Merge branch 'master' into master

This commit is contained in:
alesapin 2018-08-14 13:35:32 +03:00 committed by GitHub
commit edc2dc45b6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
143 changed files with 8025 additions and 352 deletions

View File

@ -13,6 +13,7 @@
#include <Common/typeid_cast.h>
#include <Poco/String.h>
#include <DataTypes/DataTypeWithDictionary.h>
namespace DB
@ -41,6 +42,20 @@ void AggregateFunctionFactory::registerFunction(const String & name, Creator cre
ErrorCodes::LOGICAL_ERROR);
}
static DataTypes convertTypesWithDictionaryToNested(const DataTypes & types)
{
DataTypes res_types;
res_types.reserve(types.size());
for (const auto & type : types)
{
if (auto * type_with_dict = typeid_cast<const DataTypeWithDictionary *>(type.get()))
res_types.push_back(type_with_dict->getDictionaryType());
else
res_types.push_back(type);
}
return res_types;
}
AggregateFunctionPtr AggregateFunctionFactory::get(
const String & name,
@ -48,6 +63,8 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
const Array & parameters,
int recursion_level) const
{
auto type_without_dictionary = convertTypesWithDictionaryToNested(argument_types);
/// If one of types is Nullable, we apply aggregate function combinator "Null".
if (std::any_of(argument_types.begin(), argument_types.end(),
@ -57,7 +74,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
if (!combinator)
throw Exception("Logical error: cannot find aggregate function combinator to apply a function to Nullable arguments.", ErrorCodes::LOGICAL_ERROR);
DataTypes nested_types = combinator->transformArguments(argument_types);
DataTypes nested_types = combinator->transformArguments(type_without_dictionary);
AggregateFunctionPtr nested_function;
@ -70,7 +87,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
return combinator->transformAggregateFunction(nested_function, argument_types, parameters);
}
auto res = getImpl(name, argument_types, parameters, recursion_level);
auto res = getImpl(name, type_without_dictionary, parameters, recursion_level);
if (!res)
throw Exception("Logical error: AggregateFunctionFactory returned nullptr", ErrorCodes::LOGICAL_ERROR);
return res;

View File

@ -4,6 +4,7 @@
#include <IO/WriteBufferFromArena.h>
#include <Common/SipHash.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnsCommon.h>
namespace DB
{
@ -161,6 +162,25 @@ ColumnPtr ColumnAggregateFunction::permute(const Permutation & perm, size_t limi
return std::move(res);
}
ColumnPtr ColumnAggregateFunction::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnAggregateFunction::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
auto res = createView();
res->getData().resize(limit);
for (size_t i = 0; i < limit; ++i)
res->getData()[i] = getData()[indexes[i]];
return std::move(res);
}
INSTANTIATE_INDEX_IMPL(ColumnAggregateFunction);
/// Is required to support operations with Set
void ColumnAggregateFunction::updateHashWithValue(size_t n, SipHash & hash) const
{

View File

@ -156,6 +156,11 @@ public:
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;

View File

@ -626,6 +626,44 @@ ColumnPtr ColumnArray::permute(const Permutation & perm, size_t limit) const
return std::move(res);
}
ColumnPtr ColumnArray::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename T>
ColumnPtr ColumnArray::indexImpl(const PaddedPODArray<T> & indexes, size_t limit) const
{
if (limit == 0)
return ColumnArray::create(data);
/// Convert indexes to UInt64 in case of overflow.
auto nested_indexes_column = ColumnUInt64::create();
PaddedPODArray<UInt64> & nested_indexes = nested_indexes_column->getData();
nested_indexes.reserve(getOffsets().back());
auto res = ColumnArray::create(data->cloneEmpty());
Offsets & res_offsets = res->getOffsets();
res_offsets.resize(limit);
size_t current_offset = 0;
for (size_t i = 0; i < limit; ++i)
{
for (size_t j = 0; j < sizeAt(indexes[i]); ++j)
nested_indexes.push_back(offsetAt(indexes[i]) + j);
current_offset += sizeAt(indexes[i]);
res_offsets[i] = current_offset;
}
if (current_offset != 0)
res->data = data->index(*nested_indexes_column, current_offset);
return std::move(res);
}
INSTANTIATE_INDEX_IMPL(ColumnArray);
void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
size_t s = size();

View File

@ -71,6 +71,8 @@ public:
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type> ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
void reserve(size_t n) override;

View File

@ -63,6 +63,18 @@ ColumnPtr ColumnConst::permute(const Permutation & perm, size_t limit) const
return ColumnConst::create(data, limit);
}
ColumnPtr ColumnConst::index(const IColumn & indexes, size_t limit) const
{
if (limit == 0)
limit = indexes.size();
if (indexes.size() < limit)
throw Exception("Size of indexes (" + toString(indexes.size()) + ") is less than required (" + toString(limit) + ")",
ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
return ColumnConst::create(data, limit);
}
MutableColumns ColumnConst::scatter(ColumnIndex num_columns, const Selector & selector) const
{
if (s != selector.size())

View File

@ -153,6 +153,7 @@ public:
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
size_t byteSize() const override

View File

@ -1,4 +1,5 @@
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnsCommon.h>
#include <Common/Arena.h>
#include <Common/SipHash.h>
@ -258,6 +259,32 @@ ColumnPtr ColumnFixedString::permute(const Permutation & perm, size_t limit) con
return std::move(res);
}
ColumnPtr ColumnFixedString::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnFixedString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
if (limit == 0)
return ColumnFixedString::create(n);
auto res = ColumnFixedString::create(n);
Chars_t & res_chars = res->chars;
res_chars.resize(n * limit);
size_t offset = 0;
for (size_t i = 0; i < limit; ++i, offset += n)
memcpySmallAllowReadWriteOverflow15(&res_chars[offset], &chars[indexes[i] * n], n);
return std::move(res);
}
ColumnPtr ColumnFixedString::replicate(const Offsets & offsets) const
{
size_t col_size = size();

View File

@ -108,6 +108,11 @@ public:
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override

View File

@ -88,6 +88,15 @@ ColumnPtr ColumnFunction::permute(const Permutation & perm, size_t limit) const
return ColumnFunction::create(limit, function, capture);
}
ColumnPtr ColumnFunction::index(const IColumn & indexes, size_t limit) const
{
ColumnsWithTypeAndName capture = captured_columns;
for (auto & column : capture)
column.column = column.column->index(indexes, limit);
return ColumnFunction::create(limit, function, capture);
}
std::vector<MutableColumnPtr> ColumnFunction::scatter(IColumn::ColumnIndex num_columns,
const IColumn::Selector & selector) const
{

View File

@ -33,6 +33,7 @@ public:
ColumnPtr replicate(const Offsets & offsets) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
void insertDefault() override;
void popBack(size_t n) override;
std::vector<MutableColumnPtr> scatter(IColumn::ColumnIndex num_columns,

View File

@ -166,6 +166,13 @@ ColumnPtr ColumnNullable::permute(const Permutation & perm, size_t limit) const
return ColumnNullable::create(permuted_data, permuted_null_map);
}
ColumnPtr ColumnNullable::index(const IColumn & indexes, size_t limit) const
{
ColumnPtr indexed_data = getNestedColumn().index(indexes, limit);
ColumnPtr indexed_null_map = getNullMapColumn().index(indexes, limit);
return ColumnNullable::create(indexed_data, indexed_null_map);
}
int ColumnNullable::compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const
{
/// NULL values share the properties of NaN values.

View File

@ -65,6 +65,7 @@ public:
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
void reserve(size_t n) override;

View File

@ -159,6 +159,48 @@ ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const
}
ColumnPtr ColumnString::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename Type>
ColumnPtr ColumnString::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
if (limit == 0)
return ColumnString::create();
auto res = ColumnString::create();
Chars_t & res_chars = res->chars;
Offsets & res_offsets = res->offsets;
size_t new_chars_size = 0;
for (size_t i = 0; i < limit; ++i)
new_chars_size += sizeAt(indexes[i]);
res_chars.resize(new_chars_size);
res_offsets.resize(limit);
Offset current_new_offset = 0;
for (size_t i = 0; i < limit; ++i)
{
size_t j = indexes[i];
size_t string_offset = j == 0 ? 0 : offsets[j - 1];
size_t string_size = offsets[j] - string_offset;
memcpySmallAllowReadWriteOverflow15(&res_chars[current_new_offset], &chars[string_offset], string_size);
current_new_offset += string_size;
res_offsets[i] = current_new_offset;
}
return std::move(res);
}
template <bool positive>
struct ColumnString::less
{

View File

@ -220,6 +220,11 @@ public:
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
void insertDefault() override
{
chars.push_back(0);

View File

@ -181,6 +181,17 @@ ColumnPtr ColumnTuple::permute(const Permutation & perm, size_t limit) const
return ColumnTuple::create(new_columns);
}
ColumnPtr ColumnTuple::index(const IColumn & indexes, size_t limit) const
{
const size_t tuple_size = columns.size();
Columns new_columns(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
new_columns[i] = columns[i]->index(indexes, limit);
return ColumnTuple::create(new_columns);
}
ColumnPtr ColumnTuple::replicate(const Offsets & offsets) const
{
const size_t tuple_size = columns.size();

View File

@ -60,6 +60,7 @@ public:
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
void gather(ColumnGathererStream & gatherer_stream) override;

View File

@ -0,0 +1,512 @@
#pragma once
#include <Columns/IColumnUnique.h>
#include <Columns/ReverseIndex.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnFixedString.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/NumberTraits.h>
#include <Common/typeid_cast.h>
#include <ext/range.h>
namespace DB
{
template <typename ColumnType>
class ColumnUnique final : public COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType>>
{
friend class COWPtrHelper<IColumnUnique, ColumnUnique<ColumnType>>;
private:
explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable);
explicit ColumnUnique(const IDataType & type);
ColumnUnique(const ColumnUnique & other);
public:
MutableColumnPtr cloneEmpty() const override;
const ColumnPtr & getNestedColumn() const override;
const ColumnPtr & getNestedNotNullableColumn() const override { return column_holder; }
size_t uniqueInsert(const Field & x) override;
size_t uniqueInsertFrom(const IColumn & src, size_t n) override;
MutableColumnPtr uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length) override;
IColumnUnique::IndexesWithOverflow uniqueInsertRangeWithOverflow(const IColumn & src, size_t start, size_t length,
size_t max_dictionary_size) override;
size_t uniqueInsertData(const char * pos, size_t length) override;
size_t uniqueInsertDataWithTerminatingZero(const char * pos, size_t length) override;
size_t uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos) override;
size_t getDefaultValueIndex() const override { return is_nullable ? 1 : 0; }
size_t getNullValueIndex() const override;
bool canContainNulls() const override { return is_nullable; }
Field operator[](size_t n) const override { return (*getNestedColumn())[n]; }
void get(size_t n, Field & res) const override { getNestedColumn()->get(n, res); }
StringRef getDataAt(size_t n) const override { return getNestedColumn()->getDataAt(n); }
StringRef getDataAtWithTerminatingZero(size_t n) const override
{
return getNestedColumn()->getDataAtWithTerminatingZero(n);
}
UInt64 get64(size_t n) const override { return getNestedColumn()->get64(n); }
UInt64 getUInt(size_t n) const override { return getNestedColumn()->getUInt(n); }
Int64 getInt(size_t n) const override { return getNestedColumn()->getInt(n); }
bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); }
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override
{
return column_holder->serializeValueIntoArena(n, arena, begin);
}
void updateHashWithValue(size_t n, SipHash & hash) const override
{
return getNestedColumn()->updateHashWithValue(n, hash);
}
int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override
{
auto & column_unique = static_cast<const IColumnUnique&>(rhs);
return getNestedColumn()->compareAt(n, m, *column_unique.getNestedColumn(), nan_direction_hint);
}
void getExtremes(Field & min, Field & max) const override { column_holder->getExtremes(min, max); }
bool valuesHaveFixedSize() const override { return column_holder->valuesHaveFixedSize(); }
bool isFixedAndContiguous() const override { return column_holder->isFixedAndContiguous(); }
size_t sizeOfValueIfFixed() const override { return column_holder->sizeOfValueIfFixed(); }
bool isNumeric() const override { return column_holder->isNumeric(); }
size_t byteSize() const override { return column_holder->byteSize(); }
size_t allocatedBytes() const override
{
return column_holder->allocatedBytes()
+ index.allocatedBytes()
+ (cached_null_mask ? cached_null_mask->allocatedBytes() : 0);
}
void forEachSubcolumn(IColumn::ColumnCallback callback) override
{
callback(column_holder);
index.setColumn(getRawColumnPtr());
}
private:
ColumnPtr column_holder;
bool is_nullable;
ReverseIndex<UInt64, ColumnType> index;
/// For DataTypeNullable, stores null map.
mutable ColumnPtr cached_null_mask;
mutable ColumnPtr cached_column_nullable;
static size_t numSpecialValues(bool is_nullable) { return is_nullable ? 2 : 1; }
size_t numSpecialValues() const { return numSpecialValues(is_nullable); }
ColumnType * getRawColumnPtr() { return static_cast<ColumnType *>(column_holder->assumeMutable().get()); }
const ColumnType * getRawColumnPtr() const { return static_cast<const ColumnType *>(column_holder.get()); }
template <typename IndexType>
MutableColumnPtr uniqueInsertRangeImpl(
const IColumn & src,
size_t start,
size_t length,
size_t num_added_rows,
typename ColumnVector<IndexType>::MutablePtr && positions_column,
ReverseIndex<UInt64, ColumnType> * secondary_index,
size_t max_dictionary_size);
};
template <typename ColumnType>
MutableColumnPtr ColumnUnique<ColumnType>::cloneEmpty() const
{
return ColumnUnique<ColumnType>::create(column_holder->cloneResized(numSpecialValues()), is_nullable);
}
template <typename ColumnType>
ColumnUnique<ColumnType>::ColumnUnique(const ColumnUnique & other)
: column_holder(other.column_holder)
, is_nullable(other.is_nullable)
, index(numSpecialValues(is_nullable), 0)
{
index.setColumn(getRawColumnPtr());
}
template <typename ColumnType>
ColumnUnique<ColumnType>::ColumnUnique(const IDataType & type)
: is_nullable(type.isNullable())
, index(numSpecialValues(is_nullable), 0)
{
const auto & holder_type = is_nullable ? *static_cast<const DataTypeNullable &>(type).getNestedType() : type;
column_holder = holder_type.createColumn()->cloneResized(numSpecialValues());
index.setColumn(getRawColumnPtr());
}
template <typename ColumnType>
ColumnUnique<ColumnType>::ColumnUnique(MutableColumnPtr && holder, bool is_nullable)
: column_holder(std::move(holder))
, is_nullable(is_nullable)
, index(numSpecialValues(is_nullable), 0)
{
if (column_holder->size() < numSpecialValues())
throw Exception("Too small holder column for ColumnUnique.", ErrorCodes::ILLEGAL_COLUMN);
if (column_holder->isColumnNullable())
throw Exception("Holder column for ColumnUnique can't be nullable.", ErrorCodes::ILLEGAL_COLUMN);
index.setColumn(getRawColumnPtr());
}
template <typename ColumnType>
const ColumnPtr & ColumnUnique<ColumnType>::getNestedColumn() const
{
if (is_nullable)
{
size_t size = getRawColumnPtr()->size();
if (!cached_null_mask)
{
ColumnUInt8::MutablePtr null_mask = ColumnUInt8::create(size, UInt8(0));
null_mask->getData()[getNullValueIndex()] = 1;
cached_null_mask = std::move(null_mask);
cached_column_nullable = ColumnNullable::create(column_holder, cached_null_mask);
}
if (cached_null_mask->size() != size)
{
MutableColumnPtr null_mask = (*std::move(cached_null_mask)).mutate();
static_cast<ColumnUInt8 &>(*null_mask).getData().resize_fill(size);
cached_null_mask = std::move(null_mask);
cached_column_nullable = ColumnNullable::create(column_holder, cached_null_mask);
}
return cached_column_nullable;
}
return column_holder;
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::getNullValueIndex() const
{
if (!is_nullable)
throw Exception("ColumnUnique can't contain null values.", ErrorCodes::LOGICAL_ERROR);
return 0;
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsert(const Field & x)
{
if (x.getType() == Field::Types::Null)
return getNullValueIndex();
auto column = getRawColumnPtr();
auto prev_size = static_cast<UInt64>(column->size());
if ((*column)[getDefaultValueIndex()] == x)
return getDefaultValueIndex();
column->insert(x);
auto pos = index.insert(prev_size);
if (pos != prev_size)
column->popBack(1);
return pos;
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsertFrom(const IColumn & src, size_t n)
{
if (is_nullable && src.isNullAt(n))
return getNullValueIndex();
if (auto * nullable = typeid_cast<const ColumnNullable *>(&src))
return uniqueInsertFrom(nullable->getNestedColumn(), n);
auto ref = src.getDataAt(n);
return uniqueInsertData(ref.data, ref.size);
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsertData(const char * pos, size_t length)
{
auto column = getRawColumnPtr();
if (column->getDataAt(getDefaultValueIndex()) == StringRef(pos, length))
return getDefaultValueIndex();
UInt64 size = column->size();
UInt64 insertion_point = index.getInsertionPoint(StringRef(pos, length));
if (insertion_point == size)
{
column->insertData(pos, length);
index.insertFromLastRow();
}
return insertion_point;
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueInsertDataWithTerminatingZero(const char * pos, size_t length)
{
if (std::is_same<ColumnType, ColumnString>::value)
return uniqueInsertData(pos, length - 1);
if (column_holder->valuesHaveFixedSize())
return uniqueInsertData(pos, length);
/// Don't know if data actually has terminating zero. So, insert it firstly.
auto column = getRawColumnPtr();
size_t prev_size = column->size();
column->insertDataWithTerminatingZero(pos, length);
if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
{
column->popBack(1);
return getDefaultValueIndex();
}
auto position = index.insert(prev_size);
if (position != prev_size)
column->popBack(1);
return static_cast<size_t>(position);
}
template <typename ColumnType>
size_t ColumnUnique<ColumnType>::uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos)
{
auto column = getRawColumnPtr();
size_t prev_size = column->size();
new_pos = column->deserializeAndInsertFromArena(pos);
if (column->compareAt(getDefaultValueIndex(), prev_size, *column, 1) == 0)
{
column->popBack(1);
return getDefaultValueIndex();
}
auto index_pos = index.insert(prev_size);
if (index_pos != prev_size)
column->popBack(1);
return static_cast<size_t>(index_pos);
}
template <typename IndexType>
static void checkIndexes(const ColumnVector<IndexType> & indexes, size_t max_dictionary_size)
{
auto & data = indexes.getData();
for (size_t i = 0; i < data.size(); ++i)
{
if (data[i] >= max_dictionary_size)
{
throw Exception("Found index " + toString(data[i]) + " at position " + toString(i)
+ " which is grated or equal than dictionary size " + toString(max_dictionary_size),
ErrorCodes::LOGICAL_ERROR);
}
}
}
template <typename ColumnType>
template <typename IndexType>
MutableColumnPtr ColumnUnique<ColumnType>::uniqueInsertRangeImpl(
const IColumn & src,
size_t start,
size_t length,
size_t num_added_rows,
typename ColumnVector<IndexType>::MutablePtr && positions_column,
ReverseIndex<UInt64, ColumnType> * secondary_index,
size_t max_dictionary_size)
{
const ColumnType * src_column;
const NullMap * null_map = nullptr;
auto & positions = positions_column->getData();
auto update_position = [&](UInt64 & next_position) -> MutableColumnPtr
{
constexpr auto next_size = NumberTraits::nextSize(sizeof(IndexType));
using SuperiorIndexType = typename NumberTraits::Construct<false, false, next_size>::Type;
++next_position;
if (next_position > std::numeric_limits<IndexType>::max())
{
if (sizeof(SuperiorIndexType) == sizeof(IndexType))
throw Exception("Can't find superior index type for type " + demangle(typeid(IndexType).name()),
ErrorCodes::LOGICAL_ERROR);
auto expanded_column = ColumnVector<SuperiorIndexType>::create(length);
auto & expanded_data = expanded_column->getData();
for (size_t i = 0; i < num_added_rows; ++i)
expanded_data[i] = positions[i];
return uniqueInsertRangeImpl<SuperiorIndexType>(
src,
start,
length,
num_added_rows,
std::move(expanded_column),
secondary_index,
max_dictionary_size);
}
return nullptr;
};
if (auto nullable_column = typeid_cast<const ColumnNullable *>(&src))
{
src_column = typeid_cast<const ColumnType *>(&nullable_column->getNestedColumn());
null_map = &nullable_column->getNullMapData();
}
else
src_column = typeid_cast<const ColumnType *>(&src);
if (src_column == nullptr)
throw Exception("Invalid column type for ColumnUnique::insertRangeFrom. Expected " + column_holder->getName() +
", got " + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
auto column = getRawColumnPtr();
UInt64 next_position = column->size();
if (secondary_index)
next_position += secondary_index->size();
auto check_inserted_position = [&next_position](UInt64 inserted_position)
{
if (inserted_position != next_position)
throw Exception("Inserted position " + toString(inserted_position)
+ " is not equal with expected " + toString(next_position), ErrorCodes::LOGICAL_ERROR);
};
auto insert_key = [&](const StringRef & ref, ReverseIndex<UInt64, ColumnType> * cur_index)
{
positions[num_added_rows] = next_position;
cur_index->getColumn()->insertData(ref.data, ref.size);
auto inserted_pos = cur_index->insertFromLastRow();
check_inserted_position(inserted_pos);
return update_position(next_position);
};
for (; num_added_rows < length; ++num_added_rows)
{
auto row = start + num_added_rows;
if (null_map && (*null_map)[row])
positions[num_added_rows] = getNullValueIndex();
else if (column->compareAt(getDefaultValueIndex(), row, *src_column, 1) == 0)
positions[num_added_rows] = getDefaultValueIndex();
else
{
auto ref = src_column->getDataAt(row);
auto cur_index = &index;
bool inserted = false;
while (!inserted)
{
auto insertion_point = cur_index->getInsertionPoint(ref);
if (insertion_point == cur_index->lastInsertionPoint())
{
if (secondary_index && cur_index != secondary_index && next_position >= max_dictionary_size)
{
cur_index = secondary_index;
continue;
}
if (auto res = insert_key(ref, cur_index))
return res;
}
else
positions[num_added_rows] = insertion_point;
inserted = true;
}
}
}
// checkIndexes(*positions_column, column->size() + (overflowed_keys ? overflowed_keys->size() : 0));
return std::move(positions_column);
}
template <typename ColumnType>
MutableColumnPtr ColumnUnique<ColumnType>::uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length)
{
auto callForType = [this, &src, start, length](auto x) -> MutableColumnPtr
{
size_t size = getRawColumnPtr()->size();
using IndexType = decltype(x);
if (size <= std::numeric_limits<IndexType>::max())
{
auto positions = ColumnVector<IndexType>::create(length);
return this->uniqueInsertRangeImpl<IndexType>(src, start, length, 0, std::move(positions), nullptr, 0);
}
return nullptr;
};
MutableColumnPtr positions_column;
if (!positions_column)
positions_column = callForType(UInt8());
if (!positions_column)
positions_column = callForType(UInt16());
if (!positions_column)
positions_column = callForType(UInt32());
if (!positions_column)
positions_column = callForType(UInt64());
if (!positions_column)
throw Exception("Can't find index type for ColumnUnique", ErrorCodes::LOGICAL_ERROR);
return positions_column;
}
template <typename ColumnType>
IColumnUnique::IndexesWithOverflow ColumnUnique<ColumnType>::uniqueInsertRangeWithOverflow(
const IColumn & src,
size_t start,
size_t length,
size_t max_dictionary_size)
{
auto overflowed_keys = column_holder->cloneEmpty();
auto overflowed_keys_ptr = typeid_cast<ColumnType *>(overflowed_keys.get());
if (!overflowed_keys_ptr)
throw Exception("Invalid keys type for ColumnUnique.", ErrorCodes::LOGICAL_ERROR);
auto callForType = [this, &src, start, length, overflowed_keys_ptr, max_dictionary_size](auto x) -> MutableColumnPtr
{
size_t size = getRawColumnPtr()->size();
using IndexType = decltype(x);
if (size <= std::numeric_limits<IndexType>::max())
{
auto positions = ColumnVector<IndexType>::create(length);
ReverseIndex<UInt64, ColumnType> secondary_index(0, max_dictionary_size);
secondary_index.setColumn(overflowed_keys_ptr);
return this->uniqueInsertRangeImpl<IndexType>(src, start, length, 0, std::move(positions),
&secondary_index, max_dictionary_size);
}
return nullptr;
};
MutableColumnPtr positions_column;
if (!positions_column)
positions_column = callForType(UInt8());
if (!positions_column)
positions_column = callForType(UInt16());
if (!positions_column)
positions_column = callForType(UInt32());
if (!positions_column)
positions_column = callForType(UInt64());
if (!positions_column)
throw Exception("Can't find index type for ColumnUnique", ErrorCodes::LOGICAL_ERROR);
IColumnUnique::IndexesWithOverflow indexes_with_overflow;
indexes_with_overflow.indexes = std::move(positions_column);
indexes_with_overflow.overflowed_keys = std::move(overflowed_keys);
return indexes_with_overflow;
}
};

View File

@ -17,6 +17,8 @@
#if __SSE2__
#include <emmintrin.h>
#include <Columns/ColumnsCommon.h>
#endif
@ -230,6 +232,12 @@ ColumnPtr ColumnVector<T>::permute(const IColumn::Permutation & perm, size_t lim
return std::move(res);
}
template <typename T>
ColumnPtr ColumnVector<T>::index(const IColumn & indexes, size_t limit) const
{
return selectIndexImpl(*this, indexes, limit);
}
template <typename T>
ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
{

View File

@ -252,6 +252,11 @@ public:
ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type>
ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
ColumnPtr replicate(const IColumn::Offsets & offsets) const override;
void getExtremes(Field & min, Field & max) const override;
@ -295,5 +300,23 @@ protected:
Container data;
};
template <typename T>
template <typename Type>
ColumnPtr ColumnVector<T>::indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const
{
size_t size = indexes.size();
if (limit == 0)
limit = size;
else
limit = std::min(size, limit);
auto res = this->create(limit);
typename Self::Container & res_data = res->getData();
for (size_t i = 0; i < limit; ++i)
res_data[i] = data[indexes[i]];
return std::move(res);
}
}

View File

@ -0,0 +1,619 @@
#include <Columns/ColumnWithDictionary.h>
#include <Columns/ColumnsNumber.h>
#include <DataStreams/ColumnGathererStream.h>
#include <DataTypes/NumberTraits.h>
#include <Common/HashTable/HashMap.h>
namespace DB
{
namespace
{
template <typename T>
PaddedPODArray<T> * getIndexesData(IColumn & indexes)
{
auto * column = typeid_cast<ColumnVector<T> *>(&indexes);
if (column)
return &column->getData();
return nullptr;
}
template <typename T>
MutableColumnPtr mapUniqueIndexImplRef(PaddedPODArray<T> & index)
{
PaddedPODArray<T> copy(index.cbegin(), index.cend());
HashMap<T, T> hash_map;
for (auto val : index)
hash_map.insert({val, hash_map.size()});
auto res_col = ColumnVector<T>::create();
auto & data = res_col->getData();
data.resize(hash_map.size());
for (auto val : hash_map)
data[val.second] = val.first;
for (auto & ind : index)
ind = hash_map[ind];
for (size_t i = 0; i < index.size(); ++i)
if (data[index[i]] != copy[i])
throw Exception("Expected " + toString(data[index[i]]) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR);
return std::move(res_col);
}
template <typename T>
MutableColumnPtr mapUniqueIndexImpl(PaddedPODArray<T> & index)
{
if (index.empty())
return ColumnVector<T>::create();
auto size = index.size();
T max_val = index[0];
for (size_t i = 1; i < size; ++i)
max_val = std::max(max_val, index[i]);
/// May happen when dictionary is shared.
if (max_val > size)
return mapUniqueIndexImplRef(index);
auto map_size = UInt64(max_val) + 1;
PaddedPODArray<T> map(map_size, 0);
T zero_pos_value = index[0];
index[0] = 0;
T cur_pos = 0;
for (size_t i = 1; i < size; ++i)
{
T val = index[i];
if (val != zero_pos_value && map[val] == 0)
{
++cur_pos;
map[val] = cur_pos;
}
index[i] = map[val];
}
auto res_col = ColumnVector<T>::create(UInt64(cur_pos) + 1);
auto & data = res_col->getData();
data[0] = zero_pos_value;
for (size_t i = 0; i < map_size; ++i)
{
auto val = map[i];
if (val)
data[val] = static_cast<T>(i);
}
return std::move(res_col);
}
/// Returns unique values of column. Write new index to column.
MutableColumnPtr mapUniqueIndex(IColumn & column)
{
if (auto * data_uint8 = getIndexesData<UInt8>(column))
return mapUniqueIndexImpl(*data_uint8);
else if (auto * data_uint16 = getIndexesData<UInt16>(column))
return mapUniqueIndexImpl(*data_uint16);
else if (auto * data_uint32 = getIndexesData<UInt32>(column))
return mapUniqueIndexImpl(*data_uint32);
else if (auto * data_uint64 = getIndexesData<UInt64>(column))
return mapUniqueIndexImpl(*data_uint64);
else
throw Exception("Indexes column for getUniqueIndex must be ColumnUInt, got" + column.getName(),
ErrorCodes::LOGICAL_ERROR);
}
}
ColumnWithDictionary::ColumnWithDictionary(MutableColumnPtr && column_unique_, MutableColumnPtr && indexes_)
: dictionary(std::move(column_unique_)), idx(std::move(indexes_))
{
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insert(const Field & x)
{
compactIfSharedDictionary();
idx.insertPosition(dictionary.getColumnUnique().uniqueInsert(x));
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertDefault()
{
idx.insertPosition(getDictionary().getDefaultValueIndex());
}
void ColumnWithDictionary::insertFrom(const IColumn & src, size_t n)
{
auto * src_with_dict = typeid_cast<const ColumnWithDictionary *>(&src);
if (!src_with_dict)
throw Exception("Expected ColumnWithDictionary, got" + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
size_t position = src_with_dict->getIndexes().getUInt(n);
if (&src_with_dict->getDictionary() == &getDictionary())
{
/// Dictionary is shared with src column. Insert only index.
idx.insertPosition(position);
}
else
{
compactIfSharedDictionary();
const auto & nested = *src_with_dict->getDictionary().getNestedColumn();
idx.insertPosition(dictionary.getColumnUnique().uniqueInsertFrom(nested, position));
}
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertFromFullColumn(const IColumn & src, size_t n)
{
compactIfSharedDictionary();
idx.insertPosition(dictionary.getColumnUnique().uniqueInsertFrom(src, n));
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertRangeFrom(const IColumn & src, size_t start, size_t length)
{
auto * src_with_dict = typeid_cast<const ColumnWithDictionary *>(&src);
if (!src_with_dict)
throw Exception("Expected ColumnWithDictionary, got" + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
if (&src_with_dict->getDictionary() == &getDictionary())
{
/// Dictionary is shared with src column. Insert only indexes.
idx.insertPositionsRange(src_with_dict->getIndexes(), start, length);
}
else
{
compactIfSharedDictionary();
/// TODO: Support native insertion from other unique column. It will help to avoid null map creation.
auto sub_idx = (*src_with_dict->getIndexes().cut(start, length)).mutate();
auto idx_map = mapUniqueIndex(*sub_idx);
auto src_nested = src_with_dict->getDictionary().getNestedColumn();
auto used_keys = src_nested->index(*idx_map, 0);
auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(*used_keys, 0, used_keys->size());
idx.insertPositionsRange(*inserted_indexes->index(*sub_idx, 0), 0, length);
}
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertRangeFromFullColumn(const IColumn & src, size_t start, size_t length)
{
compactIfSharedDictionary();
auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(src, start, length);
idx.insertPositionsRange(*inserted_indexes, 0, length);
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertRangeFromDictionaryEncodedColumn(const IColumn & keys, const IColumn & positions)
{
Index(positions.getPtr()).check(keys.size());
compactIfSharedDictionary();
auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(keys, 0, keys.size());
idx.insertPositionsRange(*inserted_indexes->index(positions, 0), 0, positions.size());
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertData(const char * pos, size_t length)
{
compactIfSharedDictionary();
idx.insertPosition(dictionary.getColumnUnique().uniqueInsertData(pos, length));
idx.check(getDictionary().size());
}
void ColumnWithDictionary::insertDataWithTerminatingZero(const char * pos, size_t length)
{
compactIfSharedDictionary();
idx.insertPosition(dictionary.getColumnUnique().uniqueInsertDataWithTerminatingZero(pos, length));
idx.check(getDictionary().size());
}
StringRef ColumnWithDictionary::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
{
return getDictionary().serializeValueIntoArena(getIndexes().getUInt(n), arena, begin);
}
const char * ColumnWithDictionary::deserializeAndInsertFromArena(const char * pos)
{
compactIfSharedDictionary();
const char * new_pos;
idx.insertPosition(dictionary.getColumnUnique().uniqueDeserializeAndInsertFromArena(pos, new_pos));
idx.check(getDictionary().size());
return new_pos;
}
void ColumnWithDictionary::gather(ColumnGathererStream & gatherer)
{
gatherer.gather(*this);
}
MutableColumnPtr ColumnWithDictionary::cloneResized(size_t size) const
{
auto unique_ptr = dictionary.getColumnUniquePtr();
return ColumnWithDictionary::create((*std::move(unique_ptr)).mutate(), getIndexes().cloneResized(size));
}
int ColumnWithDictionary::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
{
const auto & column_with_dictionary = static_cast<const ColumnWithDictionary &>(rhs);
size_t n_index = getIndexes().getUInt(n);
size_t m_index = column_with_dictionary.getIndexes().getUInt(m);
return getDictionary().compareAt(n_index, m_index, column_with_dictionary.getDictionary(), nan_direction_hint);
}
void ColumnWithDictionary::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
{
if (limit == 0)
limit = size();
size_t unique_limit = std::min(limit, getDictionary().size());
Permutation unique_perm;
getDictionary().getNestedColumn()->getPermutation(reverse, unique_limit, nan_direction_hint, unique_perm);
/// TODO: optimize with sse.
/// Get indexes per row in column_unique.
std::vector<std::vector<size_t>> indexes_per_row(getDictionary().size());
size_t indexes_size = getIndexes().size();
for (size_t row = 0; row < indexes_size; ++row)
indexes_per_row[getIndexes().getUInt(row)].push_back(row);
/// Replicate permutation.
size_t perm_size = std::min(indexes_size, limit);
res.resize(perm_size);
size_t perm_index = 0;
for (size_t row = 0; row < indexes_size && perm_index < perm_size; ++row)
{
const auto & row_indexes = indexes_per_row[unique_perm[row]];
for (auto row_index : row_indexes)
{
res[perm_index] = row_index;
++perm_index;
if (perm_index == perm_size)
break;
}
}
}
std::vector<MutableColumnPtr> ColumnWithDictionary::scatter(ColumnIndex num_columns, const Selector & selector) const
{
auto columns = getIndexes().scatter(num_columns, selector);
for (auto & column : columns)
{
auto unique_ptr = dictionary.getColumnUniquePtr();
column = ColumnWithDictionary::create((*std::move(unique_ptr)).mutate(), std::move(column));
}
return columns;
}
void ColumnWithDictionary::setSharedDictionary(const ColumnPtr & column_unique)
{
if (!empty())
throw Exception("Can't set ColumnUnique for ColumnWithDictionary because is't not empty.",
ErrorCodes::LOGICAL_ERROR);
dictionary.setShared(column_unique);
}
ColumnWithDictionary::MutablePtr ColumnWithDictionary::compact()
{
auto positions = idx.getPositions();
/// Create column with new indexes and old dictionary.
auto column = ColumnWithDictionary::create(getDictionary().assumeMutable(), (*std::move(positions)).mutate());
/// Will create new dictionary.
column->compactInplace();
return column;
}
ColumnWithDictionary::MutablePtr ColumnWithDictionary::cutAndCompact(size_t start, size_t length) const
{
auto sub_positions = (*idx.getPositions()->cut(start, length)).mutate();
/// Create column with new indexes and old dictionary.
auto column = ColumnWithDictionary::create(getDictionary().assumeMutable(), std::move(sub_positions));
/// Will create new dictionary.
column->compactInplace();
return column;
}
void ColumnWithDictionary::compactInplace()
{
auto positions = idx.detachPositions();
dictionary.compact(positions);
idx.attachPositions(std::move(positions));
}
void ColumnWithDictionary::compactIfSharedDictionary()
{
if (dictionary.isShared())
compactInplace();
}
ColumnWithDictionary::DictionaryEncodedColumn
ColumnWithDictionary::getMinimalDictionaryEncodedColumn(size_t offset, size_t limit) const
{
MutableColumnPtr sub_indexes = (*std::move(idx.getPositions()->cut(offset, limit))).mutate();
auto indexes_map = mapUniqueIndex(*sub_indexes);
auto sub_keys = getDictionary().getNestedColumn()->index(*indexes_map, 0);
return {std::move(sub_keys), std::move(sub_indexes)};
}
ColumnWithDictionary::Index::Index() : positions(ColumnUInt8::create()), size_of_type(sizeof(UInt8)) {}
ColumnWithDictionary::Index::Index(MutableColumnPtr && positions) : positions(std::move(positions))
{
updateSizeOfType();
}
ColumnWithDictionary::Index::Index(ColumnPtr positions) : positions(std::move(positions))
{
updateSizeOfType();
}
template <typename Callback>
void ColumnWithDictionary::Index::callForType(Callback && callback, size_t size_of_type)
{
switch (size_of_type)
{
case sizeof(UInt8): { callback(UInt8()); break; }
case sizeof(UInt16): { callback(UInt16()); break; }
case sizeof(UInt32): { callback(UInt32()); break; }
case sizeof(UInt64): { callback(UInt64()); break; }
default: {
throw Exception("Unexpected size of index type for ColumnWithDictionary: " + toString(size_of_type),
ErrorCodes::LOGICAL_ERROR);
}
}
}
size_t ColumnWithDictionary::Index::getSizeOfIndexType(const IColumn & column, size_t hint)
{
auto checkFor = [&](auto type) { return typeid_cast<const ColumnVector<decltype(type)> *>(&column) != nullptr; };
auto tryGetSizeFor = [&](auto type) -> size_t { return checkFor(type) ? sizeof(decltype(type)) : 0; };
if (hint)
{
size_t size = 0;
callForType([&](auto type) { size = tryGetSizeFor(type); }, hint);
if (size)
return size;
}
if (auto size = tryGetSizeFor(UInt8()))
return size;
if (auto size = tryGetSizeFor(UInt16()))
return size;
if (auto size = tryGetSizeFor(UInt32()))
return size;
if (auto size = tryGetSizeFor(UInt64()))
return size;
throw Exception("Unexpected indexes type for ColumnWithDictionary. Expected UInt, got " + column.getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
void ColumnWithDictionary::Index::attachPositions(ColumnPtr positions_)
{
positions = std::move(positions_);
updateSizeOfType();
}
template <typename IndexType>
typename ColumnVector<IndexType>::Container & ColumnWithDictionary::Index::getPositionsData()
{
auto * positions_ptr = typeid_cast<ColumnVector<IndexType> *>(positions->assumeMutable().get());
if (!positions_ptr)
throw Exception("Invalid indexes type for ColumnWithDictionary."
" Expected UInt" + toString(8 * sizeof(IndexType)) + ", got " + positions->getName(),
ErrorCodes::LOGICAL_ERROR);
return positions_ptr->getData();
}
template <typename IndexType>
void ColumnWithDictionary::Index::convertPositions()
{
auto convert = [&](auto x)
{
using CurIndexType = decltype(x);
auto & data = getPositionsData<CurIndexType>();
if (sizeof(CurIndexType) > sizeof(IndexType))
throw Exception("Converting indexes to smaller type: from " + toString(sizeof(CurIndexType)) +
" to " + toString(sizeof(IndexType)), ErrorCodes::LOGICAL_ERROR);
if (sizeof(CurIndexType) != sizeof(IndexType))
{
size_t size = data.size();
auto new_positions = ColumnVector<IndexType>::create(size);
auto & new_data = new_positions->getData();
/// TODO: Optimize with SSE?
for (size_t i = 0; i < size; ++i)
new_data[i] = data[i];
positions = std::move(new_positions);
size_of_type = sizeof(IndexType);
}
};
callForType(std::move(convert), size_of_type);
checkSizeOfType();
}
void ColumnWithDictionary::Index::expandType()
{
auto expand = [&](auto type)
{
using CurIndexType = decltype(type);
constexpr auto next_size = NumberTraits::nextSize(sizeof(CurIndexType));
if (next_size == sizeof(CurIndexType))
throw Exception("Can't expand indexes type for ColumnWithDictionary from type: "
+ demangle(typeid(CurIndexType).name()), ErrorCodes::LOGICAL_ERROR);
using NewIndexType = typename NumberTraits::Construct<false, false, next_size>::Type;
convertPositions<NewIndexType>();
};
callForType(std::move(expand), size_of_type);
}
UInt64 ColumnWithDictionary::Index::getMaxPositionForCurrentType() const
{
UInt64 value = 0;
callForType([&](auto type) { value = std::numeric_limits<decltype(type)>::max(); }, size_of_type);
return value;
}
void ColumnWithDictionary::Index::insertPosition(UInt64 position)
{
while (position > getMaxPositionForCurrentType())
expandType();
positions->assumeMutableRef().insert(UInt64(position));
checkSizeOfType();
}
void ColumnWithDictionary::Index::insertPositionsRange(const IColumn & column, size_t offset, size_t limit)
{
auto insertForType = [&](auto type)
{
using ColumnType = decltype(type);
const auto * column_ptr = typeid_cast<const ColumnVector<ColumnType> *>(&column);
if (!column_ptr)
return false;
if (size_of_type < sizeof(ColumnType))
convertPositions<ColumnType>();
if (size_of_type == sizeof(ColumnType))
positions->assumeMutableRef().insertRangeFrom(column, offset, limit);
else
{
auto copy = [&](auto cur_type)
{
using CurIndexType = decltype(cur_type);
auto & positions_data = getPositionsData<CurIndexType>();
const auto & column_data = column_ptr->getData();
size_t size = positions_data.size();
positions_data.resize(size + limit);
for (size_t i = 0; i < limit; ++i)
positions_data[size + i] = column_data[offset + i];
};
callForType(std::move(copy), size_of_type);
}
return true;
};
if (!insertForType(UInt8()) &&
!insertForType(UInt16()) &&
!insertForType(UInt32()) &&
!insertForType(UInt64()))
throw Exception("Invalid column for ColumnWithDictionary index. Expected UInt, got " + column.getName(),
ErrorCodes::ILLEGAL_COLUMN);
checkSizeOfType();
}
void ColumnWithDictionary::Index::check(size_t /*max_dictionary_size*/)
{
/// TODO: remove
/*
auto check = [&](auto cur_type)
{
using CurIndexType = decltype(cur_type);
auto & positions_data = getPositionsData<CurIndexType>();
for (size_t i = 0; i < positions_data.size(); ++i)
{
if (positions_data[i] >= max_dictionary_size)
{
throw Exception("Found index " + toString(positions_data[i]) + " at position " + toString(i)
+ " which is grated or equal than dictionary size " + toString(max_dictionary_size),
ErrorCodes::LOGICAL_ERROR);
}
}
};
callForType(std::move(check), size_of_type);
*/
}
void ColumnWithDictionary::Index::checkSizeOfType()
{
if (size_of_type != getSizeOfIndexType(*positions, size_of_type))
throw Exception("Invalid size of type. Expected " + toString(8 * size_of_type) +
", but positions are " + positions->getName(), ErrorCodes::LOGICAL_ERROR);
}
ColumnWithDictionary::Dictionary::Dictionary(MutableColumnPtr && column_unique_)
: column_unique(std::move(column_unique_))
{
checkColumn(*column_unique);
}
ColumnWithDictionary::Dictionary::Dictionary(ColumnPtr column_unique_)
: column_unique(std::move(column_unique_))
{
checkColumn(*column_unique);
}
void ColumnWithDictionary::Dictionary::checkColumn(const IColumn & column)
{
if (!dynamic_cast<const IColumnUnique *>(&column))
throw Exception("ColumnUnique expected as an argument of ColumnWithDictionary.", ErrorCodes::ILLEGAL_COLUMN);
}
void ColumnWithDictionary::Dictionary::setShared(const ColumnPtr & dictionary)
{
checkColumn(*dictionary);
column_unique = dictionary;
shared = true;
}
void ColumnWithDictionary::Dictionary::compact(ColumnPtr & positions)
{
auto new_column_unique = column_unique->cloneEmpty();
auto & unique = getColumnUnique();
auto & new_unique = static_cast<IColumnUnique &>(*new_column_unique);
auto indexes = mapUniqueIndex(positions->assumeMutableRef());
auto sub_keys = unique.getNestedColumn()->index(*indexes, 0);
auto new_indexes = new_unique.uniqueInsertRangeFrom(*sub_keys, 0, sub_keys->size());
positions = (*new_indexes->index(*positions, 0)).mutate();
column_unique = std::move(new_column_unique);
shared = false;
}
}

View File

@ -0,0 +1,248 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/IColumnUnique.h>
#include <Common/typeid_cast.h>
#include <AggregateFunctions/AggregateFunctionCount.h>
#include "ColumnsNumber.h"
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
}
class ColumnWithDictionary final : public COWPtrHelper<IColumn, ColumnWithDictionary>
{
friend class COWPtrHelper<IColumn, ColumnWithDictionary>;
ColumnWithDictionary(MutableColumnPtr && column_unique, MutableColumnPtr && indexes);
ColumnWithDictionary(const ColumnWithDictionary & other) = default;
public:
/** Create immutable column using immutable arguments. This arguments may be shared with other columns.
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
*/
using Base = COWPtrHelper<IColumn, ColumnWithDictionary>;
static Ptr create(const ColumnPtr & column_unique_, const ColumnPtr & indexes_)
{
return ColumnWithDictionary::create(column_unique_->assumeMutable(), indexes_->assumeMutable());
}
template <typename ... Args, typename = typename std::enable_if<IsMutableColumns<Args ...>::value>::type>
static MutablePtr create(Args &&... args) { return Base::create(std::forward<Args>(args)...); }
std::string getName() const override { return "ColumnWithDictionary"; }
const char * getFamilyName() const override { return "ColumnWithDictionary"; }
ColumnPtr convertToFullColumn() const { return getDictionary().getNestedColumn()->index(getIndexes(), 0); }
ColumnPtr convertToFullColumnIfWithDictionary() const override { return convertToFullColumn(); }
MutableColumnPtr cloneResized(size_t size) const override;
size_t size() const override { return getIndexes().size(); }
Field operator[](size_t n) const override { return getDictionary()[getIndexes().getUInt(n)]; }
void get(size_t n, Field & res) const override { getDictionary().get(getIndexes().getUInt(n), res); }
StringRef getDataAt(size_t n) const override { return getDictionary().getDataAt(getIndexes().getUInt(n)); }
StringRef getDataAtWithTerminatingZero(size_t n) const override
{
return getDictionary().getDataAtWithTerminatingZero(getIndexes().getUInt(n));
}
UInt64 get64(size_t n) const override { return getDictionary().get64(getIndexes().getUInt(n)); }
UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); }
Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); }
bool isNullAt(size_t n) const override { return getDictionary().isNullAt(getIndexes().getUInt(n)); }
ColumnPtr cut(size_t start, size_t length) const override
{
return ColumnWithDictionary::create(dictionary.getColumnUniquePtr(), getIndexes().cut(start, length));
}
void insert(const Field & x) override;
void insertDefault() override;
void insertFrom(const IColumn & src, size_t n) override;
void insertFromFullColumn(const IColumn & src, size_t n);
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
void insertRangeFromFullColumn(const IColumn & src, size_t start, size_t length);
void insertRangeFromDictionaryEncodedColumn(const IColumn & keys, const IColumn & positions);
void insertData(const char * pos, size_t length) override;
void insertDataWithTerminatingZero(const char * pos, size_t length) override;
void popBack(size_t n) override { idx.popBack(n); }
StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override;
const char * deserializeAndInsertFromArena(const char * pos) override;
void updateHashWithValue(size_t n, SipHash & hash) const override
{
return getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);
}
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override
{
return ColumnWithDictionary::create(dictionary.getColumnUniquePtr(), getIndexes().filter(filt, result_size_hint));
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override
{
return ColumnWithDictionary::create(dictionary.getColumnUniquePtr(), getIndexes().permute(perm, limit));
}
ColumnPtr index(const IColumn & indexes_, size_t limit) const override
{
return ColumnWithDictionary::create(dictionary.getColumnUniquePtr(), getIndexes().index(indexes_, limit));
}
int compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const override;
void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
ColumnPtr replicate(const Offsets & offsets) const override
{
return ColumnWithDictionary::create(dictionary.getColumnUniquePtr(), getIndexes().replicate(offsets));
}
std::vector<MutableColumnPtr> scatter(ColumnIndex num_columns, const Selector & selector) const override;
void gather(ColumnGathererStream & gatherer_stream) override ;
void getExtremes(Field & min, Field & max) const override {
return getDictionary().index(getIndexes(), 0)->getExtremes(min, max); /// TODO: optimize
}
void reserve(size_t n) override { idx.reserve(n); }
size_t byteSize() const override { return idx.getPositions()->byteSize() + getDictionary().byteSize(); }
size_t allocatedBytes() const override { return idx.getPositions()->allocatedBytes() + getDictionary().allocatedBytes(); }
void forEachSubcolumn(ColumnCallback callback) override
{
callback(idx.getPositionsPtr());
/// Column doesn't own dictionary if it's shared.
if (!dictionary.isShared())
callback(dictionary.getColumnUniquePtr());
}
bool valuesHaveFixedSize() const override { return getDictionary().valuesHaveFixedSize(); }
bool isFixedAndContiguous() const override { return getDictionary().isFixedAndContiguous(); }
size_t sizeOfValueIfFixed() const override { return getDictionary().sizeOfValueIfFixed(); }
bool isNumeric() const override { return getDictionary().isNumeric(); }
bool withDictionary() const override { return true; }
const IColumnUnique & getDictionary() const { return dictionary.getColumnUnique(); }
/// IColumnUnique & getUnique() { return static_cast<IColumnUnique &>(*column_unique->assumeMutable()); }
/// ColumnPtr getUniquePtr() const { return column_unique; }
/// IColumn & getIndexes() { return idx.getPositions()->assumeMutableRef(); }
const IColumn & getIndexes() const { return *idx.getPositions(); }
const ColumnPtr & getIndexesPtr() const { return idx.getPositions(); }
///void setIndexes(MutableColumnPtr && indexes_) { indexes = std::move(indexes_); }
/// Set shared ColumnUnique for empty column with dictionary.
void setSharedDictionary(const ColumnPtr & column_unique);
/// Create column new dictionary with only keys that are mentioned in index.
MutablePtr compact();
/// Cut + compact.
MutablePtr cutAndCompact(size_t start, size_t length) const;
struct DictionaryEncodedColumn
{
ColumnPtr dictionary;
ColumnPtr indexes;
};
DictionaryEncodedColumn getMinimalDictionaryEncodedColumn(size_t offset, size_t limit) const;
class Index
{
public:
Index();
Index(const Index & other) = default;
explicit Index(MutableColumnPtr && positions);
explicit Index(ColumnPtr positions);
const ColumnPtr & getPositions() const { return positions; }
ColumnPtr & getPositionsPtr() { return positions; }
void insertPosition(UInt64 position);
void insertPositionsRange(const IColumn & column, size_t offset, size_t limit);
void popBack(size_t n) { positions->assumeMutableRef().popBack(n); }
void reserve(size_t n) { positions->assumeMutableRef().reserve(n); }
UInt64 getMaxPositionForCurrentType() const;
static size_t getSizeOfIndexType(const IColumn & column, size_t hint);
void check(size_t max_dictionary_size);
void checkSizeOfType();
ColumnPtr detachPositions() { return std::move(positions); }
void attachPositions(ColumnPtr positions_);
private:
ColumnPtr positions;
size_t size_of_type = 0;
void updateSizeOfType() { size_of_type = getSizeOfIndexType(*positions, size_of_type); }
void expandType();
template <typename IndexType>
typename ColumnVector<IndexType>::Container & getPositionsData();
template <typename IndexType>
void convertPositions();
template <typename Callback>
static void callForType(Callback && callback, size_t size_of_type);
};
private:
class Dictionary
{
public:
Dictionary(const Dictionary & other) = default;
explicit Dictionary(MutableColumnPtr && column_unique);
explicit Dictionary(ColumnPtr column_unique);
const ColumnPtr & getColumnUniquePtr() const { return column_unique; }
ColumnPtr & getColumnUniquePtr() { return column_unique; }
const IColumnUnique & getColumnUnique() const { return static_cast<const IColumnUnique &>(*column_unique); }
IColumnUnique & getColumnUnique() { return static_cast<IColumnUnique &>(column_unique->assumeMutableRef()); }
/// Dictionary may be shared for several mutable columns.
/// Immutable columns may have the same column unique, which isn't necessarily shared dictionary.
void setShared(const ColumnPtr & dictionary);
bool isShared() const { return shared; }
/// Create new dictionary with only keys that are mentioned in positions.
void compact(ColumnPtr & positions);
private:
ColumnPtr column_unique;
bool shared = false;
void checkColumn(const IColumn & column);
};
Dictionary dictionary;
Index idx;
void compactInplace();
void compactIfSharedDictionary();
};
}

View File

@ -3,6 +3,10 @@
#endif
#include <Columns/IColumn.h>
#include <Common/typeid_cast.h>
#include <Columns/ColumnVector.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/HashMap.h>
namespace DB
@ -308,4 +312,22 @@ INSTANTIATE(Float64)
#undef INSTANTIATE
namespace detail
{
template <typename T>
const PaddedPODArray<T> * getIndexesData(const IColumn & indexes)
{
auto * column = typeid_cast<const ColumnVector<T> *>(&indexes);
if (column)
return &column->getData();
return nullptr;
}
template const PaddedPODArray<UInt8> * getIndexesData<UInt8>(const IColumn & indexes);
template const PaddedPODArray<UInt16> * getIndexesData<UInt16>(const IColumn & indexes);
template const PaddedPODArray<UInt32> * getIndexesData<UInt32>(const IColumn & indexes);
template const PaddedPODArray<UInt64> * getIndexesData<UInt64>(const IColumn & indexes);
}
}

View File

@ -8,6 +8,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
/// Counts how many bytes of `filt` are greater than zero.
size_t countBytesInFilter(const IColumn::Filter & filt);
@ -33,4 +38,38 @@ void filterArraysImplOnlyData(
PaddedPODArray<T> & res_elems,
const IColumn::Filter & filt, ssize_t result_size_hint);
namespace detail
{
template <typename T>
const PaddedPODArray<T> * getIndexesData(const IColumn & indexes);
}
/// Check limit <= indexes->size() and call column.indexImpl(const PaddedPodArray<Type> & indexes, size_t limit).
template <typename Column>
ColumnPtr selectIndexImpl(const Column & column, const IColumn & indexes, size_t limit)
{
if (limit == 0)
limit = indexes.size();
if (indexes.size() < limit)
throw Exception("Size of indexes is less than required.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
if (auto * data_uint8 = detail::getIndexesData<UInt8>(indexes))
return column.template indexImpl<UInt8>(*data_uint8, limit);
else if (auto * data_uint16 = detail::getIndexesData<UInt16>(indexes))
return column.template indexImpl<UInt16>(*data_uint16, limit);
else if (auto * data_uint32 = detail::getIndexesData<UInt32>(indexes))
return column.template indexImpl<UInt32>(*data_uint32, limit);
else if (auto * data_uint64 = detail::getIndexesData<UInt64>(indexes))
return column.template indexImpl<UInt64>(*data_uint64, limit);
else
throw Exception("Indexes column for IColumn::select must be ColumnUInt, got" + indexes.getName(),
ErrorCodes::LOGICAL_ERROR);
}
#define INSTANTIATE_INDEX_IMPL(Column) \
template ColumnPtr Column::indexImpl<UInt8>(const PaddedPODArray<UInt8> & indexes, size_t limit) const; \
template ColumnPtr Column::indexImpl<UInt16>(const PaddedPODArray<UInt16> & indexes, size_t limit) const; \
template ColumnPtr Column::indexImpl<UInt32>(const PaddedPODArray<UInt32> & indexes, size_t limit) const; \
template ColumnPtr Column::indexImpl<UInt64>(const PaddedPODArray<UInt64> & indexes, size_t limit) const;
}

View File

@ -27,14 +27,14 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column)
if (column.isColumnConst())
{
const ColumnConst & column_const = static_cast<const ColumnConst &>(column);
const IColumn & column_nested = column_const.getDataColumn();
ColumnPtr column_nested = column_const.getDataColumnPtr()->convertToFullColumnIfWithDictionary();
if (!typeid_cast<const ColumnUInt8 *>(&column_nested))
if (!typeid_cast<const ColumnUInt8 *>(column_nested.get()))
{
const ColumnNullable * column_nested_nullable = typeid_cast<const ColumnNullable *>(&column_nested);
const ColumnNullable * column_nested_nullable = typeid_cast<const ColumnNullable *>(column_nested.get());
if (!column_nested_nullable || !typeid_cast<const ColumnUInt8 *>(&column_nested_nullable->getNestedColumn()))
{
throw Exception("Illegal type " + column_nested.getName() + " of column for constant filter. Must be UInt8 or Nullable(UInt8).",
throw Exception("Illegal type " + column_nested->getName() + " of column for constant filter. Must be UInt8 or Nullable(UInt8).",
ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
}
}
@ -48,8 +48,13 @@ ConstantFilterDescription::ConstantFilterDescription(const IColumn & column)
}
FilterDescription::FilterDescription(const IColumn & column)
FilterDescription::FilterDescription(const IColumn & column_)
{
if (column_.withDictionary())
data_holder = column_.convertToFullColumnIfWithDictionary();
const auto & column = data_holder ? *data_holder : column_;
if (const ColumnUInt8 * concrete_column = typeid_cast<const ColumnUInt8 *>(&column))
{
data = &concrete_column->getData();

View File

@ -47,6 +47,10 @@ public:
*/
virtual Ptr convertToFullColumnIfConst() const { return {}; }
/// If column isn't ColumnWithDictionary, return itself.
/// If column is ColumnWithDictionary, transforms is to full column.
virtual Ptr convertToFullColumnIfWithDictionary() const { return getPtr(); }
/// Creates empty column with the same type.
virtual MutablePtr cloneEmpty() const { return cloneResized(0); }
@ -188,6 +192,10 @@ public:
using Permutation = PaddedPODArray<size_t>;
virtual Ptr permute(const Permutation & perm, size_t limit) const = 0;
/// Creates new column with values column[indexes[:limit]]. If limit is 0, all indexes are used.
/// Indexes must be one of the ColumnUInt. For default implementation, see selectIndexImpl from ColumnsCommon.h
virtual Ptr index(const IColumn & indexes, size_t limit) const = 0;
/** Compares (*this)[n] and rhs[m].
* Returns negative number, 0, or positive number (*this)[n] is less, equal, greater than rhs[m] respectively.
* Is used in sortings.
@ -325,6 +333,8 @@ public:
/// Can be inside ColumnNullable.
virtual bool canBeInsideNullable() const { return false; }
virtual bool withDictionary() const { return false; }
virtual ~IColumn() {}

View File

@ -87,6 +87,14 @@ public:
return cloneDummy(limit ? std::min(s, limit) : s);
}
ColumnPtr index(const IColumn & indexes, size_t limit) const override
{
if (indexes.size() < limit)
throw Exception("Size of indexes is less than required.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
return cloneDummy(limit ? limit : s);
}
void getPermutation(bool /*reverse*/, size_t /*limit*/, int /*nan_direction_hint*/, Permutation & res) const override
{
res.resize(s);

View File

@ -0,0 +1,129 @@
#pragma once
#include <Columns/IColumn.h>
namespace DB
{
class IColumnUnique : public IColumn
{
public:
using ColumnUniquePtr = IColumn::template immutable_ptr<IColumnUnique>;
using MutableColumnUniquePtr = IColumn::template mutable_ptr<IColumnUnique>;
/// Column always contains Null if it's Nullable and empty string if it's String or Nullable(String).
/// So, size may be greater than the number of inserted unique values.
virtual const ColumnPtr & getNestedColumn() const = 0;
/// The same as getNestedColumn, but removes null map if nested column is nullable.
virtual const ColumnPtr & getNestedNotNullableColumn() const = 0;
size_t size() const override { return getNestedColumn()->size(); }
/// Appends new value at the end of column (column's size is increased by 1).
/// Is used to transform raw strings to Blocks (for example, inside input format parsers)
virtual size_t uniqueInsert(const Field & x) = 0;
virtual size_t uniqueInsertFrom(const IColumn & src, size_t n) = 0;
/// Appends range of elements from other column.
/// Could be used to concatenate columns.
virtual MutableColumnPtr uniqueInsertRangeFrom(const IColumn & src, size_t start, size_t length) = 0;
struct IndexesWithOverflow
{
MutableColumnPtr indexes;
MutableColumnPtr overflowed_keys;
};
/// Like uniqueInsertRangeFrom, but doesn't insert keys if inner dictionary has more than max_dictionary_size keys.
/// Keys that won't be inserted into dictionary will be into overflowed_keys, indexes will be calculated for
/// concatenation of nested column (which can be got from getNestedColumn() function) and overflowed_keys.
virtual IndexesWithOverflow uniqueInsertRangeWithOverflow(const IColumn & src, size_t start,
size_t length, size_t max_dictionary_size) = 0;
/// Appends data located in specified memory chunk if it is possible (throws an exception if it cannot be implemented).
/// Is used to optimize some computations (in aggregation, for example).
/// Parameter length could be ignored if column values have fixed size.
virtual size_t uniqueInsertData(const char * pos, size_t length) = 0;
virtual size_t uniqueInsertDataWithTerminatingZero(const char * pos, size_t length) = 0;
virtual size_t getDefaultValueIndex() const = 0;
virtual size_t getNullValueIndex() const = 0;
virtual bool canContainNulls() const = 0;
virtual size_t uniqueDeserializeAndInsertFromArena(const char * pos, const char *& new_pos) = 0;
const char * getFamilyName() const override { return "ColumnUnique"; }
void insert(const Field &) override
{
throw Exception("Method insert is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void insertRangeFrom(const IColumn &, size_t, size_t) override
{
throw Exception("Method insertRangeFrom is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void insertData(const char *, size_t) override
{
throw Exception("Method insertData is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void insertDefault() override
{
throw Exception("Method insertDefault is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void popBack(size_t) override
{
throw Exception("Method popBack is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void gather(ColumnGathererStream &) override
{
throw Exception("Method gather is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
const char * deserializeAndInsertFromArena(const char *) override
{
throw Exception("Method deserializeAndInsertFromArena is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr index(const IColumn &, size_t) const override
{
throw Exception("Method index is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr cut(size_t, size_t) const override
{
throw Exception("Method cut is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr filter(const IColumn::Filter &, ssize_t) const override
{
throw Exception("Method filter is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr permute(const IColumn::Permutation &, size_t) const override
{
throw Exception("Method permute is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr replicate(const IColumn::Offsets &) const override
{
throw Exception("Method replicate is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void getPermutation(bool, size_t, int, IColumn::Permutation &) const override
{
throw Exception("Method getPermutation is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
std::vector<MutableColumnPtr> scatter(IColumn::ColumnIndex, const IColumn::Selector &) const override
{
throw Exception("Method scatter is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
};
using ColumnUniquePtr = IColumnUnique::ColumnUniquePtr;
using MutableColumnUniquePtr = IColumnUnique::MutableColumnUniquePtr;
}

View File

@ -0,0 +1,412 @@
#pragma once
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashTable.h>
#include <Common/HashTable/HashTableAllocator.h>
#include <Columns/ColumnString.h>
#include <ext/range.h>
namespace DB
{
namespace
{
template <typename ColumnType, bool with_saved_hash, bool has_base_index>
struct ReverseIndexHashTableState;
template <typename ColumnType>
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ false>
{
constexpr static bool with_saved_hash = false;
constexpr static bool has_base_index = false;
ColumnType * index_column;
};
template <typename ColumnType>
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash */ false, /* has_base_index */ true>
{
constexpr static bool with_saved_hash = false;
constexpr static bool has_base_index = true;
ColumnType * index_column;
size_t base_index;
};
template <typename ColumnType>
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ false>
{
constexpr static bool with_saved_hash = true;
constexpr static bool has_base_index = false;
ColumnType * index_column;
typename ColumnVector<UInt64>::Container * saved_hash_column;
};
template <typename ColumnType>
struct ReverseIndexHashTableState<ColumnType, /* with_saved_hash = */ true, /* has_base_index */ true>
{
constexpr static bool with_saved_hash = true;
constexpr static bool has_base_index = true;
ColumnType * index_column;
typename ColumnVector<UInt64>::Container * saved_hash_column;
size_t base_index;
};
template <typename Hash>
struct ReverseIndexHash : public Hash
{
template <typename T>
size_t operator()(T) const
{
throw Exception("operator()(key) is not implemented for ReverseIndexHash.", ErrorCodes::LOGICAL_ERROR);
}
template <typename State, typename T>
size_t operator()(const State & state, T key) const
{
auto index = key;
if constexpr (State::has_base_index)
index -= state.base_index;
return Hash::operator()(state.index_column->getElement(index));
}
};
using ReverseIndexStringHash = ReverseIndexHash<StringRefHash>;
template <typename IndexType>
using ReverseIndexNumberHash = ReverseIndexHash<DefaultHash<IndexType>>;
template <typename IndexType, typename Hash, typename HashTable, typename ColumnType, bool string_hash, bool has_base_index>
struct ReverseIndexHashTableCell
: public HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>
{
using Base = HashTableCell<IndexType, Hash, ReverseIndexHashTableState<ColumnType, string_hash, has_base_index>>;
using State = typename Base::State;
using Base::Base;
using Base::key;
using Base::keyEquals;
using Base::isZero;
template <typename T>
static bool isZero(const T &, const State & /*state*/)
{
static_assert(!std::is_same_v<typename std::decay<T>::type, typename std::decay<IndexType>::type>);
return false;
}
/// Special case when we want to compare with something not in index_column.
/// When we compare something inside column default keyEquals checks only that row numbers are equal.
bool keyEquals(const StringRef & object, size_t hash_ [[maybe_unused]], const State & state) const
{
auto index = key;
if constexpr (has_base_index)
index -= state.base_index;
if constexpr (string_hash)
return hash_ == (*state.saved_hash_column)[index] && object == state.index_column->getDataAt(index);
else
return object == state.index_column->getDataAt(index);
}
size_t getHash(const Hash & hash) const
{
auto index = key;
/// Hack. HashTable is Hash itself.
const auto & state = static_cast<const State &>(static_cast<const HashTable &>(hash));
if constexpr (has_base_index)
index -= state.base_index;
if constexpr (string_hash)
return (*state.saved_hash_column)[index];
else
return hash(state, key);
}
};
template <typename Key, typename Cell, typename Hash>
class HashTableWithPublicState : public HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>
{
using State = typename Cell::State;
using Base = HashTable<Key, Cell, Hash, HashTableGrower<>, HashTableAllocator>;
public:
using Base::Base;
State & getState() { return *this; }
};
template <typename IndexType, typename ColumnType, bool has_base_index>
class ReverseIndexStringHashTable : public HashTableWithPublicState<
IndexType,
ReverseIndexHashTableCell<
IndexType,
ReverseIndexStringHash,
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
true,
has_base_index>,
ReverseIndexStringHash>
{
using Base = HashTableWithPublicState<
IndexType,
ReverseIndexHashTableCell<
IndexType,
ReverseIndexStringHash,
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
true,
has_base_index>,
ReverseIndexStringHash>;
public:
using Base::Base;
friend struct ReverseIndexHashTableCell<
IndexType,
ReverseIndexStringHash,
ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
true,
has_base_index>;
};
template <typename IndexType, typename ColumnType, bool has_base_index>
class ReverseIndexNumberHashTable : public HashTableWithPublicState<
IndexType,
ReverseIndexHashTableCell<
IndexType,
ReverseIndexNumberHash<typename ColumnType::value_type>,
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
false,
has_base_index>,
ReverseIndexNumberHash<typename ColumnType::value_type>>
{
using Base = HashTableWithPublicState<
IndexType,
ReverseIndexHashTableCell<
IndexType,
ReverseIndexNumberHash<typename ColumnType::value_type>,
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
false,
has_base_index>,
ReverseIndexNumberHash<typename ColumnType::value_type>>;
public:
using Base::Base;
friend struct ReverseIndexHashTableCell<
IndexType,
ReverseIndexNumberHash<typename ColumnType::value_type>,
ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>,
ColumnType,
false,
has_base_index>;
};
template <typename IndexType, typename ColumnType, bool has_base_index, bool is_numeric_column>
struct SelectReverseIndexHashTable;
template <typename IndexType, typename ColumnType, bool has_base_index>
struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, true>
{
using Type = ReverseIndexNumberHashTable<IndexType, ColumnType, has_base_index>;
};
template <typename IndexType, typename ColumnType, bool has_base_index>
struct SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index, false>
{
using Type = ReverseIndexStringHashTable<IndexType, ColumnType, has_base_index>;
};
template <typename T>
constexpr bool isNumericColumn(const T *) { return false; }
template <typename T>
constexpr bool isNumericColumn(const ColumnVector<T> *) { return true; }
static_assert(isNumericColumn(static_cast<ColumnVector<UInt8> *>(nullptr)));
static_assert(!isNumericColumn(static_cast<ColumnString *>(nullptr)));
template <typename IndexType, typename ColumnType, bool has_base_index>
using ReverseIndexHashTable = typename SelectReverseIndexHashTable<IndexType, ColumnType, has_base_index,
isNumericColumn(static_cast<ColumnType *>(nullptr))>::Type;
}
template <typename IndexType, typename ColumnType>
class ReverseIndex
{
public:
explicit ReverseIndex(UInt64 num_prefix_rows_to_skip, UInt64 base_index)
: num_prefix_rows_to_skip(num_prefix_rows_to_skip), base_index(base_index) {}
void setColumn(ColumnType * column_);
static constexpr bool is_numeric_column = isNumericColumn(static_cast<ColumnType *>(nullptr));
static constexpr bool use_saved_hash = !is_numeric_column;
UInt64 insert(UInt64 from_position); /// Insert into index column[from_position];
UInt64 insertFromLastRow();
UInt64 getInsertionPoint(const StringRef & data);
UInt64 lastInsertionPoint() const { return size() + base_index; }
ColumnType * getColumn() const { return column; }
size_t size() const;
size_t allocatedBytes() const { return index ? index->getBufferSizeInBytes() : 0; }
private:
ColumnType * column = nullptr;
UInt64 num_prefix_rows_to_skip; /// The number prefix tows in column which won't be sored at index.
UInt64 base_index; /// This values will be added to row number which is inserted into index.
using IndexMapType = ReverseIndexHashTable<IndexType, ColumnType, true>;
/// Lazy initialized.
std::unique_ptr<IndexMapType> index;
ColumnUInt64::MutablePtr saved_hash;
void buildIndex();
UInt64 getHash(const StringRef & ref) const
{
if constexpr (is_numeric_column)
{
using ValueType = typename ColumnType::value_type;
ValueType value = *reinterpret_cast<const ValueType *>(ref.data);
return DefaultHash<ValueType>()(value);
}
else
return StringRefHash()(ref);
}
};
template <typename IndexType, typename ColumnType>
void ReverseIndex<IndexType, ColumnType>:: setColumn(ColumnType * column_)
{
if (column != column_)
index = nullptr;
column = column_;
}
template <typename IndexType, typename ColumnType>
size_t ReverseIndex<IndexType, ColumnType>::size() const
{
if (!column)
throw Exception("ReverseIndex has not size because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
return column->size();
}
template <typename IndexType, typename ColumnType>
void ReverseIndex<IndexType, ColumnType>::buildIndex()
{
if (index)
return;
if (!column)
throw Exception("ReverseIndex can't build index because index column wasn't set.", ErrorCodes::LOGICAL_ERROR);
auto size = column->size();
index = std::make_unique<IndexMapType>(size);
if constexpr (use_saved_hash)
saved_hash = ColumnUInt64::create(size);
auto & state = index->getState();
state.index_column = column;
state.base_index = base_index;
if constexpr (use_saved_hash)
state.saved_hash_column = &saved_hash->getData();
using IteratorType = typename IndexMapType::iterator;
IteratorType iterator;
bool inserted;
for (auto row : ext::range(num_prefix_rows_to_skip, size))
{
auto hash = getHash(column->getDataAt(row));
if constexpr (use_saved_hash)
saved_hash->getElement(row) = hash;
index->emplace(row + base_index, iterator, inserted, hash);
if (!inserted)
throw Exception("Duplicating keys found in ReverseIndex.", ErrorCodes::LOGICAL_ERROR);
}
}
template <typename IndexType, typename ColumnType>
UInt64 ReverseIndex<IndexType, ColumnType>::insert(UInt64 from_position)
{
if (!index)
buildIndex();
using IteratorType = typename IndexMapType::iterator;
IteratorType iterator;
bool inserted;
auto hash = getHash(column->getDataAt(from_position));
if constexpr (use_saved_hash)
{
auto & data = saved_hash->getData();
if (data.size() <= from_position)
data.resize(from_position + 1);
data[from_position] = hash;
}
index->emplace(from_position + base_index, iterator, inserted, hash);
return *iterator;
}
template <typename IndexType, typename ColumnType>
UInt64 ReverseIndex<IndexType, ColumnType>::insertFromLastRow()
{
if (!column)
throw Exception("ReverseIndex can't insert row from column because index column wasn't set.",
ErrorCodes::LOGICAL_ERROR);
UInt64 num_rows = size();
if (num_rows == 0)
throw Exception("ReverseIndex can't insert row from column because it is empty.", ErrorCodes::LOGICAL_ERROR);
UInt64 position = num_rows - 1;
UInt64 inserted_pos = insert(position);
if (position + base_index != inserted_pos)
throw Exception("Can't insert into reverse index from last row (" + toString(position + base_index)
+ ") because the same row is in position " + toString(inserted_pos), ErrorCodes::LOGICAL_ERROR);
return inserted_pos;
}
template <typename IndexType, typename ColumnType>
UInt64 ReverseIndex<IndexType, ColumnType>::getInsertionPoint(const StringRef & data)
{
if (!index)
buildIndex();
using IteratorType = typename IndexMapType::iterator;
IteratorType iterator;
auto hash = getHash(data);
iterator = index->find(data, hash);
return iterator == index->end() ? size() + base_index : *iterator;
}
}

View File

@ -380,9 +380,10 @@ namespace ErrorCodes
extern const int INVALID_JOIN_ON_EXPRESSION = 403;
extern const int BAD_ODBC_CONNECTION_STRING = 404;
extern const int PARTITION_SIZE_EXCEEDS_MAX_DROP_SIZE_LIMIT = 405;
extern const int BAD_REQUEST_PARAMETER = 406;
extern const int EXTERNAL_EXECUTABLE_NOT_FOUND = 407;
extern const int EXTERNAL_SERVER_IS_NOT_RESPONDING = 408;
extern const int TOP_AND_LIMIT_TOGETHER = 406;
extern const int BAD_REQUEST_PARAMETER = 407;
extern const int EXTERNAL_EXECUTABLE_NOT_FOUND = 408;
extern const int EXTERNAL_SERVER_IS_NOT_RESPONDING = 409;
extern const int KEEPER_EXCEPTION = 999;
extern const int POCO_EXCEPTION = 1000;

View File

@ -53,6 +53,7 @@ struct HashMapCell
bool keyEquals(const Key & key_) const { return value.first == key_; }
bool keyEquals(const Key & key_, size_t /*hash_*/) const { return value.first == key_; }
bool keyEquals(const Key & key_, size_t /*hash_*/, const State & /*state*/) const { return value.first == key_; }
void setHash(size_t /*hash_value*/) {}
size_t getHash(const Hash & hash) const { return hash(value.first); }
@ -112,6 +113,7 @@ struct HashMapCellWithSavedHash : public HashMapCell<Key, TMapped, Hash, TState>
bool keyEquals(const Key & key_) const { return this->value.first == key_; }
bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && this->value.first == key_; }
bool keyEquals(const Key & key_, size_t hash_, const typename Base::State &) const { return keyEquals(key_, hash_); }
void setHash(size_t hash_value) { saved_hash = hash_value; }
size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; }

View File

@ -75,6 +75,7 @@ struct HashSetCellWithSavedHash : public HashTableCell<Key, Hash, TState>
bool keyEquals(const Key & key_) const { return this->key == key_; }
bool keyEquals(const Key & key_, size_t hash_) const { return saved_hash == hash_ && this->key == key_; }
bool keyEquals(const Key & key_, size_t hash_, const typename Base::State &) const { return keyEquals(key_, hash_); }
void setHash(size_t hash_value) { saved_hash = hash_value; }
size_t getHash(const Hash & /*hash_function*/) const { return saved_hash; }

View File

@ -108,6 +108,7 @@ struct HashTableCell
/// Are the keys at the cells equal?
bool keyEquals(const Key & key_) const { return key == key_; }
bool keyEquals(const Key & key_, size_t /*hash_*/) const { return key == key_; }
bool keyEquals(const Key & key_, size_t /*hash_*/, const State & /*state*/) const { return key == key_; }
/// If the cell can remember the value of the hash function, then remember it.
void setHash(size_t /*hash_value*/) {}
@ -280,9 +281,10 @@ protected:
#endif
/// Find a cell with the same key or an empty cell, starting from the specified position and further along the collision resolution chain.
size_t ALWAYS_INLINE findCell(const Key & x, size_t hash_value, size_t place_value) const
template <typename ObjectToCompareWith>
size_t ALWAYS_INLINE findCell(const ObjectToCompareWith & x, size_t hash_value, size_t place_value) const
{
while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value))
while (!buf[place_value].isZero(*this) && !buf[place_value].keyEquals(x, hash_value, *this))
{
place_value = grower.next(place_value);
#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
@ -734,7 +736,8 @@ public:
}
iterator ALWAYS_INLINE find(Key x)
template <typename ObjectToCompareWith>
iterator ALWAYS_INLINE find(ObjectToCompareWith x)
{
if (Cell::isZero(x, *this))
return this->hasZero() ? iteratorToZero() : end();
@ -745,7 +748,8 @@ public:
}
const_iterator ALWAYS_INLINE find(Key x) const
template <typename ObjectToCompareWith>
const_iterator ALWAYS_INLINE find(ObjectToCompareWith x) const
{
if (Cell::isZero(x, *this))
return this->hasZero() ? iteratorToZero() : end();
@ -756,7 +760,8 @@ public:
}
iterator ALWAYS_INLINE find(Key x, size_t hash_value)
template <typename ObjectToCompareWith>
iterator ALWAYS_INLINE find(ObjectToCompareWith x, size_t hash_value)
{
if (Cell::isZero(x, *this))
return this->hasZero() ? iteratorToZero() : end();
@ -766,7 +771,8 @@ public:
}
const_iterator ALWAYS_INLINE find(Key x, size_t hash_value) const
template <typename ObjectToCompareWith>
const_iterator ALWAYS_INLINE find(ObjectToCompareWith x, size_t hash_value) const
{
if (Cell::isZero(x, *this))
return this->hasZero() ? iteratorToZero() : end();

View File

@ -368,6 +368,18 @@ Block Block::cloneWithColumns(const Columns & columns) const
}
Block Block::cloneWithoutColumns() const
{
Block res;
size_t num_columns = data.size();
for (size_t i = 0; i < num_columns; ++i)
res.insert({ nullptr, data[i].type, data[i].name });
return res;
}
Block Block::sortColumns() const
{
Block sorted_block;

View File

@ -109,6 +109,7 @@ public:
Columns getColumns() const;
void setColumns(const Columns & columns);
Block cloneWithColumns(const Columns & columns) const;
Block cloneWithoutColumns() const;
/** Get empty columns with the same types as in block. */
MutableColumns cloneEmptyColumns() const;

View File

@ -0,0 +1,57 @@
#pragma once
#include <DataStreams/IProfilingBlockInputStream.h>
#include <Columns/ColumnWithDictionary.h>
#include <DataTypes/DataTypeWithDictionary.h>
namespace DB
{
/** Combines several sources into one.
* Unlike UnionBlockInputStream, it does this sequentially.
* Blocks of different sources are not interleaved with each other.
*/
class ConvertColumnWithDictionaryToFullBlockInputStream : public IProfilingBlockInputStream
{
public:
explicit ConvertColumnWithDictionaryToFullBlockInputStream(const BlockInputStreamPtr & input)
{
children.push_back(input);
}
String getName() const override { return "ConvertColumnWithDictionaryToFull"; }
Block getHeader() const override { return convert(children.at(0)->getHeader()); }
protected:
Block readImpl() override { return convert(children.back()->read()); }
private:
Block convert(Block && block) const
{
for (auto & column : block)
{
auto * type_with_dict = typeid_cast<const DataTypeWithDictionary *>(column.type.get());
auto * col_with_dict = typeid_cast<const ColumnWithDictionary *>(column.column.get());
if (type_with_dict && !col_with_dict)
throw Exception("Invalid column for " + type_with_dict->getName() + ": " + column.column->getName(),
ErrorCodes::LOGICAL_ERROR);
if (!type_with_dict && col_with_dict)
throw Exception("Invalid type for " + col_with_dict->getName() + ": " + column.type->getName(),
ErrorCodes::LOGICAL_ERROR);
if (type_with_dict && col_with_dict)
{
column.column = col_with_dict->convertToFullColumn();
column.type = type_with_dict->getDictionaryType();
}
}
return std::move(block);
}
};
}

View File

@ -59,8 +59,14 @@ NativeBlockInputStream::NativeBlockInputStream(ReadBuffer & istr_, UInt64 server
void NativeBlockInputStream::readData(const IDataType & type, IColumn & column, ReadBuffer & istr, size_t rows, double avg_value_size_hint)
{
IDataType::InputStreamGetter input_stream_getter = [&] (const IDataType::SubstreamPath &) { return &istr; };
type.deserializeBinaryBulkWithMultipleStreams(column, input_stream_getter, rows, avg_value_size_hint, false, {});
IDataType::DeserializeBinaryBulkSettings settings;
settings.getter = [&](IDataType::SubstreamPath) -> ReadBuffer * { return &istr; };
settings.avg_value_size_hint = avg_value_size_hint;
settings.position_independent_encoding = false;
IDataType::DeserializeBinaryBulkStatePtr state;
type.deserializeBinaryBulkStatePrefix(settings, state);
type.deserializeBinaryBulkWithMultipleStreams(column, rows, settings, state);
if (column.size() != rows)
throw Exception("Cannot read all data in NativeBlockInputStream. Rows read: " + toString(column.size()) + ". Rows expected: " + toString(rows) + ".",

View File

@ -52,8 +52,15 @@ void NativeBlockOutputStream::writeData(const IDataType & type, const ColumnPtr
else
full_column = column;
IDataType::OutputStreamGetter output_stream_getter = [&] (const IDataType::SubstreamPath &) { return &ostr; };
type.serializeBinaryBulkWithMultipleStreams(*full_column, output_stream_getter, offset, limit, false, {});
IDataType::SerializeBinaryBulkSettings settings;
settings.getter = [&ostr](IDataType::SubstreamPath) -> WriteBuffer * { return &ostr; };
settings.position_independent_encoding = false;
settings.low_cardinality_max_dictionary_size = 0;
IDataType::SerializeBinaryBulkStatePtr state;
type.serializeBinaryBulkStatePrefix(settings, state);
type.serializeBinaryBulkWithMultipleStreams(*full_column, offset, limit, settings, state);
type.serializeBinaryBulkStateSuffix(settings, state);
}

View File

@ -145,37 +145,67 @@ namespace
}
void DataTypeArray::enumerateStreams(StreamCallback callback, SubstreamPath path) const
void DataTypeArray::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::ArraySizes);
callback(path);
path.back() = Substream::ArrayElements;
nested->enumerateStreams(callback, path);
path.pop_back();
}
void DataTypeArray::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::ArrayElements);
nested->serializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void DataTypeArray::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::ArrayElements);
nested->serializeBinaryBulkStateSuffix(settings, state);
settings.path.pop_back();
}
void DataTypeArray::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::ArrayElements);
nested->deserializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
const ColumnArray & column_array = typeid_cast<const ColumnArray &>(column);
/// First serialize array sizes.
path.push_back(Substream::ArraySizes);
if (auto stream = getter(path))
settings.path.push_back(Substream::ArraySizes);
if (auto stream = settings.getter(settings.path))
{
if (position_independent_encoding)
if (settings.position_independent_encoding)
serializeArraySizesPositionIndependent(column, *stream, offset, limit);
else
DataTypeNumber<ColumnArray::Offset>().serializeBinaryBulk(*column_array.getOffsetsPtr(), *stream, offset, limit);
}
/// Then serialize contents of arrays.
path.back() = Substream::ArrayElements;
settings.path.back() = Substream::ArrayElements;
const ColumnArray::Offsets & offset_values = column_array.getOffsets();
if (offset > offset_values.size())
@ -197,30 +227,29 @@ void DataTypeArray::serializeBinaryBulkWithMultipleStreams(
: 0;
if (limit == 0 || nested_limit)
nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), getter, nested_offset, nested_limit, position_independent_encoding, path);
nested->serializeBinaryBulkWithMultipleStreams(column_array.getData(), nested_offset, nested_limit, settings, state);
settings.path.pop_back();
}
void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double /*avg_value_size_hint*/,
bool position_independent_encoding,
SubstreamPath path) const
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
ColumnArray & column_array = typeid_cast<ColumnArray &>(column);
path.push_back(Substream::ArraySizes);
if (auto stream = getter(path))
settings.path.push_back(Substream::ArraySizes);
if (auto stream = settings.getter(settings.path))
{
if (position_independent_encoding)
if (settings.position_independent_encoding)
deserializeArraySizesPositionIndependent(column, *stream, limit);
else
DataTypeNumber<ColumnArray::Offset>().deserializeBinaryBulk(column_array.getOffsetsColumn(), *stream, limit, 0);
}
path.back() = Substream::ArrayElements;
settings.path.back() = Substream::ArrayElements;
ColumnArray::Offsets & offset_values = column_array.getOffsets();
IColumn & nested_column = column_array.getData();
@ -230,7 +259,8 @@ void DataTypeArray::deserializeBinaryBulkWithMultipleStreams(
if (last_offset < nested_column.size())
throw Exception("Nested column is longer than last offset", ErrorCodes::LOGICAL_ERROR);
size_t nested_limit = last_offset - nested_column.size();
nested->deserializeBinaryBulkWithMultipleStreams(nested_column, getter, nested_limit, 0, position_independent_encoding, path);
nested->deserializeBinaryBulkWithMultipleStreams(nested_column, nested_limit, settings, state);
settings.path.pop_back();
/// Check consistency between offsets and elements subcolumns.
/// But if elements column is empty - it's ok for columns of Nested types that was added by ALTER.

View File

@ -61,23 +61,32 @@ public:
* This is necessary, because when implementing nested structures, several arrays can have common sizes.
*/
void enumerateStreams(StreamCallback callback, SubstreamPath path) const override;
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const override;
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool position_independent_encoding,
SubstreamPath path) const override;
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
MutableColumnPtr createColumn() const override;

View File

@ -6,6 +6,7 @@
#include <Parsers/ASTLiteral.h>
#include <Common/typeid_cast.h>
#include <Poco/String.h>
#include <Common/StringUtils/StringUtils.h>
namespace DB
@ -55,6 +56,23 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr
{
String family_name = getAliasToOrName(family_name_param);
if (endsWith(family_name, "WithDictionary"))
{
ASTPtr low_cardinality_params = std::make_shared<ASTExpressionList>();
String param_name = family_name.substr(0, family_name.size() - strlen("WithDictionary"));
if (parameters)
{
auto func = std::make_shared<ASTFunction>();
func->name = param_name;
func->arguments = parameters;
low_cardinality_params->children.push_back(func);
}
else
low_cardinality_params->children.push_back(std::make_shared<ASTIdentifier>(param_name));
return get("LowCardinality", low_cardinality_params);
}
{
DataTypesDictionary::const_iterator it = data_types.find(family_name);
if (data_types.end() != it)
@ -124,6 +142,7 @@ void registerDataTypeUUID(DataTypeFactory & factory);
void registerDataTypeAggregateFunction(DataTypeFactory & factory);
void registerDataTypeNested(DataTypeFactory & factory);
void registerDataTypeInterval(DataTypeFactory & factory);
void registerDataTypeWithDictionary(DataTypeFactory & factory);
DataTypeFactory::DataTypeFactory()
@ -142,6 +161,7 @@ DataTypeFactory::DataTypeFactory()
registerDataTypeAggregateFunction(*this);
registerDataTypeNested(*this);
registerDataTypeInterval(*this);
registerDataTypeWithDictionary(*this);
}
}

View File

@ -37,53 +37,83 @@ bool DataTypeNullable::onlyNull() const
}
void DataTypeNullable::enumerateStreams(StreamCallback callback, SubstreamPath path) const
void DataTypeNullable::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::NullMap);
callback(path);
path.back() = Substream::NullableElements;
nested_data_type->enumerateStreams(callback, path);
path.pop_back();
}
void DataTypeNullable::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::NullableElements);
nested_data_type->serializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void DataTypeNullable::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::NullableElements);
nested_data_type->serializeBinaryBulkStateSuffix(settings, state);
settings.path.pop_back();
}
void DataTypeNullable::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::NullableElements);
nested_data_type->deserializeBinaryBulkStatePrefix(settings, state);
settings.path.pop_back();
}
void DataTypeNullable::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
const ColumnNullable & col = static_cast<const ColumnNullable &>(column);
col.checkConsistency();
/// First serialize null map.
path.push_back(Substream::NullMap);
if (auto stream = getter(path))
settings.path.push_back(Substream::NullMap);
if (auto stream = settings.getter(settings.path))
DataTypeUInt8().serializeBinaryBulk(col.getNullMapColumn(), *stream, offset, limit);
/// Then serialize contents of arrays.
path.back() = Substream::NullableElements;
nested_data_type->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), getter, offset, limit, position_independent_encoding, path);
settings.path.back() = Substream::NullableElements;
nested_data_type->serializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), offset, limit, settings, state);
settings.path.pop_back();
}
void DataTypeNullable::deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool position_independent_encoding,
SubstreamPath path) const
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
ColumnNullable & col = static_cast<ColumnNullable &>(column);
path.push_back(Substream::NullMap);
if (auto stream = getter(path))
settings.path.push_back(Substream::NullMap);
if (auto stream = settings.getter(settings.path))
DataTypeUInt8().deserializeBinaryBulk(col.getNullMapColumn(), *stream, limit, 0);
path.back() = Substream::NullableElements;
nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), getter, limit, avg_value_size_hint, position_independent_encoding, path);
settings.path.back() = Substream::NullableElements;
nested_data_type->deserializeBinaryBulkWithMultipleStreams(col.getNestedColumn(), limit, settings, state);
settings.path.pop_back();
}

View File

@ -13,27 +13,36 @@ class DataTypeNullable final : public IDataType
public:
static constexpr bool is_parametric = true;
DataTypeNullable(const DataTypePtr & nested_data_type_);
explicit DataTypeNullable(const DataTypePtr & nested_data_type_);
std::string getName() const override { return "Nullable(" + nested_data_type->getName() + ")"; }
const char * getFamilyName() const override { return "Nullable"; }
void enumerateStreams(StreamCallback callback, SubstreamPath path) const override;
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const override;
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool position_independent_encoding,
SubstreamPath path) const override;
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr) const override { nested_data_type->serializeBinary(field, ostr); }
void deserializeBinary(Field & field, ReadBuffer & istr) const override { nested_data_type->deserializeBinary(field, istr); }

View File

@ -283,7 +283,7 @@ void DataTypeTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, cons
});
}
void DataTypeTuple::enumerateStreams(StreamCallback callback, SubstreamPath path) const
void DataTypeTuple::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::TupleElement);
for (const auto i : ext::range(0, ext::size(elems)))
@ -291,40 +291,139 @@ void DataTypeTuple::enumerateStreams(StreamCallback callback, SubstreamPath path
path.back().tuple_element_name = names[i];
elems[i]->enumerateStreams(callback, path);
}
path.pop_back();
}
struct SerializeBinaryBulkStateTuple : public IDataType::SerializeBinaryBulkState
{
std::vector<IDataType::SerializeBinaryBulkStatePtr> states;
};
struct DeserializeBinaryBulkStateTuple : public IDataType::DeserializeBinaryBulkState
{
std::vector<IDataType::DeserializeBinaryBulkStatePtr> states;
};
static SerializeBinaryBulkStateTuple * checkAndGetTupleSerializeState(IDataType::SerializeBinaryBulkStatePtr & state)
{
if (!state)
throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR);
auto * tuple_state = typeid_cast<SerializeBinaryBulkStateTuple *>(state.get());
if (!tuple_state)
{
auto & state_ref = *state;
throw Exception("Invalid SerializeBinaryBulkState for DataTypeTuple. Expected: "
+ demangle(typeid(SerializeBinaryBulkStateTuple).name()) + ", got "
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
}
return tuple_state;
}
static DeserializeBinaryBulkStateTuple * checkAndGetTupleDeserializeState(IDataType::DeserializeBinaryBulkStatePtr & state)
{
if (!state)
throw Exception("Got empty state for DataTypeTuple.", ErrorCodes::LOGICAL_ERROR);
auto * tuple_state = typeid_cast<DeserializeBinaryBulkStateTuple *>(state.get());
if (!tuple_state)
{
auto & state_ref = *state;
throw Exception("Invalid DeserializeBinaryBulkState for DataTypeTuple. Expected: "
+ demangle(typeid(DeserializeBinaryBulkStateTuple).name()) + ", got "
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
}
return tuple_state;
}
void DataTypeTuple::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
auto tuple_state = std::make_shared<SerializeBinaryBulkStateTuple>();
tuple_state->states.resize(elems.size());
settings.path.push_back(Substream::TupleElement);
for (size_t i = 0; i < elems.size(); ++i)
{
settings.path.back().tuple_element_name = names[i];
elems[i]->serializeBinaryBulkStatePrefix(settings, tuple_state->states[i]);
}
settings.path.pop_back();
state = std::move(tuple_state);
}
void DataTypeTuple::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
auto * tuple_state = checkAndGetTupleSerializeState(state);
settings.path.push_back(Substream::TupleElement);
for (size_t i = 0; i < elems.size(); ++i)
{
settings.path.back().tuple_element_name = names[i];
elems[i]->serializeBinaryBulkStateSuffix(settings, tuple_state->states[i]);
}
settings.path.pop_back();
}
void DataTypeTuple::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
auto tuple_state = std::make_shared<DeserializeBinaryBulkStateTuple>();
tuple_state->states.resize(elems.size());
settings.path.push_back(Substream::TupleElement);
for (size_t i = 0; i < elems.size(); ++i)
{
settings.path.back().tuple_element_name = names[i];
elems[i]->deserializeBinaryBulkStatePrefix(settings, tuple_state->states[i]);
}
settings.path.pop_back();
state = std::move(tuple_state);
}
void DataTypeTuple::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
path.push_back(Substream::TupleElement);
auto * tuple_state = checkAndGetTupleSerializeState(state);
settings.path.push_back(Substream::TupleElement);
for (const auto i : ext::range(0, ext::size(elems)))
{
path.back().tuple_element_name = names[i];
elems[i]->serializeBinaryBulkWithMultipleStreams(
extractElementColumn(column, i), getter, offset, limit, position_independent_encoding, path);
settings.path.back().tuple_element_name = names[i];
auto & element_col = extractElementColumn(column, i);
elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]);
}
settings.path.pop_back();
}
void DataTypeTuple::deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool position_independent_encoding,
SubstreamPath path) const
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
path.push_back(Substream::TupleElement);
auto * tuple_state = checkAndGetTupleDeserializeState(state);
settings.path.push_back(Substream::TupleElement);
for (const auto i : ext::range(0, ext::size(elems)))
{
path.back().tuple_element_name = names[i];
elems[i]->deserializeBinaryBulkWithMultipleStreams(
extractElementColumn(column, i), getter, limit, avg_value_size_hint, position_independent_encoding, path);
settings.path.back().tuple_element_name = names[i];
auto & element_col = extractElementColumn(column, i);
elems[i]->deserializeBinaryBulkWithMultipleStreams(element_col, limit, settings, tuple_state->states[i]);
}
settings.path.pop_back();
}
MutableColumnPtr DataTypeTuple::createColumn() const

View File

@ -53,23 +53,32 @@ public:
/** Each sub-column in a tuple is serialized in separate stream.
*/
void enumerateStreams(StreamCallback callback, SubstreamPath path) const override;
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool position_independent_encoding,
SubstreamPath path) const override;
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool position_independent_encoding,
SubstreamPath path) const override;
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
MutableColumnPtr createColumn() const override;

View File

@ -0,0 +1,840 @@
#include <Columns/ColumnWithDictionary.h>
#include <Columns/ColumnUnique.h>
#include <Columns/ColumnFixedString.h>
#include <Columns/ColumnsCommon.h>
#include <Common/HashTable/HashMap.h>
#include <Common/typeid_cast.h>
#include <Core/TypeListNumber.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/DataTypeWithDictionary.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <Parsers/IAST.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
{
const ColumnWithDictionary & getColumnWithDictionary(const IColumn & column)
{
return typeid_cast<const ColumnWithDictionary &>(column);
}
ColumnWithDictionary & getColumnWithDictionary(IColumn & column)
{
return typeid_cast<ColumnWithDictionary &>(column);
}
}
DataTypeWithDictionary::DataTypeWithDictionary(DataTypePtr dictionary_type_)
: dictionary_type(std::move(dictionary_type_))
{
auto inner_type = dictionary_type;
if (dictionary_type->isNullable())
inner_type = static_cast<const DataTypeNullable &>(*dictionary_type).getNestedType();
if (!inner_type->isStringOrFixedString()
&& !inner_type->isDateOrDateTime()
&& !inner_type->isNumber())
throw Exception("DataTypeWithDictionary is supported only for numbers, strings, Date or DateTime, but got "
+ dictionary_type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
void DataTypeWithDictionary::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
path.push_back(Substream::DictionaryKeys);
dictionary_type->enumerateStreams(callback, path);
path.back() = Substream::DictionaryIndexes;
callback(path);
path.pop_back();
}
struct KeysSerializationVersion
{
enum Value
{
/// Version is written at the start of <name.dict.bin>.
/// Dictionary is written as number N and N keys after them.
/// Dictionary can be shared for continuous range of granules, so some marks may point to the same position.
/// Shared dictionary is stored in state and is read once.
SharedDictionariesWithAdditionalKeys = 1,
};
Value value;
static void checkVersion(UInt64 version)
{
if (version != SharedDictionariesWithAdditionalKeys)
throw Exception("Invalid version for DataTypeWithDictionary key column.", ErrorCodes::LOGICAL_ERROR);
}
KeysSerializationVersion(UInt64 version) : value(static_cast<Value>(version)) { checkVersion(version); }
};
/// Version is stored at the start of each granule. It's used to store indexes type and flags.
struct IndexesSerializationType
{
using SerializationType = UInt64;
/// Need to read dictionary if it wasn't.
static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u;
/// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them.
static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u;
/// Need to update dictionary. It means that previous granule has different dictionary.
static constexpr SerializationType NeedUpdateDictionary = 1u << 10u;
enum Type
{
TUInt8 = 0,
TUInt16,
TUInt32,
TUInt64,
};
Type type;
bool has_additional_keys;
bool need_global_dictionary;
bool need_update_dictionary;
static constexpr SerializationType resetFlags(SerializationType type)
{
return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary));
}
static void checkType(SerializationType type)
{
UInt64 value = resetFlags(type);
if (value <= TUInt64)
return;
throw Exception("Invalid type for DataTypeWithDictionary index column.", ErrorCodes::LOGICAL_ERROR);
}
void serialize(WriteBuffer & buffer) const
{
SerializationType val = type;
if (has_additional_keys)
val |= HasAdditionalKeysBit;
if (need_global_dictionary)
val |= NeedGlobalDictionaryBit;
if (need_update_dictionary)
val |= NeedUpdateDictionary;
writeIntBinary(val, buffer);
}
void deserialize(ReadBuffer & buffer)
{
SerializationType val;
readIntBinary(val, buffer);
checkType(val);
has_additional_keys = (val & HasAdditionalKeysBit) != 0;
need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0;
need_update_dictionary = (val & NeedUpdateDictionary) != 0;
type = static_cast<Type>(resetFlags(val));
}
IndexesSerializationType(const IColumn & column,
bool has_additional_keys,
bool need_global_dictionary,
bool enumerate_dictionaries)
: has_additional_keys(has_additional_keys)
, need_global_dictionary(need_global_dictionary)
, need_update_dictionary(enumerate_dictionaries)
{
if (typeid_cast<const ColumnUInt8 *>(&column))
type = TUInt8;
else if (typeid_cast<const ColumnUInt16 *>(&column))
type = TUInt16;
else if (typeid_cast<const ColumnUInt32 *>(&column))
type = TUInt32;
else if (typeid_cast<const ColumnUInt64 *>(&column))
type = TUInt64;
else
throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got "
+ column.getName(), ErrorCodes::LOGICAL_ERROR);
}
DataTypePtr getDataType() const
{
if (type == TUInt8)
return std::make_shared<DataTypeUInt8>();
if (type == TUInt16)
return std::make_shared<DataTypeUInt16>();
if (type == TUInt32)
return std::make_shared<DataTypeUInt32>();
if (type == TUInt64)
return std::make_shared<DataTypeUInt64>();
throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR);
}
IndexesSerializationType() = default;
};
struct SerializeStateWithDictionary : public IDataType::SerializeBinaryBulkState
{
KeysSerializationVersion key_version;
MutableColumnUniquePtr shared_dictionary;
explicit SerializeStateWithDictionary(UInt64 key_version) : key_version(key_version) {}
};
struct DeserializeStateWithDictionary : public IDataType::DeserializeBinaryBulkState
{
KeysSerializationVersion key_version;
ColumnUniquePtr global_dictionary;
IndexesSerializationType index_type;
ColumnPtr additional_keys;
UInt64 num_pending_rows = 0;
explicit DeserializeStateWithDictionary(UInt64 key_version) : key_version(key_version) {}
};
static SerializeStateWithDictionary * checkAndGetWithDictionarySerializeState(
IDataType::SerializeBinaryBulkStatePtr & state)
{
if (!state)
throw Exception("Got empty state for DataTypeWithDictionary.", ErrorCodes::LOGICAL_ERROR);
auto * with_dictionary_state = typeid_cast<SerializeStateWithDictionary *>(state.get());
if (!with_dictionary_state)
{
auto & state_ref = *state;
throw Exception("Invalid SerializeBinaryBulkState for DataTypeWithDictionary. Expected: "
+ demangle(typeid(SerializeStateWithDictionary).name()) + ", got "
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
}
return with_dictionary_state;
}
static DeserializeStateWithDictionary * checkAndGetWithDictionaryDeserializeState(
IDataType::DeserializeBinaryBulkStatePtr & state)
{
if (!state)
throw Exception("Got empty state for DataTypeWithDictionary.", ErrorCodes::LOGICAL_ERROR);
auto * with_dictionary_state = typeid_cast<DeserializeStateWithDictionary *>(state.get());
if (!with_dictionary_state)
{
auto & state_ref = *state;
throw Exception("Invalid DeserializeBinaryBulkState for DataTypeWithDictionary. Expected: "
+ demangle(typeid(DeserializeStateWithDictionary).name()) + ", got "
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
}
return with_dictionary_state;
}
void DataTypeWithDictionary::serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::DictionaryKeys);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception("Got empty stream in DataTypeWithDictionary::serializeBinaryBulkStatePrefix",
ErrorCodes::LOGICAL_ERROR);
/// Write version and create SerializeBinaryBulkState.
UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys;
writeIntBinary(key_version, *stream);
state = std::make_shared<SerializeStateWithDictionary>(key_version);
}
void DataTypeWithDictionary::serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
auto * state_with_dictionary = checkAndGetWithDictionarySerializeState(state);
KeysSerializationVersion::checkVersion(state_with_dictionary->key_version.value);
if (state_with_dictionary->shared_dictionary && settings.low_cardinality_max_dictionary_size)
{
auto nested_column = state_with_dictionary->shared_dictionary->getNestedNotNullableColumn();
settings.path.push_back(Substream::DictionaryKeys);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception("Got empty stream in DataTypeWithDictionary::serializeBinaryBulkStateSuffix",
ErrorCodes::LOGICAL_ERROR);
UInt64 num_keys = nested_column->size();
writeIntBinary(num_keys, *stream);
removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *stream, 0, num_keys);
state_with_dictionary->shared_dictionary = nullptr;
}
}
void DataTypeWithDictionary::deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::DictionaryKeys);
auto * stream = settings.getter(settings.path);
settings.path.pop_back();
if (!stream)
throw Exception("Got empty stream in DataTypeWithDictionary::deserializeBinaryBulkStatePrefix",
ErrorCodes::LOGICAL_ERROR);
UInt64 keys_version;
readIntBinary(keys_version, *stream);
state = std::make_shared<DeserializeStateWithDictionary>(keys_version);
}
namespace
{
template <typename T>
PaddedPODArray<T> * getIndexesData(IColumn & indexes)
{
auto * column = typeid_cast<ColumnVector<T> *>(&indexes);
if (column)
return &column->getData();
return nullptr;
}
struct IndexMapsWithAdditionalKeys
{
MutableColumnPtr dictionary_map;
MutableColumnPtr additional_keys_map;
};
template <typename T>
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray<T> & index, size_t dict_size)
{
PaddedPODArray<T> copy(index.cbegin(), index.cend());
HashMap<T, T> dict_map;
HashMap<T, T> add_keys_map;
for (auto val : index)
{
if (val < dict_size)
dict_map.insert({val, dict_map.size()});
else
add_keys_map.insert({val, add_keys_map.size()});
}
auto dictionary_map = ColumnVector<T>::create(dict_map.size());
auto additional_keys_map = ColumnVector<T>::create(add_keys_map.size());
auto & dict_data = dictionary_map->getData();
auto & add_keys_data = additional_keys_map->getData();
for (auto val : dict_map)
dict_data[val.second] = val.first;
for (auto val : add_keys_map)
add_keys_data[val.second] = val.first - dict_size;
for (auto & val : index)
val = val < dict_size ? dict_map[val]
: add_keys_map[val] + dict_map.size();
for (size_t i = 0; i < index.size(); ++i)
{
T expected = index[i] < dict_data.size() ? dict_data[index[i]]
: add_keys_data[index[i] - dict_data.size()] + dict_size;
if (expected != copy[i])
throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR);
}
return {std::move(dictionary_map), std::move(additional_keys_map)};
}
template <typename T>
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray<T> & index, size_t dict_size)
{
T max_less_dict_size = 0;
T max_value = 0;
auto size = index.size();
if (size == 0)
return {ColumnVector<T>::create(), ColumnVector<T>::create()};
for (size_t i = 0; i < size; ++i)
{
auto val = index[i];
if (val < dict_size)
max_less_dict_size = std::max(max_less_dict_size, val);
max_value = std::max(max_value, val);
}
auto map_size = UInt64(max_less_dict_size) + 1;
auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0;
PaddedPODArray<T> map(map_size, 0);
PaddedPODArray<T> overflow_map(overflow_map_size, 0);
T zero_pos_value = 0;
T zero_pos_overflowed_value = 0;
UInt64 cur_pos = 0;
UInt64 cur_overflowed_pos = 0;
for (size_t i = 0; i < size; ++i)
{
T val = index[i];
if (val < dict_size)
{
if (cur_pos == 0)
{
zero_pos_value = val;
++cur_pos;
}
else if (map[val] == 0 && val != zero_pos_value)
{
map[val] = cur_pos;
++cur_pos;
}
}
else
{
T shifted_val = val - dict_size;
if (cur_overflowed_pos == 0)
{
zero_pos_overflowed_value = shifted_val;
++cur_overflowed_pos;
}
else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value)
{
overflow_map[shifted_val] = cur_overflowed_pos;
++cur_overflowed_pos;
}
}
}
auto dictionary_map = ColumnVector<T>::create(cur_pos);
auto additional_keys_map = ColumnVector<T>::create(cur_overflowed_pos);
auto & dict_data = dictionary_map->getData();
auto & add_keys_data = additional_keys_map->getData();
for (size_t i = 0; i < map_size; ++i)
if (map[i])
dict_data[map[i]] = static_cast<T>(i);
for (size_t i = 0; i < overflow_map_size; ++i)
if (overflow_map[i])
add_keys_data[overflow_map[i]] = static_cast<T>(i);
if (cur_pos)
dict_data[0] = zero_pos_value;
if (cur_overflowed_pos)
add_keys_data[0] = zero_pos_overflowed_value;
for (size_t i = 0; i < size; ++i)
{
T & val = index[i];
if (val < dict_size)
val = map[val];
else
val = overflow_map[val - dict_size] + cur_pos;
}
return {std::move(dictionary_map), std::move(additional_keys_map)};
}
/// Update column and return map with old indexes.
/// Let N is the number of distinct values which are less than max_size;
/// old_column - column before function call;
/// new_column - column after function call:
/// * if old_column[i] < max_size, than
/// dictionary_map[new_column[i]] = old_column[i]
/// * else
/// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size)
{
if (auto * data_uint8 = getIndexesData<UInt8>(column))
return mapIndexWithAdditionalKeys(*data_uint8, dict_size);
else if (auto * data_uint16 = getIndexesData<UInt16>(column))
return mapIndexWithAdditionalKeys(*data_uint16, dict_size);
else if (auto * data_uint32 = getIndexesData<UInt32>(column))
return mapIndexWithAdditionalKeys(*data_uint32, dict_size);
else if (auto * data_uint64 = getIndexesData<UInt64>(column))
return mapIndexWithAdditionalKeys(*data_uint64, dict_size);
else
throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got" + column.getName(),
ErrorCodes::LOGICAL_ERROR);
}
}
void DataTypeWithDictionary::serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const
{
settings.path.push_back(Substream::DictionaryKeys);
auto * keys_stream = settings.getter(settings.path);
settings.path.back() = Substream::DictionaryIndexes;
auto * indexes_stream = settings.getter(settings.path);
settings.path.pop_back();
if (!keys_stream && !indexes_stream)
return;
if (!keys_stream)
throw Exception("Got empty stream for DataTypeWithDictionary keys.", ErrorCodes::LOGICAL_ERROR);
if (!indexes_stream)
throw Exception("Got empty stream for DataTypeWithDictionary indexes.", ErrorCodes::LOGICAL_ERROR);
const ColumnWithDictionary & column_with_dictionary = typeid_cast<const ColumnWithDictionary &>(column);
auto * state_with_dictionary = checkAndGetWithDictionarySerializeState(state);
auto & global_dictionary = state_with_dictionary->shared_dictionary;
KeysSerializationVersion::checkVersion(state_with_dictionary->key_version.value);
bool need_update_dictionary = global_dictionary == nullptr;
if (need_update_dictionary)
global_dictionary = createColumnUnique(*dictionary_type);
size_t max_limit = column.size() - offset;
limit = limit ? std::min(limit, max_limit) : max_limit;
auto sub_column = column_with_dictionary.cutAndCompact(offset, limit);
ColumnPtr positions = sub_column->getIndexesPtr();
ColumnPtr keys = sub_column->getDictionary().getNestedColumn();
if (settings.low_cardinality_max_dictionary_size)
{
/// Insert used_keys into global dictionary and update sub_index.
auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(),
settings.low_cardinality_max_dictionary_size);
size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size();
ColumnWithDictionary::Index(indexes_with_overflow.indexes->getPtr()).check(max_size);
if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size)
throw Exception("Got dictionary with size " + toString(global_dictionary->size()) +
" but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size),
ErrorCodes::LOGICAL_ERROR);
positions = indexes_with_overflow.indexes->index(*positions, 0);
keys = std::move(indexes_with_overflow.overflowed_keys);
if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty())
throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) +
" which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")",
ErrorCodes::LOGICAL_ERROR);
}
if (auto nullable_keys = typeid_cast<const ColumnNullable *>(keys.get()))
keys = nullable_keys->getNestedColumnPtr();
bool need_additional_keys = !keys->empty();
bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0;
bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part
&& global_dictionary->size() >= settings.low_cardinality_max_dictionary_size;
IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary);
index_version.serialize(*indexes_stream);
if (need_write_dictionary)
{
const auto & nested_column = global_dictionary->getNestedNotNullableColumn();
UInt64 num_keys = nested_column->size();
writeIntBinary(num_keys, *keys_stream);
removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys);
state_with_dictionary->shared_dictionary = nullptr;
}
if (need_additional_keys)
{
UInt64 num_keys = keys->size();
writeIntBinary(num_keys, *indexes_stream);
removeNullable(dictionary_type)->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys);
}
UInt64 num_rows = positions->size();
writeIntBinary(num_rows, *indexes_stream);
index_version.getDataType()->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows);
}
void DataTypeWithDictionary::deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const
{
ColumnWithDictionary & column_with_dictionary = typeid_cast<ColumnWithDictionary &>(column);
auto * state_with_dictionary = checkAndGetWithDictionaryDeserializeState(state);
KeysSerializationVersion::checkVersion(state_with_dictionary->key_version.value);
settings.path.push_back(Substream::DictionaryKeys);
auto * keys_stream = settings.getter(settings.path);
settings.path.back() = Substream::DictionaryIndexes;
auto * indexes_stream = settings.getter(settings.path);
settings.path.pop_back();
if (!keys_stream && !indexes_stream)
return;
if (!keys_stream)
throw Exception("Got empty stream for DataTypeWithDictionary keys.", ErrorCodes::LOGICAL_ERROR);
if (!indexes_stream)
throw Exception("Got empty stream for DataTypeWithDictionary indexes.", ErrorCodes::LOGICAL_ERROR);
auto readDictionary = [this, state_with_dictionary, keys_stream]()
{
UInt64 num_keys;
readIntBinary(num_keys, *keys_stream);
auto keys_type = removeNullable(dictionary_type);
auto global_dict_keys = keys_type->createColumn();
keys_type->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0);
auto column_unique = createColumnUnique(*dictionary_type, std::move(global_dict_keys));
state_with_dictionary->global_dictionary = std::move(column_unique);
};
auto readAdditionalKeys = [this, state_with_dictionary, indexes_stream]()
{
UInt64 num_keys;
readIntBinary(num_keys, *indexes_stream);
auto keys_type = removeNullable(dictionary_type);
auto additional_keys = keys_type->createColumn();
keys_type->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0);
state_with_dictionary->additional_keys = std::move(additional_keys);
};
auto readIndexes = [this, state_with_dictionary, indexes_stream, &column_with_dictionary](UInt64 num_rows)
{
auto indexes_type = state_with_dictionary->index_type.getDataType();
MutableColumnPtr indexes_column = indexes_type->createColumn();
indexes_type->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0);
auto & global_dictionary = state_with_dictionary->global_dictionary;
const auto & additional_keys = state_with_dictionary->additional_keys;
bool has_additional_keys = state_with_dictionary->index_type.has_additional_keys;
bool column_is_empty = column_with_dictionary.empty();
if (!state_with_dictionary->index_type.need_global_dictionary)
{
column_with_dictionary.insertRangeFromDictionaryEncodedColumn(*additional_keys, *indexes_column);
}
else if (!has_additional_keys)
{
if (column_is_empty)
column_with_dictionary.setSharedDictionary(global_dictionary);
auto local_column = ColumnWithDictionary::create(global_dictionary, std::move(indexes_column));
column_with_dictionary.insertRangeFrom(*local_column, 0, num_rows);
}
else
{
auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size());
ColumnWithDictionary::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size());
ColumnWithDictionary::Index(indexes_column->getPtr()).check(
maps.dictionary_map->size() + maps.additional_keys_map->size());
auto used_keys = (*std::move(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0))).mutate();
if (!maps.additional_keys_map->empty())
{
auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0);
if (dictionary_type->isNullable())
{
ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0);
used_add_keys = ColumnNullable::create(used_add_keys, null_map);
}
used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size());
}
column_with_dictionary.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column);
}
};
while (limit)
{
if (state_with_dictionary->num_pending_rows == 0)
{
if (indexes_stream->eof())
break;
auto & index_type = state_with_dictionary->index_type;
auto & global_dictionary = state_with_dictionary->global_dictionary;
index_type.deserialize(*indexes_stream);
if (index_type.need_global_dictionary && (!global_dictionary || index_type.need_update_dictionary))
readDictionary();
if (state_with_dictionary->index_type.has_additional_keys)
readAdditionalKeys();
else
state_with_dictionary->additional_keys = nullptr;
readIntBinary(state_with_dictionary->num_pending_rows, *indexes_stream);
}
size_t num_rows_to_read = std::min(limit, state_with_dictionary->num_pending_rows);
readIndexes(num_rows_to_read);
limit -= num_rows_to_read;
state_with_dictionary->num_pending_rows -= num_rows_to_read;
}
}
void DataTypeWithDictionary::serializeBinary(const Field & field, WriteBuffer & ostr) const
{
dictionary_type->serializeBinary(field, ostr);
}
void DataTypeWithDictionary::deserializeBinary(Field & field, ReadBuffer & istr) const
{
dictionary_type->deserializeBinary(field, istr);
}
template <typename ... Args>
void DataTypeWithDictionary::serializeImpl(
const IColumn & column, size_t row_num, WriteBuffer & ostr,
DataTypeWithDictionary::SerealizeFunctionPtr<Args ...> func, Args & ... args) const
{
auto & column_with_dictionary = getColumnWithDictionary(column);
size_t unique_row_number = column_with_dictionary.getIndexes().getUInt(row_num);
(dictionary_type.get()->*func)(*column_with_dictionary.getDictionary().getNestedColumn(), unique_row_number, ostr, std::forward<Args>(args)...);
}
template <typename ... Args>
void DataTypeWithDictionary::deserializeImpl(
IColumn & column, ReadBuffer & istr,
DataTypeWithDictionary::DeserealizeFunctionPtr<Args ...> func, Args & ... args) const
{
auto & column_with_dictionary = getColumnWithDictionary(column);
auto temp_column = column_with_dictionary.getDictionary().getNestedColumn()->cloneEmpty();
(dictionary_type.get()->*func)(*temp_column, istr, std::forward<Args>(args)...);
column_with_dictionary.insertFromFullColumn(*temp_column, 0);
}
namespace
{
template <typename Creator>
struct CreateColumnVector
{
MutableColumnUniquePtr & column;
const IDataType & keys_type;
const Creator & creator;
CreateColumnVector(MutableColumnUniquePtr & column, const IDataType & keys_type, const Creator & creator)
: column(column), keys_type(keys_type), creator(creator)
{
}
template <typename T, size_t>
void operator()()
{
if (typeid_cast<const DataTypeNumber<T> *>(&keys_type))
column = creator((ColumnVector<T> *)(nullptr));
}
};
}
template <typename Creator>
MutableColumnUniquePtr DataTypeWithDictionary::createColumnUniqueImpl(const IDataType & keys_type,
const Creator & creator)
{
auto * type = &keys_type;
if (auto * nullable_type = typeid_cast<const DataTypeNullable *>(&keys_type))
type = nullable_type->getNestedType().get();
if (type->isString())
return creator((ColumnString *)(nullptr));
if (type->isFixedString())
return creator((ColumnFixedString *)(nullptr));
if (typeid_cast<const DataTypeDate *>(type))
return creator((ColumnVector<UInt16> *)(nullptr));
if (typeid_cast<const DataTypeDateTime *>(type))
return creator((ColumnVector<UInt32> *)(nullptr));
if (type->isNumber())
{
MutableColumnUniquePtr column;
TypeListNumbers::forEach(CreateColumnVector(column, *type, creator));
if (!column)
throw Exception("Unexpected numeric type: " + type->getName(), ErrorCodes::LOGICAL_ERROR);
return column;
}
throw Exception("Unexpected dictionary type for DataTypeWithDictionary: " + type->getName(),
ErrorCodes::LOGICAL_ERROR);
}
MutableColumnUniquePtr DataTypeWithDictionary::createColumnUnique(const IDataType & keys_type)
{
auto creator = [&](auto x)
{
using ColumnType = typename std::remove_pointer<decltype(x)>::type;
return ColumnUnique<ColumnType>::create(keys_type);
};
return createColumnUniqueImpl(keys_type, creator);
}
MutableColumnUniquePtr DataTypeWithDictionary::createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys)
{
auto creator = [&](auto x)
{
using ColumnType = typename std::remove_pointer<decltype(x)>::type;
return ColumnUnique<ColumnType>::create(std::move(keys), keys_type.isNullable());
};
return createColumnUniqueImpl(keys_type, creator);
}
MutableColumnPtr DataTypeWithDictionary::createColumn() const
{
MutableColumnPtr indexes = DataTypeUInt8().createColumn();
MutableColumnPtr dictionary = createColumnUnique(*dictionary_type);
return ColumnWithDictionary::create(std::move(dictionary), std::move(indexes));
}
bool DataTypeWithDictionary::equals(const IDataType & rhs) const
{
if (typeid(rhs) != typeid(*this))
return false;
auto & rhs_with_dictionary = static_cast<const DataTypeWithDictionary &>(rhs);
return dictionary_type->equals(*rhs_with_dictionary.dictionary_type);
}
static DataTypePtr create(const ASTPtr & arguments)
{
if (!arguments || arguments->children.size() != 1)
throw Exception("LowCardinality data type family must have single argument - type of elements",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
return std::make_shared<DataTypeWithDictionary>(DataTypeFactory::instance().get(arguments->children[0]));
}
void registerDataTypeWithDictionary(DataTypeFactory & factory)
{
factory.registerDataType("LowCardinality", create);
}
}

View File

@ -0,0 +1,170 @@
#pragma once
#include <DataTypes/IDataType.h>
#include <Columns/IColumnUnique.h>
namespace DB
{
class DataTypeWithDictionary : public IDataType
{
private:
DataTypePtr dictionary_type;
public:
DataTypeWithDictionary(DataTypePtr dictionary_type_);
const DataTypePtr & getDictionaryType() const { return dictionary_type; }
String getName() const override
{
return "LowCardinality(" + dictionary_type->getName() + ")";
}
const char * getFamilyName() const override { return "LowCardinality"; }
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
size_t offset,
size_t limit,
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & state) const override;
void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
size_t limit,
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & state) const override;
void serializeBinary(const Field & field, WriteBuffer & ostr) const override;
void deserializeBinary(Field & field, ReadBuffer & istr) const override;
void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeBinary);
}
void deserializeBinary(IColumn & column, ReadBuffer & istr) const override
{
deserializeImpl(column, istr, &IDataType::deserializeBinary);
}
void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeTextEscaped, settings);
}
void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override
{
deserializeImpl(column, istr, &IDataType::deserializeTextEscaped, settings);
}
void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeTextQuoted, settings);
}
void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override
{
deserializeImpl(column, istr, &IDataType::deserializeTextQuoted, settings);
}
void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeTextCSV, settings);
}
void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override
{
deserializeImpl(column, istr, &IDataType::deserializeTextCSV, settings);
}
void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeText, settings);
}
void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeTextJSON, settings);
}
void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override
{
deserializeImpl(column, istr, &IDataType::deserializeTextJSON, settings);
}
void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override
{
serializeImpl(column, row_num, ostr, &IDataType::serializeTextXML, settings);
}
MutableColumnPtr createColumn() const override;
Field getDefault() const override { return dictionary_type->getDefault(); }
bool equals(const IDataType & rhs) const override;
bool isParametric() const override { return true; }
bool haveSubtypes() const override { return true; }
bool cannotBeStoredInTables() const override { return dictionary_type->cannotBeStoredInTables(); }
bool shouldAlignRightInPrettyFormats() const override { return dictionary_type->shouldAlignRightInPrettyFormats(); }
bool textCanContainOnlyValidUTF8() const override { return dictionary_type->textCanContainOnlyValidUTF8(); }
bool isComparable() const override { return dictionary_type->isComparable(); }
bool canBeComparedWithCollation() const override { return dictionary_type->canBeComparedWithCollation(); }
bool canBeUsedAsVersion() const override { return dictionary_type->canBeUsedAsVersion(); }
bool isSummable() const override { return dictionary_type->isSummable(); };
bool canBeUsedInBitOperations() const override { return dictionary_type->canBeUsedInBitOperations(); };
bool canBeUsedInBooleanContext() const override { return dictionary_type->canBeUsedInBooleanContext(); };
bool isNumber() const override { return false; }
bool isInteger() const override { return false; }
bool isUnsignedInteger() const override { return false; }
bool isDateOrDateTime() const override { return false; }
bool isValueRepresentedByNumber() const override { return dictionary_type->isValueRepresentedByNumber(); }
bool isValueRepresentedByInteger() const override { return dictionary_type->isValueRepresentedByInteger(); }
bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; }
bool isString() const override { return false; }
bool isFixedString() const override { return false; }
bool haveMaximumSizeOfValue() const override { return dictionary_type->haveMaximumSizeOfValue(); }
size_t getMaximumSizeOfValueInMemory() const override { return dictionary_type->getMaximumSizeOfValueInMemory(); }
size_t getSizeOfValueInMemory() const override { return dictionary_type->getSizeOfValueInMemory(); }
bool isCategorial() const override { return false; }
bool isEnum() const override { return false; }
bool isNullable() const override { return false; }
bool onlyNull() const override { return false; }
bool withDictionary() const override { return true; }
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type);
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys);
private:
template <typename ... Args>
using SerealizeFunctionPtr = void (IDataType::*)(const IColumn &, size_t, WriteBuffer &, Args & ...) const;
template <typename ... Args>
void serializeImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr,
SerealizeFunctionPtr<Args ...> func, Args & ... args) const;
template <typename ... Args>
using DeserealizeFunctionPtr = void (IDataType::*)(IColumn &, ReadBuffer &, Args & ...) const;
template <typename ... Args>
void deserializeImpl(IColumn & column, ReadBuffer & istr,
DeserealizeFunctionPtr<Args ...> func, Args & ... args) const;
template <typename Creator>
static MutableColumnUniquePtr createColumnUniqueImpl(const IDataType & keys_type, const Creator & creator);
};
}

View File

@ -97,6 +97,8 @@ String IDataType::getFileNameForStream(const String & column_name, const IDataTy
/// and name is encoded as a whole.
stream_name += "%2E" + escapeForFileName(elem.tuple_element_name);
}
else if (elem.type == Substream::DictionaryKeys)
stream_name += ".dict";
}
return stream_name;
}

View File

@ -79,6 +79,9 @@ public:
NullMap,
TupleElement,
DictionaryKeys,
DictionaryIndexes,
};
Type type;
@ -91,14 +94,64 @@ public:
using SubstreamPath = std::vector<Substream>;
using StreamCallback = std::function<void(const SubstreamPath &)>;
virtual void enumerateStreams(StreamCallback callback, SubstreamPath path) const
virtual void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
callback(path);
}
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
struct SerializeBinaryBulkState
{
virtual ~SerializeBinaryBulkState() = default;
};
struct DeserializeBinaryBulkState
{
virtual ~DeserializeBinaryBulkState() = default;
};
using SerializeBinaryBulkStatePtr = std::shared_ptr<SerializeBinaryBulkState>;
using DeserializeBinaryBulkStatePtr = std::shared_ptr<DeserializeBinaryBulkState>;
struct SerializeBinaryBulkSettings
{
OutputStreamGetter getter;
SubstreamPath path;
size_t low_cardinality_max_dictionary_size = 0;
bool low_cardinality_use_single_dictionary_for_part = true;
bool position_independent_encoding = true;
};
struct DeserializeBinaryBulkSettings
{
InputStreamGetter getter;
SubstreamPath path;
bool position_independent_encoding = true;
/// If not zero, may be used to avoid reallocations while reading column of String type.
double avg_value_size_hint = 0;
};
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
virtual void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
/// Call after serializeBinaryBulkWithMultipleStreams chain to finish serialization.
virtual void serializeBinaryBulkStateSuffix(
SerializeBinaryBulkSettings & /*settings*/,
SerializeBinaryBulkStatePtr & /*state*/) const {}
/// Call before before deserializeBinaryBulkWithMultipleStreams chain to get DeserializeBinaryBulkStatePtr.
virtual void deserializeBinaryBulkStatePrefix(
DeserializeBinaryBulkSettings & /*settings*/,
DeserializeBinaryBulkStatePtr & /*state*/) const {}
/** 'offset' and 'limit' are used to specify range.
* limit = 0 - means no limit.
* offset must be not greater than size of column.
@ -107,29 +160,24 @@ public:
*/
virtual void serializeBinaryBulkWithMultipleStreams(
const IColumn & column,
OutputStreamGetter getter,
size_t offset,
size_t limit,
bool /*position_independent_encoding*/,
SubstreamPath path) const
SerializeBinaryBulkSettings & settings,
SerializeBinaryBulkStatePtr & /*state*/) const
{
if (WriteBuffer * stream = getter(path))
if (WriteBuffer * stream = settings.getter(settings.path))
serializeBinaryBulk(column, *stream, offset, limit);
}
/** Read no more than limit values and append them into column.
* avg_value_size_hint - if not zero, may be used to avoid reallocations while reading column of String type.
*/
/// Read no more than limit values and append them into column.
virtual void deserializeBinaryBulkWithMultipleStreams(
IColumn & column,
InputStreamGetter getter,
size_t limit,
double avg_value_size_hint,
bool /*position_independent_encoding*/,
SubstreamPath path) const
DeserializeBinaryBulkSettings & settings,
DeserializeBinaryBulkStatePtr & /*state*/) const
{
if (ReadBuffer * stream = getter(path))
deserializeBinaryBulk(column, *stream, limit, avg_value_size_hint);
if (ReadBuffer * stream = settings.getter(settings.path))
deserializeBinaryBulk(column, *stream, limit, settings.avg_value_size_hint);
}
/** Override these methods for data types that require just single stream (most of data types).
@ -359,6 +407,8 @@ public:
*/
virtual bool canBeInsideNullable() const { return false; }
virtual bool withDictionary() const { return false; }
/// Updates avg_value_size_hint for newly read column. Uses to optimize deserialization. Zero expected for first column.
static void updateAvgValueSizeHint(const IColumn & column, double & avg_value_size_hint);

View File

@ -22,6 +22,7 @@ try
size_t size = strlen(s) + 1;
DataTypeString data_type;
{
auto column = ColumnString::create();
ColumnString::Chars_t & data = column->getChars();
@ -37,8 +38,14 @@ try
WriteBufferFromFile out_buf("test");
IDataType::SerializeBinaryBulkSettings settings;
IDataType::SerializeBinaryBulkStatePtr state;
settings.getter = [&](const IDataType::SubstreamPath &){ return &out_buf; };
stopwatch.restart();
data_type.serializeBinaryBulkWithMultipleStreams(*column, [&](const IDataType::SubstreamPath &){ return &out_buf; }, 0, 0, true, {});
data_type.serializeBinaryBulkStatePrefix(settings, state);
data_type.serializeBinaryBulkWithMultipleStreams(*column, 0, 0, settings, state);
data_type.serializeBinaryBulkStateSuffix(settings, state);
stopwatch.stop();
std::cout << "Writing, elapsed: " << stopwatch.elapsedSeconds() << std::endl;
@ -49,8 +56,13 @@ try
ReadBufferFromFile in_buf("test");
IDataType::DeserializeBinaryBulkSettings settings;
IDataType::DeserializeBinaryBulkStatePtr state;
settings.getter = [&](const IDataType::SubstreamPath &){ return &in_buf; };
stopwatch.restart();
data_type.deserializeBinaryBulkWithMultipleStreams(*column, [&](const IDataType::SubstreamPath &){ return &in_buf; }, n, 0, true, {});
data_type.deserializeBinaryBulkStatePrefix(settings, state);
data_type.deserializeBinaryBulkWithMultipleStreams(*column, n, settings, state);
stopwatch.stop();
std::cout << "Reading, elapsed: " << stopwatch.elapsedSeconds() << std::endl;

View File

@ -27,7 +27,12 @@ int main(int, char **)
WriteBufferFromOStream out_buf(ostr);
stopwatch.restart();
data_type.serializeBinaryBulkWithMultipleStreams(*column, [&](const IDataType::SubstreamPath &){ return &out_buf; }, 0, 0, true, {});
IDataType::SerializeBinaryBulkSettings settings;
settings.getter = [&](const IDataType::SubstreamPath &){ return &out_buf; };
IDataType::SerializeBinaryBulkStatePtr state;
data_type.serializeBinaryBulkStatePrefix(settings, state);
data_type.serializeBinaryBulkWithMultipleStreams(*column, 0, 0, settings, state);
data_type.serializeBinaryBulkStateSuffix(settings, state);
stopwatch.stop();
std::cout << "Elapsed: " << stopwatch.elapsedSeconds() << std::endl;

View File

@ -749,7 +749,7 @@ static bool castTypeToEither(const IDataType * type, F && f)
}
template <template <typename, typename> class Op, typename Name>
template <template <typename, typename> class Op, typename Name, bool CanBeExecutedOnDefaultArguments = true>
class FunctionBinaryArithmetic : public IFunction
{
const Context & context;
@ -993,6 +993,8 @@ public:
return result;
}
#endif
bool canBeExecutedOnDefaultArguments() const override { return CanBeExecutedOnDefaultArguments; }
};
@ -1141,9 +1143,9 @@ using FunctionPlus = FunctionBinaryArithmetic<PlusImpl, NamePlus>;
using FunctionMinus = FunctionBinaryArithmetic<MinusImpl, NameMinus>;
using FunctionMultiply = FunctionBinaryArithmetic<MultiplyImpl, NameMultiply>;
using FunctionDivideFloating = FunctionBinaryArithmetic<DivideFloatingImpl, NameDivideFloating>;
using FunctionDivideIntegral = FunctionBinaryArithmetic<DivideIntegralImpl, NameDivideIntegral>;
using FunctionDivideIntegral = FunctionBinaryArithmetic<DivideIntegralImpl, NameDivideIntegral, false>;
using FunctionDivideIntegralOrZero = FunctionBinaryArithmetic<DivideIntegralOrZeroImpl, NameDivideIntegralOrZero>;
using FunctionModulo = FunctionBinaryArithmetic<ModuloImpl, NameModulo>;
using FunctionModulo = FunctionBinaryArithmetic<ModuloImpl, NameModulo, false>;
using FunctionNegate = FunctionUnaryArithmetic<NegateImpl, NameNegate, true>;
using FunctionAbs = FunctionUnaryArithmetic<AbsImpl, NameAbs, false>;
using FunctionBitAnd = FunctionBinaryArithmetic<BitAndImpl, NameBitAnd>;
@ -1157,8 +1159,8 @@ using FunctionBitRotateRight = FunctionBinaryArithmetic<BitRotateRightImpl, Name
using FunctionBitTest = FunctionBinaryArithmetic<BitTestImpl, NameBitTest>;
using FunctionLeast = FunctionBinaryArithmetic<LeastImpl, NameLeast>;
using FunctionGreatest = FunctionBinaryArithmetic<GreatestImpl, NameGreatest>;
using FunctionGCD = FunctionBinaryArithmetic<GCDImpl, NameGCD>;
using FunctionLCM = FunctionBinaryArithmetic<LCMImpl, NameLCM>;
using FunctionGCD = FunctionBinaryArithmetic<GCDImpl, NameGCD, false>;
using FunctionLCM = FunctionBinaryArithmetic<LCMImpl, NameLCM, false>;
/// Assumed to be injective for the purpose of query optimization, but in fact it is not injective because of possible overflow.
using FunctionIntExp2 = FunctionUnaryArithmetic<IntExp2Impl, NameIntExp2, true>;
using FunctionIntExp10 = FunctionUnaryArithmetic<IntExp10Impl, NameIntExp10, true>;

View File

@ -1339,7 +1339,7 @@ void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arg
|| executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeString(*inner_col, offsets, res_data, res_offsets, nullable_col)))
executeHashed(offsets, original_data_columns, res_data, res_offsets);
executeHashed(offsets, original_data_columns, res_data, res_offsets, nullable_col);
block.getByPosition(result).column = std::move(res_ptr);
}
@ -1449,13 +1449,21 @@ void FunctionArrayDistinct::executeHashed(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets)
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col)
{
size_t count = columns.size();
using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
const PaddedPODArray<UInt8> * src_null_map = nullptr;
if (nullable_col)
{
src_null_map = &static_cast<const ColumnUInt8 *>(&nullable_col->getNullMapColumn())->getData();
}
Set set;
size_t prev_off = 0;
for (size_t i = 0; i < offsets.size(); ++i)
@ -1465,7 +1473,7 @@ void FunctionArrayDistinct::executeHashed(
for (size_t j = prev_off; j < off; ++j)
{
auto hash = hash128(j, count, columns);
if (set.find(hash) == set.end())
if (set.find(hash) == set.end() && (!nullable_col || (*src_null_map)[j] == 0))
{
set.insert(hash);
res_data_col.insertFrom(*columns[0], j);

View File

@ -1255,7 +1255,8 @@ private:
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets);
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col);
};

View File

@ -1111,7 +1111,7 @@ public:
return std::make_shared<DataTypeUUID>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{

View File

@ -36,6 +36,8 @@
#include <Functions/FunctionsMiscellaneous.h>
#include <Functions/FunctionsDateTime.h>
#include <Functions/FunctionHelpers.h>
#include <DataTypes/DataTypeWithDictionary.h>
#include <Columns/ColumnWithDictionary.h>
namespace DB
@ -728,6 +730,7 @@ public:
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
bool canBeExecutedOnDefaultArguments() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -1245,6 +1248,7 @@ protected:
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
private:
@ -1273,7 +1277,8 @@ public:
PreparedFunctionPtr prepare(const Block & /*sample_block*/) const override
{
return std::make_shared<PreparedFunctionCast>(prepare(getArgumentTypes()[0], getReturnType()), name);
return std::make_shared<PreparedFunctionCast>(
prepareUnpackDictionaries(getArgumentTypes()[0], getReturnType()), name);
}
String getName() const override { return name; }
@ -1378,7 +1383,7 @@ private:
throw Exception{"CAST AS Array can only be performed between same-dimensional array types or from String", ErrorCodes::TYPE_MISMATCH};
/// Prepare nested type conversion
const auto nested_function = prepare(from_nested_type, to_nested_type);
const auto nested_function = prepareUnpackDictionaries(from_nested_type, to_nested_type);
return [nested_function, from_nested_type, to_nested_type](
Block & block, const ColumnNumbers & arguments, const size_t result, size_t /*input_rows_count*/)
@ -1432,7 +1437,7 @@ private:
/// Create conversion wrapper for each element in tuple
for (const auto & idx_type : ext::enumerate(from_type->getElements()))
element_wrappers.push_back(prepare(idx_type.second, to_element_types[idx_type.first]));
element_wrappers.push_back(prepareUnpackDictionaries(idx_type.second, to_element_types[idx_type.first]));
return [element_wrappers, from_element_types, to_element_types]
(Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count)
@ -1576,16 +1581,11 @@ private:
};
}
WrapperType prepare(const DataTypePtr & from_type, const DataTypePtr & to_type) const
WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const
{
/// Determine whether pre-processing and/or post-processing must take place during conversion.
bool source_is_nullable = from_type->isNullable();
bool result_is_nullable = to_type->isNullable();
if (from_type->onlyNull())
{
if (!result_is_nullable)
if (!to_type->isNullable())
throw Exception{"Cannot convert NULL to a non-nullable type", ErrorCodes::CANNOT_CONVERT_TYPE};
return [](Block & block, const ColumnNumbers &, const size_t result, size_t input_rows_count)
@ -1595,6 +1595,88 @@ private:
};
}
const auto * from_with_dict = typeid_cast<const DataTypeWithDictionary *>(from_type.get());
const auto * to_with_dict = typeid_cast<const DataTypeWithDictionary *>(to_type.get());
const auto & from_nested = from_with_dict ? from_with_dict->getDictionaryType() : from_type;
const auto & to_nested = to_with_dict ? to_with_dict->getDictionaryType() : to_type;
auto wrapper = prepareRemoveNullable(from_nested, to_nested);
if (!from_with_dict && !to_with_dict)
return wrapper;
return [wrapper, from_with_dict, to_with_dict]
(Block & block, const ColumnNumbers & arguments, const size_t result, size_t input_rows_count)
{
auto & arg = block.getByPosition(arguments[0]);
auto & res = block.getByPosition(result);
ColumnPtr res_indexes;
/// For some types default can't be casted (for example, String to Int). In that case convert column to full.
bool src_converted_to_full_column = false;
{
/// Replace argument and result columns (and types) to dictionary key columns (and types).
/// Call nested wrapper in order to cast dictionary keys. Then restore block.
auto prev_arg_col = arg.column;
auto prev_arg_type = arg.type;
auto prev_res_type = res.type;
auto tmp_rows_count = input_rows_count;
if (to_with_dict)
res.type = to_with_dict->getDictionaryType();
if (from_with_dict)
{
auto * col_with_dict = typeid_cast<const ColumnWithDictionary *>(prev_arg_col.get());
arg.column = col_with_dict->getDictionary().getNestedColumn();
arg.type = from_with_dict->getDictionaryType();
/// TODO: Make map with defaults conversion.
src_converted_to_full_column = !removeNullable(arg.type)->equals(*removeNullable(res.type));
if (src_converted_to_full_column)
arg.column = arg.column->index(col_with_dict->getIndexes(), 0);
else
res_indexes = col_with_dict->getIndexesPtr();
tmp_rows_count = arg.column->size();
}
/// Perform the requested conversion.
wrapper(block, arguments, result, tmp_rows_count);
arg.column = prev_arg_col;
arg.type = prev_arg_type;
res.type = prev_res_type;
}
if (to_with_dict)
{
auto res_column = to_with_dict->createColumn();
auto * col_with_dict = typeid_cast<ColumnWithDictionary *>(res_column.get());
if (from_with_dict && !src_converted_to_full_column)
{
auto res_keys = std::move(res.column);
col_with_dict->insertRangeFromDictionaryEncodedColumn(*res_keys, *res_indexes);
}
else
col_with_dict->insertRangeFromFullColumn(*res.column, 0, res.column->size());
res.column = std::move(res_column);
}
else if (!src_converted_to_full_column)
res.column = res.column->index(*res_indexes, 0);
};
}
WrapperType prepareRemoveNullable(const DataTypePtr & from_type, const DataTypePtr & to_type) const
{
/// Determine whether pre-processing and/or post-processing must take place during conversion.
bool source_is_nullable = from_type->isNullable();
bool result_is_nullable = to_type->isNullable();
auto wrapper = prepareImpl(removeNullable(from_type), removeNullable(to_type), result_is_nullable);
if (result_is_nullable)
@ -1740,6 +1822,7 @@ protected:
}
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
private:
template <typename DataType>

View File

@ -1206,7 +1206,7 @@ public:
return std::make_shared<DataTypeDateTime>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
@ -1235,7 +1235,7 @@ public:
return std::make_shared<DataTypeDate>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
@ -1264,7 +1264,7 @@ public:
return std::make_shared<DataTypeDate>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{

View File

@ -220,7 +220,7 @@ public:
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
@ -316,7 +316,7 @@ public:
return std::make_shared<DataTypeUInt8>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
@ -452,7 +452,7 @@ public:
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
@ -728,7 +728,7 @@ public:
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{

View File

@ -94,7 +94,7 @@ private:
return std::make_shared<DataTypeUInt8>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -259,7 +259,7 @@ private:
return std::make_shared<DataTypeString>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -489,7 +489,7 @@ private:
return std::make_shared<DataTypeString>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -755,7 +755,7 @@ private:
return std::make_shared<DataType>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -1030,7 +1030,7 @@ private:
return std::make_shared<DataType>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -1263,7 +1263,7 @@ private:
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{
@ -1423,7 +1423,7 @@ private:
return std::make_shared<DataTypeUInt8>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override
{

View File

@ -23,7 +23,7 @@ public:
bool isVariadic() const override { return true; }
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
size_t getNumberOfArguments() const override { return 0; }

View File

@ -11,6 +11,7 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnWithDictionary.h>
#include <Functions/FunctionHelpers.h>
#include <Common/UnicodeBar.h>
#include <Common/UTF8Helpers.h>
@ -24,6 +25,7 @@
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeEnum.h>
#include <DataTypes/DataTypeWithDictionary.h>
#include <DataTypes/NumberTraits.h>
#include <Formats/FormatSettings.h>
#include <Functions/FunctionFactory.h>
@ -111,7 +113,7 @@ public:
return std::make_shared<DataTypeString>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
@ -135,9 +137,9 @@ public:
return name;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -213,6 +215,7 @@ public:
}
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
size_t getNumberOfArguments() const override
{
@ -404,9 +407,9 @@ public:
return name;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -448,9 +451,9 @@ public:
return 0;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -497,9 +500,9 @@ public:
return 0;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -541,9 +544,9 @@ public:
return 0;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -914,9 +917,9 @@ public:
}
/** It could return many different values for single argument. */
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -1317,7 +1320,7 @@ public:
return std::make_shared<DataTypeUInt32>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
@ -1354,7 +1357,7 @@ public:
return std::make_shared<DataTypeString>();
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers &, size_t result, size_t input_rows_count) override
{
@ -1388,9 +1391,9 @@ public:
return 1;
}
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -1557,7 +1560,7 @@ public:
return 1;
}
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
return false;
}
@ -1698,7 +1701,7 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override;
bool isDeterministic() override { return false; }
bool isDeterministic() const override { return false; }
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
@ -1866,6 +1869,119 @@ public:
};
class FunctionToLowCardinality: public IFunction
{
public:
static constexpr auto name = "toLowCardinality";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionToLowCardinality>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments[0]->withDictionary())
return arguments[0];
return std::make_shared<DataTypeWithDictionary>(arguments[0]);
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
auto arg_num = arguments[0];
const auto & arg = block.getByPosition(arg_num);
auto & res = block.getByPosition(result);
if (arg.type->withDictionary())
res.column = arg.column;
else
{
auto column = res.type->createColumn();
typeid_cast<ColumnWithDictionary &>(*column).insertRangeFromFullColumn(*arg.column, 0, arg.column->size());
res.column = std::move(column);
}
}
};
class FunctionLowCardinalityIndexes: public IFunction
{
public:
static constexpr auto name = "lowCardinalityIndexes";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionLowCardinalityIndexes>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
auto * type = typeid_cast<const DataTypeWithDictionary *>(arguments[0].get());
if (!type)
throw Exception("First first argument of function lowCardinalityIndexes must be ColumnWithDictionary, but got"
+ arguments[0]->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeUInt64>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
auto arg_num = arguments[0];
const auto & arg = block.getByPosition(arg_num);
auto & res = block.getByPosition(result);
auto indexes_col = typeid_cast<const ColumnWithDictionary *>(arg.column.get())->getIndexesPtr();
auto new_indexes_col = ColumnUInt64::create(indexes_col->size());
auto & data = new_indexes_col->getData();
for (size_t i = 0; i < data.size(); ++i)
data[i] = indexes_col->getUInt(i);
res.column = std::move(new_indexes_col);
}
};
class FunctionLowCardinalityKeys: public IFunction
{
public:
static constexpr auto name = "lowCardinalityKeys";
static FunctionPtr create(const Context &) { return std::make_shared<FunctionLowCardinalityKeys>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForNulls() const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
auto * type = typeid_cast<const DataTypeWithDictionary *>(arguments[0].get());
if (!type)
throw Exception("First first argument of function lowCardinalityKeys must be ColumnWithDictionary, but got"
+ arguments[0]->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return type->getDictionaryType();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
auto arg_num = arguments[0];
const auto & arg = block.getByPosition(arg_num);
auto & res = block.getByPosition(result);
const auto * column_with_dictionary = typeid_cast<const ColumnWithDictionary *>(arg.column.get());
res.column = column_with_dictionary->getDictionary().getNestedColumn()->cloneResized(arg.column->size());
}
};
std::string FunctionVersion::getVersion() const
{
return VERSION_STRING;
@ -1915,5 +2031,9 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
factory.registerFunction<FunctionRunningDifference>();
factory.registerFunction<FunctionRunningIncome>();
factory.registerFunction<FunctionFinalizeAggregation>();
factory.registerFunction<FunctionToLowCardinality>();
factory.registerFunction<FunctionLowCardinalityIndexes>();
factory.registerFunction<FunctionLowCardinalityKeys>();
}
}

View File

@ -6,6 +6,7 @@
#include <IO/WriteBufferFromString.h>
#include <IO/Operators.h>
#include <Columns/ColumnFunction.h>
#include <DataTypes/DataTypesNumber.h>
namespace DB
{

View File

@ -138,7 +138,7 @@ public:
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isDeterministicInScopeOfQuery() override { return false; }
bool isDeterministicInScopeOfQuery() const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{

View File

@ -5,6 +5,9 @@
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/Native.h>
#include <DataTypes/DataTypeWithDictionary.h>
#include <DataTypes/getLeastSupertype.h>
#include <Columns/ColumnWithDictionary.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <Interpreters/ExpressionActions.h>
@ -187,7 +190,7 @@ bool PreparedFunctionImpl::defaultImplementationForConstantArguments(Block & blo
for (size_t i = 0; i < arguments_size; ++i)
temporary_argument_numbers[i] = i;
execute(temporary_block, temporary_argument_numbers, arguments_size, temporary_block.rows());
executeWithoutColumnsWithDictionary(temporary_block, temporary_argument_numbers, arguments_size, temporary_block.rows());
block.getByPosition(result).column = ColumnConst::create(temporary_block.getByPosition(arguments_size).column, input_rows_count);
return true;
@ -211,7 +214,7 @@ bool PreparedFunctionImpl::defaultImplementationForNulls(Block & block, const Co
if (null_presence.has_nullable)
{
Block temporary_block = createBlockWithNestedColumns(block, args, result);
execute(temporary_block, args, result, temporary_block.rows());
executeWithoutColumnsWithDictionary(temporary_block, args, result, temporary_block.rows());
block.getByPosition(result).column = wrapInNullable(temporary_block.getByPosition(result).column, block, args,
result, input_rows_count);
return true;
@ -220,7 +223,7 @@ bool PreparedFunctionImpl::defaultImplementationForNulls(Block & block, const Co
return false;
}
void PreparedFunctionImpl::execute(Block & block, const ColumnNumbers & args, size_t result, size_t input_rows_count)
void PreparedFunctionImpl::executeWithoutColumnsWithDictionary(Block & block, const ColumnNumbers & args, size_t result, size_t input_rows_count)
{
if (defaultImplementationForConstantArguments(block, args, result, input_rows_count))
return;
@ -231,6 +234,115 @@ void PreparedFunctionImpl::execute(Block & block, const ColumnNumbers & args, si
executeImpl(block, args, result, input_rows_count);
}
static ColumnPtr replaceColumnsWithDictionaryByNestedAndGetDictionaryIndexes(Block & block, const ColumnNumbers & args,
bool can_be_executed_on_default_arguments)
{
size_t num_rows = 0;
ColumnPtr indexes;
for (auto arg : args)
{
ColumnWithTypeAndName & column = block.getByPosition(arg);
if (auto * column_with_dict = checkAndGetColumn<ColumnWithDictionary>(column.column.get()))
{
if (indexes)
throw Exception("Expected single dictionary argument for function.", ErrorCodes::LOGICAL_ERROR);
indexes = column_with_dict->getIndexesPtr();
num_rows = column_with_dict->getDictionary().size();
}
}
if (!indexes)
throw Exception("Expected column with dictionary for any function argument.", ErrorCodes::LOGICAL_ERROR);
for (auto arg : args)
{
ColumnWithTypeAndName & column = block.getByPosition(arg);
if (auto * column_const = checkAndGetColumn<ColumnConst>(column.column.get()))
column.column = column_const->cloneResized(num_rows);
else if (auto * column_with_dict = checkAndGetColumn<ColumnWithDictionary>(column.column.get()))
{
auto * type_with_dict = checkAndGetDataType<DataTypeWithDictionary>(column.type.get());
if (!type_with_dict)
throw Exception("Incompatible type for column with dictionary: " + column.type->getName(),
ErrorCodes::LOGICAL_ERROR);
if (can_be_executed_on_default_arguments)
column.column = column_with_dict->getDictionary().getNestedColumn();
else
{
auto dict_encoded = column_with_dict->getMinimalDictionaryEncodedColumn(0, column_with_dict->size());
column.column = dict_encoded.dictionary;
indexes = dict_encoded.indexes;
}
column.type = type_with_dict->getDictionaryType();
}
}
return indexes;
}
static void convertColumnsWithDictionaryToFull(Block & block, const ColumnNumbers & args)
{
for (auto arg : args)
{
ColumnWithTypeAndName & column = block.getByPosition(arg);
if (auto * column_with_dict = checkAndGetColumn<ColumnWithDictionary>(column.column.get()))
{
auto * type_with_dict = checkAndGetDataType<DataTypeWithDictionary>(column.type.get());
if (!type_with_dict)
throw Exception("Incompatible type for column with dictionary: " + column.type->getName(),
ErrorCodes::LOGICAL_ERROR);
column.column = column_with_dict->convertToFullColumn();
column.type = type_with_dict->getDictionaryType();
}
}
}
void PreparedFunctionImpl::execute(Block & block, const ColumnNumbers & args, size_t result, size_t input_rows_count)
{
if (useDefaultImplementationForColumnsWithDictionary())
{
auto & res = block.safeGetByPosition(result);
Block block_without_dicts = block.cloneWithoutColumns();
for (auto arg : args)
block_without_dicts.safeGetByPosition(arg).column = block.safeGetByPosition(arg).column;
if (auto * res_type_with_dict = typeid_cast<const DataTypeWithDictionary *>(res.type.get()))
{
block_without_dicts.safeGetByPosition(result).type = res_type_with_dict->getDictionaryType();
ColumnPtr indexes = replaceColumnsWithDictionaryByNestedAndGetDictionaryIndexes(
block_without_dicts, args, canBeExecutedOnDefaultArguments());
executeWithoutColumnsWithDictionary(block_without_dicts, args, result, block_without_dicts.rows());
auto res_column = res.type->createColumn();
auto * column_with_dictionary = typeid_cast<ColumnWithDictionary *>(res_column.get());
if (!column_with_dictionary)
throw Exception("Expected ColumnWithDictionary, got" + res_column->getName(), ErrorCodes::LOGICAL_ERROR);
const auto & keys = block_without_dicts.safeGetByPosition(result).column;
column_with_dictionary->insertRangeFromDictionaryEncodedColumn(*keys, *indexes);
res.column = std::move(res_column);
}
else
{
convertColumnsWithDictionaryToFull(block_without_dicts, args);
executeWithoutColumnsWithDictionary(block_without_dicts, args, result, input_rows_count);
res.column = block_without_dicts.safeGetByPosition(result).column;
}
}
else
executeWithoutColumnsWithDictionary(block, args, result, input_rows_count);
}
void FunctionBuilderImpl::checkNumberOfArguments(size_t number_of_arguments) const
{
if (isVariadic())
@ -244,19 +356,19 @@ void FunctionBuilderImpl::checkNumberOfArguments(size_t number_of_arguments) con
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
}
DataTypePtr FunctionBuilderImpl::getReturnType(const ColumnsWithTypeAndName & arguments) const
DataTypePtr FunctionBuilderImpl::getReturnTypeWithoutDictionary(const ColumnsWithTypeAndName & arguments) const
{
checkNumberOfArguments(arguments.size());
if (!arguments.empty() && useDefaultImplementationForNulls())
{
NullPresence null_presense = getNullPresense(arguments);
NullPresence null_presence = getNullPresense(arguments);
if (null_presense.has_null_constant)
if (null_presence.has_null_constant)
{
return makeNullable(std::make_shared<DataTypeNothing>());
}
if (null_presense.has_nullable)
if (null_presence.has_nullable)
{
Block nested_block = createBlockWithNestedColumns(Block(arguments), ext::collection_cast<ColumnNumbers>(ext::range(0, arguments.size())));
auto return_type = getReturnTypeImpl(ColumnsWithTypeAndName(nested_block.begin(), nested_block.end()));
@ -334,4 +446,39 @@ llvm::Value * IFunction::compile(llvm::IRBuilderBase & builder, const DataTypes
#endif
DataTypePtr FunctionBuilderImpl::getReturnType(const ColumnsWithTypeAndName & arguments) const
{
if (useDefaultImplementationForColumnsWithDictionary())
{
bool has_type_with_dictionary = false;
bool can_run_function_on_dictionary = true;
ColumnsWithTypeAndName args_without_dictionary(arguments);
for (ColumnWithTypeAndName & arg : args_without_dictionary)
{
if (arg.column && arg.column->isColumnConst())
continue;
if (auto * type_with_dictionary = typeid_cast<const DataTypeWithDictionary *>(arg.type.get()))
{
if (has_type_with_dictionary)
can_run_function_on_dictionary = false;
has_type_with_dictionary = true;
arg.type = type_with_dictionary->getDictionaryType();
}
else
can_run_function_on_dictionary = false;
}
if (canBeExecutedOnLowCardinalityDictionary() && has_type_with_dictionary && can_run_function_on_dictionary)
return std::make_shared<DataTypeWithDictionary>(getReturnTypeWithoutDictionary(args_without_dictionary));
else
return getReturnTypeWithoutDictionary(args_without_dictionary);
}
return getReturnTypeWithoutDictionary(arguments);
}
}

View File

@ -68,15 +68,28 @@ protected:
*/
virtual bool useDefaultImplementationForConstants() const { return false; }
/** If function arguments has single column with dictionary and all other arguments are constants, call function on nested column.
* Otherwise, convert all columns with dictionary to ordinary columns.
* Returns ColumnWithDictionary if at least one argument is ColumnWithDictionary.
*/
virtual bool useDefaultImplementationForColumnsWithDictionary() const { return true; }
/** Some arguments could remain constant during this implementation.
*/
virtual ColumnNumbers getArgumentsThatAreAlwaysConstant() const { return {}; }
/** True if function can be called on default arguments (include Nullable's) and won't throw.
* Counterexample: modulo(0, 0)
*/
virtual bool canBeExecutedOnDefaultArguments() const { return true; }
private:
bool defaultImplementationForNulls(Block & block, const ColumnNumbers & args, size_t result,
size_t input_rows_count);
bool defaultImplementationForConstantArguments(Block & block, const ColumnNumbers & args, size_t result,
size_t input_rows_count);
void executeWithoutColumnsWithDictionary(Block & block, const ColumnNumbers & arguments, size_t result,
size_t input_rows_count);
};
using ValuePlaceholders = std::vector<std::function<llvm::Value * ()>>;
@ -159,9 +172,9 @@ public:
* Example: now(). Another example: functions that work with periodically updated dictionaries.
*/
virtual bool isDeterministic() { return true; }
virtual bool isDeterministic() const { return true; }
virtual bool isDeterministicInScopeOfQuery() { return true; }
virtual bool isDeterministicInScopeOfQuery() const { return true; }
/** Lets you know if the function is monotonic in a range of values.
* This is used to work with the index in a sorted chunk of data.
@ -266,12 +279,25 @@ protected:
*/
virtual bool useDefaultImplementationForNulls() const { return true; }
/** If useDefaultImplementationForNulls() is true, than change arguments for getReturnType() and buildImpl().
* If function arguments has types with dictionary, convert them to ordinary types.
* getReturnType returns ColumnWithDictionary if at least one argument type is ColumnWithDictionary.
*/
virtual bool useDefaultImplementationForColumnsWithDictionary() const { return true; }
/// If it isn't, will convert all ColumnWithDictionary arguments to full columns.
virtual bool canBeExecutedOnLowCardinalityDictionary() const { return true; }
virtual FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const = 0;
virtual void getLambdaArgumentTypesImpl(DataTypes & /*arguments*/) const
{
throw Exception("Function " + getName() + " can't have lambda-expressions as arguments", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
private:
DataTypePtr getReturnTypeWithoutDictionary(const ColumnsWithTypeAndName & arguments) const;
};
/// Previous function interface.
@ -286,7 +312,10 @@ public:
/// Override this functions to change default implementation behavior. See details in IMyFunction.
bool useDefaultImplementationForNulls() const override { return true; }
bool useDefaultImplementationForConstants() const override { return false; }
bool useDefaultImplementationForColumnsWithDictionary() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {}; }
bool canBeExecutedOnDefaultArguments() const override { return true; }
bool canBeExecutedOnLowCardinalityDictionary() const override { return isDeterministicInScopeOfQuery(); }
using PreparedFunctionImpl::execute;
using FunctionBuilderImpl::getReturnTypeImpl;
@ -365,7 +394,9 @@ protected:
}
bool useDefaultImplementationForNulls() const final { return function->useDefaultImplementationForNulls(); }
bool useDefaultImplementationForConstants() const final { return function->useDefaultImplementationForConstants(); }
bool useDefaultImplementationForColumnsWithDictionary() const final { return function->useDefaultImplementationForColumnsWithDictionary(); }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const final { return function->getArgumentsThatAreAlwaysConstant(); }
bool canBeExecutedOnDefaultArguments() const override { return function->canBeExecutedOnDefaultArguments(); }
private:
std::shared_ptr<IFunction> function;
@ -396,9 +427,9 @@ public:
bool isInjective(const Block & sample_block) override { return function->isInjective(sample_block); }
bool isDeterministic() override { return function->isDeterministic(); }
bool isDeterministic() const override { return function->isDeterministic(); }
bool isDeterministicInScopeOfQuery() override { return function->isDeterministicInScopeOfQuery(); }
bool isDeterministicInScopeOfQuery() const override { return function->isDeterministicInScopeOfQuery(); }
bool hasInformationAboutMonotonicity() const override { return function->hasInformationAboutMonotonicity(); }
@ -431,6 +462,8 @@ protected:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override { return function->getReturnTypeImpl(arguments); }
bool useDefaultImplementationForNulls() const override { return function->useDefaultImplementationForNulls(); }
bool useDefaultImplementationForColumnsWithDictionary() const override { return function->useDefaultImplementationForColumnsWithDictionary(); }
bool canBeExecutedOnLowCardinalityDictionary() const override { return function->canBeExecutedOnLowCardinalityDictionary(); }
FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
{

View File

@ -618,6 +618,13 @@ void Context::checkDatabaseAccessRights(const std::string & database_name) const
checkDatabaseAccessRightsImpl(database_name);
}
bool Context::hasDatabaseAccessRights(const String & database_name) const
{
auto lock = getLock();
return client_info.current_user.empty() || (database_name == "system") ||
shared->security_manager->hasAccessToDatabase(client_info.current_user, database_name);
}
void Context::checkDatabaseAccessRightsImpl(const std::string & database_name) const
{
if (client_info.current_user.empty() || (database_name == "system"))
@ -1793,6 +1800,7 @@ std::shared_ptr<ActionLocksManager> Context::getActionLocksManager()
return shared->action_locks_manager;
}
SessionCleaner::~SessionCleaner()
{
try

View File

@ -180,6 +180,7 @@ public:
bool isTableExist(const String & database_name, const String & table_name) const;
bool isDatabaseExist(const String & database_name) const;
bool isExternalTableExist(const String & table_name) const;
bool hasDatabaseAccessRights(const String & database_name) const;
void assertTableExists(const String & database_name, const String & table_name) const;
/** The parameter check_database_access_rights exists to not check the permissions of the database again,

View File

@ -476,7 +476,7 @@ public:
PreparedFunctionPtr prepare(const Block &) const override { return std::make_shared<LLVMPreparedFunction>(name, context); }
bool isDeterministic() override
bool isDeterministic() const override
{
for (const auto & f : originals)
if (!f->isDeterministic())
@ -484,7 +484,7 @@ public:
return true;
}
bool isDeterministicInScopeOfQuery() override
bool isDeterministicInScopeOfQuery() const override
{
for (const auto & f : originals)
if (!f->isDeterministicInScopeOfQuery())

View File

@ -53,6 +53,7 @@ namespace ErrorCodes
extern const int UNKNOWN_DATABASE_ENGINE;
extern const int DUPLICATE_COLUMN;
extern const int READONLY;
extern const int ILLEGAL_COLUMN;
}
@ -348,6 +349,33 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(const ASTExpres
}
void InterpreterCreateQuery::checkSupportedTypes(const ColumnsDescription & columns, const Context & context)
{
const auto & settings = context.getSettingsRef();
bool allow_low_cardinality = settings.allow_experimental_low_cardinality_type != 0;
if (allow_low_cardinality)
return;
auto check_types = [&](const NamesAndTypesList & list)
{
for (const auto & column : list)
{
if (!allow_low_cardinality && column.type && column.type->withDictionary())
{
String message = "Cannot create table with column " + column.name + " which type is "
+ column.type->getName() + " because LowCardinality type is not allowed. "
+ "Set setting allow_experimental_low_cardinality_type = 1 in order to allow it.";
throw Exception(message, ErrorCodes::ILLEGAL_COLUMN);
}
}
};
check_types(columns.ordinary);
check_types(columns.materialized);
}
ColumnsDescription InterpreterCreateQuery::setColumns(
ASTCreateQuery & create, const Block & as_select_sample, const StoragePtr & as_storage) const
{
@ -487,6 +515,10 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
/// Set and retrieve list of columns.
ColumnsDescription columns = setColumns(create, as_select_sample, as_storage);
/// Some column types may be not allowed according to settings.
if (!create.attach)
checkSupportedTypes(columns, context);
/// Set the table engine if it was not specified explicitly.
setEngine(create);

View File

@ -46,6 +46,8 @@ public:
/// Obtain information about columns, their types and default values, for case when columns in CREATE query is specified explicitly.
static ColumnsDescription getColumnsDescription(const ASTExpressionList & columns, const Context & context);
/// Check that column types are allowed for usage in table according to settings.
static void checkSupportedTypes(const ColumnsDescription & columns, const Context & context);
private:
BlockIO createDatabase(ASTCreateQuery & create);

View File

@ -18,6 +18,7 @@
#include <DataStreams/CreatingSetsBlockInputStream.h>
#include <DataStreams/MaterializingBlockInputStream.h>
#include <DataStreams/ConcatBlockInputStream.h>
#include <DataStreams/ConvertColumnWithDictionaryToFullBlockInputStream.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSelectWithUnionQuery.h>
@ -775,7 +776,8 @@ void InterpreterSelectQuery::executeAggregation(Pipeline & pipeline, const Expre
{
pipeline.transform([&](auto & stream)
{
stream = std::make_shared<ExpressionBlockInputStream>(stream, expression);
stream = std::make_shared<ConvertColumnWithDictionaryToFullBlockInputStream>(
std::make_shared<ExpressionBlockInputStream>(stream, expression));
});
Names key_names;

View File

@ -268,6 +268,11 @@ struct Settings
M(SettingUInt64, enable_conditional_computation, 0, "Enable conditional computations") \
\
M(SettingDateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.") \
\
M(SettingUInt64, low_cardinality_max_dictionary_size, 8192, "Maximum size (in rows) of shared global dictionary for LowCardinality type.") \
M(SettingBool, low_cardinality_use_single_dictionary_for_part, false, "LowCardinality type serialization setting. If is true, than will use additional keys when global dictionary overflows. Otherwise, will create several shared dictionaries.") \
M(SettingBool, allow_experimental_low_cardinality_type, false, "Allows to create table with LowCardinality types.") \
\
M(SettingBool, prefer_localhost_replica, 1, "1 - always send query to local replica, if it exists. 0 - choose replica to send query between local and remote ones according to load_balancing") \
M(SettingUInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.") \

View File

@ -18,6 +18,7 @@
#include <Common/typeid_cast.h>
#include <Common/NaNUtils.h>
#include <DataTypes/DataTypeUUID.h>
#include <DataTypes/DataTypeWithDictionary.h>
namespace DB
@ -217,12 +218,10 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co
if (from_type_hint && from_type_hint->equals(to_type))
return from_value;
if (to_type.isNullable())
{
const DataTypeNullable & nullable_type = static_cast<const DataTypeNullable &>(to_type);
const DataTypePtr & nested_type = nullable_type.getNestedType();
return convertFieldToTypeImpl(from_value, *nested_type);
}
if (auto * with_dict_type = typeid_cast<const DataTypeWithDictionary *>(&to_type))
return convertFieldToType(from_value, *with_dict_type->getDictionaryType(), from_type_hint);
else if (auto * nullable_type = typeid_cast<const DataTypeNullable *>(&to_type))
return convertFieldToTypeImpl(from_value, *nullable_type->getNestedType());
else
return convertFieldToTypeImpl(from_value, to_type);
}

View File

@ -17,6 +17,7 @@ namespace DB
namespace ErrorCodes
{
extern const int SYNTAX_ERROR;
extern const int TOP_AND_LIMIT_TOGETHER;
}
@ -38,6 +39,8 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
ParserKeyword s_limit("LIMIT");
ParserKeyword s_settings("SETTINGS");
ParserKeyword s_by("BY");
ParserKeyword s_top("TOP");
ParserKeyword s_offset("OFFSET");
ParserNotEmptyExpressionList exp_list(false);
ParserNotEmptyExpressionList exp_list_for_with_clause(false, true); /// Set prefer_alias_to_column_name for each alias.
@ -62,6 +65,26 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (s_distinct.ignore(pos, expected))
select_query->distinct = true;
if (s_top.ignore(pos, expected))
{
ParserToken open_bracket(TokenType::OpeningRoundBracket);
ParserToken close_bracket(TokenType::ClosingRoundBracket);
ParserNumber num;
if (open_bracket.ignore(pos, expected))
{
if (!num.parse(pos, select_query->limit_length, expected))
return false;
if (!close_bracket.ignore(pos, expected))
return false;
}
else
{
if (!num.parse(pos, select_query->limit_length, expected))
return false;
}
}
if (!exp_list_for_select_clause.parse(pos, select_query->select_expression_list, expected))
return false;
}
@ -120,6 +143,9 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
/// LIMIT length | LIMIT offset, length | LIMIT count BY expr-list
if (s_limit.ignore(pos, expected))
{
if (select_query->limit_length)
throw Exception("Can not use TOP and LIMIT together", ErrorCodes::TOP_AND_LIMIT_TOGETHER);
ParserToken s_comma(TokenType::Comma);
ParserNumber num;
@ -140,6 +166,11 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
if (!exp_list.parse(pos, select_query->limit_by_expression_list, expected))
return false;
}
else if (s_offset.ignore(pos, expected))
{
if (!num.parse(pos, select_query->limit_offset, expected))
return false;
}
}
/// LIMIT length | LIMIT offset, length

View File

@ -586,12 +586,13 @@ void MergeTreeDataPart::accumulateColumnSizes(ColumnToSize & column_to_size) con
for (const NameAndTypePair & name_type : storage.getColumns().getAllPhysical())
{
IDataType::SubstreamPath path;
name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
{
Poco::File bin_file(getFullPath() + IDataType::getFileNameForStream(name_type.name, substream_path) + ".bin");
if (bin_file.exists())
column_to_size[name_type.name] += bin_file.getSize();
}, {});
}, path);
}
}
@ -637,6 +638,7 @@ void MergeTreeDataPart::checkConsistency(bool require_part_metadata)
{
for (const NameAndTypePair & name_type : columns)
{
IDataType::SubstreamPath stream_path;
name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
{
String file_name = IDataType::getFileNameForStream(name_type.name, substream_path);
@ -648,7 +650,7 @@ void MergeTreeDataPart::checkConsistency(bool require_part_metadata)
if (!checksums.files.count(bin_file_name))
throw Exception("No " + bin_file_name + " file checksum for column " + name + " in part " + path,
ErrorCodes::NO_FILE_IN_DATA_PART);
}, {});
}, stream_path);
}
}
@ -721,7 +723,7 @@ void MergeTreeDataPart::checkConsistency(bool require_part_metadata)
throw Exception("Part " + path + " is broken: marks have different sizes.",
ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART);
}
}, {});
});
}
}
}

View File

@ -42,7 +42,7 @@ MergeTreeReader::MergeTreeReader(const String & path,
clockid_t clock_type)
: avg_value_size_hints(avg_value_size_hints), path(path), data_part(data_part), columns(columns)
, uncompressed_cache(uncompressed_cache), mark_cache(mark_cache), save_marks_in_cache(save_marks_in_cache), storage(storage)
, all_mark_ranges(all_mark_ranges), aio_threshold(aio_threshold), max_read_buffer_size(max_read_buffer_size)
, all_mark_ranges(all_mark_ranges), aio_threshold(aio_threshold), max_read_buffer_size(max_read_buffer_size), index_granularity(storage.index_granularity)
{
try
{
@ -200,6 +200,11 @@ MergeTreeReader::Stream::Stream(
getMark(right).offset_in_compressed_file - getMark(all_mark_ranges[i].begin).offset_in_compressed_file);
}
/// Avoid empty buffer. May happen while reading dictionary for DataTypeWithDictionary.
/// For example: part has single dictionary and all marks point to the same position.
if (max_mark_range == 0)
max_mark_range = max_read_buffer_size;
size_t buffer_size = std::min(max_read_buffer_size, max_mark_range);
/// Estimate size of the data to be read.
@ -329,6 +334,26 @@ void MergeTreeReader::Stream::seekToMark(size_t index)
}
void MergeTreeReader::Stream::seekToStart()
{
try
{
if (cached_buffer)
cached_buffer->seek(0, 0);
if (non_cached_buffer)
non_cached_buffer->seek(0, 0);
}
catch (Exception & e)
{
/// Better diagnostics.
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
e.addMessage("(while seeking to start of column " + path_prefix + ")");
throw;
}
}
void MergeTreeReader::addStreams(const String & name, const IDataType & type, const MarkRanges & all_mark_ranges,
const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type)
{
@ -353,7 +378,8 @@ void MergeTreeReader::addStreams(const String & name, const IDataType & type, co
uncompressed_cache, aio_threshold, max_read_buffer_size, profile_callback, clock_type));
};
type.enumerateStreams(callback, {});
IDataType::SubstreamPath path;
type.enumerateStreams(callback, path);
}
@ -362,28 +388,47 @@ void MergeTreeReader::readData(
size_t from_mark, bool continue_reading, size_t max_rows_to_read,
bool with_offsets)
{
IDataType::InputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & path) -> ReadBuffer *
auto get_stream_getter = [&](bool stream_for_prefix) -> IDataType::InputStreamGetter
{
/// If offsets for arrays have already been read.
if (!with_offsets && path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes)
return nullptr;
return [&, stream_for_prefix](const IDataType::SubstreamPath & path) -> ReadBuffer *
{
/// If offsets for arrays have already been read.
if (!with_offsets && path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes)
return nullptr;
String stream_name = IDataType::getFileNameForStream(name, path);
String stream_name = IDataType::getFileNameForStream(name, path);
auto it = streams.find(stream_name);
if (it == streams.end())
return nullptr;
auto it = streams.find(stream_name);
if (it == streams.end())
return nullptr;
Stream & stream = *it->second;
Stream & stream = *it->second;
if (!continue_reading)
stream.seekToMark(from_mark);
if (stream_for_prefix)
{
stream.seekToStart();
continue_reading = false;
}
else if (!continue_reading)
stream.seekToMark(from_mark);
return stream.data_buffer;
return stream.data_buffer;
};
};
double & avg_value_size_hint = avg_value_size_hints[name];
type.deserializeBinaryBulkWithMultipleStreams(column, stream_getter, max_rows_to_read, avg_value_size_hint, true, {});
IDataType::DeserializeBinaryBulkSettings settings;
settings.avg_value_size_hint = avg_value_size_hint;
if (deserialize_binary_bulk_state_map.count(name) == 0)
{
settings.getter = get_stream_getter(true);
type.deserializeBinaryBulkStatePrefix(settings, deserialize_binary_bulk_state_map[name]);
}
settings.getter = get_stream_getter(false);
auto & deserialize_state = deserialize_binary_bulk_state_map[name];
type.deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, settings, deserialize_state);
IDataType::updateAvgValueSizeHint(column, avg_value_size_hint);
}

View File

@ -23,6 +23,7 @@ class MergeTreeReader : private boost::noncopyable
{
public:
using ValueSizeMap = std::map<std::string, double>;
using DeserializeBinaryBulkStateMap = std::map<std::string, IDataType::DeserializeBinaryBulkStatePtr>;
MergeTreeReader(const String & path, /// Path to the directory containing the part
const MergeTreeData::DataPartPtr & data_part, const NamesAndTypesList & columns,
@ -63,6 +64,7 @@ private:
const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type);
void seekToMark(size_t index);
void seekToStart();
ReadBuffer * data_buffer;
@ -91,6 +93,8 @@ private:
/// avg_value_size_hints are used to reduce the number of reallocations when creating columns of variable size.
ValueSizeMap avg_value_size_hints;
/// Stores states for IDataType::deserializeBinaryBulk
DeserializeBinaryBulkStateMap deserialize_binary_bulk_state_map;
String path;
MergeTreeData::DataPartPtr data_part;
@ -108,6 +112,7 @@ private:
MarkRanges all_mark_ranges;
size_t aio_threshold;
size_t max_read_buffer_size;
size_t index_granularity;
void addStreams(const String & name, const IDataType & type, const MarkRanges & all_mark_ranges,
const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type);

View File

@ -64,7 +64,28 @@ void IMergedBlockOutputStream::addStreams(
aio_threshold);
};
type.enumerateStreams(callback, {});
IDataType::SubstreamPath stream_path;
type.enumerateStreams(callback, stream_path);
}
IDataType::OutputStreamGetter IMergedBlockOutputStream::createStreamGetter(
const String & name, OffsetColumns & offset_columns, bool skip_offsets)
{
return [&, skip_offsets] (const IDataType::SubstreamPath & substream_path) -> WriteBuffer *
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
if (is_offsets && skip_offsets)
return nullptr;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
return nullptr;
return &column_streams[stream_name]->compressed;
};
}
@ -73,8 +94,15 @@ void IMergedBlockOutputStream::writeData(
const IDataType & type,
const IColumn & column,
OffsetColumns & offset_columns,
bool skip_offsets)
bool skip_offsets,
IDataType::SerializeBinaryBulkStatePtr & serialization_state)
{
auto & settings = storage.context.getSettingsRef();
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.getter = createStreamGetter(name, offset_columns, skip_offsets);
serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;
serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part != 0;
size_t size = column.size();
size_t prev_mark = 0;
while (prev_mark < size)
@ -109,25 +137,10 @@ void IMergedBlockOutputStream::writeData(
writeIntBinary(stream.plain_hashing.count(), stream.marks);
writeIntBinary(stream.compressed.offset(), stream.marks);
}, {});
}, serialize_settings.path);
}
IDataType::OutputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & substream_path) -> WriteBuffer *
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
if (is_offsets && skip_offsets)
return nullptr;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
return nullptr;
return &column_streams[stream_name]->compressed;
};
type.serializeBinaryBulkWithMultipleStreams(column, stream_getter, prev_mark, limit, true, {});
type.serializeBinaryBulkWithMultipleStreams(column, prev_mark, limit, serialize_settings, serialization_state);
/// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one.
type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path)
@ -143,7 +156,7 @@ void IMergedBlockOutputStream::writeData(
return;
column_streams[stream_name]->compressed.nextIfAtEnd();
}, {});
}, serialize_settings.path);
prev_mark += limit;
}
@ -157,7 +170,7 @@ void IMergedBlockOutputStream::writeData(
String stream_name = IDataType::getFileNameForStream(name, substream_path);
offset_columns.insert(stream_name);
}
}, {});
}, serialize_settings.path);
}
@ -284,6 +297,19 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
const NamesAndTypesList * total_column_list,
MergeTreeData::DataPart::Checksums * additional_column_checksums)
{
/// Finish columns serialization.
auto & settings = storage.context.getSettingsRef();
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;
serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part != 0;
OffsetColumns offset_columns;
auto it = columns_list.begin();
for (size_t i = 0; i < columns_list.size(); ++i, ++it)
{
serialize_settings.getter = createStreamGetter(it->name, offset_columns, false);
it->type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]);
}
if (!total_column_list)
total_column_list = &columns_list;
@ -395,28 +421,44 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
index_columns[i] = primary_columns[i].column->cloneEmpty();
}
/// Now write the data.
for (const auto & it : columns_list)
if (serialization_states.empty())
{
const ColumnWithTypeAndName & column = block.getByName(it.name);
serialization_states.reserve(columns_list.size());
OffsetColumns tmp_offset_columns;
IDataType::SerializeBinaryBulkSettings settings;
for (const auto & col : columns_list)
{
settings.getter = createStreamGetter(col.name, tmp_offset_columns, false);
serialization_states.emplace_back(nullptr);
col.type->serializeBinaryBulkStatePrefix(settings, serialization_states.back());
}
}
/// Now write the data.
auto it = columns_list.begin();
for (size_t i = 0; i < columns_list.size(); ++i, ++it)
{
const ColumnWithTypeAndName & column = block.getByName(it->name);
if (permutation)
{
auto primary_column_it = primary_columns_name_to_position.find(it.name);
auto primary_column_it = primary_columns_name_to_position.find(it->name);
if (primary_columns_name_to_position.end() != primary_column_it)
{
writeData(column.name, *column.type, *primary_columns[primary_column_it->second].column, offset_columns, false);
auto & primary_column = *primary_columns[primary_column_it->second].column;
writeData(column.name, *column.type, primary_column, offset_columns, false, serialization_states[i]);
}
else
{
/// We rearrange the columns that are not included in the primary key here; Then the result is released - to save RAM.
ColumnPtr permutted_column = column.column->permute(*permutation, 0);
writeData(column.name, *column.type, *permutted_column, offset_columns, false);
ColumnPtr permuted_column = column.column->permute(*permutation, 0);
writeData(column.name, *column.type, *permuted_column, offset_columns, false, serialization_states[i]);
}
}
else
{
writeData(column.name, *column.type, *column.column, offset_columns, false);
writeData(column.name, *column.type, *column.column, offset_columns, false, serialization_states[i]);
}
}
@ -470,11 +512,21 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
if (!initialized)
{
column_streams.clear();
serialization_states.clear();
serialization_states.reserve(block.columns());
OffsetColumns tmp_offset_columns;
IDataType::SerializeBinaryBulkSettings settings;
for (size_t i = 0; i < block.columns(); ++i)
{
addStreams(part_path, block.safeGetByPosition(i).name,
*block.safeGetByPosition(i).type, 0, skip_offsets);
const auto & col = block.safeGetByPosition(i);
addStreams(part_path, col.name, *col.type, 0, skip_offsets);
serialization_states.emplace_back(nullptr);
settings.getter = createStreamGetter(col.name, tmp_offset_columns, false);
col.type->serializeBinaryBulkStatePrefix(settings, serialization_states.back());
}
initialized = true;
}
@ -484,7 +536,7 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
for (size_t i = 0; i < block.columns(); ++i)
{
const ColumnWithTypeAndName & column = block.safeGetByPosition(i);
writeData(column.name, *column.type, *column.column, offset_columns, skip_offsets);
writeData(column.name, *column.type, *column.column, offset_columns, skip_offsets, serialization_states[i]);
}
size_t written_for_last_mark = (storage.index_granularity - index_offset + rows) % storage.index_granularity;
@ -498,6 +550,19 @@ void MergedColumnOnlyOutputStream::writeSuffix()
MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums()
{
/// Finish columns serialization.
auto & settings = storage.context.getSettingsRef();
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;
serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part != 0;
OffsetColumns offset_columns;
for (size_t i = 0; i < header.columns(); ++i)
{
auto & column = header.safeGetByPosition(i);
serialize_settings.getter = createStreamGetter(column.name, offset_columns, skip_offsets);
column.type->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[i]);
}
MergeTreeData::DataPart::Checksums checksums;
for (auto & column_stream : column_streams)
@ -510,6 +575,7 @@ MergeTreeData::DataPart::Checksums MergedColumnOnlyOutputStream::writeSuffixAndG
}
column_streams.clear();
serialization_states.clear();
initialized = false;
return checksums;

View File

@ -25,6 +25,8 @@ public:
protected:
using OffsetColumns = std::set<std::string>;
using SerializationState = IDataType::SerializeBinaryBulkStatePtr;
using SerializationStates = std::vector<SerializationState>;
struct ColumnStream
{
@ -64,8 +66,12 @@ protected:
void addStreams(const String & path, const String & name, const IDataType & type, size_t estimated_size, bool skip_offsets);
IDataType::OutputStreamGetter createStreamGetter(const String & name, OffsetColumns & offset_columns, bool skip_offsets);
/// Write data of one column.
void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns, bool skip_offsets);
void writeData(const String & name, const IDataType & type, const IColumn & column, OffsetColumns & offset_columns,
bool skip_offsets, IDataType::SerializeBinaryBulkStatePtr & serialization_state);
MergeTreeData & storage;
@ -132,6 +138,7 @@ private:
private:
NamesAndTypesList columns_list;
SerializationStates serialization_states;
String part_path;
size_t rows_count = 0;
@ -158,6 +165,7 @@ public:
private:
Block header;
SerializationStates serialization_states;
String part_path;
bool initialized = false;

View File

@ -249,6 +249,8 @@ MergeTreeData::DataPart::Checksums checkDataPart(
while (true)
{
IDataType::DeserializeBinaryBulkSettings settings;
/// Check that mark points to current position in file.
bool marks_eof = false;
name_type.type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
@ -270,7 +272,7 @@ MergeTreeData::DataPart::Checksums checkDataPart(
+ ", mrk file offset: " + toString(stream.mrk_hashing_buf.count()));
throw;
}
}, {});
}, settings.path);
++mark_num;
@ -278,18 +280,18 @@ MergeTreeData::DataPart::Checksums checkDataPart(
/// NOTE Shared array sizes of Nested columns are read more than once. That's Ok.
MutableColumnPtr tmp_column = name_type.type->createColumn();
name_type.type->deserializeBinaryBulkWithMultipleStreams(
*tmp_column,
[&](const IDataType::SubstreamPath & substream_path)
{
String file_name = IDataType::getFileNameForStream(name_type.name, substream_path);
auto stream_it = streams.find(file_name);
if (stream_it == streams.end())
throw Exception("Logical error: cannot find stream " + file_name);
return &stream_it->second.uncompressed_hashing_buf;
},
index_granularity,
0, true, {});
settings.getter = [&](const IDataType::SubstreamPath & substream_path)
{
String file_name = IDataType::getFileNameForStream(name_type.name, substream_path);
auto stream_it = streams.find(file_name);
if (stream_it == streams.end())
throw Exception("Logical error: cannot find stream " + file_name);
return &stream_it->second.uncompressed_hashing_buf;
};
IDataType::DeserializeBinaryBulkStatePtr state;
name_type.type->deserializeBinaryBulkStatePrefix(settings, state);
name_type.type->deserializeBinaryBulkWithMultipleStreams(*tmp_column, index_granularity, settings, state);
size_t read_size = tmp_column->size();
column_size += read_size;

View File

@ -76,6 +76,7 @@ protected:
Block readImpl() override;
private:
size_t block_size;
NamesAndTypesList columns;
StorageLog & storage;
@ -101,6 +102,10 @@ private:
using FileStreams = std::map<std::string, Stream>;
FileStreams streams;
using DeserializeState = IDataType::DeserializeBinaryBulkStatePtr;
using DeserializeStates = std::map<String, DeserializeState>;
DeserializeStates deserialize_states;
void readData(const String & name, const IDataType & type, IColumn & column, size_t max_rows_to_read);
};
@ -168,6 +173,12 @@ private:
WriteBufferFromFile marks_stream; /// Declared below `lock` to make the file open when rwlock is captured.
using SerializeState = IDataType::SerializeBinaryBulkStatePtr;
using SerializeStates = std::map<String, SerializeState>;
SerializeStates serialize_states;
IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenStreams & written_streams);
void writeData(const String & name, const IDataType & type, const IColumn & column,
MarksForColumns & out_marks,
WrittenStreams & written_streams);
@ -226,25 +237,36 @@ Block LogBlockInputStream::readImpl()
void LogBlockInputStream::readData(const String & name, const IDataType & type, IColumn & column, size_t max_rows_to_read)
{
IDataType::InputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & path) -> ReadBuffer *
IDataType::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
auto createStringGetter = [&](bool stream_for_prefix)
{
String stream_name = IDataType::getFileNameForStream(name, path);
return [&] (const IDataType::SubstreamPath & path) -> ReadBuffer *
{
String stream_name = IDataType::getFileNameForStream(name, path);
const auto & file_it = storage.files.find(stream_name);
if (storage.files.end() == file_it)
throw Exception("Logical error: no information about file " + stream_name + " in StorageLog", ErrorCodes::LOGICAL_ERROR);
const auto & file_it = storage.files.find(stream_name);
if (storage.files.end() == file_it)
throw Exception("Logical error: no information about file " + stream_name + " in StorageLog", ErrorCodes::LOGICAL_ERROR);
auto it = streams.try_emplace(stream_name,
file_it->second.data_file.path(),
mark_number
? file_it->second.marks[mark_number].offset
: 0,
max_read_buffer_size).first;
UInt64 offset = 0;
if (!stream_for_prefix && mark_number)
offset = file_it->second.marks[mark_number].offset;
return &it->second.compressed;
auto & data_file_path = file_it->second.data_file.path();
auto it = streams.try_emplace(stream_name, data_file_path, offset, max_read_buffer_size).first;
return &it->second.compressed;
};
};
type.deserializeBinaryBulkWithMultipleStreams(column, stream_getter, max_rows_to_read, 0, true, {}); /// TODO Use avg_value_size_hint.
if (deserialize_states.count(name) == 0)
{
settings.getter = createStringGetter(true);
type.deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]);
}
settings.getter = createStringGetter(false);
type.deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, settings, deserialize_states[name]);
}
@ -274,6 +296,18 @@ void LogBlockOutputStream::writeSuffix()
return;
done = true;
WrittenStreams written_streams;
IDataType::SerializeBinaryBulkSettings settings;
for (const auto & column : getHeader())
{
auto it = serialize_states.find(column.name);
if (it != serialize_states.end())
{
settings.getter = createStreamGetter(column.name, written_streams);
column.type->serializeBinaryBulkStateSuffix(settings, it->second);
}
}
/// Finish write.
marks_stream.next();
@ -291,27 +325,10 @@ void LogBlockOutputStream::writeSuffix()
}
void LogBlockOutputStream::writeData(const String & name, const IDataType & type, const IColumn & column,
MarksForColumns & out_marks,
WrittenStreams & written_streams)
IDataType::OutputStreamGetter LogBlockOutputStream::createStreamGetter(const String & name,
WrittenStreams & written_streams)
{
type.enumerateStreams([&] (const IDataType::SubstreamPath & path)
{
String stream_name = IDataType::getFileNameForStream(name, path);
if (written_streams.count(stream_name))
return;
const auto & file = storage.files[stream_name];
const auto stream_it = streams.try_emplace(stream_name, storage.files[stream_name].data_file.path(), storage.max_compress_block_size).first;
Mark mark;
mark.rows = (file.marks.empty() ? 0 : file.marks.back().rows) + column.size();
mark.offset = stream_it->second.plain_offset + stream_it->second.plain.count();
out_marks.emplace_back(file.column_index, mark);
}, {});
IDataType::OutputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & path) -> WriteBuffer *
return [&] (const IDataType::SubstreamPath & path) -> WriteBuffer *
{
String stream_name = IDataType::getFileNameForStream(name, path);
if (written_streams.count(stream_name))
@ -319,11 +336,50 @@ void LogBlockOutputStream::writeData(const String & name, const IDataType & type
auto it = streams.find(stream_name);
if (streams.end() == it)
throw Exception("Logical error: stream was not created when writing data in LogBlockOutputStream", ErrorCodes::LOGICAL_ERROR);
throw Exception("Logical error: stream was not created when writing data in LogBlockOutputStream",
ErrorCodes::LOGICAL_ERROR);
return &it->second.compressed;
};
}
type.serializeBinaryBulkWithMultipleStreams(column, stream_getter, 0, 0, true, {});
void LogBlockOutputStream::writeData(const String & name, const IDataType & type, const IColumn & column,
MarksForColumns & out_marks,
WrittenStreams & written_streams)
{
IDataType::SerializeBinaryBulkSettings settings;
type.enumerateStreams([&] (const IDataType::SubstreamPath & path)
{
String stream_name = IDataType::getFileNameForStream(name, path);
if (written_streams.count(stream_name))
return;
streams.try_emplace(stream_name, storage.files[stream_name].data_file.path(), storage.max_compress_block_size);
}, settings.path);
settings.getter = createStreamGetter(name, written_streams);
if (serialize_states.count(name) == 0)
type.serializeBinaryBulkStatePrefix(settings, serialize_states[name]);
type.enumerateStreams([&] (const IDataType::SubstreamPath & path)
{
String stream_name = IDataType::getFileNameForStream(name, path);
if (written_streams.count(stream_name))
return;
const auto & file = storage.files[stream_name];
const auto stream_it = streams.find(stream_name);
Mark mark;
mark.rows = (file.marks.empty() ? 0 : file.marks.back().rows) + column.size();
mark.offset = stream_it->second.plain_offset + stream_it->second.plain.count();
out_marks.emplace_back(file.column_index, mark);
}, settings.path);
type.serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]);
type.enumerateStreams([&] (const IDataType::SubstreamPath & path)
{
@ -335,7 +391,7 @@ void LogBlockOutputStream::writeData(const String & name, const IDataType & type
if (streams.end() == it)
throw Exception("Logical error: stream was not created when writing data in LogBlockOutputStream", ErrorCodes::LOGICAL_ERROR);
it->second.compressed.next();
}, {});
}, settings.path);
}
@ -401,7 +457,8 @@ void StorageLog::addFiles(const String & column_name, const IDataType & type)
}
};
type.enumerateStreams(stream_callback, {});
IDataType::SubstreamPath path;
type.enumerateStreams(stream_callback, path);
}
@ -497,11 +554,12 @@ const StorageLog::Marks & StorageLog::getMarksWithRealRowCount() const
* If this is a data type with multiple stream, get the first stream, that we assume have real row count.
* (Example: for Array data type, first stream is array sizes; and number of array sizes is the number of arrays).
*/
IDataType::SubstreamPath path;
column_type.enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
{
if (filename.empty())
filename = IDataType::getFileNameForStream(column_name, substream_path);
}, {});
}, path);
Files_t::const_iterator it = files.find(filename);
if (files.end() == it)

View File

@ -95,6 +95,10 @@ private:
using FileStreams = std::map<std::string, std::unique_ptr<Stream>>;
FileStreams streams;
using DeserializeState = IDataType::DeserializeBinaryBulkStatePtr;
using DeserializeStates = std::map<String, DeserializeState>;
DeserializeStates deserialize_states;
void readData(const String & name, const IDataType & type, IColumn & column, size_t limit);
};
@ -149,8 +153,13 @@ private:
using FileStreams = std::map<std::string, std::unique_ptr<Stream>>;
FileStreams streams;
using SerializeState = IDataType::SerializeBinaryBulkStatePtr;
using SerializeStates = std::map<String, SerializeState>;
SerializeStates serialize_states;
using WrittenStreams = std::set<std::string>;
IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenStreams & written_streams);
void writeData(const String & name, const IDataType & type, const IColumn & column, WrittenStreams & written_streams);
};
@ -206,7 +215,8 @@ Block TinyLogBlockInputStream::readImpl()
void TinyLogBlockInputStream::readData(const String & name, const IDataType & type, IColumn & column, size_t limit)
{
IDataType::InputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & path) -> ReadBuffer *
IDataType::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
settings.getter = [&] (const IDataType::SubstreamPath & path) -> ReadBuffer *
{
String stream_name = IDataType::getFileNameForStream(name, path);
@ -216,13 +226,17 @@ void TinyLogBlockInputStream::readData(const String & name, const IDataType & ty
return &streams[stream_name]->compressed;
};
type.deserializeBinaryBulkWithMultipleStreams(column, stream_getter, limit, 0, true, {}); /// TODO Use avg_value_size_hint.
if (deserialize_states.count(name) == 0)
type.deserializeBinaryBulkStatePrefix(settings, deserialize_states[name]);
type.deserializeBinaryBulkWithMultipleStreams(column, limit, settings, deserialize_states[name]);
}
void TinyLogBlockOutputStream::writeData(const String & name, const IDataType & type, const IColumn & column, WrittenStreams & written_streams)
IDataType::OutputStreamGetter TinyLogBlockOutputStream::createStreamGetter(const String & name,
WrittenStreams & written_streams)
{
IDataType::OutputStreamGetter stream_getter = [&] (const IDataType::SubstreamPath & path) -> WriteBuffer *
return [&] (const IDataType::SubstreamPath & path) -> WriteBuffer *
{
String stream_name = IDataType::getFileNameForStream(name, path);
@ -230,12 +244,23 @@ void TinyLogBlockOutputStream::writeData(const String & name, const IDataType &
return nullptr;
if (!streams.count(stream_name))
streams[stream_name] = std::make_unique<Stream>(storage.files[stream_name].data_file.path(), storage.max_compress_block_size);
streams[stream_name] = std::make_unique<Stream>(storage.files[stream_name].data_file.path(),
storage.max_compress_block_size);
return &streams[stream_name]->compressed;
};
}
type.serializeBinaryBulkWithMultipleStreams(column, stream_getter, 0, 0, true, {});
void TinyLogBlockOutputStream::writeData(const String & name, const IDataType & type, const IColumn & column, WrittenStreams & written_streams)
{
IDataType::SerializeBinaryBulkSettings settings;
settings.getter = createStreamGetter(name, written_streams);
if (serialize_states.count(name) == 0)
type.serializeBinaryBulkStatePrefix(settings, serialize_states[name]);
type.serializeBinaryBulkWithMultipleStreams(column, 0, 0, settings, serialize_states[name]);
}
@ -249,6 +274,18 @@ void TinyLogBlockOutputStream::writeSuffix()
if (streams.empty())
return;
WrittenStreams written_streams;
IDataType::SerializeBinaryBulkSettings settings;
for (const auto & column : getHeader())
{
auto it = serialize_states.find(column.name);
if (it != serialize_states.end())
{
settings.getter = createStreamGetter(column.name, written_streams);
column.type->serializeBinaryBulkStateSuffix(settings, it->second);
}
}
/// Finish write.
for (auto & stream : streams)
stream.second->finalize();
@ -324,7 +361,8 @@ void StorageTinyLog::addFiles(const String & column_name, const IDataType & type
}
};
type.enumerateStreams(stream_callback, {});
IDataType::SubstreamPath path;
type.enumerateStreams(stream_callback, path);
}

View File

@ -42,7 +42,11 @@ void StorageSystemColumns::fillData(MutableColumns & res_columns, const Context
/// Add `database` column.
MutableColumnPtr database_column_mut = ColumnString::create();
for (const auto & database : databases)
database_column_mut->insert(database.first);
{
if (context.hasDatabaseAccessRights(database.first))
database_column_mut->insert(database.first);
}
block_to_filter.insert(ColumnWithTypeAndName(std::move(database_column_mut), std::make_shared<DataTypeString>(), "database"));
/// Filter block with `database` column.

View File

@ -22,10 +22,13 @@ void StorageSystemDatabases::fillData(MutableColumns & res_columns, const Contex
auto databases = context.getDatabases();
for (const auto & database : databases)
{
res_columns[0]->insert(database.first);
res_columns[1]->insert(database.second->getEngineName());
res_columns[2]->insert(database.second->getDataPath());
res_columns[3]->insert(database.second->getMetadataPath());
if (context.hasDatabaseAccessRights(database.first))
{
res_columns[0]->insert(database.first);
res_columns[1]->insert(database.second->getEngineName());
res_columns[2]->insert(database.second->getDataPath());
res_columns[3]->insert(database.second->getMetadataPath());
}
}
}

View File

@ -33,23 +33,26 @@ void StorageSystemMerges::fillData(MutableColumns & res_columns, const Context &
{
for (const auto & merge : context.getMergeList().get())
{
size_t i = 0;
res_columns[i++]->insert(merge.database);
res_columns[i++]->insert(merge.table);
res_columns[i++]->insert(merge.elapsed);
res_columns[i++]->insert(merge.progress);
res_columns[i++]->insert(merge.num_parts);
res_columns[i++]->insert(merge.source_part_names);
res_columns[i++]->insert(merge.result_part_name);
res_columns[i++]->insert(merge.total_size_bytes_compressed);
res_columns[i++]->insert(merge.total_size_marks);
res_columns[i++]->insert(merge.bytes_read_uncompressed);
res_columns[i++]->insert(merge.rows_read);
res_columns[i++]->insert(merge.bytes_written_uncompressed);
res_columns[i++]->insert(merge.rows_written);
res_columns[i++]->insert(merge.columns_written);
res_columns[i++]->insert(merge.memory_usage);
res_columns[i++]->insert(merge.thread_number);
if (context.hasDatabaseAccessRights(merge.database))
{
size_t i = 0;
res_columns[i++]->insert(merge.database);
res_columns[i++]->insert(merge.table);
res_columns[i++]->insert(merge.elapsed);
res_columns[i++]->insert(merge.progress);
res_columns[i++]->insert(merge.num_parts);
res_columns[i++]->insert(merge.source_part_names);
res_columns[i++]->insert(merge.result_part_name);
res_columns[i++]->insert(merge.total_size_bytes_compressed);
res_columns[i++]->insert(merge.total_size_marks);
res_columns[i++]->insert(merge.bytes_read_uncompressed);
res_columns[i++]->insert(merge.rows_read);
res_columns[i++]->insert(merge.bytes_written_uncompressed);
res_columns[i++]->insert(merge.rows_written);
res_columns[i++]->insert(merge.columns_written);
res_columns[i++]->insert(merge.memory_usage);
res_columns[i++]->insert(merge.thread_number);
}
}
}

View File

@ -38,12 +38,15 @@ void StorageSystemMutations::fillData(MutableColumns & res_columns, const Contex
std::map<String, std::map<String, StoragePtr>> merge_tree_tables;
for (const auto & db : context.getDatabases())
{
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
if (context.hasDatabaseAccessRights(db.first))
{
if (dynamic_cast<const StorageMergeTree *>(iterator->table().get())
|| dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
{
merge_tree_tables[db.first][iterator->name()] = iterator->table();
if (dynamic_cast<const StorageMergeTree *>(iterator->table().get())
|| dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
{
merge_tree_tables[db.first][iterator->name()] = iterator->table();
}
}
}
}

View File

@ -59,7 +59,10 @@ public:
/// Add column 'database'.
MutableColumnPtr database_column_mut = ColumnString::create();
for (const auto & database : databases)
database_column_mut->insert(database.first);
{
if (context.hasDatabaseAccessRights(database.first))
database_column_mut->insert(database.first);
}
block_to_filter.insert(ColumnWithTypeAndName(
std::move(database_column_mut), std::make_shared<DataTypeString>(), "database"));

View File

@ -65,9 +65,15 @@ BlockInputStreams StorageSystemReplicas::read(
/// We collect a set of replicated tables.
std::map<String, std::map<String, StoragePtr>> replicated_tables;
for (const auto & db : context.getDatabases())
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
if (dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
replicated_tables[db.first][iterator->name()] = iterator->table();
{
if (context.hasDatabaseAccessRights(db.first))
{
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
if (dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
replicated_tables[db.first][iterator->name()] = iterator->table();
}
}
/// Do you need columns that require a walkthrough in ZooKeeper to compute.
bool with_zk_fields = false;

View File

@ -50,9 +50,15 @@ void StorageSystemReplicationQueue::fillData(MutableColumns & res_columns, const
{
std::map<String, std::map<String, StoragePtr>> replicated_tables;
for (const auto & db : context.getDatabases())
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
if (dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
replicated_tables[db.first][iterator->name()] = iterator->table();
{
if (context.hasDatabaseAccessRights(db.first))
{
for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
if (dynamic_cast<const StorageReplicatedMergeTree *>(iterator->table().get()))
replicated_tables[db.first][iterator->name()] = iterator->table();
}
}
MutableColumnPtr col_database_mut = ColumnString::create();
MutableColumnPtr col_table_mut = ColumnString::create();

View File

@ -95,7 +95,7 @@ BlockInputStreams StorageSystemTables::read(
auto database = context.tryGetDatabase(database_name);
if (!database)
if (!database || !context.hasDatabaseAccessRights(database_name))
{
/// Database was deleted just now.
continue;

View File

@ -11,11 +11,11 @@ class Client:
self.command = [command, '--host', self.host, '--port', str(self.port), '--stacktrace']
def query(self, sql, stdin=None, timeout=None, settings=None):
return self.get_query_request(sql, stdin=stdin, timeout=timeout, settings=settings).get_answer()
def query(self, sql, stdin=None, timeout=None, settings=None, user=None):
return self.get_query_request(sql, stdin=stdin, timeout=timeout, settings=settings, user=user).get_answer()
def get_query_request(self, sql, stdin=None, timeout=None, settings=None):
def get_query_request(self, sql, stdin=None, timeout=None, settings=None, user=None):
command = self.command[:]
if stdin is None:
@ -28,6 +28,9 @@ class Client:
for setting, value in settings.iteritems():
command += ['--' + setting, str(value)]
if user is not None:
command += ['--user', user]
return CommandRequest(command, stdin, timeout)

View File

@ -0,0 +1,20 @@
<yandex>
<users>
<default>
<password></password>
<profile>default</profile>
<quota>default</quota>
</default>
<test_allow>
<password></password>
<profile>default</profile>
<quota>default</quota>
<networks>
<ip>::/0</ip>
</networks>
<allow_databases>
<database>default</database>
</allow_databases>
</test_allow>
</users>
</yandex>

View File

@ -8,6 +8,7 @@ node1 = cluster.add_instance('node1', main_configs=['configs/config_no_substs.xm
node2 = cluster.add_instance('node2', main_configs=['configs/config_env.xml'], env_variables={"MAX_QUERY_SIZE": "55555"})
node3 = cluster.add_instance('node3', main_configs=['configs/config_zk.xml'], with_zookeeper=True)
node4 = cluster.add_instance('node4', main_configs=['configs/config_incl.xml', 'configs/max_query_size.xml']) # include value 77777
node5 = cluster.add_instance('node5', main_configs=['configs/config_allow_databases.xml'])
@pytest.fixture(scope="module")
def start_cluster():
@ -26,3 +27,19 @@ def test_config(start_cluster):
assert node2.query("select value from system.settings where name = 'max_query_size'") == "55555\n"
assert node3.query("select value from system.settings where name = 'max_query_size'") == "77777\n"
assert node4.query("select value from system.settings where name = 'max_query_size'") == "99999\n"
def test_allow_databases(start_cluster):
node5.query("CREATE DATABASE db1")
node5.query("CREATE TABLE db1.test_table(date Date, k1 String, v1 Int32) ENGINE = MergeTree(date, (k1, date), 8192)")
node5.query("INSERT INTO db1.test_table VALUES('2000-01-01', 'test_key', 1)")
assert node5.query("SELECT name FROM system.databases WHERE name = 'db1'") == "db1\n"
assert node5.query("SELECT name FROM system.tables WHERE database = 'db1' AND name = 'test_table' ") == "test_table\n"
assert node5.query("SELECT name FROM system.columns WHERE database = 'db1' AND table = 'test_table'") == "date\nk1\nv1\n"
assert node5.query("SELECT name FROM system.parts WHERE database = 'db1' AND table = 'test_table'") == "20000101_20000101_1_1_0\n"
assert node5.query("SELECT name FROM system.parts_columns WHERE database = 'db1' AND table = 'test_table'") == "20000101_20000101_1_1_0\n20000101_20000101_1_1_0\n20000101_20000101_1_1_0\n"
assert node5.query("SELECT name FROM system.databases WHERE name = 'db1'", user="test_allow") == "\n"
assert node5.query("SELECT name FROM system.tables WHERE database = 'db1' AND name = 'test_table'", user="test_allow") == "\n"
assert node5.query("SELECT name FROM system.columns WHERE database = 'db1' AND table = 'test_table'", user="test_allow") == "\n"
assert node5.query("SELECT name FROM system.parts WHERE database = 'db1' AND table = 'test_table'", user="test_allow") == "\n"
assert node5.query("SELECT name FROM system.parts_columns WHERE database = 'db1' AND table = 'test_table'", user="test_allow") == "\n"

View File

@ -1 +1,16 @@
SELECT DISTINCT eq FROM (WITH range(number % 10) AS arr, arrayMap(x -> x = intDiv(number, 10) ? nan : x, arr) AS arr_with_nan, arrayFilter(x -> x != intDiv(number, 10), arr) AS arr_filtered SELECT number, arrayReduce('quantileExact', arr_with_nan) AS q1, arrayReduce('quantileExact', arr_filtered) AS q2, q1 = q2 AS eq FROM numbers(100));
SELECT DISTINCT
eq
FROM
(
WITH
range(2 + number % 10) AS arr, -- minimum two elements, to avoid nan result --
arrayMap(x -> x = intDiv(number, 10) ? nan : x, arr) AS arr_with_nan,
arrayFilter(x -> x != intDiv(number, 10), arr) AS arr_filtered
SELECT
number,
arrayReduce('quantileExact', arr_with_nan) AS q1,
arrayReduce('quantileExact', arr_filtered) AS q2,
q1 = q2 AS eq
FROM
numbers(100)
);

View File

@ -10,3 +10,4 @@
['123']
[['1212'],['sef'],['343r4']]
[(1,2),(1,3),(1,5)]
0

View File

@ -19,3 +19,4 @@ DROP TABLE arrayDistinct_test;
SELECT arrayDistinct([['1212'], ['sef'], ['343r4'], ['1212']]);
SELECT arrayDistinct([(1, 2), (1, 3), (1, 2), (1, 2), (1, 2), (1, 5)]);
SELECT length(arrayDistinct([NULL, NULL, NULL]));

View File

@ -0,0 +1,6 @@
1
2
1
2
3
4

View File

@ -0,0 +1,21 @@
DROP TABLE IF EXISTS test.test;
CREATE TABLE test.test(val Int64) engine = Memory;
INSERT INTO test.test VALUES (1);
INSERT INTO test.test VALUES (2);
INSERT INTO test.test VALUES (3);
INSERT INTO test.test VALUES (4);
INSERT INTO test.test VALUES (5);
INSERT INTO test.test VALUES (6);
INSERT INTO test.test VALUES (7);
INSERT INTO test.test VALUES (8);
INSERT INTO test.test VALUES (9);
SELECT TOP 2 * FROM test.test ORDER BY val;
SELECT TOP (2) * FROM test.test ORDER BY val;
SELECT * FROM test.test ORDER BY val LIMIT 2 OFFSET 2;
SELECT TOP 2 * FROM test.test ORDER BY val LIMIT 2; -- { clientError 406 }
SELECT * FROM test.test ORDER BY val LIMIT 2,3 OFFSET 2; -- { clientError 62 }
DROP TABLE test.test;

View File

@ -0,0 +1,4 @@
1
1
0
1

View File

@ -0,0 +1,4 @@
select CAST(toLowCardinality(val) as UInt64) from (select arrayJoin(['1']) as val);
select toUInt64(toLowCardinality(val)) from (select arrayJoin(['1']) as val);
select 1 % toLowCardinality(val) from (select arrayJoin([1]) as val);
select gcd(1, toLowCardinality(val)) from (select arrayJoin([1]) as val);

Some files were not shown because too many files have changed in this diff Show More