2018-09-27 15:55:22 +00:00
|
|
|
#include <Columns/ColumnLowCardinality.h>
|
2018-05-03 14:53:35 +00:00
|
|
|
#include <Columns/ColumnUnique.h>
|
|
|
|
#include <Columns/ColumnFixedString.h>
|
2018-05-04 16:28:53 +00:00
|
|
|
#include <Columns/ColumnsCommon.h>
|
2018-08-02 15:16:40 +00:00
|
|
|
#include <Common/HashTable/HashMap.h>
|
2018-05-03 14:53:35 +00:00
|
|
|
#include <Common/typeid_cast.h>
|
2019-08-21 02:28:04 +00:00
|
|
|
#include <Common/assert_cast.h>
|
2019-10-04 17:46:36 +00:00
|
|
|
#include <Core/Field.h>
|
2018-05-03 14:53:35 +00:00
|
|
|
#include <Core/TypeListNumber.h>
|
2018-05-03 14:00:24 +00:00
|
|
|
#include <DataTypes/DataTypeFactory.h>
|
2018-09-27 15:55:22 +00:00
|
|
|
#include <DataTypes/DataTypeLowCardinality.h>
|
2018-05-03 14:53:35 +00:00
|
|
|
#include <DataTypes/DataTypeNullable.h>
|
|
|
|
#include <DataTypes/DataTypeDate.h>
|
|
|
|
#include <DataTypes/DataTypeDateTime.h>
|
2018-05-03 14:00:24 +00:00
|
|
|
#include <Parsers/IAST.h>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2018-05-03 14:53:35 +00:00
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
2018-05-03 14:00:24 +00:00
|
|
|
}
|
|
|
|
|
2018-05-03 15:17:10 +00:00
|
|
|
namespace
|
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
const ColumnLowCardinality & getColumnLowCardinality(const IColumn & column)
|
2018-05-03 15:17:10 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
return typeid_cast<const ColumnLowCardinality &>(column);
|
2018-05-03 15:17:10 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
ColumnLowCardinality & getColumnLowCardinality(IColumn & column)
|
2018-05-03 15:17:10 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
return typeid_cast<ColumnLowCardinality &>(column);
|
2018-05-03 15:17:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
DataTypeLowCardinality::DataTypeLowCardinality(DataTypePtr dictionary_type_)
|
2018-07-09 18:19:03 +00:00
|
|
|
: dictionary_type(std::move(dictionary_type_))
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
|
|
|
auto inner_type = dictionary_type;
|
|
|
|
if (dictionary_type->isNullable())
|
|
|
|
inner_type = static_cast<const DataTypeNullable &>(*dictionary_type).getNestedType();
|
|
|
|
|
2018-11-23 15:59:45 +00:00
|
|
|
if (!inner_type->canBeInsideLowCardinality())
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("DataTypeLowCardinality is supported only for numbers, strings, Date or DateTime, but got "
|
2018-05-03 14:53:35 +00:00
|
|
|
+ dictionary_type->getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-05-04 16:28:53 +00:00
|
|
|
path.push_back(Substream::DictionaryKeys);
|
2018-05-03 14:53:35 +00:00
|
|
|
dictionary_type->enumerateStreams(callback, path);
|
|
|
|
path.back() = Substream::DictionaryIndexes;
|
2018-07-09 18:19:03 +00:00
|
|
|
callback(path);
|
2018-06-07 18:14:37 +00:00
|
|
|
path.pop_back();
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
2018-11-26 00:56:50 +00:00
|
|
|
struct KeysSerializationVersion
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
enum Value
|
|
|
|
{
|
2018-08-06 15:26:44 +00:00
|
|
|
/// Version is written at the start of <name.dict.bin>.
|
|
|
|
/// Dictionary is written as number N and N keys after them.
|
2018-08-07 18:32:06 +00:00
|
|
|
/// Dictionary can be shared for continuous range of granules, so some marks may point to the same position.
|
|
|
|
/// Shared dictionary is stored in state and is read once.
|
2018-08-06 15:26:44 +00:00
|
|
|
SharedDictionariesWithAdditionalKeys = 1,
|
2018-06-07 18:14:37 +00:00
|
|
|
};
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
Value value;
|
|
|
|
|
|
|
|
static void checkVersion(UInt64 version)
|
|
|
|
{
|
2018-08-06 15:26:44 +00:00
|
|
|
if (version != SharedDictionariesWithAdditionalKeys)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Invalid version for DataTypeLowCardinality key column.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2020-03-18 03:27:32 +00:00
|
|
|
explicit KeysSerializationVersion(UInt64 version) : value(static_cast<Value>(version)) { checkVersion(version); }
|
2018-06-07 18:14:37 +00:00
|
|
|
};
|
|
|
|
|
2018-08-07 18:32:06 +00:00
|
|
|
/// Version is stored at the start of each granule. It's used to store indexes type and flags.
|
2018-06-07 18:14:37 +00:00
|
|
|
struct IndexesSerializationType
|
|
|
|
{
|
|
|
|
using SerializationType = UInt64;
|
2018-08-07 18:32:06 +00:00
|
|
|
/// Need to read dictionary if it wasn't.
|
2018-07-19 14:37:22 +00:00
|
|
|
static constexpr SerializationType NeedGlobalDictionaryBit = 1u << 8u;
|
2018-08-07 18:32:06 +00:00
|
|
|
/// Need to read additional keys. Additional keys are stored before indexes as value N and N keys after them.
|
2018-07-19 14:37:22 +00:00
|
|
|
static constexpr SerializationType HasAdditionalKeysBit = 1u << 9u;
|
2018-08-07 18:32:06 +00:00
|
|
|
/// Need to update dictionary. It means that previous granule has different dictionary.
|
2018-07-19 14:37:22 +00:00
|
|
|
static constexpr SerializationType NeedUpdateDictionary = 1u << 10u;
|
2018-05-04 19:38:41 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
enum Type
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
TUInt8 = 0,
|
|
|
|
TUInt16,
|
|
|
|
TUInt32,
|
|
|
|
TUInt64,
|
|
|
|
};
|
2018-05-04 16:28:53 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
Type type;
|
|
|
|
bool has_additional_keys;
|
|
|
|
bool need_global_dictionary;
|
2018-07-19 14:37:22 +00:00
|
|
|
bool need_update_dictionary;
|
2018-05-21 16:21:15 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
static constexpr SerializationType resetFlags(SerializationType type)
|
|
|
|
{
|
2018-07-19 14:37:22 +00:00
|
|
|
return type & (~(HasAdditionalKeysBit | NeedGlobalDictionaryBit | NeedUpdateDictionary));
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
2018-05-21 16:21:15 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
static void checkType(SerializationType type)
|
|
|
|
{
|
|
|
|
UInt64 value = resetFlags(type);
|
|
|
|
if (value <= TUInt64)
|
|
|
|
return;
|
2018-05-04 20:11:32 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Invalid type for DataTypeLowCardinality index column.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void serialize(WriteBuffer & buffer) const
|
|
|
|
{
|
|
|
|
SerializationType val = type;
|
|
|
|
if (has_additional_keys)
|
|
|
|
val |= HasAdditionalKeysBit;
|
|
|
|
if (need_global_dictionary)
|
|
|
|
val |= NeedGlobalDictionaryBit;
|
2018-07-19 14:37:22 +00:00
|
|
|
if (need_update_dictionary)
|
|
|
|
val |= NeedUpdateDictionary;
|
2018-06-07 18:14:37 +00:00
|
|
|
writeIntBinary(val, buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
void deserialize(ReadBuffer & buffer)
|
|
|
|
{
|
|
|
|
SerializationType val;
|
|
|
|
readIntBinary(val, buffer);
|
|
|
|
checkType(val);
|
|
|
|
has_additional_keys = (val & HasAdditionalKeysBit) != 0;
|
|
|
|
need_global_dictionary = (val & NeedGlobalDictionaryBit) != 0;
|
2018-07-19 14:37:22 +00:00
|
|
|
need_update_dictionary = (val & NeedUpdateDictionary) != 0;
|
2018-06-07 18:14:37 +00:00
|
|
|
type = static_cast<Type>(resetFlags(val));
|
|
|
|
}
|
|
|
|
|
2018-07-19 14:37:22 +00:00
|
|
|
IndexesSerializationType(const IColumn & column,
|
2019-08-03 11:02:40 +00:00
|
|
|
bool has_additional_keys_,
|
|
|
|
bool need_global_dictionary_,
|
2018-07-19 14:37:22 +00:00
|
|
|
bool enumerate_dictionaries)
|
2019-08-03 11:02:40 +00:00
|
|
|
: has_additional_keys(has_additional_keys_)
|
|
|
|
, need_global_dictionary(need_global_dictionary_)
|
2018-07-19 14:37:22 +00:00
|
|
|
, need_update_dictionary(enumerate_dictionaries)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
if (typeid_cast<const ColumnUInt8 *>(&column))
|
2018-06-07 18:14:37 +00:00
|
|
|
type = TUInt8;
|
2018-07-09 18:19:03 +00:00
|
|
|
else if (typeid_cast<const ColumnUInt16 *>(&column))
|
2018-06-07 18:14:37 +00:00
|
|
|
type = TUInt16;
|
2018-07-09 18:19:03 +00:00
|
|
|
else if (typeid_cast<const ColumnUInt32 *>(&column))
|
2018-06-07 18:14:37 +00:00
|
|
|
type = TUInt32;
|
2018-07-09 18:19:03 +00:00
|
|
|
else if (typeid_cast<const ColumnUInt64 *>(&column))
|
2018-06-07 18:14:37 +00:00
|
|
|
type = TUInt64;
|
|
|
|
else
|
2018-07-09 18:19:03 +00:00
|
|
|
throw Exception("Invalid Indexes column for IndexesSerializationType. Expected ColumnUInt*, got "
|
|
|
|
+ column.getName(), ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
DataTypePtr getDataType() const
|
|
|
|
{
|
|
|
|
if (type == TUInt8)
|
|
|
|
return std::make_shared<DataTypeUInt8>();
|
|
|
|
if (type == TUInt16)
|
|
|
|
return std::make_shared<DataTypeUInt16>();
|
|
|
|
if (type == TUInt32)
|
|
|
|
return std::make_shared<DataTypeUInt32>();
|
|
|
|
if (type == TUInt64)
|
|
|
|
return std::make_shared<DataTypeUInt64>();
|
|
|
|
|
|
|
|
throw Exception("Can't create DataType from IndexesSerializationType.", ErrorCodes::LOGICAL_ERROR);
|
2018-05-04 16:28:53 +00:00
|
|
|
}
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
IndexesSerializationType() = default;
|
|
|
|
};
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
struct SerializeStateLowCardinality : public IDataType::SerializeBinaryBulkState
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
|
|
|
KeysSerializationVersion key_version;
|
2018-08-07 18:32:06 +00:00
|
|
|
MutableColumnUniquePtr shared_dictionary;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
explicit SerializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {}
|
2018-06-07 18:14:37 +00:00
|
|
|
};
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
struct DeserializeStateLowCardinality : public IDataType::DeserializeBinaryBulkState
|
2018-05-21 16:21:15 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
KeysSerializationVersion key_version;
|
|
|
|
ColumnUniquePtr global_dictionary;
|
2018-05-21 16:21:15 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
IndexesSerializationType index_type;
|
2018-07-09 18:19:03 +00:00
|
|
|
ColumnPtr additional_keys;
|
2018-09-19 14:34:29 +00:00
|
|
|
ColumnPtr null_map;
|
2018-06-07 18:14:37 +00:00
|
|
|
UInt64 num_pending_rows = 0;
|
|
|
|
|
2019-03-29 14:06:46 +00:00
|
|
|
/// If dictionary should be updated.
|
|
|
|
/// Can happen is some granules was skipped while reading from MergeTree.
|
2019-03-29 15:07:53 +00:00
|
|
|
/// We should store this flag in State because
|
|
|
|
/// in case of long block of empty arrays we may not need read dictionary at first reading.
|
2019-03-29 14:06:46 +00:00
|
|
|
bool need_update_dictionary = false;
|
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
explicit DeserializeStateLowCardinality(UInt64 key_version_) : key_version(key_version_) {}
|
2018-05-21 16:21:15 +00:00
|
|
|
};
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
static SerializeStateLowCardinality * checkAndGetLowCardinalitySerializeState(
|
2018-06-07 18:14:37 +00:00
|
|
|
IDataType::SerializeBinaryBulkStatePtr & state)
|
|
|
|
{
|
|
|
|
if (!state)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto * low_cardinality_state = typeid_cast<SerializeStateLowCardinality *>(state.get());
|
|
|
|
if (!low_cardinality_state)
|
2018-08-09 09:28:52 +00:00
|
|
|
{
|
|
|
|
auto & state_ref = *state;
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Invalid SerializeBinaryBulkState for DataTypeLowCardinality. Expected: "
|
|
|
|
+ demangle(typeid(SerializeStateLowCardinality).name()) + ", got "
|
2018-08-09 09:28:52 +00:00
|
|
|
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
return low_cardinality_state;
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
static DeserializeStateLowCardinality * checkAndGetLowCardinalityDeserializeState(
|
2018-06-07 18:14:37 +00:00
|
|
|
IDataType::DeserializeBinaryBulkStatePtr & state)
|
|
|
|
{
|
|
|
|
if (!state)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty state for DataTypeLowCardinality.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto * low_cardinality_state = typeid_cast<DeserializeStateLowCardinality *>(state.get());
|
|
|
|
if (!low_cardinality_state)
|
2018-08-09 09:28:52 +00:00
|
|
|
{
|
|
|
|
auto & state_ref = *state;
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Invalid DeserializeBinaryBulkState for DataTypeLowCardinality. Expected: "
|
|
|
|
+ demangle(typeid(DeserializeStateLowCardinality).name()) + ", got "
|
2018-08-09 09:28:52 +00:00
|
|
|
+ demangle(typeid(state_ref).name()), ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
return low_cardinality_state;
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::serializeBinaryBulkStatePrefix(
|
2018-06-07 18:14:37 +00:00
|
|
|
SerializeBinaryBulkSettings & settings,
|
|
|
|
SerializeBinaryBulkStatePtr & state) const
|
|
|
|
{
|
|
|
|
settings.path.push_back(Substream::DictionaryKeys);
|
|
|
|
auto * stream = settings.getter(settings.path);
|
|
|
|
settings.path.pop_back();
|
|
|
|
|
|
|
|
if (!stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStatePrefix",
|
2018-06-07 18:14:37 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
/// Write version and create SerializeBinaryBulkState.
|
2018-08-06 15:26:44 +00:00
|
|
|
UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
writeIntBinary(key_version, *stream);
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
state = std::make_shared<SerializeStateLowCardinality>(key_version);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::serializeBinaryBulkStateSuffix(
|
2018-06-07 18:14:37 +00:00
|
|
|
SerializeBinaryBulkSettings & settings,
|
|
|
|
SerializeBinaryBulkStatePtr & state) const
|
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state);
|
|
|
|
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
if (low_cardinality_state->shared_dictionary && settings.low_cardinality_max_dictionary_size)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
auto nested_column = low_cardinality_state->shared_dictionary->getNestedNotNullableColumn();
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
settings.path.push_back(Substream::DictionaryKeys);
|
|
|
|
auto * stream = settings.getter(settings.path);
|
|
|
|
settings.path.pop_back();
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
if (!stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream in DataTypeLowCardinality::serializeBinaryBulkStateSuffix",
|
2018-07-09 18:19:03 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
UInt64 num_keys = nested_column->size();
|
|
|
|
writeIntBinary(num_keys, *stream);
|
|
|
|
removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *stream, 0, num_keys);
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->shared_dictionary = nullptr;
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::deserializeBinaryBulkStatePrefix(
|
2018-06-07 18:14:37 +00:00
|
|
|
DeserializeBinaryBulkSettings & settings,
|
|
|
|
DeserializeBinaryBulkStatePtr & state) const
|
|
|
|
{
|
|
|
|
settings.path.push_back(Substream::DictionaryKeys);
|
|
|
|
auto * stream = settings.getter(settings.path);
|
|
|
|
settings.path.pop_back();
|
|
|
|
|
|
|
|
if (!stream)
|
2018-09-19 10:27:46 +00:00
|
|
|
return;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
UInt64 keys_version;
|
|
|
|
readIntBinary(keys_version, *stream);
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
state = std::make_shared<DeserializeStateLowCardinality>(keys_version);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
template <typename T>
|
|
|
|
PaddedPODArray<T> * getIndexesData(IColumn & indexes)
|
|
|
|
{
|
|
|
|
auto * column = typeid_cast<ColumnVector<T> *>(&indexes);
|
|
|
|
if (column)
|
|
|
|
return &column->getData();
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
struct IndexMapsWithAdditionalKeys
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
MutableColumnPtr dictionary_map;
|
|
|
|
MutableColumnPtr additional_keys_map;
|
|
|
|
};
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
template <typename T>
|
2018-08-06 15:26:44 +00:00
|
|
|
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeysRef(PaddedPODArray<T> & index, size_t dict_size)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
PaddedPODArray<T> copy(index.cbegin(), index.cend());
|
|
|
|
|
|
|
|
HashMap<T, T> dict_map;
|
|
|
|
HashMap<T, T> add_keys_map;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
for (auto val : index)
|
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
if (val < dict_size)
|
|
|
|
dict_map.insert({val, dict_map.size()});
|
|
|
|
else
|
|
|
|
add_keys_map.insert({val, add_keys_map.size()});
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
auto dictionary_map = ColumnVector<T>::create(dict_map.size());
|
|
|
|
auto additional_keys_map = ColumnVector<T>::create(add_keys_map.size());
|
|
|
|
auto & dict_data = dictionary_map->getData();
|
|
|
|
auto & add_keys_data = additional_keys_map->getData();
|
|
|
|
|
|
|
|
for (auto val : dict_map)
|
|
|
|
dict_data[val.second] = val.first;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
for (auto val : add_keys_map)
|
|
|
|
add_keys_data[val.second] = val.first - dict_size;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
for (auto & val : index)
|
2018-07-09 18:19:03 +00:00
|
|
|
val = val < dict_size ? dict_map[val]
|
|
|
|
: add_keys_map[val] + dict_map.size();
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
for (size_t i = 0; i < index.size(); ++i)
|
|
|
|
{
|
|
|
|
T expected = index[i] < dict_data.size() ? dict_data[index[i]]
|
|
|
|
: add_keys_data[index[i] - dict_data.size()] + dict_size;
|
|
|
|
if (expected != copy[i])
|
|
|
|
throw Exception("Expected " + toString(expected) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return {std::move(dictionary_map), std::move(additional_keys_map)};
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-08-06 15:26:44 +00:00
|
|
|
template <typename T>
|
|
|
|
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(PaddedPODArray<T> & index, size_t dict_size)
|
|
|
|
{
|
|
|
|
T max_less_dict_size = 0;
|
|
|
|
T max_value = 0;
|
|
|
|
|
|
|
|
auto size = index.size();
|
|
|
|
if (size == 0)
|
|
|
|
return {ColumnVector<T>::create(), ColumnVector<T>::create()};
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
auto val = index[i];
|
|
|
|
if (val < dict_size)
|
|
|
|
max_less_dict_size = std::max(max_less_dict_size, val);
|
|
|
|
|
|
|
|
max_value = std::max(max_value, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
auto map_size = UInt64(max_less_dict_size) + 1;
|
|
|
|
auto overflow_map_size = max_value >= dict_size ? (UInt64(max_value - dict_size) + 1) : 0;
|
|
|
|
PaddedPODArray<T> map(map_size, 0);
|
|
|
|
PaddedPODArray<T> overflow_map(overflow_map_size, 0);
|
|
|
|
|
|
|
|
T zero_pos_value = 0;
|
|
|
|
T zero_pos_overflowed_value = 0;
|
|
|
|
UInt64 cur_pos = 0;
|
|
|
|
UInt64 cur_overflowed_pos = 0;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
T val = index[i];
|
|
|
|
if (val < dict_size)
|
|
|
|
{
|
|
|
|
if (cur_pos == 0)
|
|
|
|
{
|
|
|
|
zero_pos_value = val;
|
|
|
|
++cur_pos;
|
|
|
|
}
|
|
|
|
else if (map[val] == 0 && val != zero_pos_value)
|
|
|
|
{
|
|
|
|
map[val] = cur_pos;
|
|
|
|
++cur_pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
T shifted_val = val - dict_size;
|
|
|
|
if (cur_overflowed_pos == 0)
|
|
|
|
{
|
|
|
|
zero_pos_overflowed_value = shifted_val;
|
|
|
|
++cur_overflowed_pos;
|
|
|
|
}
|
|
|
|
else if (overflow_map[shifted_val] == 0 && shifted_val != zero_pos_overflowed_value)
|
|
|
|
{
|
|
|
|
overflow_map[shifted_val] = cur_overflowed_pos;
|
|
|
|
++cur_overflowed_pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
auto dictionary_map = ColumnVector<T>::create(cur_pos);
|
|
|
|
auto additional_keys_map = ColumnVector<T>::create(cur_overflowed_pos);
|
|
|
|
auto & dict_data = dictionary_map->getData();
|
|
|
|
auto & add_keys_data = additional_keys_map->getData();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < map_size; ++i)
|
|
|
|
if (map[i])
|
|
|
|
dict_data[map[i]] = static_cast<T>(i);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < overflow_map_size; ++i)
|
|
|
|
if (overflow_map[i])
|
|
|
|
add_keys_data[overflow_map[i]] = static_cast<T>(i);
|
|
|
|
|
|
|
|
if (cur_pos)
|
|
|
|
dict_data[0] = zero_pos_value;
|
|
|
|
if (cur_overflowed_pos)
|
|
|
|
add_keys_data[0] = zero_pos_overflowed_value;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
|
|
{
|
|
|
|
T & val = index[i];
|
|
|
|
if (val < dict_size)
|
|
|
|
val = map[val];
|
|
|
|
else
|
|
|
|
val = overflow_map[val - dict_size] + cur_pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
return {std::move(dictionary_map), std::move(additional_keys_map)};
|
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
/// Update column and return map with old indexes.
|
|
|
|
/// Let N is the number of distinct values which are less than max_size;
|
|
|
|
/// old_column - column before function call;
|
2018-08-06 15:26:44 +00:00
|
|
|
/// new_column - column after function call:
|
2018-07-09 18:19:03 +00:00
|
|
|
/// * if old_column[i] < max_size, than
|
2018-08-06 15:26:44 +00:00
|
|
|
/// dictionary_map[new_column[i]] = old_column[i]
|
2018-07-09 18:19:03 +00:00
|
|
|
/// * else
|
2018-08-06 15:26:44 +00:00
|
|
|
/// additional_keys_map[new_column[i]] = old_column[i] - dict_size + N
|
2018-07-09 18:19:03 +00:00
|
|
|
IndexMapsWithAdditionalKeys mapIndexWithAdditionalKeys(IColumn & column, size_t dict_size)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
|
|
|
if (auto * data_uint8 = getIndexesData<UInt8>(column))
|
2018-07-09 18:19:03 +00:00
|
|
|
return mapIndexWithAdditionalKeys(*data_uint8, dict_size);
|
2018-06-07 18:14:37 +00:00
|
|
|
else if (auto * data_uint16 = getIndexesData<UInt16>(column))
|
2018-07-09 18:19:03 +00:00
|
|
|
return mapIndexWithAdditionalKeys(*data_uint16, dict_size);
|
2018-06-07 18:14:37 +00:00
|
|
|
else if (auto * data_uint32 = getIndexesData<UInt32>(column))
|
2018-07-09 18:19:03 +00:00
|
|
|
return mapIndexWithAdditionalKeys(*data_uint32, dict_size);
|
2018-06-07 18:14:37 +00:00
|
|
|
else if (auto * data_uint64 = getIndexesData<UInt64>(column))
|
2018-07-09 18:19:03 +00:00
|
|
|
return mapIndexWithAdditionalKeys(*data_uint64, dict_size);
|
2018-06-07 18:14:37 +00:00
|
|
|
else
|
2018-07-09 18:19:03 +00:00
|
|
|
throw Exception("Indexes column for mapIndexWithAdditionalKeys must be UInt, got" + column.getName(),
|
2018-06-07 18:14:37 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::serializeBinaryBulkWithMultipleStreams(
|
2018-06-07 18:14:37 +00:00
|
|
|
const IColumn & column,
|
|
|
|
size_t offset,
|
2019-02-19 00:41:24 +00:00
|
|
|
size_t limit,
|
2018-06-07 18:14:37 +00:00
|
|
|
SerializeBinaryBulkSettings & settings,
|
|
|
|
SerializeBinaryBulkStatePtr & state) const
|
2018-05-21 16:21:15 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
settings.path.push_back(Substream::DictionaryKeys);
|
|
|
|
auto * keys_stream = settings.getter(settings.path);
|
|
|
|
settings.path.back() = Substream::DictionaryIndexes;
|
|
|
|
auto * indexes_stream = settings.getter(settings.path);
|
|
|
|
settings.path.pop_back();
|
|
|
|
|
|
|
|
if (!keys_stream && !indexes_stream)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!keys_stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
|
|
|
if (!indexes_stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
const ColumnLowCardinality & low_cardinality_column = typeid_cast<const ColumnLowCardinality &>(column);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto * low_cardinality_state = checkAndGetLowCardinalitySerializeState(state);
|
|
|
|
auto & global_dictionary = low_cardinality_state->shared_dictionary;
|
|
|
|
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-19 14:37:22 +00:00
|
|
|
bool need_update_dictionary = global_dictionary == nullptr;
|
|
|
|
if (need_update_dictionary)
|
|
|
|
global_dictionary = createColumnUnique(*dictionary_type);
|
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
size_t max_limit = column.size() - offset;
|
|
|
|
limit = limit ? std::min(limit, max_limit) : max_limit;
|
|
|
|
|
2019-01-09 12:16:03 +00:00
|
|
|
/// Do not write anything for empty column. (May happen while writing empty arrays.)
|
|
|
|
if (limit == 0)
|
|
|
|
return;
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto sub_column = low_cardinality_column.cutAndCompact(offset, limit);
|
2018-07-09 18:19:03 +00:00
|
|
|
ColumnPtr positions = sub_column->getIndexesPtr();
|
|
|
|
ColumnPtr keys = sub_column->getDictionary().getNestedColumn();
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-08-08 16:40:50 +00:00
|
|
|
if (settings.low_cardinality_max_dictionary_size)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
|
|
|
/// Insert used_keys into global dictionary and update sub_index.
|
2018-07-09 18:19:03 +00:00
|
|
|
auto indexes_with_overflow = global_dictionary->uniqueInsertRangeWithOverflow(*keys, 0, keys->size(),
|
2018-08-08 16:40:50 +00:00
|
|
|
settings.low_cardinality_max_dictionary_size);
|
|
|
|
size_t max_size = settings.low_cardinality_max_dictionary_size + indexes_with_overflow.overflowed_keys->size();
|
2018-09-27 15:55:22 +00:00
|
|
|
ColumnLowCardinality::Index(indexes_with_overflow.indexes->getPtr()).check(max_size);
|
2018-07-09 18:19:03 +00:00
|
|
|
|
2018-08-08 16:40:50 +00:00
|
|
|
if (global_dictionary->size() > settings.low_cardinality_max_dictionary_size)
|
2018-07-09 18:19:03 +00:00
|
|
|
throw Exception("Got dictionary with size " + toString(global_dictionary->size()) +
|
2018-08-08 16:40:50 +00:00
|
|
|
" but max dictionary size is " + toString(settings.low_cardinality_max_dictionary_size),
|
2018-07-09 18:19:03 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
|
|
positions = indexes_with_overflow.indexes->index(*positions, 0);
|
|
|
|
keys = std::move(indexes_with_overflow.overflowed_keys);
|
|
|
|
|
2018-08-08 16:40:50 +00:00
|
|
|
if (global_dictionary->size() < settings.low_cardinality_max_dictionary_size && !keys->empty())
|
2018-07-09 18:19:03 +00:00
|
|
|
throw Exception("Has additional keys, but dict size is " + toString(global_dictionary->size()) +
|
2018-08-08 16:40:50 +00:00
|
|
|
" which is less then max dictionary size (" + toString(settings.low_cardinality_max_dictionary_size) + ")",
|
2018-07-09 18:19:03 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2019-06-27 18:50:20 +00:00
|
|
|
if (auto * nullable_keys = checkAndGetColumn<ColumnNullable>(*keys))
|
2018-07-09 18:19:03 +00:00
|
|
|
keys = nullable_keys->getNestedColumnPtr();
|
|
|
|
|
|
|
|
bool need_additional_keys = !keys->empty();
|
2018-08-08 16:40:50 +00:00
|
|
|
bool need_dictionary = settings.low_cardinality_max_dictionary_size != 0;
|
|
|
|
bool need_write_dictionary = !settings.low_cardinality_use_single_dictionary_for_part
|
|
|
|
&& global_dictionary->size() >= settings.low_cardinality_max_dictionary_size;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-07-19 14:37:22 +00:00
|
|
|
IndexesSerializationType index_version(*positions, need_additional_keys, need_dictionary, need_update_dictionary);
|
2018-06-07 18:14:37 +00:00
|
|
|
index_version.serialize(*indexes_stream);
|
|
|
|
|
|
|
|
if (need_write_dictionary)
|
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
const auto & nested_column = global_dictionary->getNestedNotNullableColumn();
|
|
|
|
UInt64 num_keys = nested_column->size();
|
2018-06-07 18:14:37 +00:00
|
|
|
writeIntBinary(num_keys, *keys_stream);
|
2018-07-09 18:19:03 +00:00
|
|
|
removeNullable(dictionary_type)->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys);
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->shared_dictionary = nullptr;
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (need_additional_keys)
|
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
UInt64 num_keys = keys->size();
|
2018-06-07 18:14:37 +00:00
|
|
|
writeIntBinary(num_keys, *indexes_stream);
|
2018-07-09 18:19:03 +00:00
|
|
|
removeNullable(dictionary_type)->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
UInt64 num_rows = positions->size();
|
2018-06-07 18:14:37 +00:00
|
|
|
writeIntBinary(num_rows, *indexes_stream);
|
2018-07-09 18:19:03 +00:00
|
|
|
index_version.getDataType()->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows);
|
2018-05-21 16:21:15 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::deserializeBinaryBulkWithMultipleStreams(
|
2018-06-07 18:14:37 +00:00
|
|
|
IColumn & column,
|
2019-02-19 00:41:24 +00:00
|
|
|
size_t limit,
|
2018-06-07 18:14:37 +00:00
|
|
|
DeserializeBinaryBulkSettings & settings,
|
|
|
|
DeserializeBinaryBulkStatePtr & state) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
ColumnLowCardinality & low_cardinality_column = typeid_cast<ColumnLowCardinality &>(column);
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
settings.path.push_back(Substream::DictionaryKeys);
|
|
|
|
auto * keys_stream = settings.getter(settings.path);
|
|
|
|
settings.path.back() = Substream::DictionaryIndexes;
|
|
|
|
auto * indexes_stream = settings.getter(settings.path);
|
|
|
|
settings.path.pop_back();
|
|
|
|
|
|
|
|
if (!keys_stream && !indexes_stream)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!keys_stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream for DataTypeLowCardinality keys.", ErrorCodes::LOGICAL_ERROR);
|
2018-05-21 16:21:15 +00:00
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
if (!indexes_stream)
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Got empty stream for DataTypeLowCardinality indexes.", ErrorCodes::LOGICAL_ERROR);
|
2018-06-06 13:43:16 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto * low_cardinality_state = checkAndGetLowCardinalityDeserializeState(state);
|
|
|
|
KeysSerializationVersion::checkVersion(low_cardinality_state->key_version.value);
|
2018-09-19 10:27:46 +00:00
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
auto read_dictionary = [this, low_cardinality_state, keys_stream]()
|
2018-06-06 13:43:16 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
UInt64 num_keys;
|
|
|
|
readIntBinary(num_keys, *keys_stream);
|
|
|
|
|
|
|
|
auto keys_type = removeNullable(dictionary_type);
|
|
|
|
auto global_dict_keys = keys_type->createColumn();
|
|
|
|
keys_type->deserializeBinaryBulk(*global_dict_keys, *keys_stream, num_keys, 0);
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
auto column_unique = createColumnUnique(*dictionary_type, std::move(global_dict_keys));
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->global_dictionary = std::move(column_unique);
|
2018-06-06 13:43:16 +00:00
|
|
|
};
|
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
auto read_additional_keys = [this, low_cardinality_state, indexes_stream]()
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
UInt64 num_keys;
|
|
|
|
readIntBinary(num_keys, *indexes_stream);
|
|
|
|
auto keys_type = removeNullable(dictionary_type);
|
2018-07-09 18:19:03 +00:00
|
|
|
auto additional_keys = keys_type->createColumn();
|
|
|
|
keys_type->deserializeBinaryBulk(*additional_keys, *indexes_stream, num_keys, 0);
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->additional_keys = std::move(additional_keys);
|
2018-09-19 14:34:29 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
if (!low_cardinality_state->index_type.need_global_dictionary && dictionary_type->isNullable())
|
2018-09-19 14:34:29 +00:00
|
|
|
{
|
|
|
|
auto null_map = ColumnUInt8::create(num_keys, 0);
|
|
|
|
if (num_keys)
|
|
|
|
null_map->getElement(0) = 1;
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->null_map = std::move(null_map);
|
2018-09-19 14:34:29 +00:00
|
|
|
}
|
2018-05-21 16:21:15 +00:00
|
|
|
};
|
|
|
|
|
2020-03-23 02:12:31 +00:00
|
|
|
auto read_indexes = [this, low_cardinality_state, indexes_stream, &low_cardinality_column](UInt64 num_rows)
|
2018-05-04 16:28:53 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
auto indexes_type = low_cardinality_state->index_type.getDataType();
|
2018-06-07 18:14:37 +00:00
|
|
|
MutableColumnPtr indexes_column = indexes_type->createColumn();
|
|
|
|
indexes_type->deserializeBinaryBulk(*indexes_column, *indexes_stream, num_rows, 0);
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto & global_dictionary = low_cardinality_state->global_dictionary;
|
|
|
|
const auto & additional_keys = low_cardinality_state->additional_keys;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
bool has_additional_keys = low_cardinality_state->index_type.has_additional_keys;
|
|
|
|
bool column_is_empty = low_cardinality_column.empty();
|
2018-05-21 16:21:15 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
if (!low_cardinality_state->index_type.need_global_dictionary)
|
2018-05-21 16:21:15 +00:00
|
|
|
{
|
2018-09-19 14:34:29 +00:00
|
|
|
ColumnPtr keys_column = additional_keys;
|
2018-09-27 15:55:22 +00:00
|
|
|
if (low_cardinality_state->null_map)
|
|
|
|
keys_column = ColumnNullable::create(additional_keys, low_cardinality_state->null_map);
|
|
|
|
low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*keys_column, *indexes_column);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
2018-07-09 18:19:03 +00:00
|
|
|
else if (!has_additional_keys)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
if (column_is_empty)
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_column.setSharedDictionary(global_dictionary);
|
2018-07-09 18:19:03 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto local_column = ColumnLowCardinality::create(global_dictionary, std::move(indexes_column));
|
|
|
|
low_cardinality_column.insertRangeFrom(*local_column, 0, num_rows);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
auto maps = mapIndexWithAdditionalKeys(*indexes_column, global_dictionary->size());
|
2018-06-26 13:10:46 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
ColumnLowCardinality::Index(maps.additional_keys_map->getPtr()).check(additional_keys->size());
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
ColumnLowCardinality::Index(indexes_column->getPtr()).check(
|
2018-07-09 18:19:03 +00:00
|
|
|
maps.dictionary_map->size() + maps.additional_keys_map->size());
|
|
|
|
|
|
|
|
auto used_keys = (*std::move(global_dictionary->getNestedColumn()->index(*maps.dictionary_map, 0))).mutate();
|
|
|
|
|
|
|
|
if (!maps.additional_keys_map->empty())
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
auto used_add_keys = additional_keys->index(*maps.additional_keys_map, 0);
|
|
|
|
|
|
|
|
if (dictionary_type->isNullable())
|
|
|
|
{
|
|
|
|
ColumnPtr null_map = ColumnUInt8::create(used_add_keys->size(), 0);
|
|
|
|
used_add_keys = ColumnNullable::create(used_add_keys, null_map);
|
|
|
|
}
|
|
|
|
|
|
|
|
used_keys->insertRangeFrom(*used_add_keys, 0, used_add_keys->size());
|
2018-05-21 16:21:15 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_column.insertRangeFromDictionaryEncodedColumn(*used_keys, *indexes_column);
|
2018-05-21 16:21:15 +00:00
|
|
|
}
|
2018-06-07 18:14:37 +00:00
|
|
|
};
|
|
|
|
|
2018-08-22 12:47:06 +00:00
|
|
|
if (!settings.continuous_reading)
|
2019-04-05 16:31:53 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->num_pending_rows = 0;
|
2018-08-22 12:47:06 +00:00
|
|
|
|
2019-03-29 15:07:53 +00:00
|
|
|
/// Remember in state that some granules were skipped and we need to update dictionary.
|
2019-03-29 14:06:46 +00:00
|
|
|
low_cardinality_state->need_update_dictionary = true;
|
|
|
|
}
|
|
|
|
|
2018-06-07 18:14:37 +00:00
|
|
|
while (limit)
|
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
if (low_cardinality_state->num_pending_rows == 0)
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
|
|
|
if (indexes_stream->eof())
|
|
|
|
break;
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto & index_type = low_cardinality_state->index_type;
|
|
|
|
auto & global_dictionary = low_cardinality_state->global_dictionary;
|
2018-07-19 14:37:22 +00:00
|
|
|
|
|
|
|
index_type.deserialize(*indexes_stream);
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2019-03-29 14:06:46 +00:00
|
|
|
bool need_update_dictionary =
|
|
|
|
!global_dictionary || index_type.need_update_dictionary || low_cardinality_state->need_update_dictionary;
|
|
|
|
if (index_type.need_global_dictionary && need_update_dictionary)
|
2018-08-21 12:31:09 +00:00
|
|
|
{
|
2020-03-23 02:12:31 +00:00
|
|
|
read_dictionary();
|
2019-03-29 14:06:46 +00:00
|
|
|
low_cardinality_state->need_update_dictionary = false;
|
2018-08-21 12:31:09 +00:00
|
|
|
}
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
if (low_cardinality_state->index_type.has_additional_keys)
|
2020-03-23 02:12:31 +00:00
|
|
|
read_additional_keys();
|
2018-06-07 18:14:37 +00:00
|
|
|
else
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->additional_keys = nullptr;
|
2018-06-07 18:14:37 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream);
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2019-01-14 11:52:12 +00:00
|
|
|
size_t num_rows_to_read = std::min<UInt64>(limit, low_cardinality_state->num_pending_rows);
|
2020-03-23 02:12:31 +00:00
|
|
|
read_indexes(num_rows_to_read);
|
2018-06-07 18:14:37 +00:00
|
|
|
limit -= num_rows_to_read;
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_state->num_pending_rows -= num_rows_to_read;
|
2018-05-04 16:28:53 +00:00
|
|
|
}
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::serializeBinary(const Field & field, WriteBuffer & ostr) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
|
|
|
dictionary_type->serializeBinary(field, ostr);
|
|
|
|
}
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::deserializeBinary(Field & field, ReadBuffer & istr) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
|
|
|
dictionary_type->deserializeBinary(field, istr);
|
|
|
|
}
|
|
|
|
|
2019-09-20 14:45:25 +00:00
|
|
|
void DataTypeLowCardinality::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeBinary, ostr);
|
|
|
|
}
|
|
|
|
void DataTypeLowCardinality::deserializeBinary(IColumn & column, ReadBuffer & istr) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeBinary, istr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsTextEscaped, ostr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsTextQuoted, ostr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeAsTextQuoted, istr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeAsTextEscaped, istr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsTextCSV, ostr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeAsTextCSV, istr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsText, ostr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsTextJSON, ostr, settings);
|
|
|
|
}
|
|
|
|
void DataTypeLowCardinality::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeAsTextJSON, istr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DataTypeLowCardinality::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const
|
|
|
|
{
|
|
|
|
serializeImpl(column, row_num, &IDataType::serializeProtobuf, protobuf, value_index);
|
|
|
|
}
|
|
|
|
|
2019-02-19 20:01:31 +00:00
|
|
|
void DataTypeLowCardinality::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const
|
|
|
|
{
|
|
|
|
if (allow_add_row)
|
|
|
|
{
|
|
|
|
deserializeImpl(column, &IDataType::deserializeProtobuf, protobuf, true, row_added);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
row_added = false;
|
|
|
|
auto & low_cardinality_column= getColumnLowCardinality(column);
|
|
|
|
auto nested_column = low_cardinality_column.getDictionary().getNestedColumn();
|
|
|
|
auto temp_column = nested_column->cloneEmpty();
|
|
|
|
size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(low_cardinality_column.size() - 1);
|
|
|
|
temp_column->insertFrom(*nested_column, unique_row_number);
|
|
|
|
bool dummy;
|
|
|
|
dictionary_type.get()->deserializeProtobuf(*temp_column, protobuf, false, dummy);
|
|
|
|
low_cardinality_column.popBack(1);
|
|
|
|
low_cardinality_column.insertFromFullColumn(*temp_column, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename... Params, typename... Args>
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::serializeImpl(
|
2019-02-19 20:01:31 +00:00
|
|
|
const IColumn & column, size_t row_num, DataTypeLowCardinality::SerializeFunctionPtr<Params...> func, Args &&... args) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
auto & low_cardinality_column = getColumnLowCardinality(column);
|
|
|
|
size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(row_num);
|
2019-02-19 20:01:31 +00:00
|
|
|
(dictionary_type.get()->*func)(*low_cardinality_column.getDictionary().getNestedColumn(), unique_row_number, std::forward<Args>(args)...);
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
2019-02-19 20:01:31 +00:00
|
|
|
template <typename... Params, typename... Args>
|
2018-09-27 15:55:22 +00:00
|
|
|
void DataTypeLowCardinality::deserializeImpl(
|
2019-02-19 20:01:31 +00:00
|
|
|
IColumn & column, DataTypeLowCardinality::DeserializeFunctionPtr<Params...> func, Args &&... args) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
auto & low_cardinality_column= getColumnLowCardinality(column);
|
|
|
|
auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty();
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2019-02-19 20:01:31 +00:00
|
|
|
(dictionary_type.get()->*func)(*temp_column, std::forward<Args>(args)...);
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
low_cardinality_column.insertFromFullColumn(*temp_column, 0);
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
namespace
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
template <typename Creator>
|
|
|
|
struct CreateColumnVector
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
MutableColumnUniquePtr & column;
|
|
|
|
const IDataType & keys_type;
|
|
|
|
const Creator & creator;
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2019-08-03 11:02:40 +00:00
|
|
|
CreateColumnVector(MutableColumnUniquePtr & column_, const IDataType & keys_type_, const Creator & creator_)
|
|
|
|
: column(column_), keys_type(keys_type_), creator(creator_)
|
2018-07-09 18:19:03 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename T, size_t>
|
|
|
|
void operator()()
|
|
|
|
{
|
|
|
|
if (typeid_cast<const DataTypeNumber<T> *>(&keys_type))
|
2019-08-21 02:42:50 +00:00
|
|
|
column = creator(static_cast<ColumnVector<T> *>(nullptr));
|
2018-07-09 18:19:03 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
}
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
template <typename Creator>
|
2018-09-27 15:55:22 +00:00
|
|
|
MutableColumnUniquePtr DataTypeLowCardinality::createColumnUniqueImpl(const IDataType & keys_type,
|
2018-07-09 18:19:03 +00:00
|
|
|
const Creator & creator)
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
auto * type = &keys_type;
|
2018-07-09 18:19:03 +00:00
|
|
|
if (auto * nullable_type = typeid_cast<const DataTypeNullable *>(&keys_type))
|
|
|
|
type = nullable_type->getNestedType().get();
|
2018-05-03 14:53:35 +00:00
|
|
|
|
2018-09-07 14:37:26 +00:00
|
|
|
if (isString(type))
|
2019-08-21 02:42:50 +00:00
|
|
|
return creator(static_cast<ColumnString *>(nullptr));
|
2018-09-07 14:37:26 +00:00
|
|
|
if (isFixedString(type))
|
2019-08-21 02:42:50 +00:00
|
|
|
return creator(static_cast<ColumnFixedString *>(nullptr));
|
2018-06-07 18:14:37 +00:00
|
|
|
if (typeid_cast<const DataTypeDate *>(type))
|
2019-08-21 02:42:50 +00:00
|
|
|
return creator(static_cast<ColumnVector<UInt16> *>(nullptr));
|
2018-06-07 18:14:37 +00:00
|
|
|
if (typeid_cast<const DataTypeDateTime *>(type))
|
2019-08-21 02:42:50 +00:00
|
|
|
return creator(static_cast<ColumnVector<UInt32> *>(nullptr));
|
2019-05-24 12:11:03 +00:00
|
|
|
if (isColumnedAsNumber(type))
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
2018-06-07 18:14:37 +00:00
|
|
|
MutableColumnUniquePtr column;
|
2019-11-11 16:18:37 +00:00
|
|
|
TypeListNativeNumbers::forEach(CreateColumnVector(column, *type, creator));
|
2018-05-03 14:53:35 +00:00
|
|
|
|
|
|
|
if (!column)
|
|
|
|
throw Exception("Unexpected numeric type: " + type->getName(), ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
2018-05-21 12:29:52 +00:00
|
|
|
return column;
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
throw Exception("Unexpected dictionary type for DataTypeLowCardinality: " + type->getName(),
|
2018-05-03 14:53:35 +00:00
|
|
|
ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
2018-07-09 18:19:03 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
MutableColumnUniquePtr DataTypeLowCardinality::createColumnUnique(const IDataType & keys_type)
|
2018-07-09 18:19:03 +00:00
|
|
|
{
|
|
|
|
auto creator = [&](auto x)
|
|
|
|
{
|
|
|
|
using ColumnType = typename std::remove_pointer<decltype(x)>::type;
|
|
|
|
return ColumnUnique<ColumnType>::create(keys_type);
|
|
|
|
};
|
|
|
|
return createColumnUniqueImpl(keys_type, creator);
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
MutableColumnUniquePtr DataTypeLowCardinality::createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys)
|
2018-07-09 18:19:03 +00:00
|
|
|
{
|
|
|
|
auto creator = [&](auto x)
|
|
|
|
{
|
|
|
|
using ColumnType = typename std::remove_pointer<decltype(x)>::type;
|
|
|
|
return ColumnUnique<ColumnType>::create(std::move(keys), keys_type.isNullable());
|
|
|
|
};
|
|
|
|
return createColumnUniqueImpl(keys_type, creator);
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
MutableColumnPtr DataTypeLowCardinality::createColumn() const
|
2018-06-07 18:14:37 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
MutableColumnPtr indexes = DataTypeUInt8().createColumn();
|
|
|
|
MutableColumnPtr dictionary = createColumnUnique(*dictionary_type);
|
2018-09-27 15:55:22 +00:00
|
|
|
return ColumnLowCardinality::create(std::move(dictionary), std::move(indexes));
|
2018-06-07 18:14:37 +00:00
|
|
|
}
|
|
|
|
|
2019-10-04 17:46:36 +00:00
|
|
|
Field DataTypeLowCardinality::getDefault() const
|
|
|
|
{
|
|
|
|
return dictionary_type->getDefault();
|
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
bool DataTypeLowCardinality::equals(const IDataType & rhs) const
|
2018-05-03 14:53:35 +00:00
|
|
|
{
|
|
|
|
if (typeid(rhs) != typeid(*this))
|
|
|
|
return false;
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
auto & low_cardinality_rhs= static_cast<const DataTypeLowCardinality &>(rhs);
|
|
|
|
return dictionary_type->equals(*low_cardinality_rhs.dictionary_type);
|
2018-05-03 14:53:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-03-10 18:16:14 +00:00
|
|
|
static DataTypePtr create(const ASTPtr & arguments)
|
2018-05-03 14:00:24 +00:00
|
|
|
{
|
2018-07-09 18:19:03 +00:00
|
|
|
if (!arguments || arguments->children.size() != 1)
|
2018-08-07 18:57:16 +00:00
|
|
|
throw Exception("LowCardinality data type family must have single argument - type of elements",
|
2018-07-09 18:19:03 +00:00
|
|
|
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
2018-05-03 14:00:24 +00:00
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
return std::make_shared<DataTypeLowCardinality>(DataTypeFactory::instance().get(arguments->children[0]));
|
2018-05-03 14:00:24 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 15:55:22 +00:00
|
|
|
void registerDataTypeLowCardinality(DataTypeFactory & factory)
|
2018-05-03 14:00:24 +00:00
|
|
|
{
|
2018-08-07 18:57:16 +00:00
|
|
|
factory.registerDataType("LowCardinality", create);
|
2018-05-03 14:00:24 +00:00
|
|
|
}
|
|
|
|
|
2018-08-21 14:53:51 +00:00
|
|
|
|
|
|
|
DataTypePtr removeLowCardinality(const DataTypePtr & type)
|
|
|
|
{
|
2018-09-27 15:55:22 +00:00
|
|
|
if (auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(type.get()))
|
|
|
|
return low_cardinality_type->getDictionaryType();
|
2018-08-21 14:53:51 +00:00
|
|
|
return type;
|
|
|
|
}
|
|
|
|
|
2018-05-03 14:00:24 +00:00
|
|
|
}
|