Added ComplexKeyRangeHashed dictionary

This commit is contained in:
Maksim Kita 2021-08-13 00:39:20 +03:00
parent 40f5e06a8d
commit e12820ecb2
15 changed files with 731 additions and 394 deletions

View File

@ -0,0 +1,48 @@
#include "DictionaryHelpers.h"
namespace DB
{
MutableColumns deserializeColumnsFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end)
{
MutableColumns result_columns;
result_columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
result_columns.emplace_back(attribute.type->createColumn());
for (size_t index = start; index < end; ++index)
{
const auto & key = keys[index];
const auto * ptr = key.data;
for (auto & result_column : result_columns)
ptr = result_column->deserializeAndInsertFromArena(ptr);
}
return result_columns;
}
ColumnsWithTypeAndName deserializeColumnsWithTypeAndNameFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end)
{
ColumnsWithTypeAndName result;
MutableColumns columns = deserializeColumnsFromKeys(dictionary_structure, keys, start, end);
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
{
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
}
return result;
}
}

View File

@ -497,6 +497,20 @@ private:
Arena * complex_key_arena;
};
/// Deserialize columns from keys array using dictionary structure
MutableColumns deserializeColumnsFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end);
/// Deserialize columns with type and name from keys array using dictionary structure
ColumnsWithTypeAndName deserializeColumnsWithTypeAndNameFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end);
/** Merge block with blocks from stream. If there are duplicate keys in block they are filtered out.
* In result block_to_update will be merged with blocks from stream.
* Note: readPrefix readImpl readSuffix will be called on stream object during function execution.

View File

@ -29,7 +29,7 @@ DictionarySourceData::DictionarySourceData(
, key_type(DictionaryInputStreamKeyType::ComplexKey)
{
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
key_columns = deserializeColumnsWithTypeAndNameFromKeys(dictionary_structure, keys, 0, keys.size());
}
DictionarySourceData::DictionarySourceData(
@ -158,32 +158,4 @@ Block DictionarySourceData::fillBlock(
return Block(block_columns);
}
void DictionarySourceData::fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & result)
{
MutableColumns columns;
columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
columns.emplace_back(attribute.type->createColumn());
for (size_t index = start; index < size; ++index)
{
const auto & key = keys[index];
const auto *ptr = key.data;
for (auto & column : columns)
ptr = column->deserializeAndInsertFromArena(ptr);
}
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
{
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
}
}
}

View File

@ -51,13 +51,6 @@ private:
const DataTypes & types,
ColumnsWithTypeAndName && view) const;
static void fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & result);
const size_t num_rows;
std::shared_ptr<const IDictionary> dictionary;
std::unordered_set<std::string> column_names;

View File

@ -134,42 +134,11 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration
if (id->name.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "'id' cannot be empty");
const char * range_default_type = "Date";
if (config.has(structure_prefix + ".range_min"))
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
if (config.has(structure_prefix + ".range_max"))
range_max.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_max", range_default_type));
if (range_min.has_value() != range_max.has_value())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
}
if (range_min && range_max && !range_min->type->equals(*range_max->type))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure 'range_min' and 'range_max' should have same type, "
"'range_min' type: {},"
"'range_max' type: {}",
range_min->type->getName(),
range_max->type->getName());
}
if (range_min)
{
if (!range_min->type->isValueRepresentedByInteger())
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
" Actual 'range_min' and 'range_max' type is {}",
range_min->type->getName());
}
if (!id->expression.empty() || (range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
if (!id->expression.empty())
has_expressions = true;
}
parseRangeConfiguration(config, structure_prefix);
attributes = getAttributes(config, structure_prefix, /*complex_key_attributes =*/ false);
for (size_t i = 0; i < attributes.size(); ++i)
@ -439,4 +408,42 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
return res_attributes;
}
void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix)
{
const char * range_default_type = "Date";
if (config.has(structure_prefix + ".range_min"))
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
if (config.has(structure_prefix + ".range_max"))
range_max.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_max", range_default_type));
if (range_min.has_value() != range_max.has_value())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
}
if (range_min && range_max && !range_min->type->equals(*range_max->type))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure 'range_min' and 'range_max' should have same type, "
"'range_min' type: {},"
"'range_max' type: {}",
range_min->type->getName(),
range_max->type->getName());
}
if (range_min)
{
if (!range_min->type->isValueRepresentedByInteger())
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
" Actual 'range_min' and 'range_max' type is {}",
range_min->type->getName());
}
if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
has_expressions = true;
}
}

View File

@ -67,8 +67,9 @@ using DictionaryLifetime = ExternalLoadableLifetime;
* - null_value, used as a default value for non-existent entries in the dictionary,
* decimal representation for numeric attributes;
* - hierarchical, whether this attribute defines a hierarchy;
* - injective, whether the mapping to parent is injective (can be used for optimization of GROUP BY?)
* - is_object_id, used in mongo dictionary, converts string key to objectid
* - injective, whether the mapping to parent is injective (can be used for optimization of GROUP BY?);
* - is_object_id, used in mongo dictionary, converts string key to objectid;
* - is_nullable, is attribute nullable;
*/
struct DictionaryAttribute final
{
@ -153,6 +154,10 @@ private:
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
bool complex_key_attributes);
/// parse range_min and range_max
void parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix);
};
}

View File

@ -133,6 +133,29 @@ void ExternalQueryBuilder::composeLoadAllQuery(WriteBuffer & out) const
writeQuoted(key.name, out);
}
if (dict_struct.range_min && dict_struct.range_max)
{
writeString(", ", out);
if (!dict_struct.range_min->expression.empty())
{
writeParenthesisedString(dict_struct.range_min->expression, out);
writeString(" AS ", out);
}
writeQuoted(dict_struct.range_min->name, out);
writeString(", ", out);
if (!dict_struct.range_max->expression.empty())
{
writeParenthesisedString(dict_struct.range_max->expression, out);
writeString(" AS ", out);
}
writeQuoted(dict_struct.range_max->name, out);
}
}
for (const auto & attr : dict_struct.attributes)

View File

@ -64,7 +64,7 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; }

View File

@ -14,170 +14,213 @@
namespace DB
{
template <typename RangeType>
enum class RangeDictionaryType
{
simple,
complex
};
template <RangeDictionaryType range_dictionary_type, typename RangeType>
class RangeDictionarySourceData
{
public:
using Key = UInt64;
using KeyType = std::conditional_t<range_dictionary_type == RangeDictionaryType::simple, UInt64, StringRef>;
RangeDictionarySourceData(
std::shared_ptr<const IDictionary> dictionary,
const Names & column_names,
PaddedPODArray<Key> && ids_to_fill,
PaddedPODArray<KeyType> && keys,
PaddedPODArray<RangeType> && start_dates,
PaddedPODArray<RangeType> && end_dates);
Block getBlock(size_t start, size_t length) const;
size_t getNumRows() const { return ids.size(); }
size_t getNumRows() const { return keys.size(); }
private:
Block fillBlock(
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<KeyType> & keys_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const;
const PaddedPODArray<RangeType> & block_end_dates,
size_t start,
size_t end) const;
PaddedPODArray<Int64> makeDateKey(
PaddedPODArray<Int64> makeDateKeys(
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const;
std::shared_ptr<const IDictionary> dictionary;
NameSet column_names;
PaddedPODArray<Key> ids;
PaddedPODArray<KeyType> keys;
PaddedPODArray<RangeType> start_dates;
PaddedPODArray<RangeType> end_dates;
};
template <typename RangeType>
RangeDictionarySourceData<RangeType>::RangeDictionarySourceData(
template <RangeDictionaryType range_dictionary_type, typename RangeType>
RangeDictionarySourceData<range_dictionary_type, RangeType>::RangeDictionarySourceData(
std::shared_ptr<const IDictionary> dictionary_,
const Names & column_names_,
PaddedPODArray<Key> && ids_,
PaddedPODArray<KeyType> && keys,
PaddedPODArray<RangeType> && block_start_dates,
PaddedPODArray<RangeType> && block_end_dates)
: dictionary(dictionary_)
, column_names(column_names_.begin(), column_names_.end())
, ids(std::move(ids_))
, keys(std::move(keys))
, start_dates(std::move(block_start_dates))
, end_dates(std::move(block_end_dates))
{
}
template <typename RangeType>
Block RangeDictionarySourceData<RangeType>::getBlock(size_t start, size_t length) const
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySourceData<range_dictionary_type, RangeType>::getBlock(size_t start, size_t length) const
{
PaddedPODArray<Key> block_ids;
PaddedPODArray<KeyType> block_keys;
PaddedPODArray<RangeType> block_start_dates;
PaddedPODArray<RangeType> block_end_dates;
block_ids.reserve(length);
block_keys.reserve(length);
block_start_dates.reserve(length);
block_end_dates.reserve(length);
for (auto idx : collections::range(start, start + length))
for (size_t index = start; index < start + length; ++index )
{
block_ids.push_back(ids[idx]);
block_start_dates.push_back(start_dates[idx]);
block_end_dates.push_back(end_dates[idx]);
block_keys.push_back(block_keys[index]);
block_start_dates.push_back(start_dates[index]);
block_end_dates.push_back(end_dates[index]);
}
return fillBlock(block_ids, block_start_dates, block_end_dates);
return fillBlock(block_keys, block_start_dates, block_end_dates, start, start + length);
}
template <typename RangeType>
PaddedPODArray<Int64> RangeDictionarySourceData<RangeType>::makeDateKey(
const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const
{
PaddedPODArray<Int64> key(block_start_dates.size());
for (size_t i = 0; i < key.size(); ++i)
{
if (RangeHashedDictionary::Range::isCorrectDate(block_start_dates[i]))
key[i] = block_start_dates[i];
else
key[i] = block_end_dates[i];
}
return key;
}
template <typename RangeType>
Block RangeDictionarySourceData<RangeType>::fillBlock(
const PaddedPODArray<Key> & ids_to_fill,
template <RangeDictionaryType range_dictionary_type, typename RangeType>
PaddedPODArray<Int64> RangeDictionarySourceData<range_dictionary_type, RangeType>::makeDateKeys(
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const
{
PaddedPODArray<Int64> keys(block_start_dates.size());
for (size_t i = 0; i < keys.size(); ++i)
{
if (Range::isCorrectDate(block_start_dates[i]))
keys[i] = block_start_dates[i];
else
keys[i] = block_end_dates[i];
}
return keys;
}
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySourceData<range_dictionary_type, RangeType>::fillBlock(
const PaddedPODArray<KeyType> & keys_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates,
size_t start,
size_t end) const
{
ColumnsWithTypeAndName columns;
const DictionaryStructure & structure = dictionary->getStructure();
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
auto ids_column = getColumnFromPODArray(ids_to_fill);
const std::string & id_column_name = structure.id->name;
if (column_names.find(id_column_name) != column_names.end())
columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), id_column_name);
DataTypes keys_types;
Columns keys_columns;
Strings keys_names = dictionary_structure.getKeysNames();
auto date_key = makeDateKey(block_start_dates, block_end_dates);
if constexpr (range_dictionary_type == RangeDictionaryType::simple)
{
keys_columns = {getColumnFromPODArray(keys_to_fill)};
keys_types = {std::make_shared<DataTypeUInt64>()};
}
else
{
for (const auto & attribute : *dictionary_structure.key)
keys_types.emplace_back(attribute.type);
auto deserialized_columns = deserializeColumnsFromKeys(dictionary_structure, keys, start, end);
for (auto & deserialized_column : deserialized_columns)
keys_columns.emplace_back(std::move(deserialized_column));
}
size_t keys_size = keys_names.size();
std::cerr << "Keys size " << keys_size << " key columns size " << keys_columns.size();
std::cerr << " keys types size " << keys_types.size() << std::endl;
assert(keys_columns.size() == keys_size);
assert(keys_types.size() == keys_size);
for (size_t i = 0; i < keys_size; ++i)
{
auto & key_name = keys_names[i];
if (column_names.find(key_name) != column_names.end())
columns.emplace_back(keys_columns[i], keys_types[i], key_name);
}
auto date_key = makeDateKeys(block_start_dates, block_end_dates);
auto date_column = getColumnFromPODArray(date_key);
const std::string & range_min_column_name = structure.range_min->name;
keys_columns.emplace_back(std::move(date_column));
keys_types.emplace_back(std::make_shared<DataTypeInt64>());
const auto & range_min_column_name = dictionary_structure.range_min->name;
if (column_names.find(range_min_column_name) != column_names.end())
{
auto range_min_column = getColumnFromPODArray(block_start_dates);
columns.emplace_back(range_min_column, structure.range_max->type, range_min_column_name);
columns.emplace_back(range_min_column, dictionary_structure.range_max->type, range_min_column_name);
}
const std::string & range_max_column_name = structure.range_max->name;
const auto & range_max_column_name = dictionary_structure.range_max->name;
if (column_names.find(range_max_column_name) != column_names.end())
{
auto range_max_column = getColumnFromPODArray(block_end_dates);
columns.emplace_back(range_max_column, structure.range_max->type, range_max_column_name);
columns.emplace_back(range_max_column, dictionary_structure.range_max->type, range_max_column_name);
}
for (const auto idx : collections::range(0, structure.attributes.size()))
size_t attributes_size = dictionary_structure.attributes.size();
for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
{
const DictionaryAttribute & attribute = structure.attributes[idx];
if (column_names.find(attribute.name) != column_names.end())
{
ColumnPtr column = dictionary->getColumn(
const auto & attribute = dictionary_structure.attributes[attribute_index];
if (column_names.find(attribute.name) == column_names.end())
continue;
auto column = dictionary->getColumn(
attribute.name,
attribute.type,
{ids_column, date_column},
{std::make_shared<DataTypeUInt64>(), std::make_shared<DataTypeInt64>()},
nullptr);
columns.emplace_back(column, attribute.type, attribute.name);
}
keys_columns,
keys_types,
nullptr /* default_values_column*/);
columns.emplace_back(std::move(column), attribute.type, attribute.name);
}
return Block(columns);
}
/*
* BlockInputStream implementation for external dictionaries
* read() returns single block consisting of the in-memory contents of the dictionaries
*/
template <typename RangeType>
template <RangeDictionaryType range_dictionary_type, typename RangeType>
class RangeDictionarySource : public DictionarySourceBase
{
public:
using Key = UInt64;
RangeDictionarySource(RangeDictionarySourceData<RangeType> data_, size_t max_block_size);
RangeDictionarySource(RangeDictionarySourceData<range_dictionary_type, RangeType> data_, size_t max_block_size);
String getName() const override { return "RangeDictionarySource"; }
protected:
Block getBlock(size_t start, size_t length) const override;
RangeDictionarySourceData<RangeType> data;
RangeDictionarySourceData<range_dictionary_type, RangeType> data;
};
template <typename RangeType>
RangeDictionarySource<RangeType>::RangeDictionarySource(RangeDictionarySourceData<RangeType> data_, size_t max_block_size)
template <RangeDictionaryType range_dictionary_type, typename RangeType>
RangeDictionarySource<range_dictionary_type, RangeType>::RangeDictionarySource(RangeDictionarySourceData<range_dictionary_type, RangeType> data_, size_t max_block_size)
: DictionarySourceBase(data_.getBlock(0, 0), data_.getNumRows(), max_block_size)
, data(std::move(data_))
{
}
template <typename RangeType>
Block RangeDictionarySource<RangeType>::getBlock(size_t start, size_t length) const
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySource<range_dictionary_type, RangeType>::getBlock(size_t start, size_t length) const
{
return data.getBlock(start, length);
}

View File

@ -10,7 +10,8 @@
namespace
{
using RangeStorageType = DB::RangeHashedDictionary::RangeStorageType;
using RangeStorageType = DB::RangeStorageType;
// Null values mean that specified boundary, either min or max is not set on range.
// To simplify comparison, null value of min bound should be bigger than any other value,
@ -25,7 +26,7 @@ RangeStorageType getColumnIntValueOrDefault(const DB::IColumn & column, size_t i
return default_value;
const RangeStorageType result = static_cast<RangeStorageType>(column.getInt(index));
if (isDate && !DB::RangeHashedDictionary::Range::isCorrectDate(result))
if (isDate && !DB::Range::isCorrectDate(result))
return default_value;
return result;
@ -54,23 +55,23 @@ namespace ErrorCodes
extern const int UNSUPPORTED_METHOD;
}
bool RangeHashedDictionary::Range::isCorrectDate(const RangeStorageType & date)
bool Range::isCorrectDate(const RangeStorageType & date)
{
return 0 < date && date <= DATE_LUT_MAX_DAY_NUM;
}
bool RangeHashedDictionary::Range::contains(const RangeStorageType & value) const
bool Range::contains(const RangeStorageType & value) const
{
return left <= value && value <= right;
}
static bool operator<(const RangeHashedDictionary::Range & left, const RangeHashedDictionary::Range & right)
static bool operator<(const Range & left, const Range & right)
{
return std::tie(left.left, left.right) < std::tie(right.left, right.right);
}
RangeHashedDictionary::RangeHashedDictionary(
template <DictionaryKeyType dictionary_key_type>
RangeHashedDictionary<dictionary_key_type>::RangeHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
@ -87,7 +88,8 @@ RangeHashedDictionary::RangeHashedDictionary(
calculateBytesAllocated();
}
ColumnPtr RangeHashedDictionary::getColumn(
template <DictionaryKeyType dictionary_key_type>
ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
@ -96,20 +98,18 @@ ColumnPtr RangeHashedDictionary::getColumn(
{
ColumnPtr result;
const auto & attribute = getAttribute(attribute_name);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
auto keys_size = key_columns.front()->size();
const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
const auto & attribute = attributes[attribute_index];
/// Cast second column to storage type
Columns modified_key_columns = key_columns;
auto range_storage_column = key_columns[1];
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""};
auto range_column_storage_type = std::make_shared<DataTypeInt64>();
modified_key_columns[1] = castColumnAccurate(column_to_cast, range_column_storage_type);
modified_key_columns.back() = castColumnAccurate(column_to_cast, range_column_storage_type);
size_t keys_size = key_columns.front()->size();
bool is_attribute_nullable = attribute.is_nullable;
ColumnUInt8::MutablePtr col_null_map_to;
@ -204,24 +204,26 @@ ColumnPtr RangeHashedDictionary::getColumn(
return result;
}
ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr RangeHashedDictionary<dictionary_key_type>::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
{
auto range_storage_column = key_columns[1];
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_column_storage_type = std::make_shared<DataTypeInt64>();
auto range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type);
PaddedPODArray<UInt64> key_backup_storage;
PaddedPODArray<RangeStorageType> range_backup_storage;
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, range_column_updated, range_backup_storage);
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
const auto & attribute = attributes.front();
ColumnUInt8::Ptr result;
auto result = ColumnUInt8::create(keys_size);
auto & out = result->getData();
size_t keys_found = 0;
auto type_call = [&](const auto & dictionary_attribute_type)
@ -229,58 +231,48 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
result = hasKeysImpl<ValueType>(attribute, ids, dates, keys_found);
const auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
const auto key = keys_extractor.extractCurrentKey();
const auto it = collection.find(key);
if (it)
{
const auto date = dates[key_index];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
std::end(ranges_and_values),
[date](const Value<ValueType> & v)
{
return v.range.contains(date);
});
out[key_index] = val_it != std::end(ranges_and_values);
keys_found += out[key_index];
}
else
{
out[key_index] = false;
}
keys_extractor.rollbackCurrentKey();
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return result;
}
template <typename AttributeType>
ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl(
const Attribute & attribute,
const PaddedPODArray<UInt64> & ids,
const PaddedPODArray<RangeStorageType> & dates,
size_t & keys_found) const
{
auto result = ColumnUInt8::create(ids.size());
auto& out = result->getData();
const auto & attr = *std::get<Ptr<AttributeType>>(attribute.maps);
keys_found = 0;
for (const auto row : collections::range(0, ids.size()))
{
const auto it = attr.find(ids[row]);
if (it)
{
const auto date = dates[row];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
std::end(ranges_and_values),
[date](const Value<AttributeType> & v)
{
return v.range.contains(date);
});
out[row] = val_it != std::end(ranges_and_values);
keys_found += out[row];
}
else
out[row] = false;
}
return result;
}
void RangeHashedDictionary::createAttributes()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
@ -296,7 +288,8 @@ void RangeHashedDictionary::createAttributes()
}
}
void RangeHashedDictionary::loadData()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::loadData()
{
QueryPipeline pipeline;
pipeline.init(source_ptr->loadAll());
@ -305,38 +298,57 @@ void RangeHashedDictionary::loadData()
Block block;
while (executor.pull(block))
{
const auto & id_column = *block.safeGetByPosition(0).column;
size_t skip_keys_size_offset = dict_struct.getKeysSize();
Columns key_columns;
key_columns.reserve(skip_keys_size_offset);
/// Split into keys columns and attribute columns
for (size_t i = 0; i < skip_keys_size_offset; ++i)
key_columns.emplace_back(block.safeGetByPosition(i).column);
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
element_count += keys_size;
// Support old behaviour, where invalid date means 'open range'.
const bool is_date = isDate(dict_struct.range_min->type);
const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(1).column);
const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(2).column);
const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset).column);
const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset + 1).column);
element_count += id_column.size();
skip_keys_size_offset += 2;
for (const auto attribute_idx : collections::range(0, attributes.size()))
for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
{
const auto & attribute_column = *block.safeGetByPosition(attribute_idx + 3).column;
auto & attribute = attributes[attribute_idx];
const auto & attribute_column = *block.safeGetByPosition(attribute_index + skip_keys_size_offset).column;
auto & attribute = attributes[attribute_index];
for (const auto row_idx : collections::range(0, id_column.size()))
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys_extractor.extractCurrentKey();
RangeStorageType lower_bound;
RangeStorageType upper_bound;
if (is_date)
{
lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, 0);
upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, DATE_LUT_MAX_DAY_NUM + 1);
lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, 0);
upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, DATE_LUT_MAX_DAY_NUM + 1);
}
else
{
lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, RANGE_MIN_NULL_VALUE);
upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, RANGE_MAX_NULL_VALUE);
lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, RANGE_MIN_NULL_VALUE);
upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, RANGE_MAX_NULL_VALUE);
}
setAttributeValue(attribute, id_column.getUInt(row_idx), Range{lower_bound, upper_bound}, attribute_column[row_idx]);
if constexpr (std::is_same_v<KeyType, StringRef>)
key = copyKeyInArena(key);
setAttributeValue(attribute, key, Range{lower_bound, upper_bound}, attribute_column[key_index]);
keys_extractor.rollbackCurrentKey();
}
}
}
@ -346,22 +358,8 @@ void RangeHashedDictionary::loadData()
"{}: dictionary source is empty and 'require_nonempty' property is set.");
}
template <typename T>
void RangeHashedDictionary::addAttributeSize(const Attribute & attribute)
{
const auto & map_ref = std::get<Ptr<T>>(attribute.maps);
bytes_allocated += sizeof(Collection<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
template <>
void RangeHashedDictionary::addAttributeSize<String>(const Attribute & attribute)
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
}
void RangeHashedDictionary::calculateBytesAllocated()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
@ -371,14 +369,25 @@ void RangeHashedDictionary::calculateBytesAllocated()
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
addAttributeSize<AttributeType>(attribute);
using ValueType = DictionaryValueType<AttributeType>;
const auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<ValueType>) + collection.getBufferSizeInBytes();
bucket_count = collection.getBufferSizeInCells();
if constexpr (std::is_same_v<ValueType, StringRef>)
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
bytes_allocated += complex_key_arena.size();
}
RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute)
template <DictionaryKeyType dictionary_key_type>
typename RangeHashedDictionary<dictionary_key_type>::Attribute RangeHashedDictionary<dictionary_key_type>::createAttribute(const DictionaryAttribute & dictionary_attribute)
{
Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}, {}};
@ -391,7 +400,7 @@ RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const Di
if constexpr (std::is_same_v<AttributeType, String>)
attribute.string_arena = std::make_unique<Arena>();
attribute.maps = std::make_unique<Collection<ValueType>>();
attribute.maps = CollectionType<ValueType>();
};
callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call);
@ -399,29 +408,35 @@ RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const Di
return attribute;
}
template <DictionaryKeyType dictionary_key_type>
template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
void RangeHashedDictionary::getItemsImpl(
void RangeHashedDictionary<dictionary_key_type>::getItemsImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const
{
PaddedPODArray<UInt64> key_backup_storage;
PaddedPODArray<RangeStorageType> range_backup_storage;
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, key_columns[1], range_backup_storage);
const auto & attr = *std::get<Ptr<AttributeType>>(attribute.maps);
const auto & collection = std::get<CollectionType<AttributeType>>(attribute.maps);
size_t keys_found = 0;
for (const auto row : collections::range(0, ids.size()))
PaddedPODArray<RangeStorageType> range_backup_storage;
const auto & dates = getColumnVectorData(this, key_columns.back(), range_backup_storage);
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
const auto it = attr.find(ids[row]);
auto key = keys_extractor.extractCurrentKey();
const auto it = collection.find(key);
if (it)
{
const auto date = dates[row];
const auto date = dates[key_index];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
@ -439,35 +454,38 @@ void RangeHashedDictionary::getItemsImpl(
if constexpr (is_nullable)
{
if (value.has_value())
set_value(row, *value, false);
set_value(key_index, *value, false);
else
set_value(row, default_value_extractor[row], true);
set_value(key_index, default_value_extractor[key_index], true);
}
else
{
set_value(row, *value, false);
set_value(key_index, *value, false);
}
keys_extractor.rollbackCurrentKey();
continue;
}
}
if constexpr (is_nullable)
set_value(row, default_value_extractor[row], default_value_extractor.isNullAt(row));
set_value(key_index, default_value_extractor[key_index], default_value_extractor.isNullAt(key_index));
else
set_value(row, default_value_extractor[row], false);
set_value(key_index, default_value_extractor[key_index], false);
keys_extractor.rollbackCurrentKey();
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
}
template <DictionaryKeyType dictionary_key_type>
template <typename T>
void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
void RangeHashedDictionary<dictionary_key_type>::setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value)
{
using ValueType = std::conditional_t<std::is_same_v<T, String>, StringRef, T>;
auto & map = *std::get<Ptr<ValueType>>(attribute.maps);
auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
Value<ValueType> value_to_insert;
@ -490,14 +508,17 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const U
}
}
const auto it = map.find(id);
const auto it = collection.find(key);
if (it)
{
auto & values = it->getMapped();
const auto insert_it
= std::lower_bound(std::begin(values), std::end(values), range, [](const Value<ValueType> & lhs, const Range & rhs_range)
const auto insert_it = std::lower_bound(
std::begin(values),
std::end(values),
range,
[](const Value<ValueType> & lhs, const Range & rhs_range)
{
return lhs.range < rhs_range;
});
@ -505,46 +526,29 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const U
values.insert(insert_it, std::move(value_to_insert));
}
else
map.insert({id, Values<ValueType>{std::move(value_to_insert)}});
{
collection.insert({key, Values<ValueType>{std::move(value_to_insert)}});
}
}
void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value)
{
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
setAttributeValueImpl<AttributeType>(attribute, id, range, value);
setAttributeValueImpl<AttributeType>(attribute, key, range, value);
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
const RangeHashedDictionary::Attribute & RangeHashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "{}: no such attribute '{}'", full_name, attribute_name);
return attributes[it->second];
}
const RangeHashedDictionary::Attribute &
RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, const AttributeUnderlyingType type) const
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != type)
throw Exception(ErrorCodes::TYPE_MISMATCH, "attribute {} has type {}",
attribute_name,
toString(attribute.type));
return attribute;
}
template <DictionaryKeyType dictionary_key_type>
template <typename RangeType>
void RangeHashedDictionary::getIdsAndDates(
PaddedPODArray<UInt64> & ids,
void RangeHashedDictionary<dictionary_key_type>::getKeysAndDates(
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const
{
@ -556,32 +560,33 @@ void RangeHashedDictionary::getIdsAndDates(
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
getIdsAndDates<ValueType>(attribute, ids, start_dates, end_dates);
getKeysAndDates<ValueType>(attribute, keys, start_dates, end_dates);
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
template <DictionaryKeyType dictionary_key_type>
template <typename T, typename RangeType>
void RangeHashedDictionary::getIdsAndDates(
void RangeHashedDictionary<dictionary_key_type>::getKeysAndDates(
const Attribute & attribute,
PaddedPODArray<UInt64> & ids,
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const
{
const HashMap<UInt64, Values<T>> & attr = *std::get<Ptr<T>>(attribute.maps);
const auto & collection = std::get<CollectionType<T>>(attribute.maps);
ids.reserve(attr.size());
start_dates.reserve(attr.size());
end_dates.reserve(attr.size());
keys.reserve(collection.size());
start_dates.reserve(collection.size());
end_dates.reserve(collection.size());
const bool is_date = isDate(dict_struct.range_min->type);
for (const auto & key : attr)
for (const auto & key : collection)
{
for (const auto & value : key.getMapped())
{
ids.push_back(key.getKey());
keys.push_back(key.getKey());
start_dates.push_back(value.range.left);
end_dates.push_back(value.range.right);
@ -592,22 +597,23 @@ void RangeHashedDictionary::getIdsAndDates(
}
}
template <DictionaryKeyType dictionary_key_type>
template <typename RangeType>
Pipe RangeHashedDictionary::readImpl(const Names & column_names, size_t max_block_size) const
Pipe RangeHashedDictionary<dictionary_key_type>::readImpl(const Names & column_names, size_t max_block_size) const
{
PaddedPODArray<UInt64> ids;
PaddedPODArray<KeyType> keys;
PaddedPODArray<RangeType> start_dates;
PaddedPODArray<RangeType> end_dates;
getIdsAndDates(ids, start_dates, end_dates);
getKeysAndDates(keys, start_dates, end_dates);
using RangeDictionarySourceType = RangeDictionarySource<RangeType>;
static constexpr RangeDictionaryType range_dictionary_type = (dictionary_key_type == DictionaryKeyType::simple) ? RangeDictionaryType::simple : RangeDictionaryType::complex;
using RangeDictionarySourceType = RangeDictionarySource<range_dictionary_type, RangeType>;
auto source = std::make_shared<RangeDictionarySourceType>(
RangeDictionarySourceData<RangeType>(
RangeDictionarySourceData<range_dictionary_type, RangeType>(
shared_from_this(),
column_names,
std::move(ids),
std::move(keys),
std::move(start_dates),
std::move(end_dates)),
max_block_size);
@ -615,10 +621,21 @@ Pipe RangeHashedDictionary::readImpl(const Names & column_names, size_t max_bloc
return Pipe(source);
}
template <DictionaryKeyType dictionary_key_type>
StringRef RangeHashedDictionary<dictionary_key_type>::copyKeyInArena(StringRef key)
{
size_t key_size = key.size;
char * place_for_key = complex_key_arena.alloc(key_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
StringRef updated_key{place_for_key, key_size};
return updated_key;
}
template <DictionaryKeyType dictionary_key_type>
struct RangeHashedDictionaryCallGetSourceImpl
{
Pipe pipe;
const RangeHashedDictionary * dict;
const RangeHashedDictionary<dictionary_key_type> * dict;
const Names * column_names;
size_t max_block_size;
@ -627,15 +644,16 @@ struct RangeHashedDictionaryCallGetSourceImpl
{
const auto & type = dict->dict_struct.range_min->type;
if (pipe.empty() && dynamic_cast<const DataTypeNumberBase<RangeType> *>(type.get()))
pipe = dict->readImpl<RangeType>(*column_names, max_block_size);
pipe = dict->template readImpl<RangeType>(*column_names, max_block_size);
}
};
Pipe RangeHashedDictionary::read(const Names & column_names, size_t max_block_size) const
template <DictionaryKeyType dictionary_key_type>
Pipe RangeHashedDictionary<dictionary_key_type>::read(const Names & column_names, size_t max_block_size) const
{
using ListType = TypeList<UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Int128, Float32, Float64>;
RangeHashedDictionaryCallGetSourceImpl callable;
RangeHashedDictionaryCallGetSourceImpl<dictionary_key_type> callable;
callable.dict = this;
callable.column_names = &column_names;
callable.max_block_size = max_block_size;
@ -653,7 +671,7 @@ Pipe RangeHashedDictionary::read(const Names & column_names, size_t max_block_si
void registerDictionaryRangeHashed(DictionaryFactory & factory)
{
auto create_layout = [=](const std::string & full_name,
auto create_layout_simple = [=](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
@ -672,9 +690,32 @@ void registerDictionaryRangeHashed(DictionaryFactory & factory)
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<RangeHashedDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
return std::make_unique<RangeHashedDictionary<DictionaryKeyType::simple>>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("range_hashed", create_layout, false);
factory.registerLayout("range_hashed", create_layout_simple, false);
auto create_layout_complex = [=](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr,
ContextPtr /* context */,
bool /*created_from_ddl*/) -> DictionaryPtr
{
if (dict_struct.id)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_range_hashed'");
if (!dict_struct.range_min || !dict_struct.range_max)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"{}: dictionary of layout 'complex_key_range_hashed' requires .structure.range_min and .structure.range_max",
full_name);
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<RangeHashedDictionary<DictionaryKeyType::complex>>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("complex_key_range_hashed", create_layout_complex, true);
}
}

View File

@ -16,9 +16,25 @@
namespace DB
{
using RangeStorageType = Int64;
struct Range
{
RangeStorageType left;
RangeStorageType right;
static bool isCorrectDate(const RangeStorageType & date);
bool contains(const RangeStorageType & value) const;
};
template <DictionaryKeyType dictionary_key_type>
class RangeHashedDictionary final : public IDictionary
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary");
RangeHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
@ -59,7 +75,7 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::range; }
@ -73,19 +89,8 @@ public:
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
using RangeStorageType = Int64;
Pipe read(const Names & column_names, size_t max_block_size) const override;
struct Range
{
RangeStorageType left;
RangeStorageType right;
static bool isCorrectDate(const RangeStorageType & date);
bool contains(const RangeStorageType & value) const;
};
private:
template <typename T>
struct Value final
@ -96,10 +101,12 @@ private:
template <typename T>
using Values = std::vector<Value<T>>;
template <typename T>
using Collection = HashMap<UInt64, Values<T>>;
template <typename T>
using Ptr = std::unique_ptr<Collection<T>>;
template <typename Value>
using CollectionType = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
HashMap<UInt64, Values<Value>>,
HashMapWithSavedHash<StringRef, Values<Value>, DefaultHash<StringRef>>>;
struct Attribute final
{
@ -108,27 +115,27 @@ private:
bool is_nullable;
std::variant<
Ptr<UInt8>,
Ptr<UInt16>,
Ptr<UInt32>,
Ptr<UInt64>,
Ptr<UInt128>,
Ptr<UInt256>,
Ptr<Int8>,
Ptr<Int16>,
Ptr<Int32>,
Ptr<Int64>,
Ptr<Int128>,
Ptr<Int256>,
Ptr<Decimal32>,
Ptr<Decimal64>,
Ptr<Decimal128>,
Ptr<Decimal256>,
Ptr<Float32>,
Ptr<Float64>,
Ptr<UUID>,
Ptr<StringRef>,
Ptr<Array>>
CollectionType<UInt8>,
CollectionType<UInt16>,
CollectionType<UInt32>,
CollectionType<UInt64>,
CollectionType<UInt128>,
CollectionType<UInt256>,
CollectionType<Int8>,
CollectionType<Int16>,
CollectionType<Int32>,
CollectionType<Int64>,
CollectionType<Int128>,
CollectionType<Int256>,
CollectionType<Decimal32>,
CollectionType<Decimal64>,
CollectionType<Decimal128>,
CollectionType<Decimal256>,
CollectionType<Float32>,
CollectionType<Float64>,
CollectionType<UUID>,
CollectionType<StringRef>,
CollectionType<Array>>
maps;
std::unique_ptr<Arena> string_arena;
};
@ -137,9 +144,6 @@ private:
void loadData();
template <typename T>
void addAttributeSize(const Attribute & attribute);
void calculateBytesAllocated();
static Attribute createAttribute(const DictionaryAttribute & dictionary_attribute);
@ -151,35 +155,30 @@ private:
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename AttributeType>
ColumnUInt8::Ptr hasKeysImpl(
const Attribute & attribute,
const PaddedPODArray<UInt64> & ids,
const PaddedPODArray<RangeStorageType> & dates,
size_t & keys_found) const;
template <typename T>
static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
static void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value);
static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
const Attribute & getAttribute(const std::string & attribute_name) const;
const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const;
static void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value);
template <typename RangeType>
void getIdsAndDates(PaddedPODArray<UInt64> & ids, PaddedPODArray<RangeType> & start_dates, PaddedPODArray<RangeType> & end_dates) const;
void getKeysAndDates(
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const;
template <typename T, typename RangeType>
void getIdsAndDates(
void getKeysAndDates(
const Attribute & attribute,
PaddedPODArray<UInt64> & ids,
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const;
template <typename RangeType>
Pipe readImpl(const Names & column_names, size_t max_block_size) const;
StringRef copyKeyInArena(StringRef key);
template <DictionaryKeyType>
friend struct RangeHashedDictionaryCallGetSourceImpl;
const DictionaryStructure dict_struct;
@ -189,6 +188,7 @@ private:
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;
Arena complex_key_arena;
size_t bytes_allocated = 0;
size_t element_count = 0;

View File

@ -6,6 +6,7 @@
namespace DB
{
using DictionaryConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
/// Convert dictionary AST to Poco::AbstractConfiguration
@ -13,4 +14,5 @@ using DictionaryConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfigurati
/// Can throw exception if query is ill-formed
DictionaryConfigurationPtr
getDictionaryConfigurationFromAST(const ASTCreateQuery & query, ContextPtr context, const std::string & database_ = "");
}

View File

@ -250,12 +250,23 @@ public:
}
else
{
if (!WhichDataType(key_column_type).isUInt64())
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Second argument of function {} must be UInt64 when dictionary is range. Actual type {}.",
getName(),
key_column_with_type.type->getName());
/// Functions in external dictionaries_loader only support full-value (not constant) columns with keys.
ColumnPtr key_column = key_column_with_type.column->convertToFullColumnIfConst();
DataTypePtr key_column_type = key_column_with_type.type;
Columns key_columns;
DataTypes key_types;
if (isTuple(key_column_type))
{
key_columns = assert_cast<const ColumnTuple &>(*key_column).getColumnsCopy();
key_types = assert_cast<const DataTypeTuple &>(*key_column_type).getElements();
}
else
{
key_columns = {key_column, range_col};
key_types = {std::make_shared<DataTypeUInt64>(), range_col_type};
}
return dictionary->hasKeys({key_column, range_col}, {std::make_shared<DataTypeUInt64>(), range_col_type});
}
@ -487,18 +498,29 @@ public:
}
else if (dictionary_key_type == DictionaryKeyType::range)
{
if (!WhichDataType(key_col_with_type.type).isUInt64())
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Third argument of function {} must be UInt64 when dictionary is range. Actual type {}.",
getName(),
key_col_with_type.type->getName());
/// Functions in external dictionaries_loader only support full-value (not constant) columns with keys.
ColumnPtr key_column = key_col_with_type.column->convertToFullColumnIfConst();
DataTypePtr key_column_type = key_col_with_type.type;
Columns key_columns;
DataTypes key_types;
if (isTuple(key_column_type))
{
key_columns = assert_cast<const ColumnTuple &>(*key_column).getColumnsCopy();
key_types = assert_cast<const DataTypeTuple &>(*key_column_type).getElements();
}
else
{
key_columns = {key_column, range_col};
key_types = {std::make_shared<DataTypeUInt64>(), range_col_type};
}
result = executeDictionaryRequest(
dictionary,
attribute_names,
{key_column, range_col},
{std::make_shared<DataTypeUInt64>(), range_col_type},
key_columns,
key_types,
result_type,
default_cols);
}

View File

@ -0,0 +1,58 @@
Dictionary not nullable
dictGet
0.2
0.2
0.2
0.2
0.4
dictHas
1
1
1
0
select columns from dictionary
allColumns
2019-05-05 2019-05-20 1 1 0.33
2019-05-21 2019-05-30 1 1 0.42
2019-05-21 2019-05-30 2 2 0.46
noColumns
1
1
1
onlySpecificColumns
1 2019-05-05 0.33
1 2019-05-21 0.42
2 2019-05-21 0.46
onlySpecificColumn
0.33
0.42
0.46
Dictionary nullable
dictGet
0.2
0.2
0.2
0.2
0.4
dictHas
1
1
1
0
select columns from dictionary
allColumns
2019-05-05 2019-05-20 1 1 0.33
2019-05-21 2019-05-30 1 1 0.42
2019-05-21 2019-05-30 2 2 \N
noColumns
1
1
1
onlySpecificColumns
1 2019-05-05 0.33
1 2019-05-21 0.42
2 2019-05-21 \N
onlySpecificColumn
0.33
0.42
\N

View File

@ -0,0 +1,109 @@
DROP TABLE IF EXISTS date_table;
CREATE TABLE date_table
(
CountryID UInt64,
CountryKey String,
StartDate Date,
EndDate Date,
Tax Float64
)
ENGINE = MergeTree()
ORDER BY CountryID;
INSERT INTO date_table VALUES(1, '1', toDate('2019-05-05'), toDate('2019-05-20'), 0.33);
INSERT INTO date_table VALUES(1, '1', toDate('2019-05-21'), toDate('2019-05-30'), 0.42);
INSERT INTO date_table VALUES(2, '2', toDate('2019-05-21'), toDate('2019-05-30'), 0.46);
DROP DICTIONARY IF EXISTS range_dictionary;
CREATE DICTIONARY range_dictionary
(
CountryID UInt64,
CountryKey String,
StartDate Date,
EndDate Date,
Tax Float64 DEFAULT 0.2
)
PRIMARY KEY CountryID, CountryKey
SOURCE(CLICKHOUSE(TABLE 'date_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(COMPLEX_KEY_RANGE_HASHED())
RANGE(MIN StartDate MAX EndDate);
SELECT 'Dictionary not nullable';
SELECT 'dictGet';
SELECT dictGet('range_dictionary', 'Tax', (toUInt64(1), '1'), toDate('2019-05-15'));
SELECT dictGet('range_dictionary', 'Tax', (toUInt64(1), '1'), toDate('2019-05-29'));
SELECT dictGet('range_dictionary', 'Tax', (toUInt64(2), '2'), toDate('2019-05-29'));
SELECT dictGet('range_dictionary', 'Tax', (toUInt64(2), '2'), toDate('2019-05-31'));
SELECT dictGetOrDefault('range_dictionary', 'Tax', (toUInt64(2), '2'), toDate('2019-05-31'), 0.4);
SELECT 'dictHas';
SELECT dictHas('range_dictionary', (toUInt64(1), '1'), toDate('2019-05-15'));
SELECT dictHas('range_dictionary', (toUInt64(1), '1'), toDate('2019-05-29'));
SELECT dictHas('range_dictionary', (toUInt64(2), '2'), toDate('2019-05-29'));
SELECT dictHas('range_dictionary', (toUInt64(2), '2'), toDate('2019-05-31'));
SELECT 'select columns from dictionary';
SELECT 'allColumns';
SELECT * FROM range_dictionary;
SELECT 'noColumns';
SELECT 1 FROM range_dictionary;
SELECT 'onlySpecificColumns';
SELECT CountryID, StartDate, Tax FROM range_dictionary;
SELECT 'onlySpecificColumn';
SELECT Tax FROM range_dictionary;
DROP TABLE date_table;
DROP DICTIONARY range_dictionary;
CREATE TABLE date_table
(
CountryID UInt64,
CountryKey String,
StartDate Date,
EndDate Date,
Tax Nullable(Float64)
)
ENGINE = MergeTree()
ORDER BY CountryID;
INSERT INTO date_table VALUES(1, '1', toDate('2019-05-05'), toDate('2019-05-20'), 0.33);
INSERT INTO date_table VALUES(1, '1', toDate('2019-05-21'), toDate('2019-05-30'), 0.42);
INSERT INTO date_table VALUES(2, '2', toDate('2019-05-21'), toDate('2019-05-30'), NULL);
CREATE DICTIONARY range_dictionary_nullable
(
CountryID UInt64,
CountryKey String,
StartDate Date,
EndDate Date,
Tax Nullable(Float64) DEFAULT 0.2
)
PRIMARY KEY CountryID, CountryKey
SOURCE(CLICKHOUSE(TABLE 'date_table'))
LIFETIME(MIN 1 MAX 1000)
LAYOUT(COMPLEX_KEY_RANGE_HASHED())
RANGE(MIN StartDate MAX EndDate);
SELECT 'Dictionary nullable';
SELECT 'dictGet';
SELECT dictGet('range_dictionary_nullable', 'Tax', (toUInt64(1), '1'), toDate('2019-05-15'));
SELECT dictGet('range_dictionary_nullable', 'Tax', (toUInt64(1), '1'), toDate('2019-05-29'));
SELECT dictGet('range_dictionary_nullable', 'Tax', (toUInt64(2), '2'), toDate('2019-05-29'));
SELECT dictGet('range_dictionary_nullable', 'Tax', (toUInt64(2), '2'), toDate('2019-05-31'));
SELECT dictGetOrDefault('range_dictionary_nullable', 'Tax', (toUInt64(2), '2'), toDate('2019-05-31'), 0.4);
SELECT 'dictHas';
SELECT dictHas('range_dictionary_nullable', (toUInt64(1), '1'), toDate('2019-05-15'));
SELECT dictHas('range_dictionary_nullable', (toUInt64(1), '1'), toDate('2019-05-29'));
SELECT dictHas('range_dictionary_nullable', (toUInt64(2), '2'), toDate('2019-05-29'));
SELECT dictHas('range_dictionary_nullable', (toUInt64(2), '2'), toDate('2019-05-31'));
SELECT 'select columns from dictionary';
SELECT 'allColumns';
SELECT * FROM range_dictionary_nullable;
SELECT 'noColumns';
SELECT 1 FROM range_dictionary_nullable;
SELECT 'onlySpecificColumns';
SELECT CountryID, StartDate, Tax FROM range_dictionary_nullable;
SELECT 'onlySpecificColumn';
SELECT Tax FROM range_dictionary_nullable;
DROP TABLE date_table;
DROP DICTIONARY range_dictionary_nullable;