From 2547fa6c885fec2218d07a3e80f019a2104f6554 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 17 Sep 2018 18:04:57 +0300 Subject: [PATCH 1/2] Fix BlockInputStream for RangeHashed dictionary. --- .../RangeDictionaryBlockInputStream.h | 54 +++++++++--------- .../Dictionaries/RangeHashedDictionary.cpp | 55 ++++++++++++++++--- dbms/src/Dictionaries/RangeHashedDictionary.h | 12 +++- 3 files changed, 82 insertions(+), 39 deletions(-) diff --git a/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h b/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h index beef86cc9a5..93c937e7140 100644 --- a/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -18,7 +18,7 @@ namespace DB * BlockInputStream implementation for external dictionaries * read() returns single block consisting of the in-memory contents of the dictionaries */ -template +template class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: @@ -26,7 +26,7 @@ public: RangeDictionaryBlockInputStream( DictionaryPtr dictionary, size_t max_block_size, const Names & column_names, PaddedPODArray && ids_to_fill, - PaddedPODArray && start_dates, PaddedPODArray && end_dates); + PaddedPODArray && start_dates, PaddedPODArray && end_dates); String getName() const override { @@ -57,35 +57,35 @@ private: const PaddedPODArray & values, ColumnsWithTypeAndName & columns) const; Block fillBlock(const PaddedPODArray & ids_to_fill, - const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; + const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; PaddedPODArray makeDateKey( - const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; + const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const; DictionaryPtr dictionary; Names column_names; PaddedPODArray ids; - PaddedPODArray start_dates; - PaddedPODArray end_dates; + PaddedPODArray start_dates; + PaddedPODArray end_dates; }; -template -RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( +template +RangeDictionaryBlockInputStream::RangeDictionaryBlockInputStream( DictionaryPtr dictionary, size_t max_column_size, const Names & column_names, PaddedPODArray && ids, - PaddedPODArray && block_start_dates, PaddedPODArray && block_end_dates) + PaddedPODArray && block_start_dates, PaddedPODArray && block_end_dates) : DictionaryBlockInputStreamBase(ids.size(), max_column_size), dictionary(dictionary), column_names(column_names), ids(std::move(ids)), start_dates(std::move(block_start_dates)), end_dates(std::move(block_end_dates)) { } -template -Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const +template +Block RangeDictionaryBlockInputStream::getBlock(size_t start, size_t length) const { PaddedPODArray block_ids; - PaddedPODArray block_start_dates; - PaddedPODArray block_end_dates; + PaddedPODArray block_start_dates; + PaddedPODArray block_end_dates; block_ids.reserve(length); block_start_dates.reserve(length); block_end_dates.reserve(length); @@ -100,9 +100,9 @@ Block RangeDictionaryBlockInputStream::getBlock(size_t star return fillBlock(block_ids, block_start_dates, block_end_dates); } -template +template template -ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAttribute( +ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAttribute( DictionaryGetter getter, const PaddedPODArray & ids_to_fill, const PaddedPODArray & dates, const DictionaryAttribute & attribute, const DictionaryType & concrete_dictionary) const { @@ -111,8 +111,8 @@ ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAtt return column_vector; } -template -ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAttributeString( +template +ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAttributeString( const PaddedPODArray & ids_to_fill, const PaddedPODArray & dates, const DictionaryAttribute & attribute, const DictionaryType & concrete_dictionary) const { @@ -121,9 +121,9 @@ ColumnPtr RangeDictionaryBlockInputStream::getColumnFromAtt return column_string; } -template +template template -ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const +ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPODArray(const PaddedPODArray & array) const { auto column_vector = ColumnVector::create(); column_vector->getData().reserve(array.size()); @@ -133,9 +133,9 @@ ColumnPtr RangeDictionaryBlockInputStream::getColumnFromPOD } -template +template template -void RangeDictionaryBlockInputStream::addSpecialColumn( +void RangeDictionaryBlockInputStream::addSpecialColumn( const std::optional & attribute, DataTypePtr type, const std::string & default_name, const std::unordered_set & column_names_set, const PaddedPODArray & values, ColumnsWithTypeAndName & columns) const @@ -148,9 +148,9 @@ void RangeDictionaryBlockInputStream::addSpecialColumn( columns.emplace_back(getColumnFromPODArray(values), type, name); } -template -PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( - const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const +template +PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( + const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const { PaddedPODArray key(block_start_dates.size()); for (size_t i = 0; i < key.size(); ++i) @@ -165,10 +165,10 @@ PaddedPODArray RangeDictionaryBlockInputStream::make } -template -Block RangeDictionaryBlockInputStream::fillBlock( +template +Block RangeDictionaryBlockInputStream::fillBlock( const PaddedPODArray & ids_to_fill, - const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const + const PaddedPODArray & block_start_dates, const PaddedPODArray & block_end_dates) const { ColumnsWithTypeAndName columns; const DictionaryStructure & structure = dictionary->getStructure(); diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.cpp b/dbms/src/Dictionaries/RangeHashedDictionary.cpp index e96e83fc078..2cee14db4a3 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.cpp +++ b/dbms/src/Dictionaries/RangeHashedDictionary.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -416,9 +417,10 @@ const RangeHashedDictionary::Attribute & RangeHashedDictionary::getAttributeWith return attribute; } +template void RangeHashedDictionary::getIdsAndDates(PaddedPODArray & ids, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const + PaddedPODArray & start_dates, + PaddedPODArray & end_dates) const { const auto & attribute = attributes.front(); @@ -439,10 +441,10 @@ void RangeHashedDictionary::getIdsAndDates(PaddedPODArray & ids, } } -template +template void RangeHashedDictionary::getIdsAndDates(const Attribute & attribute, PaddedPODArray & ids, - PaddedPODArray & start_dates, - PaddedPODArray & end_dates) const + PaddedPODArray & start_dates, + PaddedPODArray & end_dates) const { const HashMap> & attr = *std::get>(attribute.maps); @@ -461,18 +463,53 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute & attribute, PaddedPO } } -BlockInputStreamPtr RangeHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const + +template +BlockInputStreamPtr RangeHashedDictionary::getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const { PaddedPODArray ids; - PaddedPODArray start_dates; - PaddedPODArray end_dates; + PaddedPODArray start_dates; + PaddedPODArray end_dates; getIdsAndDates(ids, start_dates, end_dates); - using BlockInputStreamType = RangeDictionaryBlockInputStream; + using BlockInputStreamType = RangeDictionaryBlockInputStream; auto dict_ptr = std::static_pointer_cast(shared_from_this()); return std::make_shared( dict_ptr, max_block_size, column_names, std::move(ids), std::move(start_dates), std::move(end_dates)); } +struct RangeHashedDIctionaryCallGetBlockInputStreamImpl +{ + BlockInputStreamPtr stream; + const RangeHashedDictionary * dict; + const Names * column_names; + size_t max_block_size; + + template + void operator()() + { + auto & type = dict->dict_struct.range_min->type; + if (!stream && dynamic_cast *>(type.get())) + stream = dict->getBlockInputStreamImpl(*column_names, max_block_size); + } +}; + +BlockInputStreamPtr RangeHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const +{ + using ListType = TypeList; + + RangeHashedDIctionaryCallGetBlockInputStreamImpl callable; + callable.dict = this; + callable.column_names = &column_names; + callable.max_block_size = max_block_size; + + ListType::forEach(callable); + + if (!callable.stream) + throw Exception("Unexpected range type for RangeHashed dictionary: " + dict_struct.range_min->type->getName(), + ErrorCodes::LOGICAL_ERROR); + + return callable.stream; +} } diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.h b/dbms/src/Dictionaries/RangeHashedDictionary.h index 73cb7ff4610..23153afd12c 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.h +++ b/dbms/src/Dictionaries/RangeHashedDictionary.h @@ -162,12 +162,18 @@ private: const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const; + template void getIdsAndDates(PaddedPODArray & ids, - PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; - template + template void getIdsAndDates(const Attribute & attribute, PaddedPODArray & ids, - PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + PaddedPODArray & start_dates, PaddedPODArray & end_dates) const; + + template + BlockInputStreamPtr getBlockInputStreamImpl(const Names & column_names, size_t max_block_size) const; + + friend struct RangeHashedDIctionaryCallGetBlockInputStreamImpl; const std::string dictionary_name; const DictionaryStructure dict_struct; From e0521b1abd29a1b072d7fca418eaa06ea89deada Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 17 Sep 2018 20:30:28 +0300 Subject: [PATCH 2/2] Fix range boundaries for RangeHashed dictionary. --- .../Dictionaries/RangeHashedDictionary.cpp | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.cpp b/dbms/src/Dictionaries/RangeHashedDictionary.cpp index 2cee14db4a3..57a948f1037 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.cpp +++ b/dbms/src/Dictionaries/RangeHashedDictionary.cpp @@ -15,7 +15,7 @@ using RangeStorageType = DB::RangeHashedDictionary::RangeStorageType; // To simplify comparison, null value of min bound should be bigger than any other value, // and null value of maxbound - less than any value. const RangeStorageType RANGE_MIN_NULL_VALUE = std::numeric_limits::max(); -const RangeStorageType RANGE_MAX_NULL_VALUE = std::numeric_limits::min(); +const RangeStorageType RANGE_MAX_NULL_VALUE = std::numeric_limits::lowest(); // Handle both kinds of null values: explicit nulls of NullableColumn and 'implicit' nulls of Date type. RangeStorageType getColumnIntValueOrDefault(const DB::IColumn & column, size_t index, bool isDate, const RangeStorageType & default_value) @@ -171,7 +171,7 @@ void RangeHashedDictionary::loadData() const auto & id_column = *block.safeGetByPosition(0).column; // Support old behaviour, where invalid date means 'open range'. - const bool is_date = isDate(block.safeGetByPosition(1).type); + const bool is_date = isDate(dict_struct.range_min->type); const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(1).column); const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(2).column); @@ -185,11 +185,22 @@ void RangeHashedDictionary::loadData() for (const auto row_idx : ext::range(0, id_column.size())) { - const auto min = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, RANGE_MIN_NULL_VALUE); - const auto max = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, RANGE_MAX_NULL_VALUE); + RangeStorageType lower_bound; + RangeStorageType upper_bound; + + if (is_date) + { + lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, 0); + upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, DATE_LUT_MAX_DAY_NUM + 1); + } + else + { + lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, RANGE_MIN_NULL_VALUE); + upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, RANGE_MAX_NULL_VALUE); + } setAttributeValue(attribute, id_column.getUInt(row_idx), - Range{min, max}, + Range{lower_bound, upper_bound}, attribute_column[row_idx]); } } @@ -452,6 +463,8 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute & attribute, PaddedPO start_dates.reserve(attr.size()); end_dates.reserve(attr.size()); + const bool is_date = isDate(dict_struct.range_min->type); + for (const auto & key : attr) { for (const auto & value : key.second) @@ -459,6 +472,9 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute & attribute, PaddedPO ids.push_back(key.first); start_dates.push_back(value.range.left); end_dates.push_back(value.range.right); + + if (is_date && static_cast(end_dates.back()) > DATE_LUT_MAX_DAY_NUM) + end_dates.back() = 0; } } }