From 2ed66515ffd3b6ae3dfff4fdca742a3995299645 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 25 Dec 2017 21:29:04 +0300 Subject: [PATCH 1/5] Date type for range_min and range_max in Dictionare(range_hashed) [#CLICKHOUSE-3521] --- dbms/src/Storages/StorageDictionary.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/StorageDictionary.cpp b/dbms/src/Storages/StorageDictionary.cpp index eedd412e844..4e55aa92a17 100644 --- a/dbms/src/Storages/StorageDictionary.cpp +++ b/dbms/src/Storages/StorageDictionary.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -89,10 +90,10 @@ NamesAndTypesListPtr StorageDictionary::getNamesAndTypes(const DictionaryStructu std::make_shared())); if (dictionaryStructure.range_min) dictionaryNamesAndTypes->push_back(NameAndTypePair(dictionaryStructure.range_min->name, - std::make_shared())); + std::make_shared())); if (dictionaryStructure.range_max) dictionaryNamesAndTypes->push_back(NameAndTypePair(dictionaryStructure.range_max->name, - std::make_shared())); + std::make_shared())); if (dictionaryStructure.key) for (const auto & attribute : *dictionaryStructure.key) dictionaryNamesAndTypes->push_back(NameAndTypePair(attribute.name, attribute.type)); From 0c082b134d310c6a194feb5a8f3063fcabb8c649 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 25 Dec 2017 22:00:48 +0300 Subject: [PATCH 2/5] fixed engine Dictionary(range_hashed) for open interval date keys [#CLICKHOUSE-3521] --- .../RangeDictionaryBlockInputStream.h | 31 ++++++++++++++++--- .../Dictionaries/RangeHashedDictionary.cpp | 2 +- dbms/src/Dictionaries/RangeHashedDictionary.h | 8 +++-- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h b/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h index 9b6382ee851..02c327ed6eb 100644 --- a/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h +++ b/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h @@ -8,6 +8,7 @@ #include #include #include +#include #include namespace DB @@ -57,6 +58,9 @@ private: Block fillBlock(const PaddedPODArray & ids, const PaddedPODArray & start_dates, const PaddedPODArray & end_dates) const; + PaddedPODArray makeDateKey( + const PaddedPODArray & start_dates, const PaddedPODArray & end_dates) const; + DictionatyPtr dictionary; Names column_names; PaddedPODArray ids; @@ -88,8 +92,8 @@ Block RangeDictionaryBlockInputStream::getBlock(size_t star for (auto idx : ext::range(start, start + length)) { block_ids.push_back(ids[idx]); - block_start_dates.push_back(block_start_dates[idx]); - block_end_dates.push_back(block_end_dates[idx]); + block_start_dates.push_back(start_dates[idx]); + block_end_dates.push_back(end_dates[idx]); } return fillBlock(block_ids, block_start_dates, block_end_dates); @@ -144,6 +148,23 @@ void RangeDictionaryBlockInputStream::addSpecialColumn( } } +template +PaddedPODArray RangeDictionaryBlockInputStream::makeDateKey( + const PaddedPODArray & start_dates, const PaddedPODArray & end_dates) const +{ + PaddedPODArray key(start_dates.size()); + for (size_t i = 0; i < key.size(); ++i) + { + if (RangeHashedDictionary::Range::isCorrectDate(start_dates[i])) + key[i] = start_dates[i]; + else + key[i] = end_dates[i]; + } + + return key; +} + + template Block RangeDictionaryBlockInputStream::fillBlock( const PaddedPODArray& ids, @@ -158,6 +179,8 @@ Block RangeDictionaryBlockInputStream::fillBlock( addSpecialColumn(structure.range_min, std::make_shared(), "Range Start", names, start_dates, columns); addSpecialColumn(structure.range_max, std::make_shared(), "Range End", names, end_dates, columns); + auto date_key = makeDateKey(start_dates, end_dates); + for (const auto idx : ext::range(0, structure.attributes.size())) { const DictionaryAttribute& attribute = structure.attributes[idx]; @@ -165,7 +188,7 @@ Block RangeDictionaryBlockInputStream::fillBlock( { ColumnPtr column; #define GET_COLUMN_FORM_ATTRIBUTE(TYPE)\ - column = getColumnFromAttribute(&DictionaryType::get##TYPE, ids, start_dates, attribute, *dictionary) + column = getColumnFromAttribute(&DictionaryType::get##TYPE, ids, date_key, attribute, *dictionary) switch (attribute.underlying_type) { case AttributeUnderlyingType::UInt8: @@ -202,7 +225,7 @@ Block RangeDictionaryBlockInputStream::fillBlock( GET_COLUMN_FORM_ATTRIBUTE(Float64); break; case AttributeUnderlyingType::String: - column = getColumnFromAttributeString(ids, start_dates, attribute, *dictionary); + column = getColumnFromAttributeString(ids, date_key, attribute, *dictionary); break; } diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.cpp b/dbms/src/Dictionaries/RangeHashedDictionary.cpp index 7afe71f86dc..68c44683515 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.cpp +++ b/dbms/src/Dictionaries/RangeHashedDictionary.cpp @@ -392,9 +392,9 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute& attribute, PaddedPOD end_dates.reserve(attr.size()); for (const auto & key : attr) { - ids.push_back(key.first); for (const auto & value : key.second) { + ids.push_back(key.first); start_dates.push_back(value.range.first); end_dates.push_back(value.range.second); } diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.h b/dbms/src/Dictionaries/RangeHashedDictionary.h index 5424cf1a206..30a23a247af 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.h +++ b/dbms/src/Dictionaries/RangeHashedDictionary.h @@ -82,11 +82,12 @@ public: BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; -private: struct Range : std::pair { using std::pair::pair; + static bool isCorrectDate(const UInt16 date) { return 0 < date && date <= DATE_LUT_MAX_DAY_NUM; } + bool contains(const UInt16 date) const { const auto & left = first; @@ -95,8 +96,8 @@ private: if (left <= date && date <= right) return true; - const auto has_left_bound = 0 < left && left <= DATE_LUT_MAX_DAY_NUM; - const auto has_right_bound = 0 < right && right <= DATE_LUT_MAX_DAY_NUM; + const auto has_left_bound = isCorrectDate(left); + const auto has_right_bound = isCorrectDate(right); if ((!has_left_bound || left <= date) && (!has_right_bound || date <= right)) return true; @@ -105,6 +106,7 @@ private: } }; +private: template struct Value final { From 64fd3fe35a9b0ffc5ddabc1ceb390c9052bc35bf Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 25 Dec 2017 23:53:57 +0300 Subject: [PATCH 3/5] added test for Dictionary(range_hashed) [#CLICKHOUSE-3521] --- .../configs/dictionaries/source.tsv | 8 +++--- .../generate_dictionaries.py | 27 ++++++++++++++++--- .../integration/test_dictionaries/test.py | 4 ++- 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/dbms/tests/integration/test_dictionaries/configs/dictionaries/source.tsv b/dbms/tests/integration/test_dictionaries/configs/dictionaries/source.tsv index d3fc6e1f3cc..438c619ab69 100644 --- a/dbms/tests/integration/test_dictionaries/configs/dictionaries/source.tsv +++ b/dbms/tests/integration/test_dictionaries/configs/dictionaries/source.tsv @@ -1,4 +1,4 @@ -0 0 0 0 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0 -1 1 1 1 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0 -2 2 2 2 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1 -3 3 3 3 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2 +0 0 0 0 0000-00-00 0000-00-00 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0 +1 1 1 1 1970-01-02 0000-00-00 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0 +2 2 2 2 0000-00-00 1970-01-02 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1 +3 3 3 3 1990-01-03 1990-01-03 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2 \ No newline at end of file diff --git a/dbms/tests/integration/test_dictionaries/generate_dictionaries.py b/dbms/tests/integration/test_dictionaries/generate_dictionaries.py index edae0bd451c..0a315a079ba 100644 --- a/dbms/tests/integration/test_dictionaries/generate_dictionaries.py +++ b/dbms/tests/integration/test_dictionaries/generate_dictionaries.py @@ -43,6 +43,9 @@ def generate_structure(): # Complex key dictionaries with (String, UInt8) key ['clickhouse_complex_mixed_key_hashed', 2, False], ['clickhouse_complex_mixed_key_cache', 2, False], + + # Range hashed dictionary + ['clickhouse_range_hashed', 3, False], ] @@ -102,6 +105,7 @@ def generate_dictionaries(path, structure): layout_cache = '128' layout_complex_key_hashed = '' layout_complex_key_cache = '128' + layout_range_hashed = '' key_simple = ''' @@ -135,7 +139,19 @@ def generate_dictionaries(path, structure): ''' - keys = [key_simple, key_complex_integers, key_complex_mixed] + key_range_hashed = ''' + + id + + + StartDate + + + EndDate + + ''' + + keys = [key_simple, key_complex_integers, key_complex_mixed, key_range_hashed] parent_attribute = ''' @@ -158,6 +174,9 @@ def generate_dictionaries(path, structure): # Complex key dictionaries with (String, UInt8) key [source_clickhouse, layout_complex_key_hashed], [source_clickhouse, layout_complex_key_cache], + + # Range hashed dictionary + [source_clickhouse, layout_range_hashed], ] file_names = [] @@ -176,6 +195,7 @@ def generate_dictionaries(path, structure): class DictionaryTestTable: def __init__(self, source_file_name): self.structure = '''id UInt64, key0 UInt8, key0_str String, key1 UInt8, + StartDate Date, EndDate Date, UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64, Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64, Float32_ Float32, Float64_ Float64, @@ -183,8 +203,8 @@ class DictionaryTestTable: Date_ Date, DateTime_ DateTime, Parent UInt64''' self.names_and_types = map(str.split, self.structure.split(',')) - self.keys_names_and_types = self.names_and_types[:4] - self.values_names_and_types = self.names_and_types[4:] + self.keys_names_and_types = self.names_and_types[:6] + self.values_names_and_types = self.names_and_types[6:] self.source_file_name = source_file_name self.rows = None @@ -213,6 +233,7 @@ class DictionaryTestTable: return '(' + ','.join(map(wrap_value, zip(row, types))) + ')' values = ','.join(map(make_tuple, lines)) + print query % (self.structure, values) instance.query(query % (self.structure, values)) def get_structure_for_keys(self, keys, enable_parent=True): diff --git a/dbms/tests/integration/test_dictionaries/test.py b/dbms/tests/integration/test_dictionaries/test.py index 96caf969606..b6eb7e33be8 100644 --- a/dbms/tests/integration/test_dictionaries/test.py +++ b/dbms/tests/integration/test_dictionaries/test.py @@ -45,10 +45,12 @@ def started_cluster(): ('clickhouse_flat', ('id',), True), ('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False), ('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False), + ('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False), ], ids=['clickhouse_hashed', 'clickhouse_flat', 'clickhouse_complex_integers_key_hashed', - 'clickhouse_complex_mixed_key_hashed'] + 'clickhouse_complex_mixed_key_hashed', + 'clickhouse_range_hashed'] ) def dictionary_structure(started_cluster, request): return request.param From ca77f3d8617a4acc66d43027d1d9e1ee1e32a62c Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Dec 2017 15:40:45 +0300 Subject: [PATCH 4/5] fixed NULL value for range hased dictionary range_min and range_max keys [#CLICKHOUSE-3521] --- dbms/src/Dictionaries/DictionarySourceFactory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Dictionaries/DictionarySourceFactory.cpp b/dbms/src/Dictionaries/DictionarySourceFactory.cpp index 2cffe5df00b..da5244d0347 100644 --- a/dbms/src/Dictionaries/DictionarySourceFactory.cpp +++ b/dbms/src/Dictionaries/DictionarySourceFactory.cpp @@ -54,7 +54,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct) if (dict_struct.id) block.insert(ColumnWithTypeAndName{ - ColumnUInt64::create(1), std::make_shared(), dict_struct.id->name}); + ColumnUInt64::create(1, 0), std::make_shared(), dict_struct.id->name}); if (dict_struct.key) { @@ -70,7 +70,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct) if (dict_struct.range_min) for (const auto & attribute : { dict_struct.range_min, dict_struct.range_max }) block.insert(ColumnWithTypeAndName{ - ColumnUInt16::create(1), std::make_shared(), attribute->name}); + ColumnUInt16::create(1, 0), std::make_shared(), attribute->name}); for (const auto & attribute : dict_struct.attributes) { From 7a0281dd20fcd110f17cce448a389bb9800492f0 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 26 Dec 2017 20:37:21 +0300 Subject: [PATCH 5/5] Update RangeHashedDictionary.cpp --- dbms/src/Dictionaries/RangeHashedDictionary.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Dictionaries/RangeHashedDictionary.cpp b/dbms/src/Dictionaries/RangeHashedDictionary.cpp index 68c44683515..544ac19bd76 100644 --- a/dbms/src/Dictionaries/RangeHashedDictionary.cpp +++ b/dbms/src/Dictionaries/RangeHashedDictionary.cpp @@ -391,7 +391,8 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute& attribute, PaddedPOD start_dates.reserve(attr.size()); end_dates.reserve(attr.size()); - for (const auto & key : attr) { + for (const auto & key : attr) + { for (const auto & value : key.second) { ids.push_back(key.first);