Merge pull request #1702 from yandex/CLICKHOUSE-3521

Dictionary(range_hashed) fix
This commit is contained in:
alexey-milovidov 2017-12-26 20:38:05 +03:00 committed by GitHub
commit 1d36d61f8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 71 additions and 21 deletions

View File

@ -54,7 +54,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct)
if (dict_struct.id) if (dict_struct.id)
block.insert(ColumnWithTypeAndName{ block.insert(ColumnWithTypeAndName{
ColumnUInt64::create(1), std::make_shared<DataTypeUInt64>(), dict_struct.id->name}); ColumnUInt64::create(1, 0), std::make_shared<DataTypeUInt64>(), dict_struct.id->name});
if (dict_struct.key) if (dict_struct.key)
{ {
@ -70,7 +70,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct)
if (dict_struct.range_min) if (dict_struct.range_min)
for (const auto & attribute : { dict_struct.range_min, dict_struct.range_max }) for (const auto & attribute : { dict_struct.range_min, dict_struct.range_max })
block.insert(ColumnWithTypeAndName{ block.insert(ColumnWithTypeAndName{
ColumnUInt16::create(1), std::make_shared<DataTypeDate>(), attribute->name}); ColumnUInt16::create(1, 0), std::make_shared<DataTypeDate>(), attribute->name});
for (const auto & attribute : dict_struct.attributes) for (const auto & attribute : dict_struct.attributes)
{ {

View File

@ -8,6 +8,7 @@
#include <Dictionaries/DictionaryBlockInputStreamBase.h> #include <Dictionaries/DictionaryBlockInputStreamBase.h>
#include <Dictionaries/DictionaryStructure.h> #include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h> #include <Dictionaries/IDictionary.h>
#include <Dictionaries/RangeHashedDictionary.h>
#include <ext/range.h> #include <ext/range.h>
namespace DB namespace DB
@ -57,6 +58,9 @@ private:
Block fillBlock(const PaddedPODArray<Key> & ids, Block fillBlock(const PaddedPODArray<Key> & ids,
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const; const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const;
PaddedPODArray<UInt16> makeDateKey(
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const;
DictionatyPtr dictionary; DictionatyPtr dictionary;
Names column_names; Names column_names;
PaddedPODArray<Key> ids; PaddedPODArray<Key> ids;
@ -88,8 +92,8 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::getBlock(size_t star
for (auto idx : ext::range(start, start + length)) for (auto idx : ext::range(start, start + length))
{ {
block_ids.push_back(ids[idx]); block_ids.push_back(ids[idx]);
block_start_dates.push_back(block_start_dates[idx]); block_start_dates.push_back(start_dates[idx]);
block_end_dates.push_back(block_end_dates[idx]); block_end_dates.push_back(end_dates[idx]);
} }
return fillBlock(block_ids, block_start_dates, block_end_dates); return fillBlock(block_ids, block_start_dates, block_end_dates);
@ -144,6 +148,23 @@ void RangeDictionaryBlockInputStream<DictionaryType, Key>::addSpecialColumn(
} }
} }
template <typename DictionaryType, typename Key>
PaddedPODArray<UInt16> RangeDictionaryBlockInputStream<DictionaryType, Key>::makeDateKey(
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const
{
PaddedPODArray<UInt16> key(start_dates.size());
for (size_t i = 0; i < key.size(); ++i)
{
if (RangeHashedDictionary::Range::isCorrectDate(start_dates[i]))
key[i] = start_dates[i];
else
key[i] = end_dates[i];
}
return key;
}
template <typename DictionaryType, typename Key> template <typename DictionaryType, typename Key>
Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock( Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
const PaddedPODArray<Key>& ids, const PaddedPODArray<Key>& ids,
@ -158,6 +179,8 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
addSpecialColumn(structure.range_min, std::make_shared<DataTypeDate>(), "Range Start", names, start_dates, columns); addSpecialColumn(structure.range_min, std::make_shared<DataTypeDate>(), "Range Start", names, start_dates, columns);
addSpecialColumn(structure.range_max, std::make_shared<DataTypeDate>(), "Range End", names, end_dates, columns); addSpecialColumn(structure.range_max, std::make_shared<DataTypeDate>(), "Range End", names, end_dates, columns);
auto date_key = makeDateKey(start_dates, end_dates);
for (const auto idx : ext::range(0, structure.attributes.size())) for (const auto idx : ext::range(0, structure.attributes.size()))
{ {
const DictionaryAttribute& attribute = structure.attributes[idx]; const DictionaryAttribute& attribute = structure.attributes[idx];
@ -165,7 +188,7 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
{ {
ColumnPtr column; ColumnPtr column;
#define GET_COLUMN_FORM_ATTRIBUTE(TYPE)\ #define GET_COLUMN_FORM_ATTRIBUTE(TYPE)\
column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids, start_dates, attribute, *dictionary) column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids, date_key, attribute, *dictionary)
switch (attribute.underlying_type) switch (attribute.underlying_type)
{ {
case AttributeUnderlyingType::UInt8: case AttributeUnderlyingType::UInt8:
@ -202,7 +225,7 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
GET_COLUMN_FORM_ATTRIBUTE(Float64); GET_COLUMN_FORM_ATTRIBUTE(Float64);
break; break;
case AttributeUnderlyingType::String: case AttributeUnderlyingType::String:
column = getColumnFromAttributeString(ids, start_dates, attribute, *dictionary); column = getColumnFromAttributeString(ids, date_key, attribute, *dictionary);
break; break;
} }

View File

@ -391,10 +391,11 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute& attribute, PaddedPOD
start_dates.reserve(attr.size()); start_dates.reserve(attr.size());
end_dates.reserve(attr.size()); end_dates.reserve(attr.size());
for (const auto & key : attr) { for (const auto & key : attr)
ids.push_back(key.first); {
for (const auto & value : key.second) for (const auto & value : key.second)
{ {
ids.push_back(key.first);
start_dates.push_back(value.range.first); start_dates.push_back(value.range.first);
end_dates.push_back(value.range.second); end_dates.push_back(value.range.second);
} }

View File

@ -82,11 +82,12 @@ public:
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
struct Range : std::pair<UInt16, UInt16> struct Range : std::pair<UInt16, UInt16>
{ {
using std::pair<UInt16, UInt16>::pair; using std::pair<UInt16, UInt16>::pair;
static bool isCorrectDate(const UInt16 date) { return 0 < date && date <= DATE_LUT_MAX_DAY_NUM; }
bool contains(const UInt16 date) const bool contains(const UInt16 date) const
{ {
const auto & left = first; const auto & left = first;
@ -95,8 +96,8 @@ private:
if (left <= date && date <= right) if (left <= date && date <= right)
return true; return true;
const auto has_left_bound = 0 < left && left <= DATE_LUT_MAX_DAY_NUM; const auto has_left_bound = isCorrectDate(left);
const auto has_right_bound = 0 < right && right <= DATE_LUT_MAX_DAY_NUM; const auto has_right_bound = isCorrectDate(right);
if ((!has_left_bound || left <= date) && (!has_right_bound || date <= right)) if ((!has_left_bound || left <= date) && (!has_right_bound || date <= right))
return true; return true;
@ -105,6 +106,7 @@ private:
} }
}; };
private:
template <typename T> template <typename T>
struct Value final struct Value final
{ {

View File

@ -1,6 +1,7 @@
#include <sstream> #include <sstream>
#include <Parsers/ASTCreateQuery.h> #include <Parsers/ASTCreateQuery.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeDate.h>
#include <Dictionaries/IDictionarySource.h> #include <Dictionaries/IDictionarySource.h>
#include <Dictionaries/DictionaryStructure.h> #include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/CacheDictionary.h> #include <Dictionaries/CacheDictionary.h>
@ -87,9 +88,9 @@ NamesAndTypesList StorageDictionary::getNamesAndTypes(const DictionaryStructure
if (dictionary_structure.id) if (dictionary_structure.id)
dictionary_names_and_types.emplace_back(dictionary_structure.id->name, std::make_shared<DataTypeUInt64>()); dictionary_names_and_types.emplace_back(dictionary_structure.id->name, std::make_shared<DataTypeUInt64>());
if (dictionary_structure.range_min) if (dictionary_structure.range_min)
dictionary_names_and_types.emplace_back(dictionary_structure.range_min->name, std::make_shared<DataTypeUInt16>()); dictionary_names_and_types.emplace_back(dictionary_structure.range_min->name, std::make_shared<DataTypeDate>());
if (dictionary_structure.range_max) if (dictionary_structure.range_max)
dictionary_names_and_types.emplace_back(dictionary_structure.range_max->name, std::make_shared<DataTypeUInt16>()); dictionary_names_and_types.emplace_back(dictionary_structure.range_max->name, std::make_shared<DataTypeDate>());
if (dictionary_structure.key) if (dictionary_structure.key)
for (const auto & attribute : *dictionary_structure.key) for (const auto & attribute : *dictionary_structure.key)
dictionary_names_and_types.emplace_back(attribute.name, attribute.type); dictionary_names_and_types.emplace_back(attribute.name, attribute.type);

View File

@ -1,4 +1,4 @@
0 0 0 0 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0 0 0 0 0 0000-00-00 0000-00-00 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0
1 1 1 1 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0 1 1 1 1 1970-01-02 0000-00-00 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0
2 2 2 2 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1 2 2 2 2 0000-00-00 1970-01-02 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1
3 3 3 3 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2 3 3 3 3 1990-01-03 1990-01-03 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2
1 0 0 0 0 0000-00-00 0000-00-00 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0
2 1 1 1 1 1970-01-02 0000-00-00 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0
3 2 2 2 2 0000-00-00 1970-01-02 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1
4 3 3 3 3 1990-01-03 1990-01-03 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2

View File

@ -43,6 +43,9 @@ def generate_structure():
# Complex key dictionaries with (String, UInt8) key # Complex key dictionaries with (String, UInt8) key
['clickhouse_complex_mixed_key_hashed', 2, False], ['clickhouse_complex_mixed_key_hashed', 2, False],
['clickhouse_complex_mixed_key_cache', 2, False], ['clickhouse_complex_mixed_key_cache', 2, False],
# Range hashed dictionary
['clickhouse_range_hashed', 3, False],
] ]
@ -102,6 +105,7 @@ def generate_dictionaries(path, structure):
layout_cache = '<cache><size_in_cells>128</size_in_cells></cache>' layout_cache = '<cache><size_in_cells>128</size_in_cells></cache>'
layout_complex_key_hashed = '<complex_key_hashed />' layout_complex_key_hashed = '<complex_key_hashed />'
layout_complex_key_cache = '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>' layout_complex_key_cache = '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>'
layout_range_hashed = '<range_hashed />'
key_simple = ''' key_simple = '''
<id> <id>
@ -135,7 +139,19 @@ def generate_dictionaries(path, structure):
</key> </key>
''' '''
keys = [key_simple, key_complex_integers, key_complex_mixed] key_range_hashed = '''
<id>
<name>id</name>
</id>
<range_min>
<name>StartDate</name>
</range_min>
<range_max>
<name>EndDate</name>
</range_max>
'''
keys = [key_simple, key_complex_integers, key_complex_mixed, key_range_hashed]
parent_attribute = ''' parent_attribute = '''
<attribute> <attribute>
@ -158,6 +174,9 @@ def generate_dictionaries(path, structure):
# Complex key dictionaries with (String, UInt8) key # Complex key dictionaries with (String, UInt8) key
[source_clickhouse, layout_complex_key_hashed], [source_clickhouse, layout_complex_key_hashed],
[source_clickhouse, layout_complex_key_cache], [source_clickhouse, layout_complex_key_cache],
# Range hashed dictionary
[source_clickhouse, layout_range_hashed],
] ]
file_names = [] file_names = []
@ -176,6 +195,7 @@ def generate_dictionaries(path, structure):
class DictionaryTestTable: class DictionaryTestTable:
def __init__(self, source_file_name): def __init__(self, source_file_name):
self.structure = '''id UInt64, key0 UInt8, key0_str String, key1 UInt8, self.structure = '''id UInt64, key0 UInt8, key0_str String, key1 UInt8,
StartDate Date, EndDate Date,
UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64, UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64,
Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64, Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64,
Float32_ Float32, Float64_ Float64, Float32_ Float32, Float64_ Float64,
@ -183,8 +203,8 @@ class DictionaryTestTable:
Date_ Date, DateTime_ DateTime, Parent UInt64''' Date_ Date, DateTime_ DateTime, Parent UInt64'''
self.names_and_types = map(str.split, self.structure.split(',')) self.names_and_types = map(str.split, self.structure.split(','))
self.keys_names_and_types = self.names_and_types[:4] self.keys_names_and_types = self.names_and_types[:6]
self.values_names_and_types = self.names_and_types[4:] self.values_names_and_types = self.names_and_types[6:]
self.source_file_name = source_file_name self.source_file_name = source_file_name
self.rows = None self.rows = None
@ -213,6 +233,7 @@ class DictionaryTestTable:
return '(' + ','.join(map(wrap_value, zip(row, types))) + ')' return '(' + ','.join(map(wrap_value, zip(row, types))) + ')'
values = ','.join(map(make_tuple, lines)) values = ','.join(map(make_tuple, lines))
print query % (self.structure, values)
instance.query(query % (self.structure, values)) instance.query(query % (self.structure, values))
def get_structure_for_keys(self, keys, enable_parent=True): def get_structure_for_keys(self, keys, enable_parent=True):

View File

@ -45,10 +45,12 @@ def started_cluster():
('clickhouse_flat', ('id',), True), ('clickhouse_flat', ('id',), True),
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False), ('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False), ('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
], ],
ids=['clickhouse_hashed', 'clickhouse_flat', ids=['clickhouse_hashed', 'clickhouse_flat',
'clickhouse_complex_integers_key_hashed', 'clickhouse_complex_integers_key_hashed',
'clickhouse_complex_mixed_key_hashed'] 'clickhouse_complex_mixed_key_hashed',
'clickhouse_range_hashed']
) )
def dictionary_structure(started_cluster, request): def dictionary_structure(started_cluster, request):
return request.param return request.param