Merge pull request #1702 from yandex/CLICKHOUSE-3521

Dictionary(range_hashed) fix
This commit is contained in:
alexey-milovidov 2017-12-26 20:38:05 +03:00 committed by GitHub
commit 1d36d61f8b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 71 additions and 21 deletions

View File

@ -54,7 +54,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct)
if (dict_struct.id)
block.insert(ColumnWithTypeAndName{
ColumnUInt64::create(1), std::make_shared<DataTypeUInt64>(), dict_struct.id->name});
ColumnUInt64::create(1, 0), std::make_shared<DataTypeUInt64>(), dict_struct.id->name});
if (dict_struct.key)
{
@ -70,7 +70,7 @@ Block createSampleBlock(const DictionaryStructure & dict_struct)
if (dict_struct.range_min)
for (const auto & attribute : { dict_struct.range_min, dict_struct.range_max })
block.insert(ColumnWithTypeAndName{
ColumnUInt16::create(1), std::make_shared<DataTypeDate>(), attribute->name});
ColumnUInt16::create(1, 0), std::make_shared<DataTypeDate>(), attribute->name});
for (const auto & attribute : dict_struct.attributes)
{

View File

@ -8,6 +8,7 @@
#include <Dictionaries/DictionaryBlockInputStreamBase.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/IDictionary.h>
#include <Dictionaries/RangeHashedDictionary.h>
#include <ext/range.h>
namespace DB
@ -57,6 +58,9 @@ private:
Block fillBlock(const PaddedPODArray<Key> & ids,
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const;
PaddedPODArray<UInt16> makeDateKey(
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const;
DictionatyPtr dictionary;
Names column_names;
PaddedPODArray<Key> ids;
@ -88,8 +92,8 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::getBlock(size_t star
for (auto idx : ext::range(start, start + length))
{
block_ids.push_back(ids[idx]);
block_start_dates.push_back(block_start_dates[idx]);
block_end_dates.push_back(block_end_dates[idx]);
block_start_dates.push_back(start_dates[idx]);
block_end_dates.push_back(end_dates[idx]);
}
return fillBlock(block_ids, block_start_dates, block_end_dates);
@ -144,6 +148,23 @@ void RangeDictionaryBlockInputStream<DictionaryType, Key>::addSpecialColumn(
}
}
template <typename DictionaryType, typename Key>
PaddedPODArray<UInt16> RangeDictionaryBlockInputStream<DictionaryType, Key>::makeDateKey(
const PaddedPODArray<UInt16> & start_dates, const PaddedPODArray<UInt16> & end_dates) const
{
PaddedPODArray<UInt16> key(start_dates.size());
for (size_t i = 0; i < key.size(); ++i)
{
if (RangeHashedDictionary::Range::isCorrectDate(start_dates[i]))
key[i] = start_dates[i];
else
key[i] = end_dates[i];
}
return key;
}
template <typename DictionaryType, typename Key>
Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
const PaddedPODArray<Key>& ids,
@ -158,6 +179,8 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
addSpecialColumn(structure.range_min, std::make_shared<DataTypeDate>(), "Range Start", names, start_dates, columns);
addSpecialColumn(structure.range_max, std::make_shared<DataTypeDate>(), "Range End", names, end_dates, columns);
auto date_key = makeDateKey(start_dates, end_dates);
for (const auto idx : ext::range(0, structure.attributes.size()))
{
const DictionaryAttribute& attribute = structure.attributes[idx];
@ -165,7 +188,7 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
{
ColumnPtr column;
#define GET_COLUMN_FORM_ATTRIBUTE(TYPE)\
column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids, start_dates, attribute, *dictionary)
column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids, date_key, attribute, *dictionary)
switch (attribute.underlying_type)
{
case AttributeUnderlyingType::UInt8:
@ -202,7 +225,7 @@ Block RangeDictionaryBlockInputStream<DictionaryType, Key>::fillBlock(
GET_COLUMN_FORM_ATTRIBUTE(Float64);
break;
case AttributeUnderlyingType::String:
column = getColumnFromAttributeString(ids, start_dates, attribute, *dictionary);
column = getColumnFromAttributeString(ids, date_key, attribute, *dictionary);
break;
}

View File

@ -391,10 +391,11 @@ void RangeHashedDictionary::getIdsAndDates(const Attribute& attribute, PaddedPOD
start_dates.reserve(attr.size());
end_dates.reserve(attr.size());
for (const auto & key : attr) {
ids.push_back(key.first);
for (const auto & key : attr)
{
for (const auto & value : key.second)
{
ids.push_back(key.first);
start_dates.push_back(value.range.first);
end_dates.push_back(value.range.second);
}

View File

@ -82,11 +82,12 @@ public:
BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override;
private:
struct Range : std::pair<UInt16, UInt16>
{
using std::pair<UInt16, UInt16>::pair;
static bool isCorrectDate(const UInt16 date) { return 0 < date && date <= DATE_LUT_MAX_DAY_NUM; }
bool contains(const UInt16 date) const
{
const auto & left = first;
@ -95,8 +96,8 @@ private:
if (left <= date && date <= right)
return true;
const auto has_left_bound = 0 < left && left <= DATE_LUT_MAX_DAY_NUM;
const auto has_right_bound = 0 < right && right <= DATE_LUT_MAX_DAY_NUM;
const auto has_left_bound = isCorrectDate(left);
const auto has_right_bound = isCorrectDate(right);
if ((!has_left_bound || left <= date) && (!has_right_bound || date <= right))
return true;
@ -105,6 +106,7 @@ private:
}
};
private:
template <typename T>
struct Value final
{

View File

@ -1,6 +1,7 @@
#include <sstream>
#include <Parsers/ASTCreateQuery.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeDate.h>
#include <Dictionaries/IDictionarySource.h>
#include <Dictionaries/DictionaryStructure.h>
#include <Dictionaries/CacheDictionary.h>
@ -87,9 +88,9 @@ NamesAndTypesList StorageDictionary::getNamesAndTypes(const DictionaryStructure
if (dictionary_structure.id)
dictionary_names_and_types.emplace_back(dictionary_structure.id->name, std::make_shared<DataTypeUInt64>());
if (dictionary_structure.range_min)
dictionary_names_and_types.emplace_back(dictionary_structure.range_min->name, std::make_shared<DataTypeUInt16>());
dictionary_names_and_types.emplace_back(dictionary_structure.range_min->name, std::make_shared<DataTypeDate>());
if (dictionary_structure.range_max)
dictionary_names_and_types.emplace_back(dictionary_structure.range_max->name, std::make_shared<DataTypeUInt16>());
dictionary_names_and_types.emplace_back(dictionary_structure.range_max->name, std::make_shared<DataTypeDate>());
if (dictionary_structure.key)
for (const auto & attribute : *dictionary_structure.key)
dictionary_names_and_types.emplace_back(attribute.name, attribute.type);

View File

@ -1,4 +1,4 @@
0 0 0 0 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0
1 1 1 1 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0
2 2 2 2 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1
3 3 3 3 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2
0 0 0 0 0000-00-00 0000-00-00 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0
1 1 1 1 1970-01-02 0000-00-00 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0
2 2 2 2 0000-00-00 1970-01-02 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1
3 3 3 3 1990-01-03 1990-01-03 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2
1 0 0 0 0 0000-00-00 0000-00-00 50 13874 980694578 980694579 50 13874 980694578 980694579 0 0 4761183170873013810 2007-12-27 1970-01-02 06:51:14 0
2 1 1 1 1 1970-01-02 0000-00-00 48 57392 4083802160 4083802161 48 -8144 -211165136 -211165135 1.5 1.5 10577349846663553072 2037-06-02 1970-01-02 09:50:24 0
3 2 2 2 2 0000-00-00 1970-01-02 69 35909 1447922757 1447922758 69 -29627 1447922757 1447922758 3 3 18198135717204167749 1978-08-08 1970-01-02 03:52:21 1
4 3 3 3 3 1990-01-03 1990-01-03 250 1274 1029309690 1029309691 -6 1274 1029309690 1029309691 4.5 4.5 9624464864560415994 1973-06-28 1970-01-02 03:21:14 2

View File

@ -43,6 +43,9 @@ def generate_structure():
# Complex key dictionaries with (String, UInt8) key
['clickhouse_complex_mixed_key_hashed', 2, False],
['clickhouse_complex_mixed_key_cache', 2, False],
# Range hashed dictionary
['clickhouse_range_hashed', 3, False],
]
@ -102,6 +105,7 @@ def generate_dictionaries(path, structure):
layout_cache = '<cache><size_in_cells>128</size_in_cells></cache>'
layout_complex_key_hashed = '<complex_key_hashed />'
layout_complex_key_cache = '<complex_key_cache><size_in_cells>128</size_in_cells></complex_key_cache>'
layout_range_hashed = '<range_hashed />'
key_simple = '''
<id>
@ -135,7 +139,19 @@ def generate_dictionaries(path, structure):
</key>
'''
keys = [key_simple, key_complex_integers, key_complex_mixed]
key_range_hashed = '''
<id>
<name>id</name>
</id>
<range_min>
<name>StartDate</name>
</range_min>
<range_max>
<name>EndDate</name>
</range_max>
'''
keys = [key_simple, key_complex_integers, key_complex_mixed, key_range_hashed]
parent_attribute = '''
<attribute>
@ -158,6 +174,9 @@ def generate_dictionaries(path, structure):
# Complex key dictionaries with (String, UInt8) key
[source_clickhouse, layout_complex_key_hashed],
[source_clickhouse, layout_complex_key_cache],
# Range hashed dictionary
[source_clickhouse, layout_range_hashed],
]
file_names = []
@ -176,6 +195,7 @@ def generate_dictionaries(path, structure):
class DictionaryTestTable:
def __init__(self, source_file_name):
self.structure = '''id UInt64, key0 UInt8, key0_str String, key1 UInt8,
StartDate Date, EndDate Date,
UInt8_ UInt8, UInt16_ UInt16, UInt32_ UInt32, UInt64_ UInt64,
Int8_ Int8, Int16_ Int16, Int32_ Int32, Int64_ Int64,
Float32_ Float32, Float64_ Float64,
@ -183,8 +203,8 @@ class DictionaryTestTable:
Date_ Date, DateTime_ DateTime, Parent UInt64'''
self.names_and_types = map(str.split, self.structure.split(','))
self.keys_names_and_types = self.names_and_types[:4]
self.values_names_and_types = self.names_and_types[4:]
self.keys_names_and_types = self.names_and_types[:6]
self.values_names_and_types = self.names_and_types[6:]
self.source_file_name = source_file_name
self.rows = None
@ -213,6 +233,7 @@ class DictionaryTestTable:
return '(' + ','.join(map(wrap_value, zip(row, types))) + ')'
values = ','.join(map(make_tuple, lines))
print query % (self.structure, values)
instance.query(query % (self.structure, values))
def get_structure_for_keys(self, keys, enable_parent=True):

View File

@ -45,10 +45,12 @@ def started_cluster():
('clickhouse_flat', ('id',), True),
('clickhouse_complex_integers_key_hashed', ('key0', 'key1'), False),
('clickhouse_complex_mixed_key_hashed', ('key0_str', 'key1'), False),
('clickhouse_range_hashed', ('id', 'StartDate', 'EndDate'), False),
],
ids=['clickhouse_hashed', 'clickhouse_flat',
'clickhouse_complex_integers_key_hashed',
'clickhouse_complex_mixed_key_hashed']
'clickhouse_complex_mixed_key_hashed',
'clickhouse_range_hashed']
)
def dictionary_structure(started_cluster, request):
return request.param