DirectDictionary improve performance of dictHas with duplicate keys

This commit is contained in:
Maksim Kita 2022-07-21 12:12:04 +02:00
parent 9e9969cea7
commit 6443116e80
3 changed files with 127 additions and 6 deletions

View File

@ -171,13 +171,13 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(
auto requested_keys = requested_keys_extractor.extractAllKeys();
size_t requested_keys_size = requested_keys.size();
HashMap<KeyType, size_t> requested_key_to_index;
HashMap<KeyType, PaddedPODArray<size_t>> requested_key_to_index;
requested_key_to_index.reserve(requested_keys_size);
for (size_t i = 0; i < requested_keys.size(); ++i)
{
auto requested_key = requested_keys[i];
requested_key_to_index[requested_key] = i;
requested_key_to_index[requested_key].push_back(i);
}
auto result = ColumnUInt8::create(requested_keys_size, false);
@ -208,10 +208,13 @@ ColumnUInt8::Ptr DirectDictionary<dictionary_key_type>::hasKeys(
const auto * it = requested_key_to_index.find(block_key);
assert(it);
size_t result_data_found_index = it->getMapped();
/// block_keys_size cannot be used, due to duplicates.
keys_found += !result_data[result_data_found_index];
result_data[result_data_found_index] = true;
auto & result_data_found_indexes = it->getMapped();
for (size_t result_data_found_index : result_data_found_indexes)
{
/// block_keys_size cannot be used, due to duplicates.
keys_found += !result_data[result_data_found_index];
result_data[result_data_found_index] = true;
}
block_keys_extractor.rollbackCurrentKey();
}

View File

@ -0,0 +1,62 @@
0
0
0
1
0
1
0
2
1
0
0 0 1
1 0 1
2 0 1
3 1 0
4 0 1
5 1 0
6 0 1
7 2 0
8 1 0
9 0 1
1
1
1
0
1
0
1
0
0
1
1
1
1
0
1
0
1
0
0
1
1
1
1
0
1
0
1
0
0
1
value_0
value_0
value_0
UNKNOWN
value_0
UNKNOWN
value_0
UNKNOWN
UNKNOWN
value_0
4 0
6 1

View File

@ -0,0 +1,56 @@
-- Tags: no-backward-compatibility-check
DROP DATABASE IF EXISTS 02366_dictionary_db;
CREATE DATABASE 02366_dictionary_db;
CREATE TABLE 02366_dictionary_db.dict_data
(
id UInt64,
val String
)
ENGINE = Memory;
CREATE TABLE 02366_dictionary_db.lookup_data
(
id UInt64,
lookup_key UInt64,
)
ENGINE = Memory;
INSERT INTO 02366_dictionary_db.dict_data VALUES(0, 'value_0');
INSERT INTO 02366_dictionary_db.lookup_data VALUES(0, 0);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(1, 0);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(2, 0);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(3, 1);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(4, 0);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(5, 1);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(6, 0);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(7, 2);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(8, 1);
INSERT INTO 02366_dictionary_db.lookup_data VALUES(9, 0);
CREATE DICTIONARY 02366_dictionary_db.dict0
(
id UInt64,
val String
)
PRIMARY KEY id
SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'dict_data'))
LAYOUT(DIRECT());
SELECT lookup_key FROM 02366_dictionary_db.lookup_data ORDER BY id ASC;
SELECT id, lookup_key, dictHas(02366_dictionary_db.dict0, lookup_key) FROM 02366_dictionary_db.lookup_data ORDER BY id ASC;
-- Nesting this way seems to help it make all the lookups as a single block, although even then it isn't guaranteed
SELECT dictHas(02366_dictionary_db.dict0, lk) FROM (SELECT any(lookup_key) as lk FROM 02366_dictionary_db.lookup_data group by id ORDER BY id ASC);
-- Same with this group by
SELECT dictHas(02366_dictionary_db.dict0, any(lookup_key)) FROM 02366_dictionary_db.lookup_data GROUP BY id ORDER BY id ASC;
SELECT dictHas(02366_dictionary_db.dict0, lookup_key) FROM 02366_dictionary_db.lookup_data ORDER BY id ASC;
SELECT dictGetOrDefault(02366_dictionary_db.dict0, 'val', lookup_key, 'UNKNOWN') FROM 02366_dictionary_db.lookup_data ORDER BY id ASC;
SELECT count(), has FROM 02366_dictionary_db.lookup_data group by dictHas(02366_dictionary_db.dict0, lookup_key) as has;
DROP DICTIONARY 02366_dictionary_db.dict0;
DROP TABLE 02366_dictionary_db.lookup_data;
DROP TABLE 02366_dictionary_db.dict_data;