mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 11:32:03 +00:00
345c422e28
Right now dictionaries (here I will talk about only HASHED/SPARSE_HASHED/COMPLEX_KEY_HASHED/COMPLEX_KEY_SPARSE_HASHED) can load data only in one thread, since it uses one hash table that cannot be filled from multiple threads. And in case you have very big dictionary (i.e. 10e9 elements), it can take a awhile to load them, especially for SPARSE_HASHED variants (and if you have such amount of elements there, you are likely use SPARSE_HASHED, since it requires less memory), in my env it takes ~4 hours, which is enormous amount of time. So this patch add support of shards for dictionaries, number of shards determine how much hash tables will use this dictionary, also, and which is more important, how much threads it can use to load the data. And with 16 threads this works 2x faster, not perfect though, see the follow up patches in this series. v0: PARTITION BY v1: SHARDS 1 v2: SHARDS(1) v3: tried optimized mod - logical and, but it does not gain even 10% v4: tried squashing more (max_block_size * shards), but it does not gain even 10% either v5: move SHARDS into layout parameters (unknown simply ignored) v6: tune params for perf tests (to avoid too long queries) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
83 lines
2.7 KiB
XML
83 lines
2.7 KiB
XML
<test>
|
|
<substitutions>
|
|
<substitution>
|
|
<name>dictionary_layout</name>
|
|
<values>
|
|
<value>hashed</value>
|
|
<value>hashed_array</value>
|
|
</values>
|
|
</substitution>
|
|
|
|
<substitution>
|
|
<name>dictionary_shards</name>
|
|
<values>
|
|
<value>1</value>
|
|
<value>16</value>
|
|
</values>
|
|
</substitution>
|
|
|
|
<substitution>
|
|
<name>func</name>
|
|
<values>
|
|
<value>dictGetHierarchy</value>
|
|
<value>dictGetDescendants</value>
|
|
</values>
|
|
</substitution>
|
|
</substitutions>
|
|
|
|
<create_query>
|
|
CREATE TABLE hierarchical_dictionary_source_table
|
|
(
|
|
id UInt64,
|
|
parent_id UInt64
|
|
) ENGINE = MergeTree ORDER BY id;
|
|
</create_query>
|
|
|
|
<create_query>
|
|
CREATE DICTIONARY hierarchical_{dictionary_layout}_shards{dictionary_shards}_dictionary
|
|
(
|
|
id UInt64,
|
|
parent_id UInt64 HIERARCHICAL
|
|
)
|
|
PRIMARY KEY id
|
|
SOURCE(CLICKHOUSE(DB 'default' TABLE 'hierarchical_dictionary_source_table'))
|
|
LAYOUT({dictionary_layout}(SHARDS {dictionary_shards}))
|
|
LIFETIME(0);
|
|
</create_query>
|
|
|
|
<create_query>
|
|
CREATE DICTIONARY hierarchical_flat_dictionary
|
|
(
|
|
id UInt64,
|
|
parent_id UInt64 HIERARCHICAL
|
|
)
|
|
PRIMARY KEY id
|
|
SOURCE(CLICKHOUSE(DB 'default' TABLE 'hierarchical_dictionary_source_table'))
|
|
LAYOUT(FLAT(max_array_size 1000001))
|
|
LIFETIME(0);
|
|
</create_query>
|
|
|
|
<fill_query>
|
|
INSERT INTO hierarchical_dictionary_source_table
|
|
WITH 5000 AS first_level_start, 50000 as second_level_start
|
|
SELECT
|
|
(number + 1) as id,
|
|
multiIf(id > second_level_start, id % (second_level_start - first_level_start) + first_level_start + 1,
|
|
id > first_level_start, (id % first_level_start) + 1,
|
|
0) as parent_id
|
|
FROM system.numbers
|
|
LIMIT 1000000;
|
|
</fill_query>
|
|
|
|
<query>
|
|
SELECT {func}('hierarchical_flat_dictionary', number + 1) FROM numbers(1000000) FORMAT Null;
|
|
</query>
|
|
<query>
|
|
SELECT {func}('hierarchical_{dictionary_layout}_shards{dictionary_shards}_dictionary', number + 1) FROM numbers(1000000) FORMAT Null;
|
|
</query>
|
|
|
|
<drop_query>DROP DICTIONARY IF EXISTS hierarchical_{dictionary_layout}_shards{dictionary_shards}_dictionary;</drop_query>
|
|
<drop_query>DROP DICTIONARY IF EXISTS hierarchical_flat_dictionary;</drop_query>
|
|
<drop_query>DROP TABLE IF EXISTS hierarchical_dictionary_source_table;</drop_query>
|
|
</test>
|