ClickHouse/tests/queries/0_stateless/02391_dictionary_shards.sql

drop dictionary if exists dict;
drop dictionary if exists dict_10;
drop dictionary if exists dict_10_uint8;
drop dictionary if exists dict_10_string;
drop dictionary if exists dict_10_incremental;
drop dictionary if exists complex_dict_10;
drop table if exists data;
drop table if exists data_string;
drop table if exists complex_data;

create table data (key UInt64, value UInt16) engine=Memory() as select number, number from numbers(1e5);
create table data_string (key String, value UInt16) engine=Memory() as select 'foo' || number::String, number from numbers(1e5);
create table complex_data (k1 UInt64, k2 UInt64, value UInt16) engine=Memory() as select number, number, number from numbers(1e5);

-- { echoOn }
create dictionary dict (key UInt64, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed()) lifetime(0);
show create dict;
system reload dictionary dict;
select element_count from system.dictionaries where database = currentDatabase() and name = 'dict';
select count() from data where dictGetUInt16('dict', 'value', key) != value;

create dictionary dict_10 (key UInt64, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed(shards 10)) lifetime(0);
show create dict_10;
system reload dictionary dict_10;
select element_count from system.dictionaries where database = currentDatabase() and name = 'dict_10';
select count() from data where dictGetUInt16('dict_10', 'value', key) != value;

create dictionary dict_10_uint8 (key UInt8, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed(shards 10)) lifetime(0);
show create dict_10_uint8;
system reload dictionary dict_10_uint8;
select element_count from system.dictionaries where database = currentDatabase() and name = 'dict_10';
select count() from data where dictGetUInt16('dict_10_uint8', 'value', key) != value;

create dictionary dict_10_string (key String, value UInt16) primary key key source(clickhouse(table data_string)) layout(sparse_hashed(shards 10)) lifetime(0);
show create dict_10_string;
system reload dictionary dict_10_string; -- { serverError CANNOT_PARSE_TEXT }

create dictionary dict_10_incremental (key UInt64, value UInt16) primary key key source(clickhouse(table data_last_access update_field last_access)) layout(sparse_hashed(shards 10)) lifetime(0);
system reload dictionary dict_10_incremental; -- { serverError BAD_ARGUMENTS }

create dictionary complex_dict_10 (k1 UInt64, k2 UInt64, value UInt16) primary key k1, k2 source(clickhouse(table complex_data)) layout(complex_key_sparse_hashed(shards 10)) lifetime(0);
system reload dictionary complex_dict_10;
select element_count from system.dictionaries where database = currentDatabase() and name = 'complex_dict_10';
select count() from complex_data where dictGetUInt16('complex_dict_10', 'value', (k1, k2)) != value;
Add ability to load hashed dictionaries using multiple threads Right now dictionaries (here I will talk about only HASHED/SPARSE_HASHED/COMPLEX_KEY_HASHED/COMPLEX_KEY_SPARSE_HASHED) can load data only in one thread, since it uses one hash table that cannot be filled from multiple threads. And in case you have very big dictionary (i.e. 10e9 elements), it can take a awhile to load them, especially for SPARSE_HASHED variants (and if you have such amount of elements there, you are likely use SPARSE_HASHED, since it requires less memory), in my env it takes ~4 hours, which is enormous amount of time. So this patch add support of shards for dictionaries, number of shards determine how much hash tables will use this dictionary, also, and which is more important, how much threads it can use to load the data. And with 16 threads this works 2x faster, not perfect though, see the follow up patches in this series. v0: PARTITION BY v1: SHARDS 1 v2: SHARDS(1) v3: tried optimized mod - logical and, but it does not gain even 10% v4: tried squashing more (max_block_size * shards), but it does not gain even 10% either v5: move SHARDS into layout parameters (unknown simply ignored) v6: tune params for perf tests (to avoid too long queries) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2022-08-07 15:48:54 +00:00			`drop dictionary if exists dict;`
			`drop dictionary if exists dict_10;`
			`drop dictionary if exists dict_10_uint8;`
			`drop dictionary if exists dict_10_string;`
tests: cover sharded hashed dictionary with update_field Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-01-03 16:09:22 +00:00			`drop dictionary if exists dict_10_incremental;`
Add ability to load hashed dictionaries using multiple threads Right now dictionaries (here I will talk about only HASHED/SPARSE_HASHED/COMPLEX_KEY_HASHED/COMPLEX_KEY_SPARSE_HASHED) can load data only in one thread, since it uses one hash table that cannot be filled from multiple threads. And in case you have very big dictionary (i.e. 10e9 elements), it can take a awhile to load them, especially for SPARSE_HASHED variants (and if you have such amount of elements there, you are likely use SPARSE_HASHED, since it requires less memory), in my env it takes ~4 hours, which is enormous amount of time. So this patch add support of shards for dictionaries, number of shards determine how much hash tables will use this dictionary, also, and which is more important, how much threads it can use to load the data. And with 16 threads this works 2x faster, not perfect though, see the follow up patches in this series. v0: PARTITION BY v1: SHARDS 1 v2: SHARDS(1) v3: tried optimized mod - logical and, but it does not gain even 10% v4: tried squashing more (max_block_size * shards), but it does not gain even 10% either v5: move SHARDS into layout parameters (unknown simply ignored) v6: tune params for perf tests (to avoid too long queries) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2022-08-07 15:48:54 +00:00			`drop dictionary if exists complex_dict_10;`
			`drop table if exists data;`
			`drop table if exists data_string;`
			`drop table if exists complex_data;`

			`create table data (key UInt64, value UInt16) engine=Memory() as select number, number from numbers(1e5);`
			`create table data_string (key String, value UInt16) engine=Memory() as select 'foo' \|\| number::String, number from numbers(1e5);`
			`create table complex_data (k1 UInt64, k2 UInt64, value UInt16) engine=Memory() as select number, number, number from numbers(1e5);`

			`-- { echoOn }`
			`create dictionary dict (key UInt64, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed()) lifetime(0);`
			`show create dict;`
			`system reload dictionary dict;`
			`select element_count from system.dictionaries where database = currentDatabase() and name = 'dict';`
			`select count() from data where dictGetUInt16('dict', 'value', key) != value;`

			`create dictionary dict_10 (key UInt64, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed(shards 10)) lifetime(0);`
			`show create dict_10;`
			`system reload dictionary dict_10;`
			`select element_count from system.dictionaries where database = currentDatabase() and name = 'dict_10';`
			`select count() from data where dictGetUInt16('dict_10', 'value', key) != value;`

			`create dictionary dict_10_uint8 (key UInt8, value UInt16) primary key key source(clickhouse(table data)) layout(sparse_hashed(shards 10)) lifetime(0);`
			`show create dict_10_uint8;`
			`system reload dictionary dict_10_uint8;`
			`select element_count from system.dictionaries where database = currentDatabase() and name = 'dict_10';`
			`select count() from data where dictGetUInt16('dict_10_uint8', 'value', key) != value;`

			`create dictionary dict_10_string (key String, value UInt16) primary key key source(clickhouse(table data_string)) layout(sparse_hashed(shards 10)) lifetime(0);`
			`show create dict_10_string;`
			`system reload dictionary dict_10_string; -- { serverError CANNOT_PARSE_TEXT }`

Disable sharded dictionaries with updatable sources Support of sharded dictionary for updatable sources is questionable since: - sharded dictionary developed for hashed dictionary with a huge number of keys - updatable source requires storing the whole table in memory (due to how reload works) - also it is an open question will it have some benefits from the updatable source or not, since using updatable source with a huge number of changes in the source does not looks optimal and on the other side if there are small amount of changes the you don't need sharded dictionary at all Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-01-03 20:14:30 +00:00			`create dictionary dict_10_incremental (key UInt64, value UInt16) primary key key source(clickhouse(table data_last_access update_field last_access)) layout(sparse_hashed(shards 10)) lifetime(0);`
			`system reload dictionary dict_10_incremental; -- { serverError BAD_ARGUMENTS }`
tests: cover sharded hashed dictionary with update_field Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-01-03 16:09:22 +00:00
Add ability to load hashed dictionaries using multiple threads Right now dictionaries (here I will talk about only HASHED/SPARSE_HASHED/COMPLEX_KEY_HASHED/COMPLEX_KEY_SPARSE_HASHED) can load data only in one thread, since it uses one hash table that cannot be filled from multiple threads. And in case you have very big dictionary (i.e. 10e9 elements), it can take a awhile to load them, especially for SPARSE_HASHED variants (and if you have such amount of elements there, you are likely use SPARSE_HASHED, since it requires less memory), in my env it takes ~4 hours, which is enormous amount of time. So this patch add support of shards for dictionaries, number of shards determine how much hash tables will use this dictionary, also, and which is more important, how much threads it can use to load the data. And with 16 threads this works 2x faster, not perfect though, see the follow up patches in this series. v0: PARTITION BY v1: SHARDS 1 v2: SHARDS(1) v3: tried optimized mod - logical and, but it does not gain even 10% v4: tried squashing more (max_block_size * shards), but it does not gain even 10% either v5: move SHARDS into layout parameters (unknown simply ignored) v6: tune params for perf tests (to avoid too long queries) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2022-08-07 15:48:54 +00:00			`create dictionary complex_dict_10 (k1 UInt64, k2 UInt64, value UInt16) primary key k1, k2 source(clickhouse(table complex_data)) layout(complex_key_sparse_hashed(shards 10)) lifetime(0);`
			`system reload dictionary complex_dict_10;`
			`select element_count from system.dictionaries where database = currentDatabase() and name = 'complex_dict_10';`
			`select count() from complex_data where dictGetUInt16('complex_dict_10', 'value', (k1, k2)) != value;`