diff --git a/.gitmodules b/.gitmodules index ecccf0633e2..519ba082304 100644 --- a/.gitmodules +++ b/.gitmodules @@ -184,7 +184,7 @@ url = https://github.com/ClickHouse-Extras/krb5 [submodule "contrib/cyrus-sasl"] path = contrib/cyrus-sasl - url = https://github.com/cyrusimap/cyrus-sasl + url = https://github.com/ClickHouse-Extras/cyrus-sasl branch = cyrus-sasl-2.1 [submodule "contrib/croaring"] path = contrib/croaring diff --git a/base/common/defines.h b/base/common/defines.h index 39df4698b88..845a53179ef 100644 --- a/base/common/defines.h +++ b/base/common/defines.h @@ -84,10 +84,12 @@ # define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined"))) # define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address"))) # define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread"))) +# define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED __attribute__((__always_inline__, __no_sanitize__("undefined"))) #else /// It does not work in GCC. GCC 7 cannot recognize this attribute and GCC 8 simply ignores it. # define NO_SANITIZE_UNDEFINED # define NO_SANITIZE_ADDRESS # define NO_SANITIZE_THREAD +# define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED ALWAYS_INLINE #endif /// A template function for suppressing warnings about unused variables or function results. diff --git a/base/glibc-compatibility/musl/sched_getcpu.c b/base/glibc-compatibility/musl/sched_getcpu.c index 57b8b416043..f290f01d153 100644 --- a/base/glibc-compatibility/musl/sched_getcpu.c +++ b/base/glibc-compatibility/musl/sched_getcpu.c @@ -31,7 +31,7 @@ static void *volatile vdso_func = (void *)getcpu_init; int sched_getcpu(void) { int r; - unsigned cpu; + unsigned cpu = 0; #ifdef VDSO_GETCPU_SYM getcpu_f f = (getcpu_f)vdso_func; diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 9b51891ccf5..f353931f0a0 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -62,6 +62,7 @@ RUN python3 -m pip install \ avro \ cassandra-driver \ confluent-kafka \ + dict2xml \ dicttoxml \ docker \ docker-compose==1.22.0 \ diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index fb510a87fcd..575be721a54 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -53,10 +53,12 @@ function run_tests() if [ "$NUM_TRIES" -gt "1" ]; then ADDITIONAL_OPTIONS+=('--skip') ADDITIONAL_OPTIONS+=('00000_no_tests_to_skip') + ADDITIONAL_OPTIONS+=('--jobs') + ADDITIONAL_OPTIONS+=('4') fi clickhouse-test --testname --shard --zookeeper --hung-check --print-time \ - --test-runs "$NUM_TRIES" --jobs 4 \ + --test-runs "$NUM_TRIES" \ "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 857e148277c..6e864751cc3 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -7,8 +7,6 @@ toc_title: EmbeddedRocksDB This engine allows integrating ClickHouse with [rocksdb](http://rocksdb.org/). -`EmbeddedRocksDB` lets you: - ## Creating a Table {#table_engine-EmbeddedRocksDB-creating-a-table} ``` sql @@ -23,6 +21,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] Required parameters: - `primary_key_name` – any column name in the column list. +- `primary key` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a `rocksdb key`. +- columns other than the primary key will be serialized in binary as `rocksdb` value in corresponding order. +- queries with key `equals` or `in` filtering will be optimized to multi keys lookup from `rocksdb`. Example: @@ -38,8 +39,4 @@ ENGINE = EmbeddedRocksDB PRIMARY KEY key ``` -## Description {#description} - -- `primary key` must be specified, it only supports one column in primary key. The primary key will serialized in binary as rocksdb key. -- columns other than the primary key will be serialized in binary as rocksdb value in corresponding order. -- queries with key `equals` or `in` filtering will be optimized to multi keys lookup from rocksdb. +[Original article](https://clickhouse.tech/docs/en/operations/table_engines/embedded-rocksdb/) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 85a3b8bd941..99d6a0aad5f 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -428,7 +428,7 @@ Possible values: - `'basic'` — Use basic parser. - ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` or `YYYY-MM-DD` format. For example, `'2019-08-20 10:18:56'` or `2019-08-20`. + ClickHouse can parse only the basic `YYYY-MM-DD HH:MM:SS` or `YYYY-MM-DD` format. For example, `2019-08-20 10:18:56` or `2019-08-20`. Default value: `'basic'`. @@ -443,19 +443,19 @@ Allows choosing different output formats of the text representation of date and Possible values: -- `'simple'` - Simple output format. +- `simple` - Simple output format. - Clickhouse output date and time `YYYY-MM-DD hh:mm:ss` format. For example, `'2019-08-20 10:18:56'`. Calculation is performed according to the data type's time zone (if present) or server time zone. + Clickhouse output date and time `YYYY-MM-DD hh:mm:ss` format. For example, `2019-08-20 10:18:56`. The calculation is performed according to the data type's time zone (if present) or server time zone. -- `'iso'` - ISO output format. +- `iso` - ISO output format. - Clickhouse output date and time in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DDThh:mm:ssZ` format. For example, `'2019-08-20T10:18:56Z'`. Note that output is in UTC (`Z` means UTC). + Clickhouse output date and time in [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DDThh:mm:ssZ` format. For example, `2019-08-20T10:18:56Z`. Note that output is in UTC (`Z` means UTC). -- `'unix_timestamp'` - Unix timestamp output format. +- `unix_timestamp` - Unix timestamp output format. - Clickhouse output date and time in [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time) format. For example `'1566285536'`. + Clickhouse output date and time in [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time) format. For example `1566285536`. -Default value: `'simple'`. +Default value: `simple`. See also: diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 3b02e145ff4..4b3bf12aa8c 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -241,7 +241,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) **Parameters** -- `window` — Length of the sliding window in seconds. +- `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`. - `mode` - It is an optional argument. - `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values. - `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1). diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index bef2f8137d0..9e360abfe26 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -11,7 +11,7 @@ Key length depends on encryption mode. It is 16, 24, and 32 bytes long for `-128 Initialization vector length is always 16 bytes (bytes in excess of 16 are ignored). -Note that these functions work slowly. +Note that these functions work slowly until ClickHouse 21.1. ## encrypt {#encrypt} @@ -41,7 +41,7 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad]) **Returned value** -- Ciphered String. [String](../../sql-reference/data-types/string.md#string). +- Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string). **Examples** @@ -52,57 +52,38 @@ Query: ``` sql CREATE TABLE encryption_test ( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; + `comment` String, + `secret` String +) +ENGINE = Memory ``` -Insert this data: +Insert some data (please avoid storing the keys/ivs in the database as this undermines the whole concept of encryption), also storing 'hints' is unsafe too and used only for illustrative purposes: Query: ``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); +INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\ +('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\ +('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\ +('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212')); ``` -Example without `iv`: - Query: ``` sql -SELECT 'aes-128-ecb' AS mode, hex(encrypt(mode, input, key16)) FROM encryption_test; +SELECT comment, hex(secret) FROM encryption_test; ``` Result: ``` text -┌─mode────────┬─hex(encrypt('aes-128-ecb', input, key16))────────────────────────┐ -│ aes-128-ecb │ 4603E6862B0D94BBEC68E0B0DF51D60F │ -│ aes-128-ecb │ 3004851B86D3F3950672DE7085D27C03 │ -│ aes-128-ecb │ E807F8C8D40A11F65076361AFC7D8B68D8658C5FAA6457985CAA380F16B3F7E4 │ -└─────────────┴──────────────────────────────────────────────────────────────────┘ -``` - -Example with `iv`: - -Query: - -``` sql -SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test; -``` - -Result: - -``` text -┌─mode────────┬─hex(encrypt('aes-256-ctr', input, key32, iv))─┐ -│ aes-256-ctr │ │ -│ aes-256-ctr │ 7FB039F7 │ -│ aes-256-ctr │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2B325949 │ -└─────────────┴───────────────────────────────────────────────┘ +┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-cfb128 no IV │ B4972BDC4459 │ +│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ +│ aes-256-cfb128 with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└─────────────────────────────────────┴──────────────────────────────────┘ ``` Example with `-gcm`: @@ -110,40 +91,26 @@ Example with `-gcm`: Query: ``` sql -SELECT 'aes-256-gcm' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test; +INSERT INTO encryption_test VALUES('aes-256-gcm', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')), \ +('aes-256-gcm with AAD', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv', 'aad')); + +SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%'; ``` Result: ``` text -┌─mode────────┬─hex(encrypt('aes-256-gcm', input, key32, iv))──────────────────────────┐ -│ aes-256-gcm │ E99DBEBC01F021758352D7FBD9039EFA │ -│ aes-256-gcm │ 8742CE3A7B0595B281C712600D274CA881F47414 │ -│ aes-256-gcm │ A44FD73ACEB1A64BDE2D03808A2576EDBB60764CC6982DB9AF2C33C893D91B00C60DC5 │ -└─────────────┴────────────────────────────────────────────────────────────────────────┘ -``` - -Example with `-gcm` mode and with `aad`: - -Query: - -``` sql -SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM encryption_test; -``` - -Result: - -``` text -┌─mode────────┬─hex(encrypt('aes-192-gcm', input, key24, iv, 'AAD'))───────────────────┐ -│ aes-192-gcm │ 04C13E4B1D62481ED22B3644595CB5DB │ -│ aes-192-gcm │ 9A6CF0FD2B329B04EAD18301818F016DF8F77447 │ -│ aes-192-gcm │ B961E9FD9B940EBAD7ADDA75C9F198A40797A5EA1722D542890CC976E21113BBB8A7AA │ -└─────────────┴────────────────────────────────────────────────────────────────────────┘ +┌─comment──────────────┬─hex(secret)──────────────────────────────────┐ +│ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ +│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ +└──────────────────────┴──────────────────────────────────────────────┘ ``` ## aes_encrypt_mysql {#aes_encrypt_mysql} -Compatible with mysql encryption and can be decrypted with [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt) function. +Compatible with mysql encryption and resulting ciphertext can be decrypted with [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt) function. + +Will produce same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `IV`. Supported encryption modes: @@ -156,7 +123,7 @@ Supported encryption modes: **Syntax** -```sql +``` sql aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) ``` @@ -164,78 +131,98 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) - `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string). - `plaintext` — Text that needs to be encrypted. [String](../../sql-reference/data-types/string.md#string). -- `key` — Encryption key. [String](../../sql-reference/data-types/string.md#string). -- `iv` — Initialization vector. Optinal. [String](../../sql-reference/data-types/string.md#string). +- `key` — Encryption key. If key is longer than required by mode, MySQL-specific key folding is performed. [String](../../sql-reference/data-types/string.md#string). +- `iv` — Initialization vector. Optinal, only first 16 bytes are taken into account [String](../../sql-reference/data-types/string.md#string). **Returned value** -- Ciphered String. [String](../../sql-reference/data-types/string.md#string). +- Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string). + **Examples** -Create this table: +Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext: Query: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; +SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal; ``` -Insert this data: +Result: -Query: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); +``` +┌─ciphertexts_equal─┐ +│ 1 │ +└───────────────────┘ ``` -Example without `iv`: + +But `encrypt` fails when `key` or `iv` is longer than expected: Query: ``` sql -SELECT 'aes-128-cbc' AS mode, hex(aes_encrypt_mysql(mode, input, key32)) FROM encryption_test; +SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'); ``` Result: ``` text -┌─mode────────┬─hex(aes_encrypt_mysql('aes-128-cbc', input, key32))──────────────┐ -│ aes-128-cbc │ FEA8CFDE6EE2C6E7A2CC6ADDC9F62C83 │ -│ aes-128-cbc │ 78B16CD4BE107660156124C5FEE6454A │ -│ aes-128-cbc │ 67C0B119D96F18E2823968D42871B3D179221B1E7EE642D628341C2B29BA2E18 │ -└─────────────┴──────────────────────────────────────────────────────────────────┘ +Received exception from server (version 21.1.2): +Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` -Example with `iv`: +While `aes_encrypt_mysql` produces MySQL-compatitalbe output: Query: ``` sql -SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv)) FROM encryption_test; +SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext; +``` + +Result: + +```text +┌─ciphertext───┐ +│ 24E9E4966469 │ +└──────────────┘ +``` + +Notice how supplying even longer `IV` produces the same result + +Query: + +``` sql +SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext ``` Result: ``` text -┌─mode───────────┬─hex(aes_encrypt_mysql('aes-256-cfb128', input, key32, iv))─┐ -│ aes-256-cfb128 │ │ -│ aes-256-cfb128 │ 7FB039F7 │ -│ aes-256-cfb128 │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2BB5174F │ -└────────────────┴────────────────────────────────────────────────────────────┘ +┌─ciphertext───┐ +│ 24E9E4966469 │ +└──────────────┘ +``` + +Which is binary equal to what MySQL produces on same inputs: + +``` sql +mysql> SET block_encryption_mode='aes-256-cfb128'; +Query OK, 0 rows affected (0.00 sec) + +mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; ++------------------------+ +| ciphertext | ++------------------------+ +| 0x24E9E4966469 | ++------------------------+ +1 row in set (0.00 sec) ``` ## decrypt {#decrypt} -This function decrypts data using these modes: +This function decrypts ciphertext into a plaintext using these modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc @@ -247,7 +234,7 @@ This function decrypts data using these modes: **Syntax** -```sql +``` sql decrypt('mode', 'ciphertext', 'key' [, iv, aad]) ``` @@ -265,51 +252,56 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) **Examples** -Create this table: +Re-using table from [encrypt](./encryption-functions.md#encrypt). Query: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; -``` - -Insert this data: - -Query: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); -``` - -Query: - -``` sql - -SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16) FROM encryption_test; +SELECT comment, hex(secret) FROM encryption_test; ``` Result: -```text -┌─mode────────┬─decrypt('aes-128-ecb', encrypt('aes-128-ecb', input, key16), key16)─┐ -│ aes-128-ecb │ │ -│ aes-128-ecb │ text │ -│ aes-128-ecb │ What Is ClickHouse? │ -└─────────────┴─────────────────────────────────────────────────────────────────────┘ +``` text +┌─comment──────────────┬─hex(secret)──────────────────────────────────┐ +│ aes-256-gcm │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │ +│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │ +└──────────────────────┴──────────────────────────────────────────────┘ +┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐ +│ aes-256-cfb128 no IV │ B4972BDC4459 │ +│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9 │ +│ aes-256-cfb128 with IV │ 5E6CB398F653 │ +│ aes-256-cbc no IV │ 1BC0629A92450D9E73A00E7D02CF4142 │ +└─────────────────────────────────────┴──────────────────────────────────┘ ``` +Now let's try to decrypt all that data. + +Query: + +``` sql +SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test +``` + +Result: +``` text +┌─comment─────────────────────────────┬─plaintext─┐ +│ aes-256-cfb128 no IV │ Secret │ +│ aes-256-cfb128 no IV, different key │ �4� + � │ +│ aes-256-cfb128 with IV │ ���6�~ │ + │aes-256-cbc no IV │ �2*4�h3c�4w��@ +└─────────────────────────────────────┴───────────┘ +``` + +Notice how only portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption. + ## aes_decrypt_mysql {#aes_decrypt_mysql} Compatible with mysql encryption and decrypts data encrypted with [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt) function. +Will produce same plaintext as `decrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_decrypt_mysql` will stick to what MySQL's `aes_decrypt` does: 'fold' `key` and ignore excess bits of `IV`. + Supported decryption modes: - aes-128-ecb, aes-192-ecb, aes-256-ecb @@ -321,7 +313,7 @@ Supported decryption modes: **Syntax** -```sql +``` sql aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) ``` @@ -338,44 +330,30 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) **Examples** -Create this table: - -Query: - +Let's decrypt data we've previously encrypted with MySQL: ``` sql -CREATE TABLE encryption_test -( - input String, - key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'), - iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'), - key32 String DEFAULT substring(key, 1, 32), - key24 String DEFAULT substring(key, 1, 24), - key16 String DEFAULT substring(key, 1, 16) -) Engine = Memory; -``` +mysql> SET block_encryption_mode='aes-256-cfb128'; +Query OK, 0 rows affected (0.00 sec) -Insert this data: - -Query: - -``` sql -INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?'); +mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext; ++------------------------+ +| ciphertext | ++------------------------+ +| 0x24E9E4966469 | ++------------------------+ +1 row in set (0.00 sec) ``` Query: - ``` sql -SELECT 'aes-128-cbc' AS mode, aes_decrypt_mysql(mode, aes_encrypt_mysql(mode, input, key), key) FROM encryption_test; +SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext ``` Result: - ``` text -┌─mode────────┬─aes_decrypt_mysql('aes-128-cbc', aes_encrypt_mysql('aes-128-cbc', input, key), key)─┐ -│ aes-128-cbc │ │ -│ aes-128-cbc │ text │ -│ aes-128-cbc │ What Is ClickHouse? │ -└─────────────┴─────────────────────────────────────────────────────────────────────────────────────┘ +┌─plaintext─┐ +│ Secret │ +└───────────┘ ``` [Original article](https://clickhouse.tech/docs/en/sql-reference/functions/encryption_functions/) diff --git a/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md new file mode 100644 index 00000000000..9b68bcfc770 --- /dev/null +++ b/docs/ru/engines/table-engines/integrations/embedded-rocksdb.md @@ -0,0 +1,44 @@ +--- +toc_priority: 6 +toc_title: EmbeddedRocksDB +--- + +# Движок EmbeddedRocksDB {#EmbeddedRocksDB-engine} + +Этот движок позволяет интегрировать ClickHouse с [rocksdb](http://rocksdb.org/). + +## Создание таблицы {#table_engine-EmbeddedRocksDB-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = EmbeddedRocksDB +PRIMARY KEY(primary_key_name); +``` + +Обязательные параметры: + +- `primary_key_name` может быть любое имя столбца из списка столбцов. +- Указание первичного ключа `primary key` является обязательным. Он будет сериализован в двоичном формате как ключ `rocksdb`. +- Поддерживается только один столбец в первичном ключе. +- Столбцы, которые отличаются от первичного ключа, будут сериализованы в двоичном формате как значение `rockdb` в соответствующем порядке. +- Запросы с фильтрацией по ключу `equals` или `in` оптимизируются для поиска по нескольким ключам из `rocksdb`. + +Пример: + +``` sql +CREATE TABLE test +( + `key` String, + `v1` UInt32, + `v2` String, + `v3` Float32, +) +ENGINE = EmbeddedRocksDB +PRIMARY KEY key; +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/table_engines/embedded-rocksdb/) \ No newline at end of file diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 73dc0b9d944..bacc97dfd14 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -406,21 +406,46 @@ INSERT INTO table_with_enum_column_for_tsv_insert FORMAT TSV 102 2; Возможные значения: -- `'best_effort'` — включает расширенный парсинг. +- `best_effort` — включает расширенный парсинг. -ClickHouse может парсить базовый формат `YYYY-MM-DD HH:MM:SS` и все форматы [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). Например, `'2018-06-08T01:02:03.000Z'`. +ClickHouse может парсить базовый формат `YYYY-MM-DD HH:MM:SS` и все форматы [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601). Например, `2018-06-08T01:02:03.000Z`. -- `'basic'` — используется базовый парсер. +- `basic` — используется базовый парсер. -ClickHouse может парсить только базовый формат `YYYY-MM-DD HH:MM:SS` или `YYYY-MM-DD`. Например, `'2019-08-20 10:18:56'` или `2019-08-20`. +ClickHouse может парсить только базовый формат `YYYY-MM-DD HH:MM:SS` или `YYYY-MM-DD`. Например, `2019-08-20 10:18:56` или `2019-08-20`. -Значение по умолчанию: `'basic'`. +Значение по умолчанию: `basic`. См. также: - [Тип данных DateTime.](../../sql-reference/data-types/datetime.md) - [Функции для работы с датой и временем.](../../sql-reference/functions/date-time-functions.md) +## date_time_output_format {#settings-date_time_output_format} + +Позволяет выбрать разные выходные форматы текстового представления даты и времени. + +Возможные значения: + +- `simple` - простой выходной формат. + + Выходные дата и время Clickhouse в формате `YYYY-MM-DD hh:mm:ss`. Например, `2019-08-20 10:18:56`. Расчет выполняется в соответствии с часовым поясом типа данных (если он есть) или часовым поясом сервера. + +- `iso` - выходной формат ISO. + + Выходные дата и время Clickhouse в формате [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) `YYYY-MM-DDThh:mm:ssZ`. Например, `2019-08-20T10:18:56Z`. Обратите внимание, что выходные данные отображаются в формате UTC (`Z` означает UTC). + +- `unix_timestamp` - выходной формат Unix. + + Выходные дата и время в формате [Unix](https://en.wikipedia.org/wiki/Unix_time). Например `1566285536`. + +Значение по умолчанию: `simple`. + +См. также: + +- [Тип данных DateTime](../../sql-reference/data-types/datetime.md) +- [Функции для работы с датой и временем](../../sql-reference/functions/date-time-functions.md) + ## join_default_strictness {#settings-join_default_strictness} Устанавливает строгость по умолчанию для [JOIN](../../sql-reference/statements/select/join.md#select-join). diff --git a/docs/ru/sql-reference/data-types/datetime.md b/docs/ru/sql-reference/data-types/datetime.md index 9894fa2802b..ffdf83e5bd0 100644 --- a/docs/ru/sql-reference/data-types/datetime.md +++ b/docs/ru/sql-reference/data-types/datetime.md @@ -27,7 +27,7 @@ DateTime([timezone]) Консольный клиент ClickHouse по умолчанию использует часовой пояс сервера, если для значения `DateTime` часовой пояс не был задан в явном виде при инициализации типа данных. Чтобы использовать часовой пояс клиента, запустите [clickhouse-client](../../interfaces/cli.md) с параметром `--use_client_time_zone`. -ClickHouse отображает значения типа `DateTime` в формате `YYYY-MM-DD hh:mm:ss`. Отображение можно поменять с помощью функции [formatDateTime](../../sql-reference/data-types/datetime.md#formatdatetime). +ClickHouse отображает значения в зависимости от значения параметра [date\_time\_output\_format](../../operations/settings/settings.md#settings-date_time_output_format). Текстовый формат по умолчанию `YYYY-MM-DD hh:mm:ss`. Кроме того, вы можете поменять отображение с помощью функции [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime). При вставке данных в ClickHouse, можно использовать различные форматы даты и времени в зависимости от значения настройки [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format). diff --git a/docs/zh/development/style.md b/docs/zh/development/style.md index 8f104e3a7d8..c8e883920dd 100644 --- a/docs/zh/development/style.md +++ b/docs/zh/development/style.md @@ -118,7 +118,7 @@ for (auto & stream : streams) stream.second->finalize(); ``` -**18.** 行的某尾不应该包含空格。 +**18.** 行的末尾不应该包含空格。 **19.** 源文件应该用 UTF-8 编码。 diff --git a/docs/zh/engines/table-engines/mergetree-family/mergetree.md b/docs/zh/engines/table-engines/mergetree-family/mergetree.md index 2fffcbe7ef3..353dd5f5bc8 100644 --- a/docs/zh/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/zh/engines/table-engines/mergetree-family/mergetree.md @@ -401,7 +401,7 @@ TTL date_time + INTERVAL 15 HOUR ### 列 TTL {#mergetree-column-ttl} -当列中的值过期时, ClickHouse会将它们替换成该列数据类型的默认值。如果数据片段中列的所有值均已过期,则ClickHouse 会从文件系统中的数据片段中此列。 +当列中的值过期时, ClickHouse会将它们替换成该列数据类型的默认值。如果数据片段中列的所有值均已过期,则ClickHouse 会从文件系统中的数据片段中删除此列。 `TTL`子句不能被用于主键字段。 diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 06bd7d84526..9a8b580407a 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -932,6 +932,10 @@ private: std::cerr << "Received exception from server (version " << server_version << "):" << std::endl << "Code: " << server_exception->code() << ". " << text << std::endl; + if (is_interactive) + { + std::cerr << std::endl; + } } if (client_exception) @@ -939,6 +943,10 @@ private: fmt::print(stderr, "Error on processing query '{}':\n{}\n", full_query, client_exception->message()); + if (is_interactive) + { + fmt::print(stderr, "\n"); + } } // A debug check -- at least some exception must be set, if the error diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4207d371c09..dba9385fe27 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -190,6 +190,7 @@ add_object_library(clickhouse_processors_sources Processors/Sources) add_object_library(clickhouse_processors_merges Processors/Merges) add_object_library(clickhouse_processors_merges_algorithms Processors/Merges/Algorithms) add_object_library(clickhouse_processors_queryplan Processors/QueryPlan) +add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations) set (DBMS_COMMON_LIBRARIES) # libgcc_s does not provide an implementation of an atomics library. Instead, diff --git a/src/Client/tests/CMakeLists.txt b/src/Client/tests/CMakeLists.txt index e69de29bb2d..d952c006bb5 100644 --- a/src/Client/tests/CMakeLists.txt +++ b/src/Client/tests/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(test-connect test_connect.cpp) +target_link_libraries (test-connect PRIVATE dbms) diff --git a/src/Client/tests/test_connect.cpp b/src/Client/tests/test_connect.cpp new file mode 100644 index 00000000000..1259980f9a6 --- /dev/null +++ b/src/Client/tests/test_connect.cpp @@ -0,0 +1,99 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +/** In a loop it connects to the server and immediately breaks the connection. + * Using the SO_LINGER option, we ensure that the connection is terminated by sending a RST packet (not FIN). + * Long time ago this behavior caused a bug in the TCPServer implementation in the Poco library. + */ +int main(int argc, char ** argv) +try +{ + size_t num_iterations = 1; + size_t num_threads = 1; + std::string host = "localhost"; + uint16_t port = 9000; + + if (argc >= 2) + num_iterations = DB::parse(argv[1]); + + if (argc >= 3) + num_threads = DB::parse(argv[2]); + + if (argc >= 4) + host = argv[3]; + + if (argc >= 5) + port = DB::parse(argv[4]); + + std::atomic_bool cancel{false}; + std::vector threads(num_threads); + for (auto & thread : threads) + { + thread = std::thread([&] + { + for (size_t i = 0; i < num_iterations && !cancel.load(std::memory_order_relaxed); ++i) + { + std::cerr << "."; + + Poco::Net::SocketAddress address(host, port); + + int fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + + if (fd < 0) + DB::throwFromErrno("Cannot create socket", 0); + + linger linger_value; + linger_value.l_onoff = 1; + linger_value.l_linger = 0; + + if (0 != setsockopt(fd, SOL_SOCKET, SO_LINGER, &linger_value, sizeof(linger_value))) + DB::throwFromErrno("Cannot set linger", 0); + + try + { + Stopwatch watch; + + int res = connect(fd, address.addr(), address.length()); + + if (res != 0 && errno != EINPROGRESS && errno != EWOULDBLOCK) + { + close(fd); + DB::throwFromErrno("Cannot connect", 0); + } + + close(fd); + + if (watch.elapsedSeconds() > 0.1) + { + std::cerr << watch.elapsedSeconds() << "\n"; + cancel = true; + break; + } + } + catch (const Poco::Exception & e) + { + std::cerr << e.displayText() << "\n"; + } + } + }); + } + + for (auto & thread : threads) + thread.join(); + + std::cerr << "\n"; +} +catch (const Poco::Exception & e) +{ + std::cerr << e.displayText() << "\n"; +} diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index a20e5d3ca0d..55e387ff2ee 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -289,7 +289,8 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result while (filt_pos < filt_end_sse) { - int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos)), zero16)); + UInt16 mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos)), zero16)); + mask = ~mask; if (0 == mask) { diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index fcbcc63731a..a075c10a8a9 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -356,7 +356,8 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter & filt, ssize_t result_s while (filt_pos < filt_end_sse) { - int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos)), zero16)); + UInt16 mask = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_loadu_si128(reinterpret_cast(filt_pos)), zero16)); + mask = ~mask; if (0 == mask) { diff --git a/src/Columns/ColumnsCommon.cpp b/src/Columns/ColumnsCommon.cpp index f3f10a25df3..3c356afa4da 100644 --- a/src/Columns/ColumnsCommon.cpp +++ b/src/Columns/ColumnsCommon.cpp @@ -17,13 +17,17 @@ namespace DB static UInt64 toBits64(const Int8 * bytes64) { static const __m128i zero16 = _mm_setzero_si128(); - return static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(bytes64)), zero16))) - | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(bytes64 + 16)), zero16))) - << 16) - | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(bytes64 + 32)), zero16))) - << 32) - | (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast(bytes64 + 48)), zero16))) - << 48); + UInt64 res = + static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128(reinterpret_cast(bytes64)), zero16))) + | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128(reinterpret_cast(bytes64 + 16)), zero16))) << 16) + | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128(reinterpret_cast(bytes64 + 32)), zero16))) << 32) + | (static_cast(_mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128(reinterpret_cast(bytes64 + 48)), zero16))) << 48); + + return ~res; } #endif @@ -49,7 +53,7 @@ size_t countBytesInFilter(const UInt8 * filt, size_t sz) #endif for (; pos < end; ++pos) - count += *pos > 0; + count += *pos != 0; return count; } @@ -82,7 +86,7 @@ size_t countBytesInFilterWithNull(const IColumn::Filter & filt, const UInt8 * nu #endif for (; pos < end; ++pos) - count += (*pos & ~*pos2) > 0; + count += (*pos & ~*pos2) != 0; return count; } @@ -232,9 +236,10 @@ namespace while (filt_pos < filt_end_aligned) { - const auto mask = _mm_movemask_epi8(_mm_cmpgt_epi8( + UInt16 mask = _mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(filt_pos)), zero_vec)); + mask = ~mask; if (mask == 0) { diff --git a/src/Common/memcmpSmall.h b/src/Common/memcmpSmall.h index bafc08a9cbe..db8641cb44d 100644 --- a/src/Common/memcmpSmall.h +++ b/src/Common/memcmpSmall.h @@ -120,9 +120,10 @@ inline int memcmpSmallLikeZeroPaddedAllowOverflow15(const Char * a, size_t a_siz for (size_t offset = min_size; offset < max_size; offset += 16) { - uint16_t mask = _mm_movemask_epi8(_mm_cmpgt_epi8( + uint16_t mask = _mm_movemask_epi8(_mm_cmpeq_epi8( _mm_loadu_si128(reinterpret_cast(longest + offset)), zero16)); + mask = ~mask; if (mask) { diff --git a/src/Core/Block.h b/src/Core/Block.h index eef3c27363b..f588373aaed 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -163,6 +163,7 @@ private: friend class ActionsDAG; }; +using BlockPtr = std::shared_ptr; using Blocks = std::vector; using BlocksList = std::list; using BlocksPtr = std::shared_ptr; diff --git a/src/Core/MySQL/MySQLClient.cpp b/src/Core/MySQL/MySQLClient.cpp index f65fbe62274..e41b4128738 100644 --- a/src/Core/MySQL/MySQLClient.cpp +++ b/src/Core/MySQL/MySQLClient.cpp @@ -6,8 +6,10 @@ #include #include #include +#include #include + namespace DB { using namespace Generic; diff --git a/src/Core/MySQL/MySQLClient.h b/src/Core/MySQL/MySQLClient.h index 5835e980149..e503c985584 100644 --- a/src/Core/MySQL/MySQLClient.h +++ b/src/Core/MySQL/MySQLClient.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a0cef8867a7..e3783746eec 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -139,6 +139,7 @@ class IColumn; \ M(UInt64, min_bytes_to_use_direct_io, 0, "The minimum number of bytes for reading the data with O_DIRECT option during SELECT queries execution. 0 - disabled.", 0) \ M(UInt64, min_bytes_to_use_mmap_io, 0, "The minimum number of bytes for reading the data with mmap option during SELECT queries execution. 0 - disabled.", 0) \ + M(Bool, checksum_on_read, true, "Validate checksums on reading. It is enabled by default and should be always enabled in production. Please do not expect any benefits in disabling this setting. It may only be used for experiments and benchmarks. The setting only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over network.", 0) \ \ M(Bool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \ M(Bool, force_primary_key, 0, "Throw an exception if there is primary key in a table, and it is not used.", 0) \ diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index a433d39c561..9614c150c7d 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -885,15 +885,17 @@ MutableColumnUniquePtr DataTypeLowCardinality::createColumnUniqueImpl(const IDat if (const auto * nullable_type = typeid_cast(&keys_type)) type = nullable_type->getNestedType().get(); - if (isString(type)) + WhichDataType which(type); + + if (which.isString()) return creator(static_cast(nullptr)); - if (isFixedString(type)) + else if (which.isFixedString()) return creator(static_cast(nullptr)); - if (typeid_cast(type)) + else if (which.isDate()) return creator(static_cast *>(nullptr)); - if (typeid_cast(type)) + else if (which.isDateTime()) return creator(static_cast *>(nullptr)); - if (isColumnedAsNumber(type)) + else if (which.isInt() || which.isUInt() || which.isFloat()) { MutableColumnUniquePtr column; TypeListNativeNumbers::forEach(CreateColumnVector(column, *type, creator)); diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index e9f1d22325b..6290d05cc3b 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -31,6 +31,7 @@ public: bool canBeUsedInBitOperations() const override { return true; } bool canBeInsideNullable() const override { return true; } + bool canBeInsideLowCardinality() const override { return false; } bool canBePromoted() const override { return false; } }; diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 4beb2caa1f1..ad98d69fdf9 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -13,11 +13,13 @@ #include #include #include +#include +#include #include -#include "CacheDictionary.inc.h" +#include #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" - +#include namespace ProfileEvents { @@ -130,8 +132,8 @@ const IDictionarySource * CacheDictionary::getSource() const void CacheDictionary::toParent(const PaddedPODArray & ids, PaddedPODArray & out) const { const auto null_value = std::get(hierarchical_attribute->null_value); - - getItemsNumberImpl(*hierarchical_attribute, ids, out, [&](const size_t) { return null_value; }); + DictionaryDefaultValueExtractor default_value_extractor(null_value); + getItemsNumberImpl(*hierarchical_attribute, ids, out, default_value_extractor); } @@ -249,34 +251,384 @@ void CacheDictionary::isInConstantVector(const Key child_id, const PaddedPODArra out[i] = std::find(ancestors.begin(), ancestors.end(), ancestor_ids[i]) != ancestors.end(); } -void CacheDictionary::getString(const std::string & attribute_name, const PaddedPODArray & ids, ColumnString * out) const +ColumnPtr CacheDictionary::getColumn( + const std::string & attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes &, + const ColumnPtr default_values_column) const { + ColumnPtr result; + + PaddedPODArray backup_storage; + const auto & keys = getColumnVectorData(this, key_columns.front(), backup_storage); + auto keys_size = keys.size(); + auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - const auto null_value = StringRef{std::get(attribute.null_value)}; + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ColumnProvider = DictionaryAttributeColumnProvider; - getItemsString(attribute, ids, out, [&](const size_t) { return null_value; }); + const auto & null_value = std::get(attribute.null_value); + DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); + + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + getItemsString(attribute, keys, column.get(), default_value_extractor); + } + else + { + auto & out = column->getData(); + getItemsNumberImpl(attribute, keys, out, default_value_extractor); + } + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + return result; } -void CacheDictionary::getString( - const std::string & attribute_name, const PaddedPODArray & ids, const ColumnString * const def, ColumnString * const out) const +template +void CacheDictionary::getItemsNumberImpl( + Attribute & attribute, + const PaddedPODArray & ids, + ResultArrayType & out, + DefaultValueExtractor & default_value_extractor) const { - auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + /// First fill everything with default values + const auto rows = ext::size(ids); + for (const auto row : ext::range(0, rows)) + out[row] = default_value_extractor[row]; - getItemsString(attribute, ids, out, [&](const size_t row) { return def->getDataAt(row); }); + /// Maybe there are duplicate keys, so we remember their indices. + std::unordered_map> cache_expired_or_not_found_ids; + + auto & attribute_array = std::get>(attribute.arrays); + + size_t cache_hit = 0; + size_t cache_not_found_count = 0; + size_t cache_expired_cound = 0; + + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + + auto insert_to_answer_routine = [&](size_t row, size_t idx) + { + auto & cell = cells[idx]; + if (!cell.isDefault()) + out[row] = static_cast(attribute_array[idx]); + }; + + /// fetch up-to-date values, decide which ones require update + for (const auto row : ext::range(0, rows)) + { + const auto id = ids[row]; + + /** cell should be updated if either: + * 1. ids do not match, + * 2. cell has expired, + * 3. explicit defaults were specified and cell was set default. */ + + const auto [cell_idx, state] = findCellIdxForGet(id, now); + + if (state == ResultState::FoundAndValid) + { + ++cache_hit; + insert_to_answer_routine(row, cell_idx); + } + else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) + { + ++cache_not_found_count; + cache_expired_or_not_found_ids[id].push_back(row); + } + else if (state == ResultState::FoundButExpired) + { + cache_expired_cound++; + cache_expired_or_not_found_ids[id].push_back(row); + + if (allow_read_expired_keys) + insert_to_answer_routine(row, cell_idx); + } + } + } + + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_cound); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); + + query_count.fetch_add(rows, std::memory_order_relaxed); + hit_count.fetch_add(rows - cache_not_found_count - cache_expired_cound, std::memory_order_release); + + if (!cache_not_found_count) + { + /// Nothing to update - return + if (!cache_expired_cound) + return; + + /// Update async only if allow_read_expired_keys_is_enabledadd condvar usage and better code + if (allow_read_expired_keys) + { + std::vector required_expired_ids; + required_expired_ids.reserve(cache_expired_cound); + std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), + std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); + + /// request new values + auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); + + tryPushToUpdateQueueOrThrow(update_unit_ptr); + + /// Nothing to do - return + return; + } + } + + /// From this point we have to update all keys sync. + /// Maybe allow_read_expired_keys_from_cache_dictionary is disabled + /// and there no cache_not_found_ids but some cache_expired. + + std::vector required_ids; + required_ids.reserve(cache_not_found_count + cache_expired_cound); + std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), + std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); + + /// Request new values + auto update_unit_ptr = std::make_shared(std::move(required_ids)); + + tryPushToUpdateQueueOrThrow(update_unit_ptr); + waitForCurrentUpdateFinish(update_unit_ptr); + + /// Add updated keys to answer. + + const size_t attribute_index = getAttributeIndex(attribute.name); + + for (auto & [key, value] : update_unit_ptr->found_ids) + { + if (value.found) + { + for (const size_t row : cache_expired_or_not_found_ids[key]) + out[row] = std::get(value.values[attribute_index]); + } + } } -void CacheDictionary::getString( - const std::string & attribute_name, const PaddedPODArray & ids, const String & def, ColumnString * const out) const +void CacheDictionary::getItemsString( + Attribute & attribute, + const PaddedPODArray & ids, + ColumnString * out, + DictionaryDefaultValueExtractor & default_value_extractor) const { - auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto rows = ext::size(ids); - getItemsString(attribute, ids, out, [&](const size_t) { return StringRef{def}; }); + /// Save on some allocations. + out->getOffsets().reserve(rows); + + auto & attribute_array = std::get>(attribute.arrays); + + auto found_outdated_values = false; + + /// Perform optimistic version, fallback to pessimistic if failed. + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + + /// Fetch up-to-date values, discard on fail. + for (const auto row : ext::range(0, rows)) + { + const auto id = ids[row]; + const auto [cell_idx, state] = findCellIdxForGet(id, now); + + if (state == ResultState::FoundAndValid) + { + auto & cell = cells[cell_idx]; + const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; + out->insertData(string_ref.data, string_ref.size); + } + else + { + found_outdated_values = true; + break; + } + } + } + + /// Optimistic code completed successfully. + if (!found_outdated_values) + { + query_count.fetch_add(rows, std::memory_order_relaxed); + hit_count.fetch_add(rows, std::memory_order_release); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, ids.size()); + return; + } + + /// Now onto the pessimistic one, discard possible partial results from the optimistic path. + out->getChars().resize_assume_reserved(0); + out->getOffsets().resize_assume_reserved(0); + + /// Mapping: -> { all indices `i` of `ids` such that `ids[i]` = } + std::unordered_map> cache_expired_or_not_found_ids; + /// we are going to store every string separately + std::unordered_map local_cache; + + size_t cache_not_found_count = 0; + size_t cache_expired_count = 0; + + size_t total_length = 0; + size_t cache_hit = 0; + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + + auto insert_value_routine = [&](size_t row, size_t id, size_t cell_idx) + { + const auto & cell = cells[cell_idx]; + const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; + + /// Do not store default, but count it in total length. + if (!cell.isDefault()) + local_cache[id] = String{string_ref}; + + total_length += string_ref.size + 1; + }; + + for (const auto row : ext::range(0, ids.size())) + { + const auto id = ids[row]; + const auto [cell_idx, state] = findCellIdxForGet(id, now); + + if (state == ResultState::FoundAndValid) + { + ++cache_hit; + insert_value_routine(row, id, cell_idx); + } + else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) + { + ++cache_not_found_count; + cache_expired_or_not_found_ids[id].push_back(row); + } + else if (state == ResultState::FoundButExpired) + { + ++cache_expired_count; + cache_expired_or_not_found_ids[id].push_back(row); + + if (allow_read_expired_keys) + insert_value_routine(row, id, cell_idx); + } + } + } + + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); + + query_count.fetch_add(rows, std::memory_order_relaxed); + hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release); + + /// Async update of expired keys. + if (!cache_not_found_count) + { + if (allow_read_expired_keys && cache_expired_count) + { + std::vector required_expired_ids; + required_expired_ids.reserve(cache_expired_count); + std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), + std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); + + auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); + + tryPushToUpdateQueueOrThrow(update_unit_ptr); + + /// Insert all found keys and defaults to output array. + out->getChars().reserve(total_length); + + for (const auto row : ext::range(0, ext::size(ids))) + { + const auto id = ids[row]; + StringRef value; + + /// Previously we stored found keys in map. + const auto it = local_cache.find(id); + if (it != local_cache.end()) + value = StringRef(it->second); + else + value = default_value_extractor[row]; + + out->insertData(value.data, value.size); + } + + /// Nothing to do else. + return; + } + } + + /// We will request both cache_not_found_ids and cache_expired_ids sync. + std::vector required_ids; + required_ids.reserve(cache_not_found_count + cache_expired_count); + std::transform( + std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), + std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); + + auto update_unit_ptr = std::make_shared(std::move(required_ids)); + + tryPushToUpdateQueueOrThrow(update_unit_ptr); + waitForCurrentUpdateFinish(update_unit_ptr); + + const size_t attribute_index = getAttributeIndex(attribute.name); + + /// Only calculate the total length. + for (auto & [key, value] : update_unit_ptr->found_ids) + { + if (value.found) + { + const auto found_value_ref = std::get(value.values[attribute_index]); + total_length += (found_value_ref.size() + 1) * cache_expired_or_not_found_ids[key].size(); + } + else + { + for (const auto row : cache_expired_or_not_found_ids[key]) + total_length += default_value_extractor[row].size + 1; + } + } + + out->getChars().reserve(total_length); + + for (const auto row : ext::range(0, ext::size(ids))) + { + const auto id = ids[row]; + StringRef value; + + /// We have two maps: found in cache and found in source. + const auto local_it = local_cache.find(id); + if (local_it != local_cache.end()) + value = StringRef(local_it->second); + else + { + const auto found_it = update_unit_ptr->found_ids.find(id); + + /// Previously we didn't store defaults in local cache. + if (found_it != update_unit_ptr->found_ids.end() && found_it->second.found) + value = std::get(found_it->second.values[attribute_index]); + else + value = default_value_extractor[row]; + } + + out->insertData(value.data, value.size); + } } + template struct Overloaded : Ts... {using Ts::operator()...;}; @@ -375,8 +727,14 @@ size_t CacheDictionary::findCellIdxForSet(const Key & id) const return oldest_id; } -void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray & out) const +ColumnUInt8::Ptr CacheDictionary::hasKeys(const Columns & key_columns, const DataTypes &) const { + PaddedPODArray backup_storage; + const auto& ids = getColumnVectorData(this, key_columns.front(), backup_storage); + + auto result = ColumnUInt8::create(ext::size(ids)); + auto& out = result->getData(); + /// There are three types of ids. /// - Valid ids. These ids are presented in local cache and their lifetime is not expired. /// - CacheExpired ids. Ids that are in local cache, but their values are rotted (lifetime is expired). @@ -444,7 +802,7 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray { /// Nothing to update - return; if (!cache_expired_count) - return; + return result; if (allow_read_expired_keys) { @@ -458,7 +816,7 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray tryPushToUpdateQueueOrThrow(update_unit_ptr); /// Update is async - no need to wait. - return; + return result; } } @@ -483,6 +841,8 @@ void CacheDictionary::has(const PaddedPODArray & ids, PaddedPODArray for (const auto row : cache_expired_or_not_found_ids[key]) out[row] = true; } + + return result; } @@ -707,7 +1067,7 @@ PaddedPODArray CacheDictionary::getCachedIds() const BlockInputStreamPtr CacheDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; + using BlockInputStreamType = DictionaryBlockInputStream; return std::make_shared(shared_from_this(), max_block_size, getCachedIds(), column_names); } diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index b9bd0b7623b..35d38f03cbe 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -21,6 +21,7 @@ #include "DictionaryStructure.h" #include "IDictionary.h" #include "IDictionarySource.h" +#include "DictionaryHelpers.h" namespace CurrentMetrics { @@ -119,77 +120,20 @@ public: std::exception_ptr getLastException() const override; + DictionaryKeyType getKeyType() const override { return DictionaryKeyType::simple; } + + ColumnPtr getColumn( + const std::string& attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes & key_types, + const ColumnPtr default_values_column) const override; + + ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; + template using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; -#define DECLARE(TYPE) \ - void get##TYPE(const std::string & attribute_name, const PaddedPODArray & ids, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString(const std::string & attribute_name, const PaddedPODArray & ids, ColumnString * out) const; - -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const PaddedPODArray & ids, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void - getString(const std::string & attribute_name, const PaddedPODArray & ids, const ColumnString * const def, ColumnString * const out) - const; - -#define DECLARE(TYPE) \ - void get##TYPE(const std::string & attribute_name, const PaddedPODArray & ids, const TYPE def, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString(const std::string & attribute_name, const PaddedPODArray & ids, const String & def, ColumnString * const out) const; - - void has(const PaddedPODArray & ids, PaddedPODArray & out) const override; - BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; private: @@ -260,12 +204,18 @@ private: /* NOLINTNEXTLINE(readability-convert-member-functions-to-static) */ Attribute createAttributeWithTypeAndName(const AttributeUnderlyingType type, const String & name, const Field & null_value); - template + template void getItemsNumberImpl( - Attribute & attribute, const PaddedPODArray & ids, ResultArrayType & out, DefaultGetter && get_default) const; + Attribute & attribute, + const PaddedPODArray & ids, + ResultArrayType & out, + DefaultValueExtractor & default_value_extractor) const; - template - void getItemsString(Attribute & attribute, const PaddedPODArray & ids, ColumnString * out, DefaultGetter && get_default) const; + void getItemsString( + Attribute & attribute, + const PaddedPODArray & ids, + ColumnString * out, + DictionaryDefaultValueExtractor & default_value_extractor) const; PaddedPODArray getCachedIds() const; @@ -456,5 +406,6 @@ private: mutable std::condition_variable is_update_finished; std::atomic finished{false}; - }; +}; + } diff --git a/src/Dictionaries/CacheDictionary.inc.h b/src/Dictionaries/CacheDictionary.inc.h deleted file mode 100644 index 803b3b2566f..00000000000 --- a/src/Dictionaries/CacheDictionary.inc.h +++ /dev/null @@ -1,368 +0,0 @@ -#pragma once - -#include - -#include "CacheDictionary.h" -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace ProfileEvents -{ -extern const Event DictCacheKeysRequested; -extern const Event DictCacheKeysRequestedMiss; -extern const Event DictCacheKeysRequestedFound; -extern const Event DictCacheKeysExpired; -extern const Event DictCacheKeysNotFound; -extern const Event DictCacheKeysHit; -extern const Event DictCacheRequestTimeNs; -extern const Event DictCacheRequests; -extern const Event DictCacheLockWriteNs; -extern const Event DictCacheLockReadNs; -} - -namespace CurrentMetrics -{ -extern const Metric DictCacheRequests; -} - -namespace DB -{ -namespace ErrorCodes -{ -} - -template -void CacheDictionary::getItemsNumberImpl( - Attribute & attribute, const PaddedPODArray & ids, ResultArrayType & out, DefaultGetter && get_default) const -{ - /// First fill everything with default values - const auto rows = ext::size(ids); - for (const auto row : ext::range(0, rows)) - out[row] = get_default(row); - - /// Maybe there are duplicate keys, so we remember their indices. - std::unordered_map> cache_expired_or_not_found_ids; - - auto & attribute_array = std::get>(attribute.arrays); - - size_t cache_hit = 0; - size_t cache_not_found_count = 0; - size_t cache_expired_cound = 0; - - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - auto insert_to_answer_routine = [&](size_t row, size_t idx) - { - auto & cell = cells[idx]; - if (!cell.isDefault()) - out[row] = static_cast(attribute_array[idx]); - }; - - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows)) - { - const auto id = ids[row]; - - /** cell should be updated if either: - * 1. ids do not match, - * 2. cell has expired, - * 3. explicit defaults were specified and cell was set default. */ - - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - ++cache_hit; - insert_to_answer_routine(row, cell_idx); - } - else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) - { - ++cache_not_found_count; - cache_expired_or_not_found_ids[id].push_back(row); - } - else if (state == ResultState::FoundButExpired) - { - cache_expired_cound++; - cache_expired_or_not_found_ids[id].push_back(row); - - if (allow_read_expired_keys) - insert_to_answer_routine(row, cell_idx); - } - } - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_cound); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - cache_not_found_count - cache_expired_cound, std::memory_order_release); - - if (!cache_not_found_count) - { - /// Nothing to update - return - if (!cache_expired_cound) - return; - - /// Update async only if allow_read_expired_keys_is_enabledadd condvar usage and better code - if (allow_read_expired_keys) - { - std::vector required_expired_ids; - required_expired_ids.reserve(cache_expired_cound); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); - - /// request new values - auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - - /// Nothing to do - return - return; - } - } - - /// From this point we have to update all keys sync. - /// Maybe allow_read_expired_keys_from_cache_dictionary is disabled - /// and there no cache_not_found_ids but some cache_expired. - - std::vector required_ids; - required_ids.reserve(cache_not_found_count + cache_expired_cound); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); - - /// Request new values - auto update_unit_ptr = std::make_shared(std::move(required_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - waitForCurrentUpdateFinish(update_unit_ptr); - - /// Add updated keys to answer. - - const size_t attribute_index = getAttributeIndex(attribute.name); - - for (auto & [key, value] : update_unit_ptr->found_ids) - { - if (value.found) - { - for (const size_t row : cache_expired_or_not_found_ids[key]) - out[row] = std::get(value.values[attribute_index]); - } - } -} - -template -void CacheDictionary::getItemsString( - Attribute & attribute, const PaddedPODArray & ids, ColumnString * out, DefaultGetter && get_default) const -{ - const auto rows = ext::size(ids); - - /// Save on some allocations. - out->getOffsets().reserve(rows); - - auto & attribute_array = std::get>(attribute.arrays); - - auto found_outdated_values = false; - - /// Perform optimistic version, fallback to pessimistic if failed. - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - /// Fetch up-to-date values, discard on fail. - for (const auto row : ext::range(0, rows)) - { - const auto id = ids[row]; - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; - out->insertData(string_ref.data, string_ref.size); - } - else - { - found_outdated_values = true; - break; - } - } - } - - /// Optimistic code completed successfully. - if (!found_outdated_values) - { - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows, std::memory_order_release); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, ids.size()); - return; - } - - /// Now onto the pessimistic one, discard possible partial results from the optimistic path. - out->getChars().resize_assume_reserved(0); - out->getOffsets().resize_assume_reserved(0); - - /// Mapping: -> { all indices `i` of `ids` such that `ids[i]` = } - std::unordered_map> cache_expired_or_not_found_ids; - /// we are going to store every string separately - std::unordered_map local_cache; - - size_t cache_not_found_count = 0; - size_t cache_expired_count = 0; - - size_t total_length = 0; - size_t cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - - auto insert_value_routine = [&](size_t row, size_t id, size_t cell_idx) - { - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; - - /// Do not store default, but count it in total length. - if (!cell.isDefault()) - local_cache[id] = String{string_ref}; - - total_length += string_ref.size + 1; - }; - - for (const auto row : ext::range(0, ids.size())) - { - const auto id = ids[row]; - const auto [cell_idx, state] = findCellIdxForGet(id, now); - - if (state == ResultState::FoundAndValid) - { - ++cache_hit; - insert_value_routine(row, id, cell_idx); - } - else if (state == ResultState::NotFound || state == ResultState::FoundButExpiredPermanently) - { - ++cache_not_found_count; - cache_expired_or_not_found_ids[id].push_back(row); - } - else if (state == ResultState::FoundButExpired) - { - ++cache_expired_count; - cache_expired_or_not_found_ids[id].push_back(row); - - if (allow_read_expired_keys) - insert_value_routine(row, id, cell_idx); - } - } - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found_count); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows, std::memory_order_relaxed); - hit_count.fetch_add(rows - cache_expired_count - cache_not_found_count, std::memory_order_release); - - /// Async update of expired keys. - if (!cache_not_found_count) - { - if (allow_read_expired_keys && cache_expired_count) - { - std::vector required_expired_ids; - required_expired_ids.reserve(cache_expired_count); - std::transform(std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_expired_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_expired_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - - /// Insert all found keys and defaults to output array. - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(ids))) - { - const auto id = ids[row]; - StringRef value; - - /// Previously we stored found keys in map. - const auto it = local_cache.find(id); - if (it != local_cache.end()) - value = StringRef(it->second); - else - value = get_default(row); - - out->insertData(value.data, value.size); - } - - /// Nothing to do else. - return; - } - } - - /// We will request both cache_not_found_ids and cache_expired_ids sync. - std::vector required_ids; - required_ids.reserve(cache_not_found_count + cache_expired_count); - std::transform( - std::begin(cache_expired_or_not_found_ids), std::end(cache_expired_or_not_found_ids), - std::back_inserter(required_ids), [](auto & pair) { return pair.first; }); - - auto update_unit_ptr = std::make_shared(std::move(required_ids)); - - tryPushToUpdateQueueOrThrow(update_unit_ptr); - waitForCurrentUpdateFinish(update_unit_ptr); - - const size_t attribute_index = getAttributeIndex(attribute.name); - - /// Only calculate the total length. - for (auto & [key, value] : update_unit_ptr->found_ids) - { - if (value.found) - { - const auto found_value_ref = std::get(value.values[attribute_index]); - total_length += (found_value_ref.size() + 1) * cache_expired_or_not_found_ids[key].size(); - } - else - { - for (const auto row : cache_expired_or_not_found_ids[key]) - total_length += get_default(row).size + 1; - } - } - - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(ids))) - { - const auto id = ids[row]; - StringRef value; - - /// We have two maps: found in cache and found in source. - const auto local_it = local_cache.find(id); - if (local_it != local_cache.end()) - value = StringRef(local_it->second); - else - { - const auto found_it = update_unit_ptr->found_ids.find(id); - - /// Previously we didn't store defaults in local cache. - if (found_it != update_unit_ptr->found_ids.end() && found_it->second.found) - value = std::get(found_it->second.values[attribute_index]); - else - value = get_default(row); - } - - out->insertData(value.data, value.size); - } -} - -} diff --git a/src/Dictionaries/CacheDictionary_generate1.cpp b/src/Dictionaries/CacheDictionary_generate1.cpp deleted file mode 100644 index 2c6742b3a8c..00000000000 --- a/src/Dictionaries/CacheDictionary_generate1.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void CacheDictionary::get##TYPE(const std::string & attribute_name, const PaddedPODArray & ids, ResultArrayType & out) \ - const \ - { \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - const auto null_value = std::get(attribute.null_value); \ - getItemsNumberImpl(attribute, ids, out, [&](const size_t) { return null_value; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/CacheDictionary_generate2.cpp b/src/Dictionaries/CacheDictionary_generate2.cpp deleted file mode 100644 index be28a6302c2..00000000000 --- a/src/Dictionaries/CacheDictionary_generate2.cpp +++ /dev/null @@ -1,34 +0,0 @@ -#include -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void CacheDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const PaddedPODArray & ids, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const \ - { \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - getItemsNumberImpl(attribute, ids, out, [&](const size_t row) { return def[row]; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/CacheDictionary_generate3.cpp b/src/Dictionaries/CacheDictionary_generate3.cpp deleted file mode 100644 index 36195f166db..00000000000 --- a/src/Dictionaries/CacheDictionary_generate3.cpp +++ /dev/null @@ -1,31 +0,0 @@ -#include -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void CacheDictionary::get##TYPE( \ - const std::string & attribute_name, const PaddedPODArray & ids, const TYPE def, ResultArrayType & out) const \ - { \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - getItemsNumberImpl(attribute, ids, out, [&](const size_t) { return def; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary.cpp b/src/Dictionaries/ComplexKeyCacheDictionary.cpp index 0c517699272..cbb57f81793 100644 --- a/src/Dictionaries/ComplexKeyCacheDictionary.cpp +++ b/src/Dictionaries/ComplexKeyCacheDictionary.cpp @@ -10,7 +10,8 @@ #include #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" - +#include +#include namespace ProfileEvents { @@ -70,48 +71,50 @@ ComplexKeyCacheDictionary::ComplexKeyCacheDictionary( createAttributes(); } - -void ComplexKeyCacheDictionary::getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const -{ - dict_struct.validateKeyTypes(key_types); - - auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); - - const auto null_value = StringRef{std::get(attribute.null_values)}; - - getItemsString(attribute, key_columns, out, [&](const size_t) { return null_value; }); -} - -void ComplexKeyCacheDictionary::getString( +ColumnPtr ComplexKeyCacheDictionary::getColumn( const std::string & attribute_name, + const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnString * const def, - ColumnString * const out) const + const ColumnPtr default_values_column) const { dict_struct.validateKeyTypes(key_types); - auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); - - getItemsString(attribute, key_columns, out, [&](const size_t row) { return def->getDataAt(row); }); -} - -void ComplexKeyCacheDictionary::getString( - const std::string & attribute_name, - const Columns & key_columns, - const DataTypes & key_types, - const String & def, - ColumnString * const out) const -{ - dict_struct.validateKeyTypes(key_types); + ColumnPtr result; auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - getItemsString(attribute, key_columns, out, [&](const size_t) { return StringRef{def}; }); + auto keys_size = key_columns.front()->size(); + + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ColumnProvider = DictionaryAttributeColumnProvider; + + const auto & null_value = std::get(attribute.null_values); + DictionaryDefaultValueExtractor default_value_extractor(null_value, default_values_column); + + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + auto * out = column.get(); + getItemsString(attribute, key_columns, out, default_value_extractor); + } + else + { + auto & out = column->getData(); + getItemsNumberImpl(attribute, key_columns, out, default_value_extractor); + } + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + return result; } /// returns cell_idx (always valid for replacing), 'cell is valid' flag, 'cell is outdated' flag, @@ -158,15 +161,21 @@ ComplexKeyCacheDictionary::findCellIdx(const StringRef & key, const CellMetadata return {oldest_id, false, false}; } -void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const +ColumnUInt8::Ptr ComplexKeyCacheDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { dict_struct.validateKeyTypes(key_types); + const auto rows_num = key_columns.front()->size(); + + auto result = ColumnUInt8::create(rows_num); + auto& out = result->getData(); + + for (const auto row : ext::range(0, rows_num)) + out[row] = false; + /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } MapType> outdated_keys; - - const auto rows_num = key_columns.front()->size(); const auto keys_size = dict_struct.key->size(); StringRefs keys(keys_size); Arena temporary_keys_pool; @@ -212,7 +221,7 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); if (outdated_keys.empty()) - return; + return result; std::vector required_rows(outdated_keys.size()); std::transform( @@ -233,8 +242,395 @@ void ComplexKeyCacheDictionary::has(const Columns & key_columns, const DataTypes for (const auto out_idx : outdated_keys[key]) out[out_idx] = false; }); + + return result; } + +template +void ComplexKeyCacheDictionary::getItemsNumberImpl( + Attribute & attribute, + const Columns & key_columns, + PaddedPODArray & out, + DefaultValueExtractor & default_value_extractor) const +{ + /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } + MapType> outdated_keys; + auto & attribute_array = std::get>(attribute.arrays); + + const auto rows_num = key_columns.front()->size(); + const auto keys_size = dict_struct.key->size(); + StringRefs keys(keys_size); + Arena temporary_keys_pool; + PODArray keys_array(rows_num); + + size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + /// fetch up-to-date values, decide which ones require update + for (const auto row : ext::range(0, rows_num)) + { + const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); + keys_array[row] = key; + const auto find_result = findCellIdx(key, now); + + /** cell should be updated if either: + * 1. keys (or hash) do not match, + * 2. cell has expired, + * 3. explicit defaults were specified and cell was set default. */ + + if (!find_result.valid) + { + outdated_keys[key].push_back(row); + if (find_result.outdated) + ++cache_expired; + else + ++cache_not_found; + } + else + { + ++cache_hit; + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; + out[row] = cell.isDefault() ? default_value_extractor[row] : static_cast(attribute_array[cell_idx]); + } + } + } + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); + + if (outdated_keys.empty()) + return; + + std::vector required_rows(outdated_keys.size()); + std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) + { + return pair.getMapped().front(); + }); + + /// request new values + update( + key_columns, + keys_array, + required_rows, + [&](const StringRef key, const size_t cell_idx) + { + for (const auto row : outdated_keys[key]) + out[row] = static_cast(attribute_array[cell_idx]); + }, + [&](const StringRef key, const size_t) + { + for (const auto row : outdated_keys[key]) + out[row] = default_value_extractor[row]; + }); +} + +void ComplexKeyCacheDictionary::getItemsString( + Attribute & attribute, + const Columns & key_columns, + ColumnString * out, + DictionaryDefaultValueExtractor & default_value_extractor) const +{ + const auto rows_num = key_columns.front()->size(); + /// save on some allocations + out->getOffsets().reserve(rows_num); + + const auto keys_size = dict_struct.key->size(); + StringRefs keys(keys_size); + Arena temporary_keys_pool; + + auto & attribute_array = std::get>(attribute.arrays); + + auto found_outdated_values = false; + + /// perform optimistic version, fallback to pessimistic if failed + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + /// fetch up-to-date values, discard on fail + for (const auto row : ext::range(0, rows_num)) + { + const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); + SCOPE_EXIT(temporary_keys_pool.rollback(key.size)); + const auto find_result = findCellIdx(key, now); + + if (!find_result.valid) + { + found_outdated_values = true; + break; + } + else + { + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; + const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; + out->insertData(string_ref.data, string_ref.size); + } + } + } + + /// optimistic code completed successfully + if (!found_outdated_values) + { + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num, std::memory_order_release); + return; + } + + /// now onto the pessimistic one, discard possible partial results from the optimistic path + out->getChars().resize_assume_reserved(0); + out->getOffsets().resize_assume_reserved(0); + + /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } + MapType> outdated_keys; + /// we are going to store every string separately + MapType map; + PODArray keys_array(rows_num); + + size_t total_length = 0; + size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; + { + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; + + const auto now = std::chrono::system_clock::now(); + for (const auto row : ext::range(0, rows_num)) + { + const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); + keys_array[row] = key; + const auto find_result = findCellIdx(key, now); + + if (!find_result.valid) + { + outdated_keys[key].push_back(row); + if (find_result.outdated) + ++cache_expired; + else + ++cache_not_found; + } + else + { + ++cache_hit; + const auto & cell_idx = find_result.cell_idx; + const auto & cell = cells[cell_idx]; + const auto string_ref = cell.isDefault() ? default_value_extractor[row] : attribute_array[cell_idx]; + + if (!cell.isDefault()) + map[key] = copyIntoArena(string_ref, temporary_keys_pool); + + total_length += string_ref.size + 1; + } + } + } + ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); + ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); + ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); + + query_count.fetch_add(rows_num, std::memory_order_relaxed); + hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); + + /// request new values + if (!outdated_keys.empty()) + { + std::vector required_rows(outdated_keys.size()); + std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) + { + return pair.getMapped().front(); + }); + + update( + key_columns, + keys_array, + required_rows, + [&](const StringRef key, const size_t cell_idx) + { + const StringRef attribute_value = attribute_array[cell_idx]; + + /// We must copy key and value to own memory, because it may be replaced with another + /// in next iterations of inner loop of update. + const StringRef copied_key = copyIntoArena(key, temporary_keys_pool); + const StringRef copied_value = copyIntoArena(attribute_value, temporary_keys_pool); + + map[copied_key] = copied_value; + total_length += (attribute_value.size + 1) * outdated_keys[key].size(); + }, + [&](const StringRef key, const size_t) + { + for (const auto row : outdated_keys[key]) + total_length += default_value_extractor[row].size + 1; + }); + } + + out->getChars().reserve(total_length); + + for (const auto row : ext::range(0, ext::size(keys_array))) + { + const StringRef key = keys_array[row]; + auto * const it = map.find(key); + const auto string_ref = it ? it->getMapped() : default_value_extractor[row]; + out->insertData(string_ref.data, string_ref.size); + } +} + +template +void ComplexKeyCacheDictionary::update( + const Columns & in_key_columns, + const PODArray & in_keys, + const std::vector & in_requested_rows, + PresentKeyHandler && on_cell_updated, + AbsentKeyHandler && on_key_not_found) const +{ + MapType remaining_keys{in_requested_rows.size()}; + for (const auto row : in_requested_rows) + remaining_keys.insert({in_keys[row], false}); + + std::uniform_int_distribution distribution(dict_lifetime.min_sec, dict_lifetime.max_sec); + + const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + { + Stopwatch watch; + auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows); + stream->readPrefix(); + + const auto keys_size = dict_struct.key->size(); + StringRefs keys(keys_size); + + const auto attributes_size = attributes.size(); + const auto now = std::chrono::system_clock::now(); + + while (const auto block = stream->read()) + { + /// cache column pointers + const auto key_columns = ext::map( + ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); + + const auto attribute_columns = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) + { + return block.safeGetByPosition(keys_size + attribute_idx).column; + }); + + const auto rows_num = block.rows(); + + for (const auto row : ext::range(0, rows_num)) + { + auto key = allocKey(row, key_columns, keys); + const auto hash = StringRefHash{}(key); + const auto find_result = findCellIdx(key, now, hash); + const auto & cell_idx = find_result.cell_idx; + auto & cell = cells[cell_idx]; + + for (const auto attribute_idx : ext::range(0, attributes.size())) + { + const auto & attribute_column = *attribute_columns[attribute_idx]; + auto & attribute = attributes[attribute_idx]; + + setAttributeValue(attribute, cell_idx, attribute_column[row]); + } + + /// if cell id is zero and zero does not map to this cell, then the cell is unused + if (cell.key == StringRef{} && cell_idx != zero_cell_idx) + element_count.fetch_add(1, std::memory_order_relaxed); + + /// handle memory allocated for old key + if (key == cell.key) + { + freeKey(key); + key = cell.key; + } + else + { + /// new key is different from the old one + if (cell.key.data) + freeKey(cell.key); + + cell.key = key; + } + + cell.hash = hash; + + if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) + cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); + else + cell.setExpiresAt(std::chrono::time_point::max()); + + /// inform caller + on_cell_updated(key, cell_idx); + /// mark corresponding id as found + remaining_keys[key] = true; + } + } + + stream->readSuffix(); + + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, in_requested_rows.size()); + ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); + } + + size_t found_num = 0; + size_t not_found_num = 0; + + const auto now = std::chrono::system_clock::now(); + + /// Check which ids have not been found and require setting null_value + for (const auto & key_found_pair : remaining_keys) + { + if (key_found_pair.getMapped()) + { + ++found_num; + continue; + } + + ++not_found_num; + + auto key = key_found_pair.getKey(); + const auto hash = StringRefHash{}(key); + const auto find_result = findCellIdx(key, now, hash); + const auto & cell_idx = find_result.cell_idx; + auto & cell = cells[cell_idx]; + + /// Set null_value for each attribute + for (auto & attribute : attributes) + setDefaultAttributeValue(attribute, cell_idx); + + /// Check if cell had not been occupied before and increment element counter if it hadn't + if (cell.key == StringRef{} && cell_idx != zero_cell_idx) + element_count.fetch_add(1, std::memory_order_relaxed); + + if (key == cell.key) + key = cell.key; + else + { + if (cell.key.data) + freeKey(cell.key); + + /// copy key from temporary pool + key = copyKey(key); + cell.key = key; + } + + cell.hash = hash; + + if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) + cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); + else + cell.setExpiresAt(std::chrono::time_point::max()); + + cell.setDefault(); + + /// inform caller that the cell has not been found + on_key_not_found(key, cell_idx); + } + + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); + ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num); +} + + void ComplexKeyCacheDictionary::createAttributes() { const auto attributes_size = dict_struct.attributes.size(); @@ -263,6 +659,102 @@ ComplexKeyCacheDictionary::Attribute & ComplexKeyCacheDictionary::getAttribute(c return attributes[it->second]; } +void ComplexKeyCacheDictionary::setDefaultAttributeValue(Attribute & attribute, const size_t idx) const +{ + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + + if constexpr (std::is_same_v) + { + const auto & null_value_ref = std::get(attribute.null_values); + auto & string_ref = std::get>(attribute.arrays)[idx]; + + if (string_ref.data != null_value_ref.data()) + { + if (string_ref.data) + string_arena->free(const_cast(string_ref.data), string_ref.size); + + string_ref = StringRef{null_value_ref}; + } + } + else + { + std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); +} + +ComplexKeyCacheDictionary::Attribute +ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) +{ + Attribute attr{type, {}, {}}; + + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + + if constexpr (std::is_same_v) + { + attr.null_values = null_value.get(); + attr.arrays = std::make_unique>(size); + bytes_allocated += size * sizeof(StringRef); + if (!string_arena) + string_arena = std::make_unique(); + } + else + { + attr.null_values = AttributeType(null_value.get>()); /* NOLINT */ + attr.arrays = std::make_unique>(size); /* NOLINT */ + bytes_allocated += size * sizeof(AttributeType); + } + }; + + callOnDictionaryAttributeType(type, type_call); + + return attr; +} + +void ComplexKeyCacheDictionary::setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const +{ + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + + if constexpr (std::is_same_v) + { + const auto & string = value.get(); + auto & string_ref = std::get>(attribute.arrays)[idx]; + const auto & null_value_ref = std::get(attribute.null_values); + + /// free memory unless it points to a null_value + if (string_ref.data && string_ref.data != null_value_ref.data()) + string_arena->free(const_cast(string_ref.data), string_ref.size); + + const auto str_size = string.size(); + if (str_size != 0) + { + auto * str_ptr = string_arena->alloc(str_size); + std::copy(string.data(), string.data() + str_size, str_ptr); + string_ref = StringRef{str_ptr, str_size}; + } + else + string_ref = {}; + } + else + { + std::get>(attribute.arrays)[idx] = value.get>(); + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); +} + StringRef ComplexKeyCacheDictionary::allocKey(const size_t row, const Columns & key_columns, StringRefs & keys) const { if (key_size_is_fixed) @@ -388,7 +880,7 @@ BlockInputStreamPtr ComplexKeyCacheDictionary::getBlockInputStream(const Names & keys.push_back(cells[idx].key); } - using BlockInputStreamType = DictionaryBlockInputStream; + using BlockInputStreamType = DictionaryBlockInputStream; return std::make_shared(shared_from_this(), max_block_size, keys, column_names); } diff --git a/src/Dictionaries/ComplexKeyCacheDictionary.h b/src/Dictionaries/ComplexKeyCacheDictionary.h index 2663fee266d..f5643fc799c 100644 --- a/src/Dictionaries/ComplexKeyCacheDictionary.h +++ b/src/Dictionaries/ComplexKeyCacheDictionary.h @@ -23,7 +23,7 @@ #include "IDictionary.h" #include "IDictionarySource.h" #include - +#include "DictionaryHelpers.h" namespace ProfileEvents { @@ -89,93 +89,16 @@ public: return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; } - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; + DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } -/// In all functions below, key_columns must be full (non-constant) columns. -/// See the requirement in IDataType.h for text-serialization functions. -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString(const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const; - -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, + ColumnPtr getColumn( + const std::string& attribute_name, + const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnString * const def, - ColumnString * const out) const; + const ColumnPtr default_values_column) const override; -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const TYPE def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, - const Columns & key_columns, - const DataTypes & key_types, - const String & def, - ColumnString * const out) const; - - void has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const; + ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; @@ -252,227 +175,18 @@ private: Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); - template + template void getItemsNumberImpl( - Attribute & attribute, const Columns & key_columns, PaddedPODArray & out, DefaultGetter && get_default) const - { - /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } - MapType> outdated_keys; - auto & attribute_array = std::get>(attribute.arrays); + Attribute & attribute, + const Columns & key_columns, + PaddedPODArray & out, + DefaultValueExtractor & default_value_extractor) const; - const auto rows_num = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - PODArray keys_array(rows_num); - - size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, decide which ones require update - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - keys_array[row] = key; - const auto find_result = findCellIdx(key, now); - - /** cell should be updated if either: - * 1. keys (or hash) do not match, - * 2. cell has expired, - * 3. explicit defaults were specified and cell was set default. */ - - if (!find_result.valid) - { - outdated_keys[key].push_back(row); - if (find_result.outdated) - ++cache_expired; - else - ++cache_not_found; - } - else - { - ++cache_hit; - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - out[row] = cell.isDefault() ? get_default(row) : static_cast(attribute_array[cell_idx]); - } - } - } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); - - if (outdated_keys.empty()) - return; - - std::vector required_rows(outdated_keys.size()); - std::transform( - std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) { return pair.getMapped().front(); }); - - /// request new values - update( - key_columns, - keys_array, - required_rows, - [&](const StringRef key, const size_t cell_idx) - { - for (const auto row : outdated_keys[key]) - out[row] = static_cast(attribute_array[cell_idx]); - }, - [&](const StringRef key, const size_t) - { - for (const auto row : outdated_keys[key]) - out[row] = get_default(row); - }); - } - - template - void getItemsString(Attribute & attribute, const Columns & key_columns, ColumnString * out, DefaultGetter && get_default) const - { - const auto rows_num = key_columns.front()->size(); - /// save on some allocations - out->getOffsets().reserve(rows_num); - - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - Arena temporary_keys_pool; - - auto & attribute_array = std::get>(attribute.arrays); - - auto found_outdated_values = false; - - /// perform optimistic version, fallback to pessimistic if failed - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - /// fetch up-to-date values, discard on fail - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - SCOPE_EXIT(temporary_keys_pool.rollback(key.size)); - const auto find_result = findCellIdx(key, now); - - if (!find_result.valid) - { - found_outdated_values = true; - break; - } - else - { - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; - out->insertData(string_ref.data, string_ref.size); - } - } - } - - /// optimistic code completed successfully - if (!found_outdated_values) - { - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num, std::memory_order_release); - return; - } - - /// now onto the pessimistic one, discard possible partial results from the optimistic path - out->getChars().resize_assume_reserved(0); - out->getOffsets().resize_assume_reserved(0); - - /// Mapping: -> { all indices `i` of `key_columns` such that `key_columns[i]` = } - MapType> outdated_keys; - /// we are going to store every string separately - MapType map; - PODArray keys_array(rows_num); - - size_t total_length = 0; - size_t cache_expired = 0, cache_not_found = 0, cache_hit = 0; - { - const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - - const auto now = std::chrono::system_clock::now(); - for (const auto row : ext::range(0, rows_num)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys, *dict_struct.key, temporary_keys_pool); - keys_array[row] = key; - const auto find_result = findCellIdx(key, now); - - if (!find_result.valid) - { - outdated_keys[key].push_back(row); - if (find_result.outdated) - ++cache_expired; - else - ++cache_not_found; - } - else - { - ++cache_hit; - const auto & cell_idx = find_result.cell_idx; - const auto & cell = cells[cell_idx]; - const auto string_ref = cell.isDefault() ? get_default(row) : attribute_array[cell_idx]; - - if (!cell.isDefault()) - map[key] = copyIntoArena(string_ref, temporary_keys_pool); - - total_length += string_ref.size + 1; - } - } - } - ProfileEvents::increment(ProfileEvents::DictCacheKeysExpired, cache_expired); - ProfileEvents::increment(ProfileEvents::DictCacheKeysNotFound, cache_not_found); - ProfileEvents::increment(ProfileEvents::DictCacheKeysHit, cache_hit); - - query_count.fetch_add(rows_num, std::memory_order_relaxed); - hit_count.fetch_add(rows_num - outdated_keys.size(), std::memory_order_release); - - /// request new values - if (!outdated_keys.empty()) - { - std::vector required_rows(outdated_keys.size()); - std::transform(std::begin(outdated_keys), std::end(outdated_keys), std::begin(required_rows), [](auto & pair) - { - return pair.getMapped().front(); - }); - - update( - key_columns, - keys_array, - required_rows, - [&](const StringRef key, const size_t cell_idx) - { - const StringRef attribute_value = attribute_array[cell_idx]; - - /// We must copy key and value to own memory, because it may be replaced with another - /// in next iterations of inner loop of update. - const StringRef copied_key = copyIntoArena(key, temporary_keys_pool); - const StringRef copied_value = copyIntoArena(attribute_value, temporary_keys_pool); - - map[copied_key] = copied_value; - total_length += (attribute_value.size + 1) * outdated_keys[key].size(); - }, - [&](const StringRef key, const size_t) - { - for (const auto row : outdated_keys[key]) - total_length += get_default(row).size + 1; - }); - } - - out->getChars().reserve(total_length); - - for (const auto row : ext::range(0, ext::size(keys_array))) - { - const StringRef key = keys_array[row]; - const auto it = map.find(key); - const auto string_ref = it ? it->getMapped() : get_default(row); - out->insertData(string_ref.data, string_ref.size); - } - } + void getItemsString( + Attribute & attribute, + const Columns & key_columns, + ColumnString * out, + DictionaryDefaultValueExtractor & default_value_extractor) const; template void update( @@ -480,152 +194,7 @@ private: const PODArray & in_keys, const std::vector & in_requested_rows, PresentKeyHandler && on_cell_updated, - AbsentKeyHandler && on_key_not_found) const - { - MapType remaining_keys{in_requested_rows.size()}; - for (const auto row : in_requested_rows) - remaining_keys.insert({in_keys[row], false}); - - std::uniform_int_distribution distribution(dict_lifetime.min_sec, dict_lifetime.max_sec); - - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - { - Stopwatch watch; - auto stream = source_ptr->loadKeys(in_key_columns, in_requested_rows); - stream->readPrefix(); - - const auto keys_size = dict_struct.key->size(); - StringRefs keys(keys_size); - - const auto attributes_size = attributes.size(); - const auto now = std::chrono::system_clock::now(); - - while (const auto block = stream->read()) - { - /// cache column pointers - const auto key_columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_columns = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - - const auto rows_num = block.rows(); - - for (const auto row : ext::range(0, rows_num)) - { - auto key = allocKey(row, key_columns, keys); - const auto hash = StringRefHash{}(key); - const auto find_result = findCellIdx(key, now, hash); - const auto & cell_idx = find_result.cell_idx; - auto & cell = cells[cell_idx]; - - for (const auto attribute_idx : ext::range(0, attributes.size())) - { - const auto & attribute_column = *attribute_columns[attribute_idx]; - auto & attribute = attributes[attribute_idx]; - - setAttributeValue(attribute, cell_idx, attribute_column[row]); - } - - /// if cell id is zero and zero does not map to this cell, then the cell is unused - if (cell.key == StringRef{} && cell_idx != zero_cell_idx) - element_count.fetch_add(1, std::memory_order_relaxed); - - /// handle memory allocated for old key - if (key == cell.key) - { - freeKey(key); - key = cell.key; - } - else - { - /// new key is different from the old one - if (cell.key.data) - freeKey(cell.key); - - cell.key = key; - } - - cell.hash = hash; - - if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) - cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); - else - cell.setExpiresAt(std::chrono::time_point::max()); - - /// inform caller - on_cell_updated(key, cell_idx); - /// mark corresponding id as found - remaining_keys[key] = true; - } - } - - stream->readSuffix(); - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequested, in_requested_rows.size()); - ProfileEvents::increment(ProfileEvents::DictCacheRequestTimeNs, watch.elapsed()); - } - - size_t found_num = 0; - size_t not_found_num = 0; - - const auto now = std::chrono::system_clock::now(); - - /// Check which ids have not been found and require setting null_value - for (const auto & key_found_pair : remaining_keys) - { - if (key_found_pair.getMapped()) - { - ++found_num; - continue; - } - - ++not_found_num; - - auto key = key_found_pair.getKey(); - const auto hash = StringRefHash{}(key); - const auto find_result = findCellIdx(key, now, hash); - const auto & cell_idx = find_result.cell_idx; - auto & cell = cells[cell_idx]; - - /// Set null_value for each attribute - for (auto & attribute : attributes) - setDefaultAttributeValue(attribute, cell_idx); - - /// Check if cell had not been occupied before and increment element counter if it hadn't - if (cell.key == StringRef{} && cell_idx != zero_cell_idx) - element_count.fetch_add(1, std::memory_order_relaxed); - - if (key == cell.key) - key = cell.key; - else - { - if (cell.key.data) - freeKey(cell.key); - - /// copy key from temporary pool - key = copyKey(key); - cell.key = key; - } - - cell.hash = hash; - - if (dict_lifetime.min_sec != 0 && dict_lifetime.max_sec != 0) - cell.setExpiresAt(std::chrono::system_clock::now() + std::chrono::seconds{distribution(rnd_engine)}); - else - cell.setExpiresAt(std::chrono::time_point::max()); - - cell.setDefault(); - - /// inform caller that the cell has not been found - on_key_not_found(key, cell_idx); - } - - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedFound, found_num); - ProfileEvents::increment(ProfileEvents::DictCacheKeysRequestedMiss, not_found_num); - } + AbsentKeyHandler && on_key_not_found) const; UInt64 getCellIdx(const StringRef key) const; diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp deleted file mode 100644 index ba9f8d014fd..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_createAttributeWithType.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "ComplexKeyCacheDictionary.h" - -namespace DB -{ -ComplexKeyCacheDictionary::Attribute -ComplexKeyCacheDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) -{ - Attribute attr{type, {}, {}}; - - switch (type) - { -#define DISPATCH(TYPE) \ - case AttributeUnderlyingType::ut##TYPE: \ - attr.null_values = TYPE(null_value.get>()); /* NOLINT */ \ - attr.arrays = std::make_unique>(size); /* NOLINT */ \ - bytes_allocated += size * sizeof(TYPE); \ - break; - DISPATCH(UInt8) - DISPATCH(UInt16) - DISPATCH(UInt32) - DISPATCH(UInt64) - DISPATCH(UInt128) - DISPATCH(Int8) - DISPATCH(Int16) - DISPATCH(Int32) - DISPATCH(Int64) - DISPATCH(Decimal32) - DISPATCH(Decimal64) - DISPATCH(Decimal128) - DISPATCH(Float32) - DISPATCH(Float64) -#undef DISPATCH - case AttributeUnderlyingType::utString: - attr.null_values = null_value.get(); - attr.arrays = std::make_unique>(size); - bytes_allocated += size * sizeof(StringRef); - if (!string_arena) - string_arena = std::make_unique(); - break; - } - - return attr; -} - -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_generate1.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_generate1.cpp deleted file mode 100644 index 01d39722d33..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_generate1.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void ComplexKeyCacheDictionary::get##TYPE( \ - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - const auto null_value = std::get(attribute.null_values); \ - getItemsNumberImpl(attribute, key_columns, out, [&](const size_t) { return null_value; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_generate2.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_generate2.cpp deleted file mode 100644 index deb34706f54..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_generate2.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void ComplexKeyCacheDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - getItemsNumberImpl(attribute, key_columns, out, [&](const size_t row) { return def[row]; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_generate3.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_generate3.cpp deleted file mode 100644 index 2a84fdc89f6..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_generate3.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include - -namespace DB -{ -#define DEFINE(TYPE) \ - void ComplexKeyCacheDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const TYPE def, \ - ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ - getItemsNumberImpl(attribute, key_columns, out, [&](const size_t) { return def; }); \ - } - -DEFINE(UInt8) -DEFINE(UInt16) -DEFINE(UInt32) -DEFINE(UInt64) -DEFINE(UInt128) -DEFINE(Int8) -DEFINE(Int16) -DEFINE(Int32) -DEFINE(Int64) -DEFINE(Float32) -DEFINE(Float64) -DEFINE(Decimal32) -DEFINE(Decimal64) -DEFINE(Decimal128) - -#undef DEFINE -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp deleted file mode 100644 index 2df8f95bc0c..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_setAttributeValue.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "ComplexKeyCacheDictionary.h" - -namespace DB -{ -void ComplexKeyCacheDictionary::setAttributeValue(Attribute & attribute, const size_t idx, const Field & value) const -{ - switch (attribute.type) - { - case AttributeUnderlyingType::utUInt8: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt16: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utUInt128: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt8: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt16: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utInt64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utFloat32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utFloat64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - - case AttributeUnderlyingType::utDecimal32: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utDecimal64: - std::get>(attribute.arrays)[idx] = value.get(); - break; - case AttributeUnderlyingType::utDecimal128: - std::get>(attribute.arrays)[idx] = value.get(); - break; - - case AttributeUnderlyingType::utString: - { - const auto & string = value.get(); - auto & string_ref = std::get>(attribute.arrays)[idx]; - const auto & null_value_ref = std::get(attribute.null_values); - - /// free memory unless it points to a null_value - if (string_ref.data && string_ref.data != null_value_ref.data()) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - const auto str_size = string.size(); - if (str_size != 0) - { - auto * str_ptr = string_arena->alloc(str_size); - std::copy(string.data(), string.data() + str_size, str_ptr); - string_ref = StringRef{str_ptr, str_size}; - } - else - string_ref = {}; - - break; - } - } -} - -} diff --git a/src/Dictionaries/ComplexKeyCacheDictionary_setDefaultAttributeValue.cpp b/src/Dictionaries/ComplexKeyCacheDictionary_setDefaultAttributeValue.cpp deleted file mode 100644 index aa03cc88038..00000000000 --- a/src/Dictionaries/ComplexKeyCacheDictionary_setDefaultAttributeValue.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include "ComplexKeyCacheDictionary.h" - -namespace DB -{ -void ComplexKeyCacheDictionary::setDefaultAttributeValue(Attribute & attribute, const size_t idx) const -{ - switch (attribute.type) - { - case AttributeUnderlyingType::utUInt8: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utUInt16: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utUInt32: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utUInt64: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utUInt128: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utInt8: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utInt16: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utInt32: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utInt64: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utFloat32: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utFloat64: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - - case AttributeUnderlyingType::utDecimal32: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utDecimal64: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - case AttributeUnderlyingType::utDecimal128: - std::get>(attribute.arrays)[idx] = std::get(attribute.null_values); - break; - - case AttributeUnderlyingType::utString: - { - const auto & null_value_ref = std::get(attribute.null_values); - auto & string_ref = std::get>(attribute.arrays)[idx]; - - if (string_ref.data != null_value_ref.data()) - { - if (string_ref.data) - string_arena->free(const_cast(string_ref.data), string_ref.size); - - string_ref = StringRef{null_value_ref}; - } - - break; - } - } -} - -} diff --git a/src/Dictionaries/ComplexKeyDirectDictionary.cpp b/src/Dictionaries/ComplexKeyDirectDictionary.cpp index c4b8678672c..391b5c47980 100644 --- a/src/Dictionaries/ComplexKeyDirectDictionary.cpp +++ b/src/Dictionaries/ComplexKeyDirectDictionary.cpp @@ -3,6 +3,9 @@ #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" #include +#include +#include +#include namespace DB { @@ -31,194 +34,151 @@ ComplexKeyDirectDictionary::ComplexKeyDirectDictionary( createAttributes(); } -#define DECLARE(TYPE) \ - void ComplexKeyDirectDictionary::get##TYPE(const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - const auto null_value = std::get(attribute.null_values); \ -\ - getItemsImpl( \ - attribute, key_columns, [&](const size_t row, const auto value) { out[row] = value; }, [&](const size_t) { return null_value; }); \ - } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE - -void ComplexKeyDirectDictionary::getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const +ColumnPtr ComplexKeyDirectDictionary::getColumn( + const std::string & attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes & key_types, + const ColumnPtr default_values_column) const { dict_struct.validateKeyTypes(key_types); - const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); - const auto & null_value = std::get(attribute.null_values); - getItemsStringImpl( - attribute, - key_columns, - [&](const size_t, const String value) { const auto ref = StringRef{value}; out->insertData(ref.data, ref.size); }, - [&](const size_t) { return String(null_value.data, null_value.size); }); -} - -#define DECLARE(TYPE) \ - void ComplexKeyDirectDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - getItemsImpl( \ - attribute, key_columns, [&](const size_t row, const auto value) { out[row] = value; }, [&](const size_t row) { return def[row]; }); \ - } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE - -void ComplexKeyDirectDictionary::getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const ColumnString * const def, ColumnString * const out) const -{ - dict_struct.validateKeyTypes(key_types); + ColumnPtr result; const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - getItemsStringImpl( - attribute, - key_columns, - [&](const size_t, const String value) { const auto ref = StringRef{value}; out->insertData(ref.data, ref.size); }, - [&](const size_t row) { const auto ref = def->getDataAt(row); return String(ref.data, ref.size); }); -} + auto keys_size = key_columns.front()->size(); -#define DECLARE(TYPE) \ - void ComplexKeyDirectDictionary::get##TYPE( \ - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const TYPE def, ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - getItemsImpl( \ - attribute, key_columns, [&](const size_t row, const auto value) { out[row] = value; }, [&](const size_t) { return def; }); \ - } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE - -void ComplexKeyDirectDictionary::getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const String & def, ColumnString * const out) const -{ - dict_struct.validateKeyTypes(key_types); - - const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); - - ComplexKeyDirectDictionary::getItemsStringImpl( - attribute, - key_columns, - [&](const size_t, const String value) { const auto ref = StringRef{value}; out->insertData(ref.data, ref.size); }, - [&](const size_t) { return def; }); -} - - -void ComplexKeyDirectDictionary::has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const -{ - dict_struct.validateKeyTypes(key_types); - const auto & attribute = attributes.front(); - - switch (attribute.type) + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (attribute.is_nullable) { - case AttributeUnderlyingType::utUInt8: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt16: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt128: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt8: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt16: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utFloat32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utFloat64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utString: - has(attribute, key_columns, out); - break; - - case AttributeUnderlyingType::utDecimal32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utDecimal64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utDecimal128: - has(attribute, key_columns, out); - break; + col_null_map_to = ColumnUInt8::create(keys_size, false); + vec_null_map_to = &col_null_map_to->getData(); } + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnProvider = DictionaryAttributeColumnProvider; + + const auto attribute_null_value = std::get(attribute.null_values); + AttributeType null_value = static_cast(attribute_null_value); + DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); + + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + auto * out = column.get(); + + getItemsImpl( + attribute, + key_columns, + [&](const size_t row, const String value, bool is_null) + { + if (attribute.is_nullable) + (*vec_null_map_to)[row] = is_null; + + const auto ref = StringRef{value}; + out->insertData(ref.data, ref.size); + }, + default_value_extractor); + } + else + { + auto & out = column->getData(); + + getItemsImpl( + attribute, + key_columns, + [&](const size_t row, const auto value, bool is_null) + { + if (attribute.is_nullable) + (*vec_null_map_to)[row] = is_null; + + out[row] = value; + }, + default_value_extractor); + } + + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + if (attribute.is_nullable) + { + result = ColumnNullable::create(result, std::move(col_null_map_to)); + } + + return result; } +ColumnUInt8::Ptr ComplexKeyDirectDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const +{ + dict_struct.validateKeyTypes(key_types); + + auto size = key_columns.front()->size(); + auto result = ColumnUInt8::create(size); + auto& out = result->getData(); + + const auto rows = key_columns.front()->size(); + const auto keys_size = dict_struct.key->size(); + StringRefs keys_array(keys_size); + MapType has_key; + Arena temporary_keys_pool; + std::vector to_load(rows); + PODArray keys(rows); + + for (const auto row : ext::range(0, rows)) + { + const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); + keys[row] = key; + has_key[key] = 0; + to_load[row] = row; + } + + auto stream = source_ptr->loadKeys(key_columns, to_load); + + stream->readPrefix(); + + while (const auto block = stream->read()) + { + const auto columns = ext::map( + ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); + + Arena pool; + + StringRefs keys_temp(keys_size); + + const auto columns_size = columns.front()->size(); + + for (const auto row_idx : ext::range(0, columns_size)) + { + const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); + if (has_key.has(key)) + { + has_key[key] = 1; + } + } + } + + stream->readSuffix(); + + for (const auto row : ext::range(0, rows)) + { + out[row] = has_key[keys[row]]; + } + + query_count.fetch_add(rows, std::memory_order_relaxed); + + return result; +} void ComplexKeyDirectDictionary::createAttributes() { @@ -229,7 +189,7 @@ void ComplexKeyDirectDictionary::createAttributes() { attribute_index_by_name.emplace(attribute.name, attributes.size()); attribute_name_by_index.emplace(attributes.size(), attribute.name); - attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value, attribute.name)); + attributes.push_back(createAttribute(attribute, attribute.null_value, attribute.name)); if (attribute.hierarchical) throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), @@ -237,7 +197,6 @@ void ComplexKeyDirectDictionary::createAttributes() } } - template void ComplexKeyDirectDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { @@ -254,59 +213,19 @@ void ComplexKeyDirectDictionary::createAttributeImpl(Attribute & attribu } -ComplexKeyDirectDictionary::Attribute ComplexKeyDirectDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value, const std::string & attr_name) +ComplexKeyDirectDictionary::Attribute ComplexKeyDirectDictionary::createAttribute( + const DictionaryAttribute & attribute, const Field & null_value, const std::string & attr_name) { - Attribute attr{type, {}, {}, attr_name}; + Attribute attr{attribute.underlying_type, attribute.is_nullable, {}, {}, attr_name}; - switch (type) + auto type_call = [&](const auto &dictionary_attribute_type) { - case AttributeUnderlyingType::utUInt8: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt16: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt128: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt8: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt16: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utFloat32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utFloat64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utString: - createAttributeImpl(attr, null_value); - break; + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + createAttributeImpl(attr, null_value); + }; - case AttributeUnderlyingType::utDecimal32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utDecimal64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utDecimal128: - createAttributeImpl(attr, null_value); - break; - } + callOnDictionaryAttributeType(attribute.underlying_type, type_call); return attr; } @@ -356,14 +275,18 @@ StringRef ComplexKeyDirectDictionary::placeKeysInPool( } -template +template void ComplexKeyDirectDictionary::getItemsImpl( - const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + DefaultValueExtractor & default_value_extractor) const { const auto rows = key_columns.front()->size(); const auto keys_size = dict_struct.key->size(); StringRefs keys_array(keys_size); MapType value_by_key; + HashMapWithSavedHash value_is_null; Arena temporary_keys_pool; std::vector to_load(rows); PODArray keys(rows); @@ -372,8 +295,9 @@ void ComplexKeyDirectDictionary::getItemsImpl( { const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); keys[row] = key; - value_by_key[key] = get_default(row); + value_by_key[key] = static_cast(default_value_extractor[row]); to_load[row] = row; + value_is_null[key] = false; } auto stream = source_ptr->loadKeys(key_columns, to_load); @@ -392,6 +316,11 @@ void ComplexKeyDirectDictionary::getItemsImpl( }); for (const size_t attribute_idx : ext::range(0, attributes.size())) { + if (attribute.name != attribute_name_by_index.at(attribute_idx)) + { + continue; + } + const IColumn & attribute_column = *attribute_columns[attribute_idx]; Arena pool; @@ -402,17 +331,15 @@ void ComplexKeyDirectDictionary::getItemsImpl( for (const auto row_idx : ext::range(0, columns_size)) { const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); - if (value_by_key.has(key) && attribute.name == attribute_name_by_index.at(attribute_idx)) + + if (value_by_key.has(key)) { - if (attribute.type == AttributeUnderlyingType::utFloat32) - { - value_by_key[key] = static_cast(attribute_column[row_idx].template get()); - } + auto value = attribute_column[row_idx]; + + if (value.isNull()) + value_is_null[key] = true; else - { - value_by_key[key] = static_cast(attribute_column[row_idx].template get()); - } - + value_by_key[key] = static_cast(value.template get>()); } } } @@ -422,78 +349,13 @@ void ComplexKeyDirectDictionary::getItemsImpl( for (const auto row : ext::range(0, rows)) { - set_value(row, value_by_key[keys[row]]); + auto key = keys[row]; + set_value(row, value_by_key[key], value_is_null[key]); } query_count.fetch_add(rows, std::memory_order_relaxed); } -template -void ComplexKeyDirectDictionary::getItemsStringImpl( - const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const -{ - const auto rows = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys_array(keys_size); - MapType value_by_key; - Arena temporary_keys_pool; - std::vector to_load(rows); - PODArray keys(rows); - - for (const auto row : ext::range(0, rows)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); - keys[row] = key; - value_by_key[key] = get_default(row); - to_load[row] = row; - } - - auto stream = source_ptr->loadKeys(key_columns, to_load); - const auto attributes_size = attributes.size(); - - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - const auto attribute_columns = ext::map(ext::range(0, attributes_size), [&](const size_t attribute_idx) - { - return block.safeGetByPosition(keys_size + attribute_idx).column; - }); - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - const IColumn & attribute_column = *attribute_columns[attribute_idx]; - Arena pool; - - StringRefs keys_temp(keys_size); - - const auto columns_size = columns.front()->size(); - - for (const auto row_idx : ext::range(0, columns_size)) - { - const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); - if (value_by_key.has(key) && attribute.name == attribute_name_by_index.at(attribute_idx)) - { - const String from_source = attribute_column[row_idx].template get(); - value_by_key[key] = from_source; - } - } - } - } - - stream->readSuffix(); - - for (const auto row : ext::range(0, rows)) - { - set_value(row, value_by_key[keys[row]]); - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - const ComplexKeyDirectDictionary::Attribute & ComplexKeyDirectDictionary::getAttribute(const std::string & attribute_name) const { const auto it = attribute_index_by_name.find(attribute_name); @@ -503,65 +365,6 @@ const ComplexKeyDirectDictionary::Attribute & ComplexKeyDirectDictionary::getAtt return attributes[it->second]; } - -template -void ComplexKeyDirectDictionary::has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const -{ - const auto rows = key_columns.front()->size(); - const auto keys_size = dict_struct.key->size(); - StringRefs keys_array(keys_size); - MapType has_key; - Arena temporary_keys_pool; - std::vector to_load(rows); - PODArray keys(rows); - - for (const auto row : ext::range(0, rows)) - { - const StringRef key = placeKeysInPool(row, key_columns, keys_array, *dict_struct.key, temporary_keys_pool); - keys[row] = key; - has_key[key] = 0; - to_load[row] = row; - } - - auto stream = source_ptr->loadKeys(key_columns, to_load); - - stream->readPrefix(); - - while (const auto block = stream->read()) - { - const auto columns = ext::map( - ext::range(0, keys_size), [&](const size_t attribute_idx) { return block.safeGetByPosition(attribute_idx).column; }); - - for (const size_t attribute_idx : ext::range(0, attributes.size())) - { - Arena pool; - - StringRefs keys_temp(keys_size); - - const auto columns_size = columns.front()->size(); - - for (const auto row_idx : ext::range(0, columns_size)) - { - const StringRef key = placeKeysInPool(row_idx, columns, keys_temp, *dict_struct.key, pool); - if (has_key.has(key) && attribute.name == attribute_name_by_index.at(attribute_idx)) - { - has_key[key] = 1; - } - } - } - } - - stream->readSuffix(); - - for (const auto row : ext::range(0, rows)) - { - out[row] = has_key[keys[row]]; - } - - query_count.fetch_add(rows, std::memory_order_relaxed); -} - - BlockInputStreamPtr ComplexKeyDirectDictionary::getBlockInputStream(const Names & /* column_names */, size_t /* max_block_size */) const { return source_ptr->loadAll(); diff --git a/src/Dictionaries/ComplexKeyDirectDictionary.h b/src/Dictionaries/ComplexKeyDirectDictionary.h index dc602be103f..0e191321daa 100644 --- a/src/Dictionaries/ComplexKeyDirectDictionary.h +++ b/src/Dictionaries/ComplexKeyDirectDictionary.h @@ -12,14 +12,13 @@ #include #include #include -#include "DictionaryStructure.h" #include "IDictionary.h" #include "IDictionarySource.h" - +#include "DictionaryStructure.h" +#include "DictionaryHelpers.h" namespace DB { -using BlockPtr = std::shared_ptr; class ComplexKeyDirectDictionary final : public IDictionaryBase { @@ -60,78 +59,16 @@ public: return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; } - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; + DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } -#define DECLARE(TYPE) \ - void get##TYPE(const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE + ColumnPtr getColumn( + const std::string& attribute_name, + const DataTypePtr & result_type, + const Columns & key_columns, + const DataTypes & key_types, + const ColumnPtr default_values_column) const override; - void getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const; - -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const ColumnString * const def, ColumnString * const out) const; - -#define DECLARE(TYPE) \ - void get##TYPE(const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const TYPE def, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, const String & def, ColumnString * const out) const; - - void has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const; + ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; @@ -142,6 +79,8 @@ private: struct Attribute final { AttributeUnderlyingType type; + bool is_nullable; + std::variant< UInt8, UInt16, @@ -168,27 +107,21 @@ private: template void addAttributeSize(const Attribute & attribute); - void calculateBytesAllocated(); - template - void createAttributeImpl(Attribute & attribute, const Field & null_value); + static void createAttributeImpl(Attribute & attribute, const Field & null_value); - Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value, const std::string & name); + static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value, const std::string & name); template StringRef placeKeysInPool( const size_t row, const Columns & key_columns, StringRefs & keys, const std::vector & key_attributes, Pool & pool) const; - template - void getItemsStringImpl( - const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const; - - template + template void getItemsImpl( - const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const; - - template - void resize(Attribute & attribute, const Key id); + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + DefaultValueExtractor & default_value_extractor) const; template void setAttributeValueImpl(Attribute & attribute, const Key id, const T & value); @@ -197,9 +130,6 @@ private: const Attribute & getAttribute(const std::string & attribute_name) const; - template - void has(const Attribute & attribute, const Columns & key_columns, PaddedPODArray & out) const; - const DictionaryStructure dict_struct; const DictionarySourcePtr source_ptr; const DictionaryLifetime dict_lifetime; diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.cpp b/src/Dictionaries/ComplexKeyHashedDictionary.cpp index 676196fabd2..a0784b5a417 100644 --- a/src/Dictionaries/ComplexKeyHashedDictionary.cpp +++ b/src/Dictionaries/ComplexKeyHashedDictionary.cpp @@ -1,6 +1,10 @@ #include "ComplexKeyHashedDictionary.h" #include #include +#include +#include +#include +#include #include "DictionaryBlockInputStream.h" #include "DictionaryFactory.h" @@ -32,216 +36,111 @@ ComplexKeyHashedDictionary::ComplexKeyHashedDictionary( calculateBytesAllocated(); } -#define DECLARE(TYPE) \ - void ComplexKeyHashedDictionary::get##TYPE( \ - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ -\ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - const auto null_value = std::get(attribute.null_values); \ -\ - getItemsImpl( \ - attribute, \ - key_columns, \ - [&](const size_t row, const auto value) { out[row] = value; }, \ - [&](const size_t) { return null_value; }); \ - } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE - -void ComplexKeyHashedDictionary::getString( - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const -{ - dict_struct.validateKeyTypes(key_types); - - const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); - - const auto & null_value = StringRef{std::get(attribute.null_values)}; - - getItemsImpl( - attribute, - key_columns, - [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, - [&](const size_t) { return null_value; }); -} - -#define DECLARE(TYPE) \ - void ComplexKeyHashedDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ -\ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - getItemsImpl( \ - attribute, \ - key_columns, \ - [&](const size_t row, const auto value) { out[row] = value; }, \ - [&](const size_t row) { return def[row]; }); \ - } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE - -void ComplexKeyHashedDictionary::getString( +ColumnPtr ComplexKeyHashedDictionary::getColumn( const std::string & attribute_name, + const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnString * const def, - ColumnString * const out) const + const ColumnPtr default_values_column) const { dict_struct.validateKeyTypes(key_types); + ColumnPtr result; + const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type); - getItemsImpl( - attribute, - key_columns, - [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, - [&](const size_t row) { return def->getDataAt(row); }); -} + auto keys_size = key_columns.front()->size(); -#define DECLARE(TYPE) \ - void ComplexKeyHashedDictionary::get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const TYPE def, \ - ResultArrayType & out) const \ - { \ - dict_struct.validateKeyTypes(key_types); \ -\ - const auto & attribute = getAttribute(attribute_name); \ - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::ut##TYPE); \ -\ - getItemsImpl( \ - attribute, key_columns, [&](const size_t row, const auto value) { out[row] = value; }, [&](const size_t) { return def; }); \ + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to = nullptr; + if (attribute.is_nullable) + { + col_null_map_to = ColumnUInt8::create(keys_size, false); + vec_null_map_to = &col_null_map_to->getData(); } -DECLARE(UInt8) -DECLARE(UInt16) -DECLARE(UInt32) -DECLARE(UInt64) -DECLARE(UInt128) -DECLARE(Int8) -DECLARE(Int16) -DECLARE(Int32) -DECLARE(Int64) -DECLARE(Float32) -DECLARE(Float64) -DECLARE(Decimal32) -DECLARE(Decimal64) -DECLARE(Decimal128) -#undef DECLARE -void ComplexKeyHashedDictionary::getString( - const std::string & attribute_name, - const Columns & key_columns, - const DataTypes & key_types, - const String & def, - ColumnString * const out) const -{ - dict_struct.validateKeyTypes(key_types); + auto type_call = [&](const auto &dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnProvider = DictionaryAttributeColumnProvider; - const auto & attribute = getAttribute(attribute_name); - checkAttributeType(this, attribute_name, attribute.type, AttributeUnderlyingType::utString); + const auto attribute_null_value = std::get(attribute.null_values); + AttributeType null_value = static_cast(attribute_null_value); + DictionaryDefaultValueExtractor default_value_extractor(std::move(null_value), default_values_column); - getItemsImpl( - attribute, - key_columns, - [&](const size_t, const StringRef value) { out->insertData(value.data, value.size); }, - [&](const size_t) { return StringRef{def}; }); + auto column = ColumnProvider::getColumn(dictionary_attribute, keys_size); + + if constexpr (std::is_same_v) + { + auto * out = column.get(); + + getItemsImpl( + attribute, + key_columns, + [&](const size_t row, const StringRef value, bool is_null) + { + if (attribute.is_nullable) + (*vec_null_map_to)[row] = is_null; + + out->insertData(value.data, value.size); + }, + default_value_extractor); + } + else + { + auto & out = column->getData(); + + getItemsImpl( + attribute, + key_columns, + [&](const size_t row, const auto value, bool is_null) + { + if (attribute.is_nullable) + (*vec_null_map_to)[row] = is_null; + + out[row] = value; + }, + default_value_extractor); + } + + result = std::move(column); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + if (attribute.is_nullable) + { + result = ColumnNullable::create(result, std::move(col_null_map_to)); + } + + return result; } -void ComplexKeyHashedDictionary::has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const +ColumnUInt8::Ptr ComplexKeyHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const { dict_struct.validateKeyTypes(key_types); + auto size = key_columns.front()->size(); + auto result = ColumnUInt8::create(size); + auto& out = result->getData(); + const auto & attribute = attributes.front(); - switch (attribute.type) + auto type_call = [&](const auto & dictionary_attribute_type) { - case AttributeUnderlyingType::utUInt8: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt16: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utUInt128: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt8: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt16: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utInt64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utFloat32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utFloat64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utString: - has(attribute, key_columns, out); - break; + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; - case AttributeUnderlyingType::utDecimal32: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utDecimal64: - has(attribute, key_columns, out); - break; - case AttributeUnderlyingType::utDecimal128: - has(attribute, key_columns, out); - break; - } + has(attribute, key_columns, out); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + return result; } void ComplexKeyHashedDictionary::createAttributes() @@ -252,7 +151,7 @@ void ComplexKeyHashedDictionary::createAttributes() for (const auto & attribute : dict_struct.attributes) { attribute_index_by_name.emplace(attribute.name, attributes.size()); - attributes.push_back(createAttributeWithType(attribute.underlying_type, attribute.null_value)); + attributes.push_back(createAttribute(attribute, attribute.null_value)); if (attribute.hierarchical) throw Exception{full_name + ": hierarchical attributes not supported for dictionary of type " + getTypeName(), @@ -407,66 +306,30 @@ void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) bucket_count = map_ref.getBufferSizeInCells(); } +template <> +void ComplexKeyHashedDictionary::addAttributeSize(const Attribute & attribute) +{ + const auto & map_ref = std::get>(attribute.maps); + bytes_allocated += sizeof(ContainerType) + map_ref.getBufferSizeInBytes(); + bucket_count = map_ref.getBufferSizeInCells(); + bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); +} + void ComplexKeyHashedDictionary::calculateBytesAllocated() { bytes_allocated += attributes.size() * sizeof(attributes.front()); for (const auto & attribute : attributes) { - switch (attribute.type) + auto type_call = [&](const auto & dictionary_attribute_type) { - case AttributeUnderlyingType::utUInt8: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utUInt16: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utUInt32: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utUInt64: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utUInt128: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utInt8: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utInt16: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utInt32: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utInt64: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utFloat32: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utFloat64: - addAttributeSize(attribute); - break; + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; - case AttributeUnderlyingType::utDecimal32: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utDecimal64: - addAttributeSize(attribute); - break; - case AttributeUnderlyingType::utDecimal128: - addAttributeSize(attribute); - break; + addAttributeSize(attribute); + }; - case AttributeUnderlyingType::utString: - { - addAttributeSize(attribute); - bytes_allocated += sizeof(Arena) + attribute.string_arena->size(); - - break; - } - } + callOnDictionaryAttributeType(attribute.type, type_call); } bytes_allocated += keys_pool.size(); @@ -479,73 +342,41 @@ void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, cons attribute.maps.emplace>(); } -ComplexKeyHashedDictionary::Attribute -ComplexKeyHashedDictionary::createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value) +template <> +void ComplexKeyHashedDictionary::createAttributeImpl(Attribute & attribute, const Field & null_value) { - Attribute attr{type, {}, {}, {}}; + attribute.string_arena = std::make_unique(); + const String & string = null_value.get(); + const char * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); + attribute.null_values.emplace(string_in_arena, string.size()); + attribute.maps.emplace>(); +} - switch (type) +ComplexKeyHashedDictionary::Attribute +ComplexKeyHashedDictionary::createAttribute(const DictionaryAttribute & attribute, const Field & null_value) +{ + auto nullable_set = attribute.is_nullable ? std::make_unique() : nullptr; + Attribute attr{attribute.underlying_type, attribute.is_nullable, std::move(nullable_set), {}, {}, {}}; + + auto type_call = [&](const auto &dictionary_attribute_type) { - case AttributeUnderlyingType::utUInt8: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt16: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utUInt128: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt8: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt16: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utInt64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utFloat32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utFloat64: - createAttributeImpl(attr, null_value); - break; + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + createAttributeImpl(attr, null_value); + }; - case AttributeUnderlyingType::utDecimal32: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utDecimal64: - createAttributeImpl(attr, null_value); - break; - case AttributeUnderlyingType::utDecimal128: - createAttributeImpl(attr, null_value); - break; - - case AttributeUnderlyingType::utString: - { - attr.null_values = null_value.get(); - attr.maps.emplace>(); - attr.string_arena = std::make_unique(); - break; - } - } + callOnDictionaryAttributeType(attribute.underlying_type, type_call); return attr; } -template +template void ComplexKeyHashedDictionary::getItemsImpl( - const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + DefaultValueExtractor & default_value_extractor) const { const auto & attr = std::get>(attribute.maps); @@ -560,7 +391,18 @@ void ComplexKeyHashedDictionary::getItemsImpl( const auto key = placeKeysInPool(i, key_columns, keys, temporary_keys_pool); const auto it = attr.find(key); - set_value(i, it ? static_cast(it->getMapped()) : get_default(i)); + + if (it) + { + set_value(i, static_cast(it->getMapped()), false); + } + else + { + if (attribute.is_nullable && attribute.nullable_set->find(key) != nullptr) + set_value(i, default_value_extractor[i], true); + else + set_value(i, default_value_extractor[i], false); + } /// free memory allocated for the key temporary_keys_pool.rollback(key.size); @@ -578,51 +420,42 @@ bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, co return pair.second; } +template <> +bool ComplexKeyHashedDictionary::setAttributeValueImpl(Attribute & attribute, const StringRef key, const String value) +{ + const auto * string_in_arena = attribute.string_arena->insert(value.data(), value.size()); + return setAttributeValueImpl(attribute, key, StringRef{string_in_arena, value.size()}); +} + bool ComplexKeyHashedDictionary::setAttributeValue(Attribute & attribute, const StringRef key, const Field & value) { - switch (attribute.type) + bool result = false; + + auto type_call = [&](const auto &dictionary_attribute_type) { - case AttributeUnderlyingType::utUInt8: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utUInt16: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utUInt32: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utUInt64: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utUInt128: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utInt8: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utInt16: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utInt32: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utInt64: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utFloat32: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utFloat64: - return setAttributeValueImpl(attribute, key, value.get()); + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; - case AttributeUnderlyingType::utDecimal32: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utDecimal64: - return setAttributeValueImpl(attribute, key, value.get()); - case AttributeUnderlyingType::utDecimal128: - return setAttributeValueImpl(attribute, key, value.get()); - - case AttributeUnderlyingType::utString: + if (attribute.is_nullable) { - auto & map = std::get>(attribute.maps); - const auto & string = value.get(); - const auto * string_in_arena = attribute.string_arena->insert(string.data(), string.size()); - const auto pair = map.insert({key, StringRef{string_in_arena, string.size()}}); - return pair.second; + if (value.isNull()) + { + attribute.nullable_set->insert(key); + result = true; + return; + } + else + { + attribute.nullable_set->erase(key); + } } - } - return {}; + result = setAttributeValueImpl(attribute, key, value.get>()); + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + return result; } const ComplexKeyHashedDictionary::Attribute & ComplexKeyHashedDictionary::getAttribute(const std::string & attribute_name) const @@ -673,6 +506,9 @@ void ComplexKeyHashedDictionary::has(const Attribute & attribute, const Columns const auto it = attr.find(key); out[i] = static_cast(it); + if (attribute.is_nullable && !out[i]) + out[i] = attribute.nullable_set->find(key) != nullptr; + /// free memory allocated for the key temporary_keys_pool.rollback(key.size); } @@ -684,41 +520,26 @@ std::vector ComplexKeyHashedDictionary::getKeys() const { const Attribute & attribute = attributes.front(); - switch (attribute.type) - { - case AttributeUnderlyingType::utUInt8: - return getKeys(attribute); - case AttributeUnderlyingType::utUInt16: - return getKeys(attribute); - case AttributeUnderlyingType::utUInt32: - return getKeys(attribute); - case AttributeUnderlyingType::utUInt64: - return getKeys(attribute); - case AttributeUnderlyingType::utUInt128: - return getKeys(attribute); - case AttributeUnderlyingType::utInt8: - return getKeys(attribute); - case AttributeUnderlyingType::utInt16: - return getKeys(attribute); - case AttributeUnderlyingType::utInt32: - return getKeys(attribute); - case AttributeUnderlyingType::utInt64: - return getKeys(attribute); - case AttributeUnderlyingType::utFloat32: - return getKeys(attribute); - case AttributeUnderlyingType::utFloat64: - return getKeys(attribute); - case AttributeUnderlyingType::utString: - return getKeys(attribute); + std::vector result; - case AttributeUnderlyingType::utDecimal32: - return getKeys(attribute); - case AttributeUnderlyingType::utDecimal64: - return getKeys(attribute); - case AttributeUnderlyingType::utDecimal128: - return getKeys(attribute); - } - return {}; + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + + if constexpr (std::is_same_v) + { + result = getKeys(attribute); + } + else + { + result = getKeys(attribute); + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + + return result; } template @@ -730,12 +551,18 @@ std::vector ComplexKeyHashedDictionary::getKeys(const Attribute & att for (const auto & key : attr) keys.push_back(key.getKey()); + if (attribute.is_nullable) + { + for (const auto & key: *attribute.nullable_set) + keys.push_back(key.getKey()); + } + return keys; } BlockInputStreamPtr ComplexKeyHashedDictionary::getBlockInputStream(const Names & column_names, size_t max_block_size) const { - using BlockInputStreamType = DictionaryBlockInputStream; + using BlockInputStreamType = DictionaryBlockInputStream; return std::make_shared(shared_from_this(), max_block_size, getKeys(), column_names); } diff --git a/src/Dictionaries/ComplexKeyHashedDictionary.h b/src/Dictionaries/ComplexKeyHashedDictionary.h index baf6628eebd..ecc720ca0b0 100644 --- a/src/Dictionaries/ComplexKeyHashedDictionary.h +++ b/src/Dictionaries/ComplexKeyHashedDictionary.h @@ -7,17 +7,17 @@ #include #include #include +#include #include #include #include -#include "DictionaryStructure.h" #include "IDictionary.h" #include "IDictionarySource.h" - +#include "DictionaryStructure.h" +#include "DictionaryHelpers.h" namespace DB { -using BlockPtr = std::shared_ptr; class ComplexKeyHashedDictionary final : public IDictionaryBase { @@ -60,91 +60,16 @@ public: return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective; } - template - using ResultArrayType = std::conditional_t, DecimalPaddedPODArray, PaddedPODArray>; + DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; } -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString(const std::string & attribute_name, const Columns & key_columns, const DataTypes & key_types, ColumnString * out) const; - -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const PaddedPODArray & def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, + ColumnPtr getColumn( + const std::string& attribute_name, + const DataTypePtr & result_type, const Columns & key_columns, const DataTypes & key_types, - const ColumnString * const def, - ColumnString * const out) const; + const ColumnPtr default_values_column) const override; -#define DECLARE(TYPE) \ - void get##TYPE( \ - const std::string & attribute_name, \ - const Columns & key_columns, \ - const DataTypes & key_types, \ - const TYPE def, \ - ResultArrayType & out) const; - DECLARE(UInt8) - DECLARE(UInt16) - DECLARE(UInt32) - DECLARE(UInt64) - DECLARE(UInt128) - DECLARE(Int8) - DECLARE(Int16) - DECLARE(Int32) - DECLARE(Int64) - DECLARE(Float32) - DECLARE(Float64) - DECLARE(Decimal32) - DECLARE(Decimal64) - DECLARE(Decimal128) -#undef DECLARE - - void getString( - const std::string & attribute_name, - const Columns & key_columns, - const DataTypes & key_types, - const String & def, - ColumnString * const out) const; - - void has(const Columns & key_columns, const DataTypes & key_types, PaddedPODArray & out) const; + ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override; BlockInputStreamPtr getBlockInputStream(const Names & column_names, size_t max_block_size) const override; @@ -152,9 +77,14 @@ private: template using ContainerType = HashMapWithSavedHash; + using NullableSet = HashSetWithSavedHash; + struct Attribute final { AttributeUnderlyingType type; + bool is_nullable; + std::unique_ptr nullable_set; + std::variant< UInt8, UInt16, @@ -170,7 +100,7 @@ private: Decimal128, Float32, Float64, - String> + StringRef> null_values; std::variant< ContainerType, @@ -206,18 +136,21 @@ private: void calculateBytesAllocated(); template - void createAttributeImpl(Attribute & attribute, const Field & null_value); + static void createAttributeImpl(Attribute & attribute, const Field & null_value); - Attribute createAttributeWithType(const AttributeUnderlyingType type, const Field & null_value); + static Attribute createAttribute(const DictionaryAttribute & attribute, const Field & null_value); - template - void - getItemsImpl(const Attribute & attribute, const Columns & key_columns, ValueSetter && set_value, DefaultGetter && get_default) const; + template + void getItemsImpl( + const Attribute & attribute, + const Columns & key_columns, + ValueSetter && set_value, + DefaultValueExtractor & default_value_extractor) const; template - bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); + static bool setAttributeValueImpl(Attribute & attribute, const StringRef key, const T value); - bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); + static bool setAttributeValue(Attribute & attribute, const StringRef key, const Field & value); const Attribute & getAttribute(const std::string & attribute_name) const; diff --git a/src/Dictionaries/DictionaryBlockInputStream.h b/src/Dictionaries/DictionaryBlockInputStream.h index c683ef0e9cc..f045d47c2c2 100644 --- a/src/Dictionaries/DictionaryBlockInputStream.h +++ b/src/Dictionaries/DictionaryBlockInputStream.h @@ -25,12 +25,10 @@ namespace ErrorCodes /* BlockInputStream implementation for external dictionaries * read() returns blocks consisting of the in-memory contents of the dictionaries */ -template +template class DictionaryBlockInputStream : public DictionaryBlockInputStreamBase { public: - using DictionaryPtr = std::shared_ptr; - DictionaryBlockInputStream( std::shared_ptr dictionary, UInt64 max_block_size, PaddedPODArray && ids, const Names & column_names); @@ -60,111 +58,9 @@ protected: Block getBlock(size_t start, size_t size) const override; private: - // pointer types to getXXX functions - // for single key dictionaries - template - using DictionaryGetter = void (DictionaryType::*)(const std::string &, const PaddedPODArray &, PaddedPODArray &) const; - - template - using DictionaryDecimalGetter - = void (DictionaryType::*)(const std::string &, const PaddedPODArray &, DecimalPaddedPODArray &) const; - - using DictionaryStringGetter = void (DictionaryType::*)(const std::string &, const PaddedPODArray &, ColumnString *) const; - - // for complex complex key dictionaries - template - using GetterByKey = void (DictionaryType::*)(const std::string &, const Columns &, const DataTypes &, PaddedPODArray & out) const; - - template - using DecimalGetterByKey - = void (DictionaryType::*)(const std::string &, const Columns &, const DataTypes &, DecimalPaddedPODArray & out) const; - - using StringGetterByKey = void (DictionaryType::*)(const std::string &, const Columns &, const DataTypes &, ColumnString * out) const; - - // call getXXX - // for single key dictionaries - template - void callGetter( - DictionaryGetter getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - template - void callGetter( - DictionaryDecimalGetter getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - template - void callGetter( - DictionaryStringGetter getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - // for complex complex key dictionaries - template - void callGetter( - GetterByKey getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - template - void callGetter( - DecimalGetterByKey getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - template - void callGetter( - StringGetterByKey getter, - const PaddedPODArray & ids_to_fill, - const Columns & keys, - const DataTypes & data_types, - Container & container, - const DictionaryAttribute & attribute, - const DictionaryType & dictionary) const; - - template