Merge branch 'master' into ncb/server_uuid

This commit is contained in:
Alexander Tokmakov 2021-08-17 18:07:09 +03:00
commit df1a4e90a8
402 changed files with 5407 additions and 1075 deletions

View File

@ -9,10 +9,6 @@ if (GLIBC_COMPATIBILITY)
check_include_file("sys/random.h" HAVE_SYS_RANDOM_H)
if(COMPILER_CLANG)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-builtin-requires-header")
endif()
add_headers_and_sources(glibc_compatibility .)
add_headers_and_sources(glibc_compatibility musl)
if (ARCH_AARCH64)
@ -35,11 +31,9 @@ if (GLIBC_COMPATIBILITY)
add_library(glibc-compatibility STATIC ${glibc_compatibility_sources})
if (COMPILER_CLANG)
target_compile_options(glibc-compatibility PRIVATE -Wno-unused-command-line-argument)
elseif (COMPILER_GCC)
target_compile_options(glibc-compatibility PRIVATE -Wno-unused-but-set-variable)
endif ()
target_no_warning(glibc-compatibility unused-command-line-argument)
target_no_warning(glibc-compatibility unused-but-set-variable)
target_no_warning(glibc-compatibility builtin-requires-header)
target_include_directories(glibc-compatibility PRIVATE libcxxabi ${musl_arch_include_dir})

View File

@ -27,3 +27,22 @@ endmacro ()
macro (no_warning flag)
add_warning(no-${flag})
endmacro ()
# The same but only for specified target.
macro (target_add_warning target flag)
string (REPLACE "-" "_" underscored_flag ${flag})
string (REPLACE "+" "x" underscored_flag ${underscored_flag})
check_cxx_compiler_flag("-W${flag}" SUPPORTS_CXXFLAG_${underscored_flag})
if (SUPPORTS_CXXFLAG_${underscored_flag})
target_compile_options (${target} PRIVATE "-W${flag}")
else ()
message (WARNING "Flag -W${flag} is unsupported")
endif ()
endmacro ()
macro (target_no_warning target flag)
target_add_warning(${target} no-${flag})
endmacro ()

View File

@ -119,12 +119,9 @@ set(ORC_SRCS
"${ORC_SOURCE_SRC_DIR}/ColumnWriter.cc"
"${ORC_SOURCE_SRC_DIR}/Common.cc"
"${ORC_SOURCE_SRC_DIR}/Compression.cc"
"${ORC_SOURCE_SRC_DIR}/Exceptions.cc"
"${ORC_SOURCE_SRC_DIR}/Int128.cc"
"${ORC_SOURCE_SRC_DIR}/LzoDecompressor.cc"
"${ORC_SOURCE_SRC_DIR}/MemoryPool.cc"
"${ORC_SOURCE_SRC_DIR}/OrcFile.cc"
"${ORC_SOURCE_SRC_DIR}/Reader.cc"
"${ORC_SOURCE_SRC_DIR}/RLE.cc"
"${ORC_SOURCE_SRC_DIR}/RLEv1.cc"
"${ORC_SOURCE_SRC_DIR}/RLEv2.cc"

View File

@ -27,16 +27,12 @@ target_include_directories(roaring SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/cpp")
# We redirect malloc/free family of functions to different functions that will track memory in ClickHouse.
# Also note that we exploit implicit function declarations.
# Also it is disabled on Mac OS because it fails).
target_compile_definitions(roaring PRIVATE
-Dmalloc=clickhouse_malloc
-Dcalloc=clickhouse_calloc
-Drealloc=clickhouse_realloc
-Dreallocarray=clickhouse_reallocarray
-Dfree=clickhouse_free
-Dposix_memalign=clickhouse_posix_memalign)
if (NOT OS_DARWIN)
target_compile_definitions(roaring PRIVATE
-Dmalloc=clickhouse_malloc
-Dcalloc=clickhouse_calloc
-Drealloc=clickhouse_realloc
-Dreallocarray=clickhouse_reallocarray
-Dfree=clickhouse_free
-Dposix_memalign=clickhouse_posix_memalign)
target_link_libraries(roaring PUBLIC clickhouse_common_io)
endif ()
target_link_libraries(roaring PUBLIC clickhouse_common_io)

View File

@ -1,10 +1,9 @@
# Disabled under OSX until https://github.com/ClickHouse/ClickHouse/issues/27568 is fixed
if (SANITIZE OR NOT (
((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE)) OR
(OS_DARWIN AND (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_BUILD_TYPE STREQUAL "Debug"))
))
((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE))))
if (ENABLE_JEMALLOC)
message (${RECONFIGURE_MESSAGE_LEVEL}
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds and RelWithDebInfo macOS builds.")
"jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds")
endif ()
set (ENABLE_JEMALLOC OFF)
else ()

View File

@ -226,7 +226,7 @@ continue
task_exit_code=$fuzzer_exit_code
echo "failure" > status.txt
{ grep --text -o "Found error:.*" fuzzer.log \
|| grep --text -o "Exception.*" fuzzer.log \
|| grep --text -ao "Exception:.*" fuzzer.log \
|| echo "Fuzzer failed ($fuzzer_exit_code). See the logs." ; } \
| tail -1 > description.txt
fi

View File

@ -1,3 +1,8 @@
---
toc_priority: 36
toc_title: Replicated
---
# [experimental] Replicated {#replicated}
The engine is based on the [Atomic](../../engines/database-engines/atomic.md) engine. It supports replication of metadata via DDL log being written to ZooKeeper and executed on all of the replicas for a given database.

View File

@ -844,44 +844,3 @@ S3 disk can be configured as `main` or `cold` storage:
```
In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule.
## Using HDFS for Data Storage {#table_engine-mergetree-hdfs}
[HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html) is a distributed file system for remote data storage.
`MergeTree` family table engines can store data to HDFS using a disk with type `HDFS`.
Configuration markup:
``` xml
<yandex>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
</hdfs>
</disks>
<policies>
<hdfs>
<volumes>
<main>
<disk>hdfs</disk>
</main>
</volumes>
</hdfs>
</policies>
</storage_configuration>
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
</yandex>
```
Required parameters:
- `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data.
Optional parameters:
- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`.

View File

@ -5,10 +5,111 @@ toc_title: External Disks for Storing Data
# External Disks for Storing Data {#external-disks}
Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon s3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)).
Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)).
To work with data stored on `Amazon s3` disks use [s3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine.
To work with data stored on `Amazon S3` disks use [S3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine.
## Zero-copy Replication {#zero-copy}
ClickHouse supports zero-copy replication for `s3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.
ClickHouse supports zero-copy replication for `S3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.
## Configuring HDFS {#configuring-hdfs}
[MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`.
Configuration markup:
``` xml
<yandex>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
</hdfs>
</disks>
<policies>
<hdfs>
<volumes>
<main>
<disk>hdfs</disk>
</main>
</volumes>
</hdfs>
</policies>
</storage_configuration>
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
</yandex>
```
Required parameters:
- `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data.
Optional parameters:
- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`.
## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system}
You can encrypt the data stored on [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one.
Example of disk configuration:
``` xml
<disks>
<disk1>
<type>local</type>
<path>/path1/</path>
</disk1>
<disk2>
<type>encrypted</type>
<disk>disk1</disk>
<path>path2/</path>
<key>_16_ascii_chars_</key>
</disk2>
</disks>
```
For example, when ClickHouse writes data from some table to a file `store/all_1_1_0/data.bin` to `disk1`, then in fact this file will be written to the physical disk along the path `/path1/store/all_1_1_0/data.bin`.
When writing the same file to `disk2`, it will actually be written to the physical disk at the path `/path1/path2/store/all_1_1_0/data.bin` in encrypted mode.
Required parameters:
- `type``encrypted`. Otherwise the encrypted disk is not created.
- `disk` — Type of disk for data storage.
- `key` — The key for encryption and decryption. Type: [Uint64](../sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encrypt in hexadecimal form.
You can specify multiple keys using the `id` attribute (see example above).
Optional parameters:
- `path` — Path to the location on the disk where the data will be saved. If not specified, the data will be saved in the root directory.
- `current_key_id` — The key used for encryption. All the specified keys can be used for decryption, and you can always switch to another key while maintaining access to previously encrypted data.
- `algorithm` — [Algorithm](../sql-reference/statements/create/table.md#create-query-encryption-codecs) for encryption. Possible values: `AES_128_CTR`, `AES_192_CTR` or `AES_256_CTR`. Default value: `AES_128_CTR`. The key length depends on the algorithm: `AES_128_CTR` — 16 bytes, `AES_192_CTR` — 24 bytes, `AES_256_CTR` — 32 bytes.
Example of disk configuration:
``` xml
<yandex>
<storage_configuration>
<disks>
<disk_s3>
<type>s3</type>
<endpoint>...
</disk_s3>
<disk_s3_encrypted>
<type>encrypted</type>
<disk>disk_s3</disk>
<algorithm>AES_128_CTR</algorithm>
<key_hex id="0">00112233445566778899aabbccddeeff</key_hex>
<key_hex id="1">ffeeddccbbaa99887766554433221100</key_hex>
<current_key_id>1</current_key_id>
</disk_s3_encrypted>
</disks>
</storage_configuration>
</yandex>
```

View File

@ -952,7 +952,7 @@ SELECT arrayEnumerateDense([10, 20, 10, 30])
## arrayIntersect(arr) {#array-functions-arrayintersect}
Takes multiple arrays, returns an array with elements that are present in all source arrays. Elements order in the resulting array is the same as in the first array.
Takes multiple arrays, returns an array with elements that are present in all source arrays.
Example:

View File

@ -384,5 +384,32 @@ ExpressionTransform
(ReadFromStorage)
NumbersMt × 2 0 → 1
```
### EXPLAIN ESTIMATE {#explain-estimate}
Shows the estimated number of rows, marks and parts to be read from the tables while processing the query. Works with tables in the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) family.
**Example**
Creating a table:
```sql
CREATE TABLE ttt (i Int64) ENGINE = MergeTree() ORDER BY i SETTINGS index_granularity = 16, write_final_mark = 0;
INSERT INTO ttt SELECT number FROM numbers(128);
OPTIMIZE TABLE ttt;
```
Query:
```sql
EXPLAIN ESTIMATE SELECT * FROM ttt;
```
Result:
```text
┌─database─┬─table─┬─parts─┬─rows─┬─marks─┐
│ default │ ttt │ 1 │ 128 │ 8 │
└──────────┴───────┴───────┴──────┴───────┘
```
[Оriginal article](https://clickhouse.tech/docs/en/sql-reference/statements/explain/) <!--hide-->

View File

@ -1,3 +1,7 @@
---
toc_priority: 36
toc_title: Replicated
---
# [экспериментальный] Replicated {#replicated}

View File

@ -827,44 +827,3 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd'
```
Если диск сконфигурирован как `cold`, данные будут переноситься в S3 при срабатывании правил TTL или когда свободное место на локальном диске станет меньше порогового значения, которое определяется как `move_factor * disk_size`.
## Использование сервиса HDFS для хранения данных {#table_engine-mergetree-hdfs}
[HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html) — это распределенная файловая система для удаленного хранения данных.
Таблицы семейства `MergeTree` могут хранить данные в сервисе HDFS при использовании диска типа `HDFS`.
Пример конфигурации:
``` xml
<yandex>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
</hdfs>
</disks>
<policies>
<hdfs>
<volumes>
<main>
<disk>hdfs</disk>
</main>
</volumes>
</hdfs>
</policies>
</storage_configuration>
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
</yandex>
```
Обязательные параметры:
- `endpoint` — URL точки приема запроса на стороне HDFS в формате `path`. URL точки должен содержать путь к корневой директории на сервере, где хранятся данные.
Необязательные параметры:
- `min_bytes_for_seek` — минимальное количество байтов, которые используются для операций поиска вместо последовательного чтения. Значение по умолчанию: 1 МБайт.

View File

@ -5,10 +5,110 @@ toc_title: "Хранение данных на внешних дисках"
# Хранение данных на внешних дисках {#external-disks}
Данные, которые обрабатываются в ClickHouse, обычно хранятся в файловой системе локально, где развернут сервер ClickHouse. При этом для хранения данных требуются диски большого объема, которые могут быть довольно дорогостоящими. Решением проблемы может стать хранение данных отдельно от сервера — в распределенных файловых системах — [Amazon s3](https://aws.amazon.com/s3/) или Hadoop ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)).
Данные, которые обрабатываются в ClickHouse, обычно хранятся в файловой системе локально, где развернут сервер ClickHouse. При этом для хранения данных требуются диски большого объема, которые могут быть довольно дорогостоящими. Решением проблемы может стать хранение данных отдельно от сервера — в распределенных файловых системах — [Amazon S3](https://aws.amazon.com/s3/) или Hadoop ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)).
Для работы с данными, хранящимися в файловой системе `Amazon s3`, используйте движок [s3](../engines/table-engines/integrations/s3.md), а для работы с данными в файловой системе Hadoop — движок [HDFS](../engines/table-engines/integrations/hdfs.md).
Для работы с данными, хранящимися в файловой системе `Amazon S3`, используйте движок [S3](../engines/table-engines/integrations/s3.md), а для работы с данными в файловой системе Hadoop — движок [HDFS](../engines/table-engines/integrations/hdfs.md).
## Репликация без копирования данных {#zero-copy}
Для дисков `s3` и `HDFS` в ClickHouse поддерживается репликация без копирования данных (zero-copy): если данные хранятся на нескольких репликах, то при синхронизации пересылаются только метаданные (пути к кускам данных), а сами данные не копируются.
Для дисков `S3` и `HDFS` в ClickHouse поддерживается репликация без копирования данных (zero-copy): если данные хранятся на нескольких репликах, то при синхронизации пересылаются только метаданные (пути к кускам данных), а сами данные не копируются.
## Использование сервиса HDFS для хранения данных {#table_engine-mergetree-hdfs}
Таблицы семейств [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) и [Log](../engines/table-engines/log-family/log.md) могут хранить данные в сервисе HDFS при использовании диска типа `HDFS`.
Пример конфигурации:
``` xml
<yandex>
<storage_configuration>
<disks>
<hdfs>
<type>hdfs</type>
<endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
</hdfs>
</disks>
<policies>
<hdfs>
<volumes>
<main>
<disk>hdfs</disk>
</main>
</volumes>
</hdfs>
</policies>
</storage_configuration>
<merge_tree>
<min_bytes_for_wide_part>0</min_bytes_for_wide_part>
</merge_tree>
</yandex>
```
Обязательные параметры:
- `endpoint` — URL точки приема запроса на стороне HDFS в формате `path`. URL точки должен содержать путь к корневой директории на сервере, где хранятся данные.
Необязательные параметры:
- `min_bytes_for_seek` — минимальное количество байтов, которые используются для операций поиска вместо последовательного чтения. Значение по умолчанию: `1 МБайт`.
## Использование виртуальной файловой системы для шифрования данных {#encrypted-virtual-file-system}
Вы можете зашифровать данные, сохраненные на внешних дисках [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3) или [HDFS](#table_engine-mergetree-hdfs) или на локальном диске. Чтобы включить режим шифрования, в конфигурационном файле вы должны указать диск с типом `encrypted` и тип диска, на котором будут сохранены данные. Диск типа `encrypted` шифрует данные "на лету", то есть при чтении файлов с этого диска расшифровка происходит автоматически. Таким образом, вы можете работать с диском типа `encrypted` как с обычным.
Пример конфигурации:
``` xml
<disks>
<disk1>
<type>local</type>
<path>/path1/</path>
</disk1>
<disk2>
<type>encrypted</type>
<disk>disk1</disk>
<path>path2/</path>
<key>_16_ascii_chars_</key>
</disk2>
</disks>
```
Например, когда ClickHouse записывает данные из какой-либо таблицы в файл `store/all_1_1_0/data.bin` на `disk1`, то на самом деле этот файл будет записан на физический диск по пути `/path1/store/all_1_1_0/data.bin`.
При записи того же файла на диск `disk2` он будет записан на физический диск в зашифрованном виде по пути `/path1/path2/store/all_1_1_0/data.bin`.
Обязательные параметры:
- `type``encrypted`. Иначе зашифрованный диск создан не будет.
- `disk` — тип диска для хранения данных.
- `key` — ключ для шифрования и расшифровки. Тип: [Uint64](../sql-reference/data-types/int-uint.md). Вы можете использовать параметр `key_hex` для шифрования в шестнадцатеричной форме.
Вы можете указать несколько ключей, используя атрибут `id` (смотрите пример выше).
Необязательные параметры:
- `path` — путь к месту на диске, где будут сохранены данные. Если не указан, данные будут сохранены в корневом каталоге.
- `current_key_id` — ключ, используемый для шифрования. Все указанные ключи могут быть использованы для расшифровки, и вы всегда можете переключиться на другой ключ, сохраняя доступ к ранее зашифрованным данным.
- `algorithm` — [алгоритм](../sql-reference/statements/create/table.md#create-query-encryption-codecs) шифрования данных. Возможные значения: `AES_128_CTR`, `AES_192_CTR` или `AES_256_CTR`. Значение по умолчанию: `AES_128_CTR`. Длина ключа зависит от алгоритма: `AES_128_CTR` — 16 байт, `AES_192_CTR` — 24 байта, `AES_256_CTR` — 32 байта.
Пример конфигурации:
``` xml
<yandex>
<storage_configuration>
<disks>
<disk_s3>
<type>s3</type>
<endpoint>...
</disk_s3>
<disk_s3_encrypted>
<type>encrypted</type>
<disk>disk_s3</disk>
<algorithm>AES_128_CTR</algorithm>
<key_hex id="0">00112233445566778899aabbccddeeff</key_hex>
<key_hex id="1">ffeeddccbbaa99887766554433221100</key_hex>
<current_key_id>1</current_key_id>
</disk_s3_encrypted>
</disks>
</storage_configuration>
</yandex>
```

View File

@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM
## sequenceCount(pattern)(time, cond1, cond2, …) {#function-sequencecount}
Вычисляет количество цепочек событий, соответствующих шаблону. Функция обнаруживает только непересекающиеся цепочки событий. Она начитает искать следующую цепочку только после того, как полностью совпала текущая цепочка событий.
Вычисляет количество цепочек событий, соответствующих шаблону. Функция обнаруживает только непересекающиеся цепочки событий. Она начинает искать следующую цепочку только после того, как полностью совпала текущая цепочка событий.
!!! warning "Предупреждение"
События, произошедшие в одну и ту же секунду, располагаются в последовательности в неопределенном порядке, что может повлиять на результат работы функции.

View File

@ -906,7 +906,7 @@ SELECT arrayEnumerateDense([10, 20, 10, 30])
## arrayIntersect(arr) {#array-functions-arrayintersect}
Принимает несколько массивов, возвращает массив с элементами, присутствующими во всех исходных массивах. Элементы на выходе следуют в порядке следования в первом массиве.
Принимает несколько массивов, возвращает массив с элементами, присутствующими во всех исходных массивах.
Пример:

View File

@ -385,4 +385,32 @@ ExpressionTransform
NumbersMt × 2 0 → 1
```
### EXPLAIN ESTIMATE {#explain-estimate}
Отображает оценки числа строк, засечек и кусков, которые будут прочитаны при выполнении запроса. Применяется для таблиц семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree).
**Пример**
Создадим таблицу:
```sql
CREATE TABLE ttt (i Int64) ENGINE = MergeTree() ORDER BY i SETTINGS index_granularity = 16, write_final_mark = 0;
INSERT INTO ttt SELECT number FROM numbers(128);
OPTIMIZE TABLE ttt;
```
Запрос:
```sql
EXPLAIN ESTIMATE SELECT * FROM ttt;
```
Результат:
```text
┌─database─┬─table─┬─parts─┬─rows─┬─marks─┐
│ default │ ttt │ 1 │ 128 │ 8 │
└──────────┴───────┴───────┴──────┴───────┘
```
[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/explain/) <!--hide-->

View File

@ -3,6 +3,7 @@ set (CLICKHOUSE_CLIENT_SOURCES
ConnectionParameters.cpp
QueryFuzzer.cpp
Suggest.cpp
TestHint.cpp
)
set (CLICKHOUSE_CLIENT_LINK

View File

@ -0,0 +1,105 @@
#include "TestHint.h"
#include <Common/Exception.h>
#include <Common/ErrorCodes.h>
#include <IO/ReadBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <Parsers/Lexer.h>
namespace
{
/// Parse error as number or as a string (name of the error code const)
int parseErrorCode(DB::ReadBufferFromString & in)
{
int code = -1;
String code_name;
auto * pos = in.position();
tryReadText(code, in);
if (pos != in.position())
{
return code;
}
/// Try parse as string
readStringUntilWhitespace(code_name, in);
return DB::ErrorCodes::getErrorCodeByName(code_name);
}
}
namespace DB
{
TestHint::TestHint(bool enabled_, const String & query_)
: query(query_)
{
if (!enabled_)
return;
// Don't parse error hints in leading comments, because it feels weird.
// Leading 'echo' hint is OK.
bool is_leading_hint = true;
Lexer lexer(query.data(), query.data() + query.size());
for (Token token = lexer.nextToken(); !token.isEnd(); token = lexer.nextToken())
{
if (token.type != TokenType::Comment
&& token.type != TokenType::Whitespace)
{
is_leading_hint = false;
}
else if (token.type == TokenType::Comment)
{
String comment(token.begin, token.begin + token.size());
if (!comment.empty())
{
size_t pos_start = comment.find('{', 0);
if (pos_start != String::npos)
{
size_t pos_end = comment.find('}', pos_start);
if (pos_end != String::npos)
{
String hint(comment.begin() + pos_start + 1, comment.begin() + pos_end);
parse(hint, is_leading_hint);
}
}
}
}
}
}
void TestHint::parse(const String & hint, bool is_leading_hint)
{
ReadBufferFromString in(hint);
String item;
while (!in.eof())
{
readStringUntilWhitespace(item, in);
if (in.eof())
break;
skipWhitespaceIfAny(in);
if (!is_leading_hint)
{
if (item == "serverError")
server_error = parseErrorCode(in);
else if (item == "clientError")
client_error = parseErrorCode(in);
}
if (item == "echo")
echo.emplace(true);
if (item == "echoOn")
echo.emplace(true);
if (item == "echoOff")
echo.emplace(false);
}
}
}

View File

@ -1,11 +1,7 @@
#pragma once
#include <memory>
#include <sstream>
#include <iostream>
#include <optional>
#include <Core/Types.h>
#include <Common/Exception.h>
#include <Parsers/Lexer.h>
namespace DB
@ -19,6 +15,10 @@ namespace DB
///
/// - "-- { clientError 20 }" -- in case of you are expecting client error.
///
/// - "-- { serverError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO }" -- by error name.
///
/// - "-- { clientError FUNCTION_THROW_IF_VALUE_IS_NON_ZERO }" -- by error name.
///
/// Remember that the client parse the query first (not the server), so for
/// example if you are expecting syntax error, then you should use
/// clientError not serverError.
@ -43,45 +43,7 @@ namespace DB
class TestHint
{
public:
TestHint(bool enabled_, const String & query_) :
query(query_)
{
if (!enabled_)
return;
// Don't parse error hints in leading comments, because it feels weird.
// Leading 'echo' hint is OK.
bool is_leading_hint = true;
Lexer lexer(query.data(), query.data() + query.size());
for (Token token = lexer.nextToken(); !token.isEnd(); token = lexer.nextToken())
{
if (token.type != TokenType::Comment
&& token.type != TokenType::Whitespace)
{
is_leading_hint = false;
}
else if (token.type == TokenType::Comment)
{
String comment(token.begin, token.begin + token.size());
if (!comment.empty())
{
size_t pos_start = comment.find('{', 0);
if (pos_start != String::npos)
{
size_t pos_end = comment.find('}', pos_start);
if (pos_end != String::npos)
{
String hint(comment.begin() + pos_start + 1, comment.begin() + pos_end);
parse(hint, is_leading_hint);
}
}
}
}
}
}
TestHint(bool enabled_, const String & query_);
int serverError() const { return server_error; }
int clientError() const { return client_error; }
@ -93,34 +55,7 @@ private:
int client_error = 0;
std::optional<bool> echo;
void parse(const String & hint, bool is_leading_hint)
{
std::stringstream ss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
ss << hint;
String item;
while (!ss.eof())
{
ss >> item;
if (ss.eof())
break;
if (!is_leading_hint)
{
if (item == "serverError")
ss >> server_error;
else if (item == "clientError")
ss >> client_error;
}
if (item == "echo")
echo.emplace(true);
if (item == "echoOn")
echo.emplace(true);
if (item == "echoOff")
echo.emplace(false);
}
}
void parse(const String & hint, bool is_leading_hint);
bool allErrorsExpected(int actual_server_error, int actual_client_error) const
{

View File

@ -1,5 +1,7 @@
<?xml version="1.0"?>
<yandex>
<!-- See also the files in users.d directory where the settings can be overridden. -->
<!-- Profiles of settings. -->
<profiles>
<!-- Default settings. -->
@ -28,7 +30,9 @@
<users>
<!-- If user name was not specified, 'default' user is used. -->
<default>
<!-- Password could be specified in plaintext or in SHA256 (in hex format).
<!-- See also the files in users.d directory where the password can be overridden.
Password could be specified in plaintext or in SHA256 (in hex format).
If you want to specify password in plaintext (not recommended), place it in 'password' element.
Example: <password>qwerty</password>.
@ -49,7 +53,7 @@
place 'kerberos' element instead of 'password' (and similar) elements.
The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
whose initiator's realm matches it.
whose initiator's realm matches it.
Example: <kerberos />
Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>

View File

@ -122,6 +122,9 @@ struct AccessRightsElement
class AccessRightsElements : public std::vector<AccessRightsElement>
{
public:
using Base = std::vector<AccessRightsElement>;
using Base::Base;
bool empty() const { return std::all_of(begin(), end(), [](const AccessRightsElement & e) { return e.empty(); }); }
bool sameDatabaseAndTable() const

View File

@ -1,5 +1,6 @@
#include <Columns/ColumnAggregateFunction.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/MaskOperations.h>
#include <Common/assert_cast.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/WriteBufferFromArena.h>
@ -308,6 +309,10 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_
return res;
}
void ColumnAggregateFunction::expand(const Filter & mask, bool inverted)
{
expandDataByMask<char *>(data, mask, inverted);
}
ColumnPtr ColumnAggregateFunction::permute(const Permutation & perm, size_t limit) const
{

View File

@ -177,6 +177,8 @@ public:
ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -8,6 +8,7 @@
#include <Columns/ColumnConst.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/MaskOperations.h>
#include <common/unaligned.h>
#include <common/sort.h>
@ -551,6 +552,34 @@ ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) con
return filterGeneric(filt, result_size_hint);
}
void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
{
auto & offsets_data = getOffsets();
if (mask.size() < offsets_data.size())
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
int index = mask.size() - 1;
int from = offsets_data.size() - 1;
offsets_data.resize(mask.size());
UInt64 last_offset = offsets_data[from];
while (index >= 0)
{
offsets_data[index] = last_offset;
if (mask[index] ^ inverted)
{
if (from < 0)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
--from;
last_offset = offsets_data[from];
}
--index;
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);}
template <typename T>
ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
{

View File

@ -71,6 +71,7 @@ public:
void insertDefault() override;
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
template <typename Type> ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;

View File

@ -90,6 +90,7 @@ public:
void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); }
void expand(const Filter &, bool) override { throwMustBeDecompressed(); }
ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeDecompressed(); }
ColumnPtr index(const IColumn &, size_t) const override { throwMustBeDecompressed(); }
int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeDecompressed(); }

View File

@ -59,9 +59,28 @@ ColumnPtr ColumnConst::filter(const Filter & filt, ssize_t /*result_size_hint*/)
throw Exception("Size of filter (" + toString(filt.size()) + ") doesn't match size of column (" + toString(s) + ")",
ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
return ColumnConst::create(data, countBytesInFilter(filt));
size_t new_size = countBytesInFilter(filt);
return ColumnConst::create(data, new_size);
}
void ColumnConst::expand(const Filter & mask, bool inverted)
{
if (mask.size() < s)
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
size_t bytes_count = countBytesInFilter(mask);
if (inverted)
bytes_count = mask.size() - bytes_count;
if (bytes_count < s)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
else if (bytes_count > s)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
s = mask.size();
}
ColumnPtr ColumnConst::replicate(const Offsets & offsets) const
{
if (s != offsets.size())

View File

@ -181,6 +181,8 @@ public:
}
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr replicate(const Offsets & offsets) const override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -15,6 +15,7 @@
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnDecimal.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/MaskOperations.h>
#include <DataStreams/ColumnGathererStream.h>
@ -320,6 +321,12 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter & filt, ssize_t result_
return res;
}
template <typename T>
void ColumnDecimal<T>::expand(const IColumn::Filter & mask, bool inverted)
{
expandDataByMask<T>(data, mask, inverted);
}
template <typename T>
ColumnPtr ColumnDecimal<T>::index(const IColumn & indexes, size_t limit) const
{

View File

@ -151,6 +151,8 @@ public:
bool isDefaultAt(size_t n) const override { return data[n].value == 0; }
ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;
void expand(const IColumn::Filter & mask, bool inverted) override;
ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -344,6 +344,32 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
return res;
}
void ColumnFixedString::expand(const IColumn::Filter & mask, bool inverted)
{
if (mask.size() < size())
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
int index = mask.size() - 1;
int from = size() - 1;
chars.resize_fill(mask.size() * n, 0);
while (index >= 0)
{
if (mask[index] ^ inverted)
{
if (from < 0)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
memcpy(&chars[index * n], &chars[from * n], n);
--from;
}
--index;
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
ColumnPtr ColumnFixedString::permute(const Permutation & perm, size_t limit) const
{
size_t col_size = size();

View File

@ -147,6 +147,8 @@ public:
ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;
void expand(const IColumn::Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -2,9 +2,15 @@
#include <Columns/ColumnFunction.h>
#include <Columns/ColumnsCommon.h>
#include <Common/PODArray.h>
#include <Common/ProfileEvents.h>
#include <IO/WriteHelpers.h>
#include <Functions/IFunction.h>
namespace ProfileEvents
{
extern const Event FunctionExecute;
extern const Event CompiledFunctionExecute;
}
namespace DB
{
@ -15,8 +21,8 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
ColumnFunction::ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture)
: size_(size), function(function_)
ColumnFunction::ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_, bool is_function_compiled_)
: size_(size), function(function_), is_short_circuit_argument(is_short_circuit_argument_), is_function_compiled(is_function_compiled_)
{
appendArguments(columns_to_capture);
}
@ -27,7 +33,7 @@ MutableColumnPtr ColumnFunction::cloneResized(size_t size) const
for (auto & column : capture)
column.column = column.column->cloneResized(size);
return ColumnFunction::create(size, function, capture);
return ColumnFunction::create(size, function, capture, is_short_circuit_argument, is_function_compiled);
}
ColumnPtr ColumnFunction::replicate(const Offsets & offsets) const
@ -41,7 +47,7 @@ ColumnPtr ColumnFunction::replicate(const Offsets & offsets) const
column.column = column.column->replicate(offsets);
size_t replicated_size = 0 == size_ ? 0 : offsets.back();
return ColumnFunction::create(replicated_size, function, capture);
return ColumnFunction::create(replicated_size, function, capture, is_short_circuit_argument, is_function_compiled);
}
ColumnPtr ColumnFunction::cut(size_t start, size_t length) const
@ -50,7 +56,7 @@ ColumnPtr ColumnFunction::cut(size_t start, size_t length) const
for (auto & column : capture)
column.column = column.column->cut(start, length);
return ColumnFunction::create(length, function, capture);
return ColumnFunction::create(length, function, capture, is_short_circuit_argument, is_function_compiled);
}
ColumnPtr ColumnFunction::filter(const Filter & filt, ssize_t result_size_hint) const
@ -65,11 +71,24 @@ ColumnPtr ColumnFunction::filter(const Filter & filt, ssize_t result_size_hint)
size_t filtered_size = 0;
if (capture.empty())
{
filtered_size = countBytesInFilter(filt);
}
else
filtered_size = capture.front().column->size();
return ColumnFunction::create(filtered_size, function, capture);
return ColumnFunction::create(filtered_size, function, capture, is_short_circuit_argument, is_function_compiled);
}
void ColumnFunction::expand(const Filter & mask, bool inverted)
{
for (auto & column : captured_columns)
{
column.column = column.column->cloneResized(column.column->size());
column.column->assumeMutable()->expand(mask, inverted);
}
size_ = mask.size();
}
ColumnPtr ColumnFunction::permute(const Permutation & perm, size_t limit) const
@ -87,7 +106,7 @@ ColumnPtr ColumnFunction::permute(const Permutation & perm, size_t limit) const
for (auto & column : capture)
column.column = column.column->permute(perm, limit);
return ColumnFunction::create(limit, function, capture);
return ColumnFunction::create(limit, function, capture, is_short_circuit_argument, is_function_compiled);
}
ColumnPtr ColumnFunction::index(const IColumn & indexes, size_t limit) const
@ -96,7 +115,7 @@ ColumnPtr ColumnFunction::index(const IColumn & indexes, size_t limit) const
for (auto & column : capture)
column.column = column.column->index(indexes, limit);
return ColumnFunction::create(limit, function, capture);
return ColumnFunction::create(limit, function, capture, is_short_circuit_argument, is_function_compiled);
}
std::vector<MutableColumnPtr> ColumnFunction::scatter(IColumn::ColumnIndex num_columns,
@ -125,7 +144,7 @@ std::vector<MutableColumnPtr> ColumnFunction::scatter(IColumn::ColumnIndex num_c
{
auto & capture = captures[part];
size_t capture_size = capture.empty() ? counts[part] : capture.front().column->size();
columns.emplace_back(ColumnFunction::create(capture_size, function, std::move(capture)));
columns.emplace_back(ColumnFunction::create(capture_size, function, std::move(capture), is_short_circuit_argument));
}
return columns;
@ -179,7 +198,7 @@ void ColumnFunction::appendArgument(const ColumnWithTypeAndName & column)
const auto & argumnet_types = function->getArgumentTypes();
auto index = captured_columns.size();
if (!column.type->equals(*argumnet_types[index]))
if (!is_short_circuit_argument && !column.type->equals(*argumnet_types[index]))
throw Exception("Cannot capture column " + std::to_string(argumnet_types.size()) +
" because it has incompatible type: got " + column.type->getName() +
", but " + argumnet_types[index]->getName() + " is expected.", ErrorCodes::LOGICAL_ERROR);
@ -187,6 +206,11 @@ void ColumnFunction::appendArgument(const ColumnWithTypeAndName & column)
captured_columns.push_back(column);
}
DataTypePtr ColumnFunction::getResultType() const
{
return function->getResultType();
}
ColumnWithTypeAndName ColumnFunction::reduce() const
{
auto args = function->getArgumentTypes().size();
@ -196,11 +220,33 @@ ColumnWithTypeAndName ColumnFunction::reduce() const
throw Exception("Cannot call function " + function->getName() + " because is has " + toString(args) +
"arguments but " + toString(captured) + " columns were captured.", ErrorCodes::LOGICAL_ERROR);
auto columns = captured_columns;
ColumnsWithTypeAndName columns = captured_columns;
if (is_short_circuit_argument)
{
/// Arguments of lazy executed function can also be lazy executed.
for (auto & col : columns)
{
if (const ColumnFunction * arg = checkAndGetShortCircuitArgument(col.column))
col = arg->reduce();
}
}
ColumnWithTypeAndName res{nullptr, function->getResultType(), ""};
ProfileEvents::increment(ProfileEvents::FunctionExecute);
if (is_function_compiled)
ProfileEvents::increment(ProfileEvents::CompiledFunctionExecute);
res.column = function->execute(columns, res.type, size_);
return res;
}
const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column)
{
const ColumnFunction * column_function;
if ((column_function = typeid_cast<const ColumnFunction *>(column.get())) && column_function->isShortCircuitArgument())
return column_function;
return nullptr;
}
}

View File

@ -5,9 +5,6 @@
#include <Core/ColumnsWithTypeAndName.h>
#include <Columns/IColumn.h>
class IFunctionBase;
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
namespace DB
{
@ -16,6 +13,8 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
}
class IFunctionBase;
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
/** A column containing a lambda expression.
* Behaves like a constant-column. Contains an expression, but not input or output data.
@ -25,7 +24,7 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
private:
friend class COWHelper<IColumn, ColumnFunction>;
ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture);
ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false);
public:
const char * getFamilyName() const override { return "Function"; }
@ -38,6 +37,7 @@ public:
ColumnPtr cut(size_t start, size_t length) const override;
ColumnPtr replicate(const Offsets & offsets) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
@ -153,12 +153,29 @@ public:
throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
bool isShortCircuitArgument() const { return is_short_circuit_argument; }
DataTypePtr getResultType() const;
private:
size_t size_;
FunctionBasePtr function;
ColumnsWithTypeAndName captured_columns;
/// Determine if it's used as a lazy executed argument for short-circuit function.
/// It's needed to distinguish between lazy executed argument and
/// argument with ColumnFunction column (some functions can return it)
/// See ExpressionActions.cpp for details.
bool is_short_circuit_argument;
/// Determine if passed function is compiled. Used for profiling.
bool is_function_compiled;
void appendArgument(const ColumnWithTypeAndName & column);
void addOffsetsForReplication(const IColumn::Offsets & offsets);
};
const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column);
}

View File

@ -110,6 +110,11 @@ public:
return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().filter(filt, result_size_hint));
}
void expand(const Filter & mask, bool inverted) override
{
idx.getPositionsPtr()->expand(mask, inverted);
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override
{
return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().permute(perm, limit));

View File

@ -149,6 +149,11 @@ ColumnPtr ColumnMap::filter(const Filter & filt, ssize_t result_size_hint) const
return ColumnMap::create(filtered);
}
void ColumnMap::expand(const IColumn::Filter & mask, bool inverted)
{
nested->expand(mask, inverted);
}
ColumnPtr ColumnMap::permute(const Permutation & perm, size_t limit) const
{
auto permuted = nested->permute(perm, limit);

View File

@ -64,6 +64,7 @@ public:
void updateHashFast(SipHash & hash) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;

View File

@ -221,6 +221,12 @@ ColumnPtr ColumnNullable::filter(const Filter & filt, ssize_t result_size_hint)
return ColumnNullable::create(filtered_data, filtered_null_map);
}
void ColumnNullable::expand(const IColumn::Filter & mask, bool inverted)
{
nested_column->expand(mask, inverted);
null_map->expand(mask, inverted);
}
ColumnPtr ColumnNullable::permute(const Permutation & perm, size_t limit) const
{
ColumnPtr permuted_data = getNestedColumn().permute(perm, limit);

View File

@ -88,6 +88,7 @@ public:
void popBack(size_t n) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;

View File

@ -3,6 +3,7 @@
#include <Columns/Collator.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/MaskOperations.h>
#include <DataStreams/ColumnGathererStream.h>
#include <Common/Arena.h>
#include <Common/HashTable/Hash.h>
@ -157,6 +158,53 @@ ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) co
return res;
}
void ColumnString::expand(const IColumn::Filter & mask, bool inverted)
{
auto & offsets_data = getOffsets();
auto & chars_data = getChars();
if (mask.size() < offsets_data.size())
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
/// We cannot change only offsets, because each string should end with terminating zero byte.
/// So, we will insert one zero byte when mask value is zero.
int index = mask.size() - 1;
int from = offsets_data.size() - 1;
/// mask.size() - offsets_data.size() should be equal to the number of zeros in mask
/// (if not, one of exceptions below will throw) and we can calculate the resulting chars size.
UInt64 last_offset = offsets_data[from] + (mask.size() - offsets_data.size());
offsets_data.resize(mask.size());
chars_data.resize_fill(last_offset, 0);
while (index >= 0)
{
offsets_data[index] = last_offset;
if (mask[index] ^ inverted)
{
if (from < 0)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
size_t len = offsets_data[from] - offsets_data[from - 1];
/// Copy only if it makes sense. It's important to copy backward, because
/// ranges can overlap, but destination is always is more to the right then source
if (last_offset - len != offsets_data[from - 1])
std::copy_backward(&chars_data[offsets_data[from - 1]], &chars_data[offsets_data[from]], &chars_data[last_offset]);
last_offset -= len;
--from;
}
else
{
chars_data[last_offset - 1] = 0;
--last_offset;
}
--index;
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const
{

View File

@ -212,6 +212,8 @@ public:
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -232,6 +232,12 @@ ColumnPtr ColumnTuple::filter(const Filter & filt, ssize_t result_size_hint) con
return ColumnTuple::create(new_columns);
}
void ColumnTuple::expand(const Filter & mask, bool inverted)
{
for (auto & column : columns)
column->expand(mask, inverted);
}
ColumnPtr ColumnTuple::permute(const Permutation & perm, size_t limit) const
{
const size_t tuple_size = columns.size();

View File

@ -67,6 +67,7 @@ public:
void updateHashFast(SipHash & hash) const override;
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;
ColumnPtr permute(const Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;
ColumnPtr replicate(const Offsets & offsets) const override;

View File

@ -3,6 +3,7 @@
#include <pdqsort.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnCompressed.h>
#include <Columns/MaskOperations.h>
#include <DataStreams/ColumnGathererStream.h>
#include <IO/WriteHelpers.h>
#include <Common/Arena.h>
@ -408,6 +409,12 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
return res;
}
template <typename T>
void ColumnVector<T>::expand(const IColumn::Filter & mask, bool inverted)
{
expandDataByMask<T>(data, mask, inverted);
}
template <typename T>
void ColumnVector<T>::applyZeroMap(const IColumn::Filter & filt, bool inverted)
{

View File

@ -239,6 +239,7 @@ public:
return data[n];
}
void get(size_t n, Field & res) const override
{
res = (*this)[n];
@ -284,6 +285,8 @@ public:
ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;
void expand(const IColumn::Filter & mask, bool inverted) override;
ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override;
ColumnPtr index(const IColumn & indexes, size_t limit) const override;

View File

@ -3,7 +3,6 @@
#include <Columns/IColumn.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnArray.h>
#include <Core/Field.h>

View File

@ -230,12 +230,20 @@ public:
/** Removes elements that don't match the filter.
* Is used in WHERE and HAVING operations.
* If result_size_hint > 0, then makes advance reserve(result_size_hint) for the result column;
* if 0, then don't makes reserve(),
* otherwise (i.e. < 0), makes reserve() using size of source column.
* if 0, then don't makes reserve(),
* otherwise (i.e. < 0), makes reserve() using size of source column.
*/
using Filter = PaddedPODArray<UInt8>;
virtual Ptr filter(const Filter & filt, ssize_t result_size_hint) const = 0;
/** Expand column by mask inplace. After expanding column will
* satisfy the following: if we filter it by given mask, we will
* get initial column. Values with indexes i: mask[i] = 0
* shouldn't be used after expanding.
* If inverted is true, inverted mask will be used.
*/
virtual void expand(const Filter & /*mask*/, bool /*inverted*/) = 0;
/// Permutes elements using specified permutation. Is used in sorting.
/// limit - if it isn't 0, puts only first limit elements in the result.
using Permutation = PaddedPODArray<size_t>;

View File

@ -100,7 +100,16 @@ public:
ColumnPtr filter(const Filter & filt, ssize_t /*result_size_hint*/) const override
{
return cloneDummy(countBytesInFilter(filt));
size_t bytes = countBytesInFilter(filt);
return cloneDummy(bytes);
}
void expand(const IColumn::Filter & mask, bool inverted) override
{
size_t bytes = countBytesInFilter(mask);
if (inverted)
bytes = mask.size() - bytes;
s = bytes;
}
ColumnPtr permute(const Permutation & perm, size_t limit) const override

View File

@ -139,6 +139,11 @@ public:
throw Exception("Method filter is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
void expand(const IColumn::Filter &, bool) override
{
throw Exception("Method expand is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
}
ColumnPtr permute(const IColumn::Permutation &, size_t) const override
{
throw Exception("Method permute is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);

View File

@ -0,0 +1,316 @@
#include <Columns/MaskOperations.h>
#include <Columns/ColumnFunction.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnNothing.h>
#include <Columns/ColumnsCommon.h>
#include <Columns/ColumnConst.h>
#include <algorithm>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
}
template <typename T>
void expandDataByMask(PaddedPODArray<T> & data, const PaddedPODArray<UInt8> & mask, bool inverted)
{
if (mask.size() < data.size())
throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
int from = data.size() - 1;
int index = mask.size() - 1;
data.resize(mask.size());
while (index >= 0)
{
if (mask[index] ^ inverted)
{
if (from < 0)
throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
/// Copy only if it makes sense.
if (index != from)
data[index] = data[from];
--from;
}
else
data[index] = T();
--index;
}
if (from != -1)
throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
}
/// Explicit instantiations - not to place the implementation of the function above in the header file.
#define INSTANTIATE(TYPE) \
template void expandDataByMask<TYPE>(PaddedPODArray<TYPE> &, const PaddedPODArray<UInt8> &, bool);
INSTANTIATE(UInt8)
INSTANTIATE(UInt16)
INSTANTIATE(UInt32)
INSTANTIATE(UInt64)
INSTANTIATE(UInt128)
INSTANTIATE(UInt256)
INSTANTIATE(Int8)
INSTANTIATE(Int16)
INSTANTIATE(Int32)
INSTANTIATE(Int64)
INSTANTIATE(Int128)
INSTANTIATE(Int256)
INSTANTIATE(Float32)
INSTANTIATE(Float64)
INSTANTIATE(Decimal32)
INSTANTIATE(Decimal64)
INSTANTIATE(Decimal128)
INSTANTIATE(Decimal256)
INSTANTIATE(DateTime64)
INSTANTIATE(char *)
INSTANTIATE(UUID)
#undef INSTANTIATE
template <bool inverted, bool column_is_short, typename Container>
size_t extractMaskNumericImpl(
PaddedPODArray<UInt8> & mask,
const Container & data,
UInt8 null_value,
const PaddedPODArray<UInt8> * null_bytemap,
PaddedPODArray<UInt8> * nulls)
{
size_t ones_count = 0;
size_t data_index = 0;
for (size_t i = 0; i != mask.size(); ++i)
{
// Change mask only where value is 1.
if (!mask[i])
continue;
UInt8 value;
size_t index;
if constexpr (column_is_short)
{
index = data_index;
++data_index;
}
else
index = i;
if (null_bytemap && (*null_bytemap)[index])
{
value = null_value;
if (nulls)
(*nulls)[i] = 1;
}
else
value = !!data[index];
if constexpr (inverted)
value = !value;
if (value)
++ones_count;
mask[i] = value;
}
return ones_count;
}
template <bool inverted, typename NumericType>
bool extractMaskNumeric(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value,
const PaddedPODArray<UInt8> * null_bytemap,
PaddedPODArray<UInt8> * nulls,
MaskInfo & mask_info)
{
const auto * numeric_column = checkAndGetColumn<ColumnVector<NumericType>>(column.get());
if (!numeric_column)
return false;
const auto & data = numeric_column->getData();
size_t ones_count;
if (column->size() < mask.size())
ones_count = extractMaskNumericImpl<inverted, true>(mask, data, null_value, null_bytemap, nulls);
else
ones_count = extractMaskNumericImpl<inverted, false>(mask, data, null_value, null_bytemap, nulls);
mask_info.has_ones = ones_count > 0;
mask_info.has_zeros = ones_count != mask.size();
return true;
}
template <bool inverted>
MaskInfo extractMaskFromConstOrNull(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value,
PaddedPODArray<UInt8> * nulls = nullptr)
{
UInt8 value;
if (column->onlyNull())
{
value = null_value;
if (nulls)
std::fill(nulls->begin(), nulls->end(), 1);
}
else
value = column->getBool(0);
if constexpr (inverted)
value = !value;
size_t ones_count = 0;
if (value)
ones_count = countBytesInFilter(mask);
else
std::fill(mask.begin(), mask.end(), 0);
return {.has_ones = ones_count > 0, .has_zeros = ones_count != mask.size()};
}
template <bool inverted>
MaskInfo extractMaskImpl(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value,
const PaddedPODArray<UInt8> * null_bytemap,
PaddedPODArray<UInt8> * nulls = nullptr)
{
/// Special implementation for Null and Const columns.
if (column->onlyNull() || checkAndGetColumn<ColumnConst>(*column))
return extractMaskFromConstOrNull<inverted>(mask, column, null_value, nulls);
if (const auto * col = checkAndGetColumn<ColumnNullable>(*column))
{
const PaddedPODArray<UInt8> & null_map = col->getNullMapData();
return extractMaskImpl<inverted>(mask, col->getNestedColumnPtr(), null_value, &null_map, nulls);
}
MaskInfo mask_info;
if (!(extractMaskNumeric<inverted, UInt8>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, UInt16>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, UInt32>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, UInt64>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Int8>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Int16>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Int32>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Int64>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Float32>(mask, column, null_value, null_bytemap, nulls, mask_info)
|| extractMaskNumeric<inverted, Float64>(mask, column, null_value, null_bytemap, nulls, mask_info)))
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot convert column {} to mask.", column->getName());
return mask_info;
}
MaskInfo extractMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value)
{
return extractMaskImpl<false>(mask, column, null_value, nullptr);
}
MaskInfo extractInvertedMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value)
{
return extractMaskImpl<true>(mask, column, null_value, nullptr);
}
MaskInfo extractMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
PaddedPODArray<UInt8> * nulls,
UInt8 null_value)
{
return extractMaskImpl<false>(mask, column, null_value, nullptr, nulls);
}
MaskInfo extractInvertedMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
PaddedPODArray<UInt8> * nulls,
UInt8 null_value)
{
return extractMaskImpl<true>(mask, column, null_value, nullptr, nulls);
}
void inverseMask(PaddedPODArray<UInt8> & mask, MaskInfo & mask_info)
{
for (size_t i = 0; i != mask.size(); ++i)
mask[i] = !mask[i];
std::swap(mask_info.has_ones, mask_info.has_zeros);
}
void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> & mask, const MaskInfo & mask_info)
{
const auto * column_function = checkAndGetShortCircuitArgument(column.column);
if (!column_function)
return;
ColumnWithTypeAndName result;
/// If mask contains only zeros, we can just create
/// an empty column with the execution result type.
if (!mask_info.has_ones)
{
auto result_type = column_function->getResultType();
auto empty_column = result_type->createColumn();
result = {std::move(empty_column), result_type, ""};
}
/// Filter column only if mask contains zeros.
else if (mask_info.has_zeros)
{
auto filtered = column_function->filter(mask, -1);
result = typeid_cast<const ColumnFunction *>(filtered.get())->reduce();
}
else
result = column_function->reduce();
column = std::move(result);
}
void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty)
{
const auto * column_function = checkAndGetShortCircuitArgument(column.column);
if (!column_function)
return;
if (!empty)
column = column_function->reduce();
else
column.column = column_function->getResultType()->createColumn();
}
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments)
{
int last_short_circuit_argument_index = -1;
for (size_t i = 0; i != arguments.size(); ++i)
{
if (checkAndGetShortCircuitArgument(arguments[i].column))
last_short_circuit_argument_index = i;
}
return last_short_circuit_argument_index;
}
void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to)
{
if (from.size() != to.size())
throw Exception("Cannot copy mask, because source and destination have different size", ErrorCodes::LOGICAL_ERROR);
if (from.empty())
return;
memcpy(to.data(), from.data(), from.size() * sizeof(*from.data()));
}
}

View File

@ -0,0 +1,73 @@
#pragma once
#include <Core/ColumnWithTypeAndName.h>
#include <Core/ColumnsWithTypeAndName.h>
#include <Core/Field.h>
#include <Common/PODArray.h>
namespace DB
{
/// Expand data by mask. After expanding data will satisfy the following: if we filter data
/// by given mask, we get initial data. In places where mask[i] = 0 we insert default value.
/// If inverted is true, we will work with inverted mask. This function is used in implementations of
/// expand() method in IColumn interface.
template <typename T>
void expandDataByMask(PaddedPODArray<T> & data, const PaddedPODArray<UInt8> & mask, bool inverted);
struct MaskInfo
{
bool has_ones;
bool has_zeros;
};
/// The next functions are used to extract UInt8 mask from a column,
/// filtered by some condition (mask). We will use value from a column
/// only when value in condition is 1. Column should satisfy the
/// condition: sum(mask) = column.size() or mask.size() = column.size().
/// You can set flag 'inverted' to use inverted values
/// from a column. You can also determine value that will be used when
/// column value is Null (argument null_value).
MaskInfo extractMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value = 0);
MaskInfo extractInvertedMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
UInt8 null_value = 0);
/// The same as extractMask, but fills
/// nulls so that nulls[i] = 1 when column[i] = Null.
MaskInfo extractMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
PaddedPODArray<UInt8> * nulls,
UInt8 null_value = 0);
MaskInfo extractInvertedMask(
PaddedPODArray<UInt8> & mask,
const ColumnPtr & column,
PaddedPODArray<UInt8> * nulls,
UInt8 null_value = 0);
/// Inplace inversion.
void inverseMask(PaddedPODArray<UInt8> & mask, MaskInfo & mask_info);
/// If given column is lazy executed argument (ColumnFunction with isShortCircuitArgument() = true),
/// filter it by mask and then reduce. If inverted is true, we will work with inverted mask.
void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> & mask, const MaskInfo & mask_info);
/// If given column is lazy executed argument, reduce it. If empty is true,
/// create an empty column with the execution result type.
void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty = false);
/// Check if arguments contain lazy executed argument. If contain, return index of the last one,
/// otherwise return -1.
int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments);
void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to);
}

View File

@ -35,6 +35,7 @@ SRCS(
ColumnsCommon.cpp
FilterDescription.cpp
IColumn.cpp
MaskOperations.cpp
getLeastSuperColumn.cpp
)

View File

@ -1,29 +0,0 @@
#pragma once
#include <unordered_map>
/// DenseHashMap is a wrapper for google::dense_hash_map.
/// Some hacks are needed to make it work in "Arcadia".
/// "Arcadia" is a proprietary monorepository in Yandex.
/// It uses slightly changed version of sparsehash with a different set of hash functions (which we don't need).
/// Those defines are needed to make it compile.
#if defined(ARCADIA_BUILD)
#define HASH_FUN_H <unordered_map>
template <typename T>
struct THash;
#endif
#include <sparsehash/dense_hash_map>
#if !defined(ARCADIA_BUILD)
template <class Key, class T, class HashFcn = std::hash<Key>,
class EqualKey = std::equal_to<Key>,
class Alloc = google::libc_allocator_with_realloc<std::pair<const Key, T>>>
using DenseHashMap = google::dense_hash_map<Key, T, HashFcn, EqualKey, Alloc>;
#else
template <class Key, class T, class HashFcn = std::hash<Key>,
class EqualKey = std::equal_to<Key>,
class Alloc = google::sparsehash::libc_allocator_with_realloc<std::pair<const Key, T>>>
using DenseHashMap = google::sparsehash::dense_hash_map<Key, T, HashFcn, EqualKey, Alloc>;
#undef THash
#endif

View File

@ -1,25 +0,0 @@
#pragma once
/// DenseHashSet is a wrapper for google::dense_hash_set.
/// See comment in DenseHashMap.h
#if defined(ARCADIA_BUILD)
#define HASH_FUN_H <unordered_map>
template <typename T>
struct THash;
#endif
#include <sparsehash/dense_hash_set>
#if !defined(ARCADIA_BUILD)
template <class Value, class HashFcn = std::hash<Value>,
class EqualKey = std::equal_to<Value>,
class Alloc = google::libc_allocator_with_realloc<Value>>
using DenseHashSet = google::dense_hash_set<Value, HashFcn, EqualKey, Alloc>;
#else
template <class Value, class HashFcn = std::hash<Value>,
class EqualKey = std::equal_to<Value>,
class Alloc = google::sparsehash::libc_allocator_with_realloc<Value>>
using DenseHashSet = google::sparsehash::dense_hash_set<Value, HashFcn, EqualKey, Alloc>;
#undef THash
#endif

View File

@ -1,4 +1,5 @@
#include <Common/ErrorCodes.h>
#include <Common/Exception.h>
#include <chrono>
/** Previously, these constants were located in one enum.
@ -564,6 +565,7 @@
M(594, BZIP2_STREAM_DECODER_FAILED) \
M(595, BZIP2_STREAM_ENCODER_FAILED) \
M(596, INTERSECT_OR_EXCEPT_RESULT_STRUCTURES_MISMATCH) \
M(597, NO_SUCH_ERROR_CODE) \
\
M(998, POSTGRESQL_CONNECTION_FAILURE) \
M(999, KEEPER_EXCEPTION) \
@ -602,6 +604,21 @@ namespace ErrorCodes
return error_codes_names.names[error_code];
}
ErrorCode getErrorCodeByName(std::string_view error_name)
{
for (size_t i = 0, end = ErrorCodes::end(); i < end; ++i)
{
std::string_view name = ErrorCodes::getName(i);
if (name.empty())
continue;
if (name == error_name)
return i;
}
throw Exception(NO_SUCH_ERROR_CODE, "No error code with name: '{}'", error_name);
}
ErrorCode end() { return END + 1; }
void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace)

View File

@ -25,6 +25,12 @@ namespace ErrorCodes
/// Get name of error_code by identifier.
/// Returns statically allocated string.
std::string_view getName(ErrorCode error_code);
/// Get error code value by name.
///
/// It has O(N) complexity, but this is not major, since it is used only
/// for test hints, and it does not worth to keep another structure for
/// this.
ErrorCode getErrorCodeByName(std::string_view error_name);
struct Error
{

View File

@ -1,7 +1,6 @@
#pragma once
/// SparseHashMap is a wrapper for google::sparse_hash_map.
/// See comment in DenseHashMap.h
#if defined(ARCADIA_BUILD)
#define HASH_FUN_H <unordered_map>
template <typename T>

View File

@ -1,4 +1,5 @@
#include <Core/NamesAndTypes.h>
#include <Common/HashTable/HashMap.h>
#include <DataTypes/DataTypeFactory.h>
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
@ -6,7 +7,6 @@
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <Common/DenseHashMap.h>
namespace DB
@ -163,8 +163,7 @@ NamesAndTypesList NamesAndTypesList::filter(const Names & names) const
NamesAndTypesList NamesAndTypesList::addTypes(const Names & names) const
{
/// NOTE: It's better to make a map in `IStorage` than to create it here every time again.
DenseHashMap<StringRef, const DataTypePtr *, StringRefHash> types;
types.set_empty_key(StringRef());
HashMapWithSavedHash<StringRef, const DataTypePtr *, StringRefHash> types;
for (const auto & column : *this)
types[column.name] = &column.type;
@ -172,10 +171,11 @@ NamesAndTypesList NamesAndTypesList::addTypes(const Names & names) const
NamesAndTypesList res;
for (const String & name : names)
{
auto it = types.find(name);
const auto * it = types.find(name);
if (it == types.end())
throw Exception("No column " + name, ErrorCodes::THERE_IS_NO_COLUMN);
res.emplace_back(name, *it->second);
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN, "No column {}", name);
res.emplace_back(name, *it->getMapped());
}
return res;

View File

@ -492,6 +492,7 @@ class IColumn;
M(UInt64, offset, 0, "Offset on read rows from the most 'end' result for select query", 0) \
\
M(UInt64, function_range_max_elements_in_block, 500000000, "Maximum number of values generated by function 'range' per block of data (sum of array sizes for every row in a block, see also 'max_block_size' and 'min_insert_block_size_rows'). It is a safety threshold.", 0) \
M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable', 'disable', 'force_enable'", 0) \
\
/** Experimental functions */ \
M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \

View File

@ -111,4 +111,9 @@ IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS,
IMPLEMENT_SETTING_ENUM(HandleKafkaErrorMode, ErrorCodes::BAD_ARGUMENTS,
{{"default", HandleKafkaErrorMode::DEFAULT},
{"stream", HandleKafkaErrorMode::STREAM}})
IMPLEMENT_SETTING_ENUM(ShortCircuitFunctionEvaluation, ErrorCodes::BAD_ARGUMENTS,
{{"enable", ShortCircuitFunctionEvaluation::ENABLE},
{"force_enable", ShortCircuitFunctionEvaluation::FORCE_ENABLE},
{"disable", ShortCircuitFunctionEvaluation::DISABLE}})
}

View File

@ -157,4 +157,14 @@ enum class HandleKafkaErrorMode
};
DECLARE_SETTING_ENUM(HandleKafkaErrorMode)
enum class ShortCircuitFunctionEvaluation
{
ENABLE, // Use short-circuit function evaluation for functions that are suitable for it.
FORCE_ENABLE, // Use short-circuit function evaluation for all functions.
DISABLE, // Disable short-circuit function evaluation.
};
DECLARE_SETTING_ENUM(ShortCircuitFunctionEvaluation)
}

View File

@ -79,7 +79,7 @@ void DataTypeMap::assertKeyType() const
std::string DataTypeMap::doGetName() const
{
WriteBufferFromOwnString s;
s << "Map(" << key_type->getName() << "," << value_type->getName() << ")";
s << "Map(" << key_type->getName() << ", " << value_type->getName() << ")";
return s.str();
}

View File

@ -29,6 +29,13 @@ using DataTypes = std::vector<DataTypePtr>;
struct NameAndTypePair;
class SerializationInfo;
struct DataTypeWithConstInfo
{
DataTypePtr type;
bool is_const;
};
using DataTypesWithConstInfo = std::vector<DataTypeWithConstInfo>;
/** Properties of data type.
*

View File

@ -0,0 +1,48 @@
#include "DictionaryHelpers.h"
namespace DB
{
MutableColumns deserializeColumnsFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end)
{
MutableColumns result_columns;
result_columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
result_columns.emplace_back(attribute.type->createColumn());
for (size_t index = start; index < end; ++index)
{
const auto & key = keys[index];
const auto * ptr = key.data;
for (auto & result_column : result_columns)
ptr = result_column->deserializeAndInsertFromArena(ptr);
}
return result_columns;
}
ColumnsWithTypeAndName deserializeColumnsWithTypeAndNameFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end)
{
ColumnsWithTypeAndName result;
MutableColumns columns = deserializeColumnsFromKeys(dictionary_structure, keys, start, end);
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
{
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
}
return result;
}
}

View File

@ -497,6 +497,20 @@ private:
Arena * complex_key_arena;
};
/// Deserialize columns from keys array using dictionary structure
MutableColumns deserializeColumnsFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end);
/// Deserialize columns with type and name from keys array using dictionary structure
ColumnsWithTypeAndName deserializeColumnsWithTypeAndNameFromKeys(
const DictionaryStructure & dictionary_structure,
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t end);
/** Merge block with blocks from stream. If there are duplicate keys in block they are filtered out.
* In result block_to_update will be merged with blocks from stream.
* Note: readPrefix readImpl readSuffix will be called on stream object during function execution.

View File

@ -29,7 +29,7 @@ DictionarySourceData::DictionarySourceData(
, key_type(DictionaryInputStreamKeyType::ComplexKey)
{
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
fillKeyColumns(keys, 0, keys.size(), dictionary_structure, key_columns);
key_columns = deserializeColumnsWithTypeAndNameFromKeys(dictionary_structure, keys, 0, keys.size());
}
DictionarySourceData::DictionarySourceData(
@ -158,32 +158,4 @@ Block DictionarySourceData::fillBlock(
return Block(block_columns);
}
void DictionarySourceData::fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & result)
{
MutableColumns columns;
columns.reserve(dictionary_structure.key->size());
for (const DictionaryAttribute & attribute : *dictionary_structure.key)
columns.emplace_back(attribute.type->createColumn());
for (size_t index = start; index < size; ++index)
{
const auto & key = keys[index];
const auto *ptr = key.data;
for (auto & column : columns)
ptr = column->deserializeAndInsertFromArena(ptr);
}
for (size_t i = 0, num_columns = columns.size(); i < num_columns; ++i)
{
const auto & dictionary_attribute = (*dictionary_structure.key)[i];
result.emplace_back(ColumnWithTypeAndName{std::move(columns[i]), dictionary_attribute.type, dictionary_attribute.name});
}
}
}

View File

@ -51,13 +51,6 @@ private:
const DataTypes & types,
ColumnsWithTypeAndName && view) const;
static void fillKeyColumns(
const PaddedPODArray<StringRef> & keys,
size_t start,
size_t size,
const DictionaryStructure & dictionary_structure,
ColumnsWithTypeAndName & result);
const size_t num_rows;
std::shared_ptr<const IDictionary> dictionary;
std::unordered_set<std::string> column_names;

View File

@ -134,42 +134,11 @@ DictionaryStructure::DictionaryStructure(const Poco::Util::AbstractConfiguration
if (id->name.empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "'id' cannot be empty");
const char * range_default_type = "Date";
if (config.has(structure_prefix + ".range_min"))
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
if (config.has(structure_prefix + ".range_max"))
range_max.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_max", range_default_type));
if (range_min.has_value() != range_max.has_value())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
}
if (range_min && range_max && !range_min->type->equals(*range_max->type))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure 'range_min' and 'range_max' should have same type, "
"'range_min' type: {},"
"'range_max' type: {}",
range_min->type->getName(),
range_max->type->getName());
}
if (range_min)
{
if (!range_min->type->isValueRepresentedByInteger())
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
" Actual 'range_min' and 'range_max' type is {}",
range_min->type->getName());
}
if (!id->expression.empty() || (range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
if (!id->expression.empty())
has_expressions = true;
}
parseRangeConfiguration(config, structure_prefix);
attributes = getAttributes(config, structure_prefix, /*complex_key_attributes =*/ false);
for (size_t i = 0; i < attributes.size(); ++i)
@ -439,4 +408,42 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
return res_attributes;
}
void DictionaryStructure::parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix)
{
const char * range_default_type = "Date";
if (config.has(structure_prefix + ".range_min"))
range_min.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_min", range_default_type));
if (config.has(structure_prefix + ".range_max"))
range_max.emplace(makeDictionaryTypedSpecialAttribute(config, structure_prefix + ".range_max", range_default_type));
if (range_min.has_value() != range_max.has_value())
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure should have both 'range_min' and 'range_max' either specified or not.");
}
if (range_min && range_max && !range_min->type->equals(*range_max->type))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure 'range_min' and 'range_max' should have same type, "
"'range_min' type: {},"
"'range_max' type: {}",
range_min->type->getName(),
range_max->type->getName());
}
if (range_min)
{
if (!range_min->type->isValueRepresentedByInteger())
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Dictionary structure type of 'range_min' and 'range_max' should be an integer, Date, DateTime, or Enum."
" Actual 'range_min' and 'range_max' type is {}",
range_min->type->getName());
}
if ((range_min && !range_min->expression.empty()) || (range_max && !range_max->expression.empty()))
has_expressions = true;
}
}

View File

@ -67,8 +67,9 @@ using DictionaryLifetime = ExternalLoadableLifetime;
* - null_value, used as a default value for non-existent entries in the dictionary,
* decimal representation for numeric attributes;
* - hierarchical, whether this attribute defines a hierarchy;
* - injective, whether the mapping to parent is injective (can be used for optimization of GROUP BY?)
* - is_object_id, used in mongo dictionary, converts string key to objectid
* - injective, whether the mapping to parent is injective (can be used for optimization of GROUP BY?);
* - is_object_id, used in mongo dictionary, converts string key to objectid;
* - is_nullable, is attribute nullable;
*/
struct DictionaryAttribute final
{
@ -153,6 +154,10 @@ private:
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
bool complex_key_attributes);
/// parse range_min and range_max
void parseRangeConfiguration(const Poco::Util::AbstractConfiguration & config, const std::string & structure_prefix);
};
}

View File

@ -133,6 +133,29 @@ void ExternalQueryBuilder::composeLoadAllQuery(WriteBuffer & out) const
writeQuoted(key.name, out);
}
if (dict_struct.range_min && dict_struct.range_max)
{
writeString(", ", out);
if (!dict_struct.range_min->expression.empty())
{
writeParenthesisedString(dict_struct.range_min->expression, out);
writeString(" AS ", out);
}
writeQuoted(dict_struct.range_min->name, out);
writeString(", ", out);
if (!dict_struct.range_max->expression.empty())
{
writeParenthesisedString(dict_struct.range_max->expression, out);
writeString(" AS ", out);
}
writeQuoted(dict_struct.range_max->name, out);
}
}
for (const auto & attr : dict_struct.attributes)

View File

@ -64,7 +64,7 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::complex; }

View File

@ -14,170 +14,241 @@
namespace DB
{
template <typename RangeType>
enum class RangeDictionaryType
{
simple,
complex
};
template <RangeDictionaryType range_dictionary_type, typename RangeType>
class RangeDictionarySourceData
{
public:
using Key = UInt64;
using KeyType = std::conditional_t<range_dictionary_type == RangeDictionaryType::simple, UInt64, StringRef>;
RangeDictionarySourceData(
std::shared_ptr<const IDictionary> dictionary,
const Names & column_names,
PaddedPODArray<Key> && ids_to_fill,
PaddedPODArray<KeyType> && keys,
PaddedPODArray<RangeType> && start_dates,
PaddedPODArray<RangeType> && end_dates);
Block getBlock(size_t start, size_t length) const;
size_t getNumRows() const { return ids.size(); }
size_t getNumRows() const { return keys.size(); }
private:
Block fillBlock(
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<KeyType> & keys_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const;
const PaddedPODArray<RangeType> & block_end_dates,
size_t start,
size_t end) const;
PaddedPODArray<Int64> makeDateKey(
PaddedPODArray<Int64> makeDateKeys(
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const;
std::shared_ptr<const IDictionary> dictionary;
NameSet column_names;
PaddedPODArray<Key> ids;
PaddedPODArray<KeyType> keys;
PaddedPODArray<RangeType> start_dates;
PaddedPODArray<RangeType> end_dates;
};
template <typename RangeType>
RangeDictionarySourceData<RangeType>::RangeDictionarySourceData(
template <RangeDictionaryType range_dictionary_type, typename RangeType>
RangeDictionarySourceData<range_dictionary_type, RangeType>::RangeDictionarySourceData(
std::shared_ptr<const IDictionary> dictionary_,
const Names & column_names_,
PaddedPODArray<Key> && ids_,
PaddedPODArray<KeyType> && keys,
PaddedPODArray<RangeType> && block_start_dates,
PaddedPODArray<RangeType> && block_end_dates)
: dictionary(dictionary_)
, column_names(column_names_.begin(), column_names_.end())
, ids(std::move(ids_))
, keys(std::move(keys))
, start_dates(std::move(block_start_dates))
, end_dates(std::move(block_end_dates))
{
}
template <typename RangeType>
Block RangeDictionarySourceData<RangeType>::getBlock(size_t start, size_t length) const
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySourceData<range_dictionary_type, RangeType>::getBlock(size_t start, size_t length) const
{
PaddedPODArray<Key> block_ids;
PaddedPODArray<KeyType> block_keys;
PaddedPODArray<RangeType> block_start_dates;
PaddedPODArray<RangeType> block_end_dates;
block_ids.reserve(length);
block_keys.reserve(length);
block_start_dates.reserve(length);
block_end_dates.reserve(length);
for (auto idx : collections::range(start, start + length))
for (size_t index = start; index < start + length; ++index)
{
block_ids.push_back(ids[idx]);
block_start_dates.push_back(start_dates[idx]);
block_end_dates.push_back(end_dates[idx]);
block_keys.push_back(keys[index]);
block_start_dates.push_back(start_dates[index]);
block_end_dates.push_back(end_dates[index]);
}
return fillBlock(block_ids, block_start_dates, block_end_dates);
return fillBlock(block_keys, block_start_dates, block_end_dates, start, start + length);
}
template <typename RangeType>
PaddedPODArray<Int64> RangeDictionarySourceData<RangeType>::makeDateKey(
const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const
{
PaddedPODArray<Int64> key(block_start_dates.size());
for (size_t i = 0; i < key.size(); ++i)
{
if (RangeHashedDictionary::Range::isCorrectDate(block_start_dates[i]))
key[i] = block_start_dates[i];
else
key[i] = block_end_dates[i];
}
return key;
}
template <typename RangeType>
Block RangeDictionarySourceData<RangeType>::fillBlock(
const PaddedPODArray<Key> & ids_to_fill,
template <RangeDictionaryType range_dictionary_type, typename RangeType>
PaddedPODArray<Int64> RangeDictionarySourceData<range_dictionary_type, RangeType>::makeDateKeys(
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const
{
PaddedPODArray<Int64> keys(block_start_dates.size());
for (size_t i = 0; i < keys.size(); ++i)
{
if (Range::isCorrectDate(block_start_dates[i]))
keys[i] = block_start_dates[i];
else
keys[i] = block_end_dates[i];
}
return keys;
}
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySourceData<range_dictionary_type, RangeType>::fillBlock(
const PaddedPODArray<KeyType> & keys_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates,
size_t start,
size_t end) const
{
std::cerr << "RangeDictionarySourceData::fillBlock keys_to_fill " << keys_to_fill.size() << std::endl;
if constexpr (range_dictionary_type == RangeDictionaryType::simple)
{
for (auto & key : keys_to_fill)
{
std::cerr << key << std::endl;
}
}
ColumnsWithTypeAndName columns;
const DictionaryStructure & structure = dictionary->getStructure();
const DictionaryStructure & dictionary_structure = dictionary->getStructure();
auto ids_column = getColumnFromPODArray(ids_to_fill);
const std::string & id_column_name = structure.id->name;
if (column_names.find(id_column_name) != column_names.end())
columns.emplace_back(ids_column, std::make_shared<DataTypeUInt64>(), id_column_name);
DataTypes keys_types;
Columns keys_columns;
Strings keys_names = dictionary_structure.getKeysNames();
auto date_key = makeDateKey(block_start_dates, block_end_dates);
if constexpr (range_dictionary_type == RangeDictionaryType::simple)
{
keys_columns = {getColumnFromPODArray(keys_to_fill)};
keys_types = {std::make_shared<DataTypeUInt64>()};
}
else
{
for (const auto & attribute : *dictionary_structure.key)
keys_types.emplace_back(attribute.type);
auto deserialized_columns = deserializeColumnsFromKeys(dictionary_structure, keys, start, end);
for (auto & deserialized_column : deserialized_columns)
keys_columns.emplace_back(std::move(deserialized_column));
}
size_t keys_size = keys_names.size();
std::cerr << "Keys size " << keys_size << " key columns size " << keys_columns.size();
std::cerr << " keys types size " << keys_types.size() << std::endl;
assert(keys_columns.size() == keys_size);
assert(keys_types.size() == keys_size);
for (size_t i = 0; i < keys_size; ++i)
{
auto & key_name = keys_names[i];
if (column_names.find(key_name) != column_names.end())
columns.emplace_back(keys_columns[i], keys_types[i], key_name);
}
auto date_key = makeDateKeys(block_start_dates, block_end_dates);
auto date_column = getColumnFromPODArray(date_key);
keys_columns.emplace_back(std::move(date_column));
keys_types.emplace_back(std::make_shared<DataTypeInt64>());
const std::string & range_min_column_name = structure.range_min->name;
const auto & range_min_column_name = dictionary_structure.range_min->name;
if (column_names.find(range_min_column_name) != column_names.end())
{
auto range_min_column = getColumnFromPODArray(block_start_dates);
columns.emplace_back(range_min_column, structure.range_max->type, range_min_column_name);
columns.emplace_back(range_min_column, dictionary_structure.range_max->type, range_min_column_name);
}
const std::string & range_max_column_name = structure.range_max->name;
const auto & range_max_column_name = dictionary_structure.range_max->name;
if (column_names.find(range_max_column_name) != column_names.end())
{
auto range_max_column = getColumnFromPODArray(block_end_dates);
columns.emplace_back(range_max_column, structure.range_max->type, range_max_column_name);
columns.emplace_back(range_max_column, dictionary_structure.range_max->type, range_max_column_name);
}
for (const auto idx : collections::range(0, structure.attributes.size()))
size_t attributes_size = dictionary_structure.attributes.size();
for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
{
const DictionaryAttribute & attribute = structure.attributes[idx];
if (column_names.find(attribute.name) != column_names.end())
const auto & attribute = dictionary_structure.attributes[attribute_index];
if (column_names.find(attribute.name) == column_names.end())
continue;
auto column = dictionary->getColumn(
attribute.name,
attribute.type,
keys_columns,
keys_types,
nullptr /* default_values_column*/);
columns.emplace_back(std::move(column), attribute.type, attribute.name);
}
auto result = Block(columns);
Field value;
std::cerr << "RangeDictionarySourceData::fillBlock result" << std::endl;
for (auto & block_column : result)
{
std::cerr << "Column name " << block_column.name << " type " << block_column.type->getName() << std::endl;
auto & column = block_column.column;
size_t column_size = column->size();
for (size_t i = 0; i < column_size; ++i)
{
ColumnPtr column = dictionary->getColumn(
attribute.name,
attribute.type,
{ids_column, date_column},
{std::make_shared<DataTypeUInt64>(), std::make_shared<DataTypeInt64>()},
nullptr);
columns.emplace_back(column, attribute.type, attribute.name);
column->get(i, value);
std::cerr << "Index " << i << " value " << value.dump() << std::endl;
}
}
return Block(columns);
}
/*
* BlockInputStream implementation for external dictionaries
* read() returns single block consisting of the in-memory contents of the dictionaries
*/
template <typename RangeType>
template <RangeDictionaryType range_dictionary_type, typename RangeType>
class RangeDictionarySource : public DictionarySourceBase
{
public:
using Key = UInt64;
RangeDictionarySource(RangeDictionarySourceData<RangeType> data_, size_t max_block_size);
RangeDictionarySource(RangeDictionarySourceData<range_dictionary_type, RangeType> data_, size_t max_block_size);
String getName() const override { return "RangeDictionarySource"; }
protected:
Block getBlock(size_t start, size_t length) const override;
RangeDictionarySourceData<RangeType> data;
RangeDictionarySourceData<range_dictionary_type, RangeType> data;
};
template <typename RangeType>
RangeDictionarySource<RangeType>::RangeDictionarySource(RangeDictionarySourceData<RangeType> data_, size_t max_block_size)
template <RangeDictionaryType range_dictionary_type, typename RangeType>
RangeDictionarySource<range_dictionary_type, RangeType>::RangeDictionarySource(RangeDictionarySourceData<range_dictionary_type, RangeType> data_, size_t max_block_size)
: DictionarySourceBase(data_.getBlock(0, 0), data_.getNumRows(), max_block_size)
, data(std::move(data_))
{
}
template <typename RangeType>
Block RangeDictionarySource<RangeType>::getBlock(size_t start, size_t length) const
template <RangeDictionaryType range_dictionary_type, typename RangeType>
Block RangeDictionarySource<range_dictionary_type, RangeType>::getBlock(size_t start, size_t length) const
{
return data.getBlock(start, length);
}

View File

@ -10,7 +10,8 @@
namespace
{
using RangeStorageType = DB::RangeHashedDictionary::RangeStorageType;
using RangeStorageType = DB::RangeStorageType;
// Null values mean that specified boundary, either min or max is not set on range.
// To simplify comparison, null value of min bound should be bigger than any other value,
@ -25,7 +26,7 @@ RangeStorageType getColumnIntValueOrDefault(const DB::IColumn & column, size_t i
return default_value;
const RangeStorageType result = static_cast<RangeStorageType>(column.getInt(index));
if (isDate && !DB::RangeHashedDictionary::Range::isCorrectDate(result))
if (isDate && !DB::Range::isCorrectDate(result))
return default_value;
return result;
@ -50,27 +51,26 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
extern const int BAD_ARGUMENTS;
extern const int DICTIONARY_IS_EMPTY;
extern const int TYPE_MISMATCH;
extern const int UNSUPPORTED_METHOD;
}
bool RangeHashedDictionary::Range::isCorrectDate(const RangeStorageType & date)
bool Range::isCorrectDate(const RangeStorageType & date)
{
return 0 < date && date <= DATE_LUT_MAX_DAY_NUM;
}
bool RangeHashedDictionary::Range::contains(const RangeStorageType & value) const
bool Range::contains(const RangeStorageType & value) const
{
return left <= value && value <= right;
}
static bool operator<(const RangeHashedDictionary::Range & left, const RangeHashedDictionary::Range & right)
static bool operator<(const Range & left, const Range & right)
{
return std::tie(left.left, left.right) < std::tie(right.left, right.right);
}
RangeHashedDictionary::RangeHashedDictionary(
template <DictionaryKeyType dictionary_key_type>
RangeHashedDictionary<dictionary_key_type>::RangeHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
DictionarySourcePtr source_ptr_,
@ -87,7 +87,8 @@ RangeHashedDictionary::RangeHashedDictionary(
calculateBytesAllocated();
}
ColumnPtr RangeHashedDictionary::getColumn(
template <DictionaryKeyType dictionary_key_type>
ColumnPtr RangeHashedDictionary<dictionary_key_type>::getColumn(
const std::string & attribute_name,
const DataTypePtr & result_type,
const Columns & key_columns,
@ -96,20 +97,18 @@ ColumnPtr RangeHashedDictionary::getColumn(
{
ColumnPtr result;
const auto & attribute = getAttribute(attribute_name);
const auto & dictionary_attribute = dict_struct.getAttribute(attribute_name, result_type);
auto keys_size = key_columns.front()->size();
const size_t attribute_index = dict_struct.attribute_name_to_index.find(attribute_name)->second;
const auto & attribute = attributes[attribute_index];
/// Cast second column to storage type
Columns modified_key_columns = key_columns;
auto range_storage_column = key_columns[1];
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types.back(), ""};
auto range_column_storage_type = std::make_shared<DataTypeInt64>();
modified_key_columns[1] = castColumnAccurate(column_to_cast, range_column_storage_type);
modified_key_columns.back() = castColumnAccurate(column_to_cast, range_column_storage_type);
size_t keys_size = key_columns.front()->size();
bool is_attribute_nullable = attribute.is_nullable;
ColumnUInt8::MutablePtr col_null_map_to;
@ -204,24 +203,26 @@ ColumnPtr RangeHashedDictionary::getColumn(
return result;
}
ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
template <DictionaryKeyType dictionary_key_type>
ColumnUInt8::Ptr RangeHashedDictionary<dictionary_key_type>::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
{
auto range_storage_column = key_columns[1];
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_column_storage_type = std::make_shared<DataTypeInt64>();
auto range_storage_column = key_columns.back();
ColumnWithTypeAndName column_to_cast = {range_storage_column->convertToFullColumnIfConst(), key_types[1], ""};
auto range_column_updated = castColumnAccurate(column_to_cast, range_column_storage_type);
PaddedPODArray<UInt64> key_backup_storage;
PaddedPODArray<RangeStorageType> range_backup_storage;
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, range_column_updated, range_backup_storage);
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
const auto & attribute = attributes.front();
ColumnUInt8::Ptr result;
auto result = ColumnUInt8::create(keys_size);
auto & out = result->getData();
size_t keys_found = 0;
auto type_call = [&](const auto & dictionary_attribute_type)
@ -229,58 +230,48 @@ ColumnUInt8::Ptr RangeHashedDictionary::hasKeys(const Columns & key_columns, con
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
result = hasKeysImpl<ValueType>(attribute, ids, dates, keys_found);
const auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
const auto key = keys_extractor.extractCurrentKey();
const auto it = collection.find(key);
if (it)
{
const auto date = dates[key_index];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
std::end(ranges_and_values),
[date](const Value<ValueType> & v)
{
return v.range.contains(date);
});
out[key_index] = val_it != std::end(ranges_and_values);
keys_found += out[key_index];
}
else
{
out[key_index] = false;
}
keys_extractor.rollbackCurrentKey();
}
};
callOnDictionaryAttributeType(attribute.type, type_call);
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
return result;
}
template <typename AttributeType>
ColumnUInt8::Ptr RangeHashedDictionary::hasKeysImpl(
const Attribute & attribute,
const PaddedPODArray<UInt64> & ids,
const PaddedPODArray<RangeStorageType> & dates,
size_t & keys_found) const
{
auto result = ColumnUInt8::create(ids.size());
auto& out = result->getData();
const auto & attr = *std::get<Ptr<AttributeType>>(attribute.maps);
keys_found = 0;
for (const auto row : collections::range(0, ids.size()))
{
const auto it = attr.find(ids[row]);
if (it)
{
const auto date = dates[row];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
std::end(ranges_and_values),
[date](const Value<AttributeType> & v)
{
return v.range.contains(date);
});
out[row] = val_it != std::end(ranges_and_values);
keys_found += out[row];
}
else
out[row] = false;
}
return result;
}
void RangeHashedDictionary::createAttributes()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::createAttributes()
{
const auto size = dict_struct.attributes.size();
attributes.reserve(size);
@ -296,7 +287,8 @@ void RangeHashedDictionary::createAttributes()
}
}
void RangeHashedDictionary::loadData()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::loadData()
{
QueryPipeline pipeline;
pipeline.init(source_ptr->loadAll());
@ -305,39 +297,60 @@ void RangeHashedDictionary::loadData()
Block block;
while (executor.pull(block))
{
const auto & id_column = *block.safeGetByPosition(0).column;
size_t skip_keys_size_offset = dict_struct.getKeysSize();
Columns key_columns;
key_columns.reserve(skip_keys_size_offset);
/// Split into keys columns and attribute columns
for (size_t i = 0; i < skip_keys_size_offset; ++i)
key_columns.emplace_back(block.safeGetByPosition(i).column);
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
element_count += keys_size;
// Support old behaviour, where invalid date means 'open range'.
const bool is_date = isDate(dict_struct.range_min->type);
const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(1).column);
const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(2).column);
const auto & min_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset).column);
const auto & max_range_column = unwrapNullableColumn(*block.safeGetByPosition(skip_keys_size_offset + 1).column);
element_count += id_column.size();
skip_keys_size_offset += 2;
for (const auto attribute_idx : collections::range(0, attributes.size()))
for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
{
const auto & attribute_column = *block.safeGetByPosition(attribute_idx + 3).column;
auto & attribute = attributes[attribute_idx];
const auto & attribute_column = *block.safeGetByPosition(attribute_index + skip_keys_size_offset).column;
auto & attribute = attributes[attribute_index];
for (const auto row_idx : collections::range(0, id_column.size()))
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
auto key = keys_extractor.extractCurrentKey();
RangeStorageType lower_bound;
RangeStorageType upper_bound;
if (is_date)
{
lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, 0);
upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, DATE_LUT_MAX_DAY_NUM + 1);
lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, 0);
upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, DATE_LUT_MAX_DAY_NUM + 1);
}
else
{
lower_bound = getColumnIntValueOrDefault(min_range_column, row_idx, is_date, RANGE_MIN_NULL_VALUE);
upper_bound = getColumnIntValueOrDefault(max_range_column, row_idx, is_date, RANGE_MAX_NULL_VALUE);
lower_bound = getColumnIntValueOrDefault(min_range_column, key_index, is_date, RANGE_MIN_NULL_VALUE);
upper_bound = getColumnIntValueOrDefault(max_range_column, key_index, is_date, RANGE_MAX_NULL_VALUE);
}
setAttributeValue(attribute, id_column.getUInt(row_idx), Range{lower_bound, upper_bound}, attribute_column[row_idx]);
if constexpr (std::is_same_v<KeyType, StringRef>)
key = copyKeyInArena(key);
setAttributeValue(attribute, key, Range{lower_bound, upper_bound}, attribute_column[key_index]);
keys_extractor.rollbackCurrentKey();
}
keys_extractor.reset();
}
}
@ -346,22 +359,8 @@ void RangeHashedDictionary::loadData()
"{}: dictionary source is empty and 'require_nonempty' property is set.");
}
template <typename T>
void RangeHashedDictionary::addAttributeSize(const Attribute & attribute)
{
const auto & map_ref = std::get<Ptr<T>>(attribute.maps);
bytes_allocated += sizeof(Collection<T>) + map_ref->getBufferSizeInBytes();
bucket_count = map_ref->getBufferSizeInCells();
}
template <>
void RangeHashedDictionary::addAttributeSize<String>(const Attribute & attribute)
{
addAttributeSize<StringRef>(attribute);
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
}
void RangeHashedDictionary::calculateBytesAllocated()
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::calculateBytesAllocated()
{
bytes_allocated += attributes.size() * sizeof(attributes.front());
@ -371,14 +370,25 @@ void RangeHashedDictionary::calculateBytesAllocated()
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
addAttributeSize<AttributeType>(attribute);
using ValueType = DictionaryValueType<AttributeType>;
const auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
bytes_allocated += sizeof(CollectionType<ValueType>) + collection.getBufferSizeInBytes();
bucket_count = collection.getBufferSizeInCells();
if constexpr (std::is_same_v<ValueType, StringRef>)
bytes_allocated += sizeof(Arena) + attribute.string_arena->size();
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
if constexpr (dictionary_key_type == DictionaryKeyType::complex)
bytes_allocated += complex_key_arena.size();
}
RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const DictionaryAttribute & dictionary_attribute)
template <DictionaryKeyType dictionary_key_type>
typename RangeHashedDictionary<dictionary_key_type>::Attribute RangeHashedDictionary<dictionary_key_type>::createAttribute(const DictionaryAttribute & dictionary_attribute)
{
Attribute attribute{dictionary_attribute.underlying_type, dictionary_attribute.is_nullable, {}, {}};
@ -391,7 +401,7 @@ RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const Di
if constexpr (std::is_same_v<AttributeType, String>)
attribute.string_arena = std::make_unique<Arena>();
attribute.maps = std::make_unique<Collection<ValueType>>();
attribute.maps = CollectionType<ValueType>();
};
callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call);
@ -399,29 +409,35 @@ RangeHashedDictionary::Attribute RangeHashedDictionary::createAttribute(const Di
return attribute;
}
template <DictionaryKeyType dictionary_key_type>
template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
void RangeHashedDictionary::getItemsImpl(
void RangeHashedDictionary<dictionary_key_type>::getItemsImpl(
const Attribute & attribute,
const Columns & key_columns,
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const
{
PaddedPODArray<UInt64> key_backup_storage;
PaddedPODArray<RangeStorageType> range_backup_storage;
const PaddedPODArray<UInt64> & ids = getColumnVectorData(this, key_columns[0], key_backup_storage);
const PaddedPODArray<RangeStorageType> & dates = getColumnVectorData(this, key_columns[1], range_backup_storage);
const auto & attr = *std::get<Ptr<AttributeType>>(attribute.maps);
const auto & collection = std::get<CollectionType<AttributeType>>(attribute.maps);
size_t keys_found = 0;
for (const auto row : collections::range(0, ids.size()))
PaddedPODArray<RangeStorageType> range_backup_storage;
const auto & dates = getColumnVectorData(this, key_columns.back(), range_backup_storage);
auto key_columns_copy = key_columns;
key_columns_copy.pop_back();
DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns_copy, arena_holder.getComplexKeyArena());
const size_t keys_size = keys_extractor.getKeysSize();
for (size_t key_index = 0; key_index < keys_size; ++key_index)
{
const auto it = attr.find(ids[row]);
auto key = keys_extractor.extractCurrentKey();
const auto it = collection.find(key);
if (it)
{
const auto date = dates[row];
const auto date = dates[key_index];
const auto & ranges_and_values = it->getMapped();
const auto val_it = std::find_if(
std::begin(ranges_and_values),
@ -439,35 +455,38 @@ void RangeHashedDictionary::getItemsImpl(
if constexpr (is_nullable)
{
if (value.has_value())
set_value(row, *value, false);
set_value(key_index, *value, false);
else
set_value(row, default_value_extractor[row], true);
set_value(key_index, default_value_extractor[key_index], true);
}
else
{
set_value(row, *value, false);
set_value(key_index, *value, false);
}
keys_extractor.rollbackCurrentKey();
continue;
}
}
if constexpr (is_nullable)
set_value(row, default_value_extractor[row], default_value_extractor.isNullAt(row));
set_value(key_index, default_value_extractor[key_index], default_value_extractor.isNullAt(key_index));
else
set_value(row, default_value_extractor[row], false);
set_value(key_index, default_value_extractor[key_index], false);
keys_extractor.rollbackCurrentKey();
}
query_count.fetch_add(ids.size(), std::memory_order_relaxed);
query_count.fetch_add(keys_size, std::memory_order_relaxed);
found_count.fetch_add(keys_found, std::memory_order_relaxed);
}
template <DictionaryKeyType dictionary_key_type>
template <typename T>
void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
void RangeHashedDictionary<dictionary_key_type>::setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value)
{
using ValueType = std::conditional_t<std::is_same_v<T, String>, StringRef, T>;
auto & map = *std::get<Ptr<ValueType>>(attribute.maps);
auto & collection = std::get<CollectionType<ValueType>>(attribute.maps);
Value<ValueType> value_to_insert;
@ -490,61 +509,47 @@ void RangeHashedDictionary::setAttributeValueImpl(Attribute & attribute, const U
}
}
const auto it = map.find(id);
const auto it = collection.find(key);
if (it)
{
auto & values = it->getMapped();
const auto insert_it
= std::lower_bound(std::begin(values), std::end(values), range, [](const Value<ValueType> & lhs, const Range & rhs_range)
{
return lhs.range < rhs_range;
});
const auto insert_it = std::lower_bound(
std::begin(values),
std::end(values),
range,
[](const Value<ValueType> & lhs, const Range & rhs_range)
{
return lhs.range < rhs_range;
});
values.insert(insert_it, std::move(value_to_insert));
}
else
map.insert({id, Values<ValueType>{std::move(value_to_insert)}});
{
collection.insert({key, Values<ValueType>{std::move(value_to_insert)}});
}
}
void RangeHashedDictionary::setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value)
template <DictionaryKeyType dictionary_key_type>
void RangeHashedDictionary<dictionary_key_type>::setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value)
{
auto type_call = [&](const auto &dictionary_attribute_type)
{
using Type = std::decay_t<decltype(dictionary_attribute_type)>;
using AttributeType = typename Type::AttributeType;
setAttributeValueImpl<AttributeType>(attribute, id, range, value);
setAttributeValueImpl<AttributeType>(attribute, key, range, value);
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
const RangeHashedDictionary::Attribute & RangeHashedDictionary::getAttribute(const std::string & attribute_name) const
{
const auto it = attribute_index_by_name.find(attribute_name);
if (it == std::end(attribute_index_by_name))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "{}: no such attribute '{}'", full_name, attribute_name);
return attributes[it->second];
}
const RangeHashedDictionary::Attribute &
RangeHashedDictionary::getAttributeWithType(const std::string & attribute_name, const AttributeUnderlyingType type) const
{
const auto & attribute = getAttribute(attribute_name);
if (attribute.type != type)
throw Exception(ErrorCodes::TYPE_MISMATCH, "attribute {} has type {}",
attribute_name,
toString(attribute.type));
return attribute;
}
template <DictionaryKeyType dictionary_key_type>
template <typename RangeType>
void RangeHashedDictionary::getIdsAndDates(
PaddedPODArray<UInt64> & ids,
void RangeHashedDictionary<dictionary_key_type>::getKeysAndDates(
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const
{
@ -556,32 +561,33 @@ void RangeHashedDictionary::getIdsAndDates(
using AttributeType = typename Type::AttributeType;
using ValueType = DictionaryValueType<AttributeType>;
getIdsAndDates<ValueType>(attribute, ids, start_dates, end_dates);
getKeysAndDates<ValueType>(attribute, keys, start_dates, end_dates);
};
callOnDictionaryAttributeType(attribute.type, type_call);
}
template <DictionaryKeyType dictionary_key_type>
template <typename T, typename RangeType>
void RangeHashedDictionary::getIdsAndDates(
void RangeHashedDictionary<dictionary_key_type>::getKeysAndDates(
const Attribute & attribute,
PaddedPODArray<UInt64> & ids,
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const
{
const HashMap<UInt64, Values<T>> & attr = *std::get<Ptr<T>>(attribute.maps);
const auto & collection = std::get<CollectionType<T>>(attribute.maps);
ids.reserve(attr.size());
start_dates.reserve(attr.size());
end_dates.reserve(attr.size());
keys.reserve(collection.size());
start_dates.reserve(collection.size());
end_dates.reserve(collection.size());
const bool is_date = isDate(dict_struct.range_min->type);
for (const auto & key : attr)
for (const auto & key : collection)
{
for (const auto & value : key.getMapped())
{
ids.push_back(key.getKey());
keys.push_back(key.getKey());
start_dates.push_back(value.range.left);
end_dates.push_back(value.range.right);
@ -592,33 +598,44 @@ void RangeHashedDictionary::getIdsAndDates(
}
}
template <DictionaryKeyType dictionary_key_type>
template <typename RangeType>
Pipe RangeHashedDictionary::readImpl(const Names & column_names, size_t max_block_size) const
Pipe RangeHashedDictionary<dictionary_key_type>::readImpl(const Names & column_names, size_t max_block_size) const
{
PaddedPODArray<UInt64> ids;
PaddedPODArray<KeyType> keys;
PaddedPODArray<RangeType> start_dates;
PaddedPODArray<RangeType> end_dates;
getIdsAndDates(ids, start_dates, end_dates);
getKeysAndDates(keys, start_dates, end_dates);
using RangeDictionarySourceType = RangeDictionarySource<RangeType>;
static constexpr RangeDictionaryType range_dictionary_type = (dictionary_key_type == DictionaryKeyType::simple) ? RangeDictionaryType::simple : RangeDictionaryType::complex;
using RangeDictionarySourceType = RangeDictionarySource<range_dictionary_type, RangeType>;
auto source = std::make_shared<RangeDictionarySourceType>(
RangeDictionarySourceData<RangeType>(
shared_from_this(),
column_names,
std::move(ids),
std::move(start_dates),
std::move(end_dates)),
max_block_size);
auto source_data = RangeDictionarySourceData<range_dictionary_type, RangeType>(
shared_from_this(),
column_names,
std::move(keys),
std::move(start_dates),
std::move(end_dates));
auto source = std::make_shared<RangeDictionarySourceType>(std::move(source_data), max_block_size);
return Pipe(source);
}
template <DictionaryKeyType dictionary_key_type>
StringRef RangeHashedDictionary<dictionary_key_type>::copyKeyInArena(StringRef key)
{
size_t key_size = key.size;
char * place_for_key = complex_key_arena.alloc(key_size);
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
StringRef updated_key{place_for_key, key_size};
return updated_key;
}
template <DictionaryKeyType dictionary_key_type>
struct RangeHashedDictionaryCallGetSourceImpl
{
Pipe pipe;
const RangeHashedDictionary * dict;
const RangeHashedDictionary<dictionary_key_type> * dict;
const Names * column_names;
size_t max_block_size;
@ -627,15 +644,16 @@ struct RangeHashedDictionaryCallGetSourceImpl
{
const auto & type = dict->dict_struct.range_min->type;
if (pipe.empty() && dynamic_cast<const DataTypeNumberBase<RangeType> *>(type.get()))
pipe = dict->readImpl<RangeType>(*column_names, max_block_size);
pipe = dict->template readImpl<RangeType>(*column_names, max_block_size);
}
};
Pipe RangeHashedDictionary::read(const Names & column_names, size_t max_block_size) const
template <DictionaryKeyType dictionary_key_type>
Pipe RangeHashedDictionary<dictionary_key_type>::read(const Names & column_names, size_t max_block_size) const
{
using ListType = TypeList<UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Int128, Float32, Float64>;
RangeHashedDictionaryCallGetSourceImpl callable;
RangeHashedDictionaryCallGetSourceImpl<dictionary_key_type> callable;
callable.dict = this;
callable.column_names = &column_names;
callable.max_block_size = max_block_size;
@ -653,7 +671,7 @@ Pipe RangeHashedDictionary::read(const Names & column_names, size_t max_block_si
void registerDictionaryRangeHashed(DictionaryFactory & factory)
{
auto create_layout = [=](const std::string & full_name,
auto create_layout_simple = [=](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
@ -672,9 +690,32 @@ void registerDictionaryRangeHashed(DictionaryFactory & factory)
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<RangeHashedDictionary>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
return std::make_unique<RangeHashedDictionary<DictionaryKeyType::simple>>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("range_hashed", create_layout, false);
factory.registerLayout("range_hashed", create_layout_simple, false);
auto create_layout_complex = [=](const std::string & full_name,
const DictionaryStructure & dict_struct,
const Poco::Util::AbstractConfiguration & config,
const std::string & config_prefix,
DictionarySourcePtr source_ptr,
ContextPtr /* context */,
bool /*created_from_ddl*/) -> DictionaryPtr
{
if (dict_struct.id)
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "'id' is not supported for dictionary of layout 'complex_key_range_hashed'");
if (!dict_struct.range_min || !dict_struct.range_max)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"{}: dictionary of layout 'complex_key_range_hashed' requires .structure.range_min and .structure.range_max",
full_name);
const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);
const DictionaryLifetime dict_lifetime{config, config_prefix + ".lifetime"};
const bool require_nonempty = config.getBool(config_prefix + ".require_nonempty", false);
return std::make_unique<RangeHashedDictionary<DictionaryKeyType::complex>>(dict_id, dict_struct, std::move(source_ptr), dict_lifetime, require_nonempty);
};
factory.registerLayout("complex_key_range_hashed", create_layout_complex, true);
}
}

View File

@ -16,9 +16,25 @@
namespace DB
{
using RangeStorageType = Int64;
struct Range
{
RangeStorageType left;
RangeStorageType right;
static bool isCorrectDate(const RangeStorageType & date);
bool contains(const RangeStorageType & value) const;
};
template <DictionaryKeyType dictionary_key_type>
class RangeHashedDictionary final : public IDictionary
{
public:
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::simple, UInt64, StringRef>;
static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by hashed dictionary");
RangeHashedDictionary(
const StorageID & dict_id_,
const DictionaryStructure & dict_struct_,
@ -59,7 +75,7 @@ public:
bool isInjective(const std::string & attribute_name) const override
{
return dict_struct.attributes[&getAttribute(attribute_name) - attributes.data()].injective;
return dict_struct.getAttribute(attribute_name).injective;
}
DictionaryKeyType getKeyType() const override { return DictionaryKeyType::range; }
@ -73,19 +89,8 @@ public:
ColumnUInt8::Ptr hasKeys(const Columns & key_columns, const DataTypes & key_types) const override;
using RangeStorageType = Int64;
Pipe read(const Names & column_names, size_t max_block_size) const override;
struct Range
{
RangeStorageType left;
RangeStorageType right;
static bool isCorrectDate(const RangeStorageType & date);
bool contains(const RangeStorageType & value) const;
};
private:
template <typename T>
struct Value final
@ -96,10 +101,12 @@ private:
template <typename T>
using Values = std::vector<Value<T>>;
template <typename T>
using Collection = HashMap<UInt64, Values<T>>;
template <typename T>
using Ptr = std::unique_ptr<Collection<T>>;
template <typename Value>
using CollectionType = std::conditional_t<
dictionary_key_type == DictionaryKeyType::simple,
HashMap<UInt64, Values<Value>>,
HashMapWithSavedHash<StringRef, Values<Value>, DefaultHash<StringRef>>>;
struct Attribute final
{
@ -108,27 +115,27 @@ private:
bool is_nullable;
std::variant<
Ptr<UInt8>,
Ptr<UInt16>,
Ptr<UInt32>,
Ptr<UInt64>,
Ptr<UInt128>,
Ptr<UInt256>,
Ptr<Int8>,
Ptr<Int16>,
Ptr<Int32>,
Ptr<Int64>,
Ptr<Int128>,
Ptr<Int256>,
Ptr<Decimal32>,
Ptr<Decimal64>,
Ptr<Decimal128>,
Ptr<Decimal256>,
Ptr<Float32>,
Ptr<Float64>,
Ptr<UUID>,
Ptr<StringRef>,
Ptr<Array>>
CollectionType<UInt8>,
CollectionType<UInt16>,
CollectionType<UInt32>,
CollectionType<UInt64>,
CollectionType<UInt128>,
CollectionType<UInt256>,
CollectionType<Int8>,
CollectionType<Int16>,
CollectionType<Int32>,
CollectionType<Int64>,
CollectionType<Int128>,
CollectionType<Int256>,
CollectionType<Decimal32>,
CollectionType<Decimal64>,
CollectionType<Decimal128>,
CollectionType<Decimal256>,
CollectionType<Float32>,
CollectionType<Float64>,
CollectionType<UUID>,
CollectionType<StringRef>,
CollectionType<Array>>
maps;
std::unique_ptr<Arena> string_arena;
};
@ -137,9 +144,6 @@ private:
void loadData();
template <typename T>
void addAttributeSize(const Attribute & attribute);
void calculateBytesAllocated();
static Attribute createAttribute(const DictionaryAttribute & dictionary_attribute);
@ -151,35 +155,30 @@ private:
ValueSetter && set_value,
DefaultValueExtractor & default_value_extractor) const;
template <typename AttributeType>
ColumnUInt8::Ptr hasKeysImpl(
const Attribute & attribute,
const PaddedPODArray<UInt64> & ids,
const PaddedPODArray<RangeStorageType> & dates,
size_t & keys_found) const;
template <typename T>
static void setAttributeValueImpl(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
static void setAttributeValueImpl(Attribute & attribute, KeyType key, const Range & range, const Field & value);
static void setAttributeValue(Attribute & attribute, const UInt64 id, const Range & range, const Field & value);
const Attribute & getAttribute(const std::string & attribute_name) const;
const Attribute & getAttributeWithType(const std::string & name, const AttributeUnderlyingType type) const;
static void setAttributeValue(Attribute & attribute, KeyType key, const Range & range, const Field & value);
template <typename RangeType>
void getIdsAndDates(PaddedPODArray<UInt64> & ids, PaddedPODArray<RangeType> & start_dates, PaddedPODArray<RangeType> & end_dates) const;
void getKeysAndDates(
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const;
template <typename T, typename RangeType>
void getIdsAndDates(
void getKeysAndDates(
const Attribute & attribute,
PaddedPODArray<UInt64> & ids,
PaddedPODArray<KeyType> & keys,
PaddedPODArray<RangeType> & start_dates,
PaddedPODArray<RangeType> & end_dates) const;
template <typename RangeType>
Pipe readImpl(const Names & column_names, size_t max_block_size) const;
StringRef copyKeyInArena(StringRef key);
template <DictionaryKeyType>
friend struct RangeHashedDictionaryCallGetSourceImpl;
const DictionaryStructure dict_struct;
@ -189,6 +188,7 @@ private:
std::map<std::string, size_t> attribute_index_by_name;
std::vector<Attribute> attributes;
Arena complex_key_arena;
size_t bytes_allocated = 0;
size_t element_count = 0;

View File

@ -6,6 +6,7 @@
namespace DB
{
using DictionaryConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
/// Convert dictionary AST to Poco::AbstractConfiguration
@ -13,4 +14,5 @@ using DictionaryConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfigurati
/// Can throw exception if query is ill-formed
DictionaryConfigurationPtr
getDictionaryConfigurationFromAST(const ASTCreateQuery & query, ContextPtr context, const std::string & database_ = "");
}

View File

@ -27,6 +27,7 @@ SRCS(
CassandraSource.cpp
ClickHouseDictionarySource.cpp
DictionaryFactory.cpp
DictionaryHelpers.cpp
DictionarySource.cpp
DictionarySourceBase.cpp
DictionarySourceFactory.cpp

View File

@ -76,6 +76,8 @@ public:
return 1;
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override
{
return true;

View File

@ -955,6 +955,12 @@ public:
size_t getNumberOfArguments() const override { return 2; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override
{
return ((IsOperation<Op>::div_int || IsOperation<Op>::modulo) && !arguments[1].is_const)
|| (IsOperation<Op>::div_floating && (isDecimal(arguments[0].type) || isDecimal(arguments[1].type)));
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
return getReturnTypeImplStatic(arguments, context);

View File

@ -29,6 +29,7 @@ public:
String getName() const override { return name; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override

View File

@ -30,6 +30,7 @@ public:
bool isVariadic() const override { return true; }
bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override

View File

@ -30,6 +30,7 @@ public:
String getName() const override { return name; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override

View File

@ -399,6 +399,7 @@ public:
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{

View File

@ -33,6 +33,7 @@ public:
}
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
size_t getNumberOfArguments() const override { return 0; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override

View File

@ -24,6 +24,8 @@ public:
bool isDeterministic() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
size_t getNumberOfArguments() const override
{
return 0;

View File

@ -33,6 +33,7 @@ public:
size_t getNumberOfArguments() const override { return 1; }
bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{

View File

@ -90,11 +90,17 @@ FunctionBasePtr JoinGetOverloadResolver<or_null>::buildImpl(const ColumnsWithTyp
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
auto [storage_join, attr_name] = getJoin(arguments, getContext());
DataTypes data_types(arguments.size() - 2);
for (size_t i = 2; i < arguments.size(); ++i)
data_types[i - 2] = arguments[i].type;
DataTypes argument_types(arguments.size());
for (size_t i = 0; i < arguments.size(); ++i)
{
if (i >= 2)
data_types[i - 2] = arguments[i].type;
argument_types[i] = arguments[i].type;
}
auto return_type = storage_join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null);
auto table_lock = storage_join->lockForShare(getContext()->getInitialQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
return std::make_unique<FunctionJoinGet<or_null>>(table_lock, storage_join, attr_name, data_types, return_type);
return std::make_unique<FunctionJoinGet<or_null>>(table_lock, storage_join, attr_name, argument_types, return_type);
}
void registerFunctionJoinGet(FunctionFactory & factory)

View File

@ -60,6 +60,8 @@ public:
String getName() const override { return name; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
const DataTypes & getArgumentTypes() const override { return argument_types; }
const DataTypePtr & getResultType() const override { return return_type; }

View File

@ -32,6 +32,7 @@ public:
static_assert(Impl::rows_per_iteration > 0, "Impl must process at least one row per iteration");
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
private:
String getName() const override { return name; }

View File

@ -20,6 +20,8 @@ private:
size_t getNumberOfArguments() const override { return 0; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
{
return std::make_shared<DataTypeFloat64>();

View File

@ -41,6 +41,8 @@ private:
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
const auto & arg = arguments.front();

View File

@ -38,6 +38,11 @@ public:
return 1;
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
{
return false;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isNativeNumber(arguments.front()))

View File

@ -153,6 +153,7 @@ public:
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{

View File

@ -42,6 +42,11 @@ public:
return name;
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
{
return true;
}
size_t getNumberOfArguments() const override
{
return 2;

View File

@ -21,7 +21,7 @@ namespace ErrorCodes
}
template <typename Impl, typename Name, typename ResultType>
template <typename Impl, typename Name, typename ResultType, bool is_suitable_for_short_circuit_arguments_execution = true>
class FunctionStringOrArrayToT : public IFunction
{
public:
@ -41,6 +41,11 @@ public:
return 1;
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
{
return is_suitable_for_short_circuit_arguments_execution;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0])

View File

@ -29,6 +29,8 @@ public:
size_t getNumberOfArguments() const override { return 3; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }

View File

@ -43,6 +43,11 @@ public:
return is_injective;
}
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override
{
return true;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0]))

View File

@ -120,6 +120,7 @@ public:
size_t getNumberOfArguments() const override { return 1; }
bool isInjective(const ColumnsWithTypeAndName &) const override { return is_injective; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }

View File

@ -35,6 +35,7 @@ public:
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isVariadic() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
@ -99,6 +100,7 @@ public:
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 0; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override

View File

@ -148,6 +148,7 @@ private:
String getName() const override { return name; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
bool useDefaultImplementationForConstants() const override { return true; }
@ -423,6 +424,7 @@ private:
String getName() const override { return name; }
bool isVariadic() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0}; }
bool useDefaultImplementationForConstants() const override { return true; }

Some files were not shown because too many files have changed in this diff Show More