Merge branch 'master' into ADQM-830

This commit is contained in:
Alexey Gerasimchuk 2023-05-31 15:07:37 +10:00 committed by GitHub
commit 44ba35d2c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
175 changed files with 3589 additions and 1486 deletions

View File

@ -25,6 +25,9 @@ message(STATUS "Intel QPL version: ${QPL_VERSION}")
# Generate 8 library targets: middle_layer_lib, isal, isal_asm, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, core_iaa, middle_layer_lib. # Generate 8 library targets: middle_layer_lib, isal, isal_asm, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, core_iaa, middle_layer_lib.
# Output ch_contrib::qpl by linking with 8 library targets. # Output ch_contrib::qpl by linking with 8 library targets.
# The qpl submodule comes with its own version of isal. It contains code which does not exist in upstream isal. It would be nice to link
# only upstream isal (ch_contrib::isal) but at this point we can't.
include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake") include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake")
# check nasm compiler # check nasm compiler
@ -308,7 +311,7 @@ target_include_directories(middle_layer_lib
target_compile_definitions(middle_layer_lib PUBLIC -DQPL_LIB) target_compile_definitions(middle_layer_lib PUBLIC -DQPL_LIB)
# [SUBDIR]c_api # [SUBDIR]c_api
file(GLOB_RECURSE QPL_C_API_SRC file(GLOB_RECURSE QPL_C_API_SRC
${QPL_SRC_DIR}/c_api/*.c ${QPL_SRC_DIR}/c_api/*.c
${QPL_SRC_DIR}/c_api/*.cpp) ${QPL_SRC_DIR}/c_api/*.cpp)

View File

@ -1,6 +0,0 @@
# ARM (AArch64) build works on Amazon Graviton, Oracle Cloud, Huawei Cloud ARM machines.
# The support for AArch64 is pre-production ready.
wget 'https://builds.clickhouse.com/master/aarch64/clickhouse'
chmod a+x ./clickhouse
sudo ./clickhouse install

View File

@ -1,3 +0,0 @@
fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse'
chmod a+x ./clickhouse
su -m root -c './clickhouse install'

View File

@ -1,3 +0,0 @@
wget 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse'
chmod a+x ./clickhouse
./clickhouse

View File

@ -1,3 +0,0 @@
wget 'https://builds.clickhouse.com/master/macos/clickhouse'
chmod a+x ./clickhouse
./clickhouse

View File

@ -43,7 +43,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html). For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
As of April 2023, any version of Clang >= 15 will work. As of April 2023, any version of Clang >= 15 will work.
GCC as a compiler is not supported GCC as a compiler is not supported.
To build with a specific Clang version: To build with a specific Clang version:
:::tip :::tip
@ -114,18 +114,3 @@ mkdir build
cmake -S . -B build cmake -S . -B build
cmake --build build cmake --build build
``` ```
## You Dont Have to Build ClickHouse {#you-dont-have-to-build-clickhouse}
ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
The CI checks build the binaries on each commit to [ClickHouse](https://github.com/clickhouse/clickhouse/). To download them:
1. Open the [commits list](https://github.com/ClickHouse/ClickHouse/commits/master)
1. Choose a **Merge pull request** commit that includes the new feature, or was added after the new feature
1. Click the status symbol (yellow dot, red x, green check) to open the CI check list
1. Scroll through the list until you find **ClickHouse build check x/x artifact groups are OK**
1. Click **Details**
1. Find the type of package for your operating system that you need and download the files.
![build artifact check](images/find-build-artifact.png)

View File

@ -131,14 +131,17 @@ CREATE TABLE table_with_asterisk (name String, value UInt32)
The following settings can be set before query execution or placed into configuration file. The following settings can be set before query execution or placed into configuration file.
- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`. - `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `32Mb`.
- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. - `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `16Mb`.
- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`.
- `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`. - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`.
- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). - `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. - `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). - `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. - `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
- `s3_upload_part_size_multiply_factor` - Multiply `s3_min_upload_part_size` by this factor each time `s3_multiply_parts_count_threshold` parts were uploaded from a single write to S3. Default values is `2`.
- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. DEfault value us `500`.
- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each inflight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enought, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file.
Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration.

View File

@ -1219,11 +1219,12 @@ Authentication parameters (the disk will try all available methods **and** Manag
* `account_name` and `account_key` - For authentication using Shared Key. * `account_name` and `account_key` - For authentication using Shared Key.
Limit parameters (mainly for internal usage): Limit parameters (mainly for internal usage):
* `max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage. * `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
* `min_bytes_for_seek` - Limits the size of a seekable region. * `min_bytes_for_seek` - Limits the size of a seekable region.
* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage. * `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage. * `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated. * `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object.
Other parameters: Other parameters:
* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`. * `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.

View File

@ -36,8 +36,8 @@ The data is in CSV files but uses a semi-colon for the delimiter. The rows look
│ 7389 │ BMP180 │ 3735 │ 50.136 │ 11.062 │ 2019-06-01T00:00:06 │ 98905 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 12.1 │ │ 7389 │ BMP180 │ 3735 │ 50.136 │ 11.062 │ 2019-06-01T00:00:06 │ 98905 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 12.1 │
│ 13199 │ BMP180 │ 6664 │ 52.514 │ 13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.74 │ │ 13199 │ BMP180 │ 6664 │ 52.514 │ 13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.74 │
│ 12753 │ BMP180 │ 6440 │ 44.616 │ 2.032 │ 2019-06-01T00:00:07 │ 99475 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17 │ │ 12753 │ BMP180 │ 6440 │ 44.616 │ 2.032 │ 2019-06-01T00:00:07 │ 99475 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17 │
│ 16956 │ BMP180 │ 8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │ 101322 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17.2 │ │ 16956 │ BMP180 │ 8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │ 101322 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17.2 │
└───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘ └───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘
``` ```
2. We will use the following `MergeTree` table to store the data in ClickHouse: 2. We will use the following `MergeTree` table to store the data in ClickHouse:

View File

@ -28,23 +28,25 @@ The quickest and easiest way to get up and running with ClickHouse is to create
For production installs of a specific release version see the [installation options](#available-installation-options) down below. For production installs of a specific release version see the [installation options](#available-installation-options) down below.
::: :::
On Linux and macOS: On Linux, macOS and FreeBSD:
1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, clickhouse-client, clickhouse-local, 1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the
ClickHouse Keeper, and other tools: following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server,
clickhouse-client, clickhouse-local, ClickHouse Keeper, and other tools:
```bash ```bash
curl https://clickhouse.com/ | sh curl https://clickhouse.com/ | sh
``` ```
1. Run the following command to start the ClickHouse server: 1. Run the following command to start the ClickHouse server:
```bash ```bash
./clickhouse server ./clickhouse server
``` ```
The first time you run this script, the necessary files and folders are created in the current directory, then the server starts. The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.
1. Open a new terminal and use the **clickhouse-client** to connect to your service: 1. Open a new terminal and use the **./clickhouse client** to connect to your service:
```bash ```bash
./clickhouse client ./clickhouse client
@ -330,7 +332,9 @@ For production environments, its recommended to use the latest `stable`-versi
To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside. To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside.
### From Sources {#from-sources} ## Non-Production Deployments (Advanced)
### Compile From Source {#from-sources}
To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [macOS](/docs/en/development/build-osx.md). To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [macOS](/docs/en/development/build-osx.md).
@ -346,8 +350,33 @@ Youll need to create data and metadata folders manually and `chown` them for
On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources. On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources.
### From CI checks pre-built binaries ### Install a CI-generated Binary
ClickHouse binaries are built for each [commit](/docs/en/development/build.md#you-dont-have-to-build-clickhouse).
ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit in the [ClickHouse
repository](https://github.com/clickhouse/clickhouse/), e.g. [sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug)
builds, cross-compiled builds etc. While such builds are normally only useful during development, they can in certain situations also be
interesting for users.
:::note
Since ClickHouse's CI is evolving over time, the exact steps to download CI-generated builds may vary.
Also, CI may delete too old build artifacts, making them unavailable for download.
:::
For example, to download a aarch64 binary for ClickHouse v23.4, follow these steps:
- Find the GitHub pull request for release v23.4: [Release pull request for branch 23.4](https://github.com/ClickHouse/ClickHouse/pull/49238)
- Click "Commits", then click a commit similar to "Update autogenerated version to 23.4.2.1 and contributors" for the particular version you like to install.
- Click the green check / yellow dot / red cross to open the list of CI checks.
- Click "Details" next to "ClickHouse Build Check" in the list, it will open a page similar to [this page](https://s3.amazonaws.com/clickhouse-test-reports/46793/b460eb70bf29b19eadd19a1f959b15d186705394/clickhouse_build_check/report.html)
- Find the rows with compiler = "clang-*-aarch64" - there are multiple rows.
- Download the artifacts for these builds.
To download binaries for very old x86-64 systems without [SSE3](https://en.wikipedia.org/wiki/SSE3) support or old ARM systems without
[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a [pull
request](https://github.com/ClickHouse/ClickHouse/commits/master) and find CI check "BuilderBinAmd64Compat", respectively
"BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs
https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the
build.
## Launch {#launch} ## Launch {#launch}

File diff suppressed because it is too large Load Diff

View File

@ -577,7 +577,7 @@ Default value: 20
**Usage** **Usage**
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception. The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
## max_part_loading_threads {#max-part-loading-threads} ## max_part_loading_threads {#max-part-loading-threads}
@ -840,4 +840,4 @@ Possible values:
- `Always` or `Never`. - `Always` or `Never`.
Default value: `Never` Default value: `Never`

View File

@ -1187,6 +1187,36 @@ Disable limit on kafka_num_consumers that depends on the number of available CPU
Default value: false. Default value: false.
## postgresql_connection_pool_size {#postgresql-connection-pool-size}
Connection pool size for PostgreSQL table engine and database engine.
Default value: 16
## postgresql_connection_pool_size {#postgresql-connection-pool-size}
Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.
Default value: 5000
## postgresql_connection_pool_auto_close_connection {#postgresql-connection-pool-auto-close-connection}
Close connection before returning connection to the pool.
Default value: true.
## odbc_bridge_connection_pool_size {#odbc-bridge-connection-pool-size}
Connection pool size for each connection settings string in ODBC bridge.
Default value: 16
## odbc_bridge_use_connection_pooling {#odbc-bridge-use-connection-pooling}
Use connection pooling in ODBC bridge. If set to false, a new connection is created every time.
Default value: true
## use_uncompressed_cache {#setting-use_uncompressed_cache} ## use_uncompressed_cache {#setting-use_uncompressed_cache}
Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled). Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled).
@ -3563,7 +3593,7 @@ SETTINGS index_granularity = 8192 │
## external_table_functions_use_nulls {#external-table-functions-use-nulls} ## external_table_functions_use_nulls {#external-table-functions-use-nulls}
Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md)] table functions use Nullable columns. Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns.
Possible values: Possible values:

View File

@ -29,7 +29,7 @@ select first_value(b) from test_data
### example2 ### example2
The NULL value is ignored. The NULL value is ignored.
```sql ```sql
select first_value(b) ignore nulls sfrom test_data select first_value(b) ignore nulls from test_data
``` ```
```text ```text

View File

@ -2234,7 +2234,7 @@ Result:
## Regular Expression Tree Dictionary {#regexp-tree-dictionary} ## Regular Expression Tree Dictionary {#regexp-tree-dictionary}
Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of (user agent)[https://en.wikipedia.org/wiki/User_agent] strings, which can be expressed elegantly with regexp tree dictionaries. Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of [user agent](https://en.wikipedia.org/wiki/User_agent) strings, which can be expressed elegantly with regexp tree dictionaries.
### Use Regular Expression Tree Dictionary in ClickHouse Open-Source ### Use Regular Expression Tree Dictionary in ClickHouse Open-Source
@ -2280,7 +2280,7 @@ This config consists of a list of regular expression tree nodes. Each node has t
- The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution. - The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution.
- **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example. - **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example.
Regexp tree dictionaries only allow access using functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull`. Regexp tree dictionaries only allow access using the functions `dictGet` and `dictGetOrDefault`.
Example: Example:

View File

@ -34,7 +34,7 @@ For the `SAMPLE` clause the following syntax is supported:
| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | | `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) |
## SAMPLE K ## SAMPLE K {#select-sample-k}
Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`. Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`.
@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000
In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10. In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10.
## SAMPLE N ## SAMPLE N {#select-sample-n}
Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`. Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`.
@ -90,7 +90,7 @@ FROM visits
SAMPLE 10000000 SAMPLE 10000000
``` ```
## SAMPLE K OFFSET M ## SAMPLE K OFFSET M {#select-sample-offset}
Here `k` and `m` are numbers from 0 to 1. Examples are shown below. Here `k` and `m` are numbers from 0 to 1. Examples are shown below.

View File

@ -1137,6 +1137,16 @@
<ttl>event_date + INTERVAL 30 DAY DELETE</ttl> <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
--> -->
<!--
ORDER BY expr: https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#order_by
Example:
event_date, event_time
event_date, type, query_id
event_date, event_time, initial_query_id
<order_by>event_date, event_time, initial_query_id</order_by>
-->
<!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters, <!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine> Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
--> -->

View File

@ -152,6 +152,13 @@ public:
nested_func->merge(place, rhs, arena); nested_func->merge(place, rhs, arena);
} }
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{
nested_func->merge(place, rhs, thread_pool, arena);
}
void mergeBatch( void mergeBatch(
size_t row_begin, size_t row_begin,
size_t row_end, size_t row_end,

View File

@ -59,16 +59,31 @@ UInt64 BackupEntryFromImmutableFile::getSize() const
UInt128 BackupEntryFromImmutableFile::getChecksum() const UInt128 BackupEntryFromImmutableFile::getChecksum() const
{ {
std::lock_guard lock{size_and_checksum_mutex};
if (!checksum_adjusted)
{ {
if (!checksum) std::lock_guard lock{size_and_checksum_mutex};
checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum(); if (checksum_adjusted)
else if (copy_encrypted) return *checksum;
checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
checksum_adjusted = true; if (checksum)
{
if (copy_encrypted)
checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
checksum_adjusted = true;
return *checksum;
}
}
auto calculated_checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
{
std::lock_guard lock{size_and_checksum_mutex};
if (!checksum_adjusted)
{
checksum = calculated_checksum;
checksum_adjusted = true;
}
return *checksum;
} }
return *checksum;
} }
std::optional<UInt128> BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const std::optional<UInt128> BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const

View File

@ -44,7 +44,7 @@ private:
const DataSourceDescription data_source_description; const DataSourceDescription data_source_description;
const bool copy_encrypted; const bool copy_encrypted;
mutable std::optional<UInt64> file_size; mutable std::optional<UInt64> file_size;
mutable std::optional<UInt64> checksum; mutable std::optional<UInt128> checksum;
mutable bool file_size_adjusted = false; mutable bool file_size_adjusted = false;
mutable bool checksum_adjusted = false; mutable bool checksum_adjusted = false;
mutable std::mutex size_and_checksum_mutex; mutable std::mutex size_and_checksum_mutex;

View File

@ -8,15 +8,32 @@ namespace DB
template <typename Base> template <typename Base>
UInt128 BackupEntryWithChecksumCalculation<Base>::getChecksum() const UInt128 BackupEntryWithChecksumCalculation<Base>::getChecksum() const
{ {
std::lock_guard lock{checksum_calculation_mutex};
if (!calculated_checksum)
{ {
auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(this->getSize())); std::lock_guard lock{checksum_calculation_mutex};
HashingReadBuffer hashing_read_buffer(*read_buffer); if (calculated_checksum)
hashing_read_buffer.ignoreAll(); return *calculated_checksum;
calculated_checksum = hashing_read_buffer.getHash(); }
size_t size = this->getSize();
{
std::lock_guard lock{checksum_calculation_mutex};
if (!calculated_checksum)
{
if (size == 0)
{
calculated_checksum = 0;
}
else
{
auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(size));
HashingReadBuffer hashing_read_buffer(*read_buffer);
hashing_read_buffer.ignoreAll();
calculated_checksum = hashing_read_buffer.getHash();
}
}
return *calculated_checksum;
} }
return *calculated_checksum;
} }
template <typename Base> template <typename Base>

View File

@ -0,0 +1,350 @@
#include <gtest/gtest.h>
#include <Backups/BackupEntryFromAppendOnlyFile.h>
#include <Backups/BackupEntryFromImmutableFile.h>
#include <Backups/BackupEntryFromSmallFile.h>
#include <Disks/IDisk.h>
#include <Disks/DiskLocal.h>
#include <Disks/DiskEncrypted.h>
#include <IO/FileEncryptionCommon.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/TemporaryFile.h>
using namespace DB;
class BackupEntriesTest : public ::testing::Test
{
protected:
void SetUp() override
{
/// Make local disk.
temp_dir = std::make_unique<Poco::TemporaryFile>();
temp_dir->createDirectories();
local_disk = std::make_shared<DiskLocal>("local_disk", temp_dir->path() + "/", 0);
/// Make encrypted disk.
auto settings = std::make_unique<DiskEncryptedSettings>();
settings->wrapped_disk = local_disk;
settings->current_algorithm = FileEncryption::Algorithm::AES_128_CTR;
settings->keys[0] = "1234567890123456";
settings->current_key_id = 0;
settings->disk_path = "encrypted/";
encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
}
void TearDown() override
{
encrypted_disk.reset();
local_disk.reset();
}
static void writeFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
writeString(std::string_view{"Some text"}, *buf);
buf->finalize();
}
static void writeEmptyFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
buf->finalize();
}
static void appendFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append, {});
writeString(std::string_view{"Appended"}, *buf);
buf->finalize();
}
static String getChecksum(const BackupEntryPtr & backup_entry)
{
return getHexUIntUppercase(backup_entry->getChecksum());
}
static const constexpr std::string_view NO_CHECKSUM = "no checksum";
static String getPartialChecksum(const BackupEntryPtr & backup_entry, size_t prefix_length)
{
auto partial_checksum = backup_entry->getPartialChecksum(prefix_length);
if (!partial_checksum)
return String{NO_CHECKSUM};
return getHexUIntUppercase(*partial_checksum);
}
static String readAll(const BackupEntryPtr & backup_entry)
{
auto in = backup_entry->getReadBuffer({});
String str;
readStringUntilEOF(str, *in);
return str;
}
std::unique_ptr<Poco::TemporaryFile> temp_dir;
std::shared_ptr<DiskLocal> local_disk;
std::shared_ptr<DiskEncrypted> encrypted_disk;
};
static const constexpr std::string_view ZERO_CHECKSUM = "00000000000000000000000000000000";
static const constexpr std::string_view SOME_TEXT_CHECKSUM = "28B5529750AC210952FFD366774363ED";
static const constexpr std::string_view S_CHECKSUM = "C27395C39AFB5557BFE47661CC9EB86C";
static const constexpr std::string_view SOME_TEX_CHECKSUM = "D00D9BE8D87919A165F14EDD31088A0E";
static const constexpr std::string_view SOME_TEXT_APPENDED_CHECKSUM = "5A1F10F638DC7A226231F3FD927D1726";
static const constexpr std::string_view PRECALCULATED_CHECKSUM = "1122334455667788AABBCCDDAABBCCDD";
static const constexpr UInt128 PRECALCULATED_CHECKSUM_UINT128 = (UInt128(0x1122334455667788) << 64) | 0xAABBCCDDAABBCCDD;
static const size_t PRECALCULATED_SIZE = 123;
TEST_F(BackupEntriesTest, BackupEntryFromImmutableFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
writeEmptyFile(local_disk, "empty.txt");
auto empty_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "empty.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE - 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
EXPECT_EQ(readAll(precalculated_entry), "Some text");
}
TEST_F(BackupEntriesTest, BackupEntryFromAppendOnlyFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
appendFile(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
auto appended_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(appended_entry->getSize(), 17);
EXPECT_EQ(getChecksum(appended_entry), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 22), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 1000), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(readAll(appended_entry), "Some textAppended");
writeEmptyFile(local_disk, "empty_appended.txt");
auto empty_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "empty_appended.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
appendFile(local_disk, "empty_appended.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
}
TEST_F(BackupEntriesTest, PartialChecksumBeforeFullChecksum)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
TEST_F(BackupEntriesTest, BackupEntryFromSmallFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromSmallFile>(local_disk, "a.txt");
local_disk->removeFile("a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
TEST_F(BackupEntriesTest, DecryptedEntriesFromEncryptedDisk)
{
{
writeFile(encrypted_disk, "a.txt");
std::pair<BackupEntryPtr, bool /* partial_checksum_allowed */> test_cases[]
= {{std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt"), false},
{std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt"), true},
{std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt"), true}};
for (const auto & [entry, partial_checksum_allowed] : test_cases)
{
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), partial_checksum_allowed ? S_CHECKSUM : NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), partial_checksum_allowed ? SOME_TEX_CHECKSUM : NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
}
{
writeEmptyFile(encrypted_disk, "empty.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt"),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt"),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt")};
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 0);
EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(readAll(entry), "");
}
}
{
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
EXPECT_EQ(readAll(precalculated_entry), "Some text");
}
}
TEST_F(BackupEntriesTest, EncryptedEntriesFromEncryptedDisk)
{
{
writeFile(encrypted_disk, "a.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true)};
auto encrypted_checksum = getChecksum(entries[0]);
EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
auto partial_checksum = getPartialChecksum(entries[1], 9);
EXPECT_NE(partial_checksum, NO_CHECKSUM);
EXPECT_NE(partial_checksum, ZERO_CHECKSUM);
EXPECT_NE(partial_checksum, SOME_TEXT_CHECKSUM);
EXPECT_NE(partial_checksum, encrypted_checksum);
auto encrypted_data = readAll(entries[0]);
EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 9 + FileEncryption::Header::kSize);
EXPECT_EQ(getChecksum(entry), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
auto encrypted_checksum_9 = getPartialChecksum(entry, 9);
EXPECT_TRUE(encrypted_checksum_9 == NO_CHECKSUM || encrypted_checksum_9 == partial_checksum);
EXPECT_EQ(getPartialChecksum(entry, 9 + FileEncryption::Header::kSize), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(entry, 1000), encrypted_checksum);
EXPECT_EQ(readAll(entry), encrypted_data);
}
}
{
writeEmptyFile(encrypted_disk, "empty.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true)};
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 0);
EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(readAll(entry), "");
}
}
{
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE + FileEncryption::Header::kSize);
auto encrypted_checksum = getChecksum(precalculated_entry);
EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
EXPECT_NE(encrypted_checksum, PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE + FileEncryption::Header::kSize), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), encrypted_checksum);
auto encrypted_data = readAll(precalculated_entry);
EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
}
}

View File

@ -121,7 +121,7 @@ ConnectionEstablisherAsync::ConnectionEstablisherAsync(
epoll.add(timeout_descriptor.getDescriptor()); epoll.add(timeout_descriptor.getDescriptor());
} }
void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, ResumeCallback) void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, SuspendCallback)
{ {
connection_establisher_async.reset(); connection_establisher_async.reset();
connection_establisher_async.connection_establisher.setAsyncCallback(async_callback); connection_establisher_async.connection_establisher.setAsyncCallback(async_callback);

View File

@ -91,7 +91,7 @@ private:
ConnectionEstablisherAsync & connection_establisher_async; ConnectionEstablisherAsync & connection_establisher_async;
void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override; void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
}; };
void cancelAfter() override; void cancelAfter() override;

View File

@ -57,7 +57,7 @@ bool PacketReceiver::checkTimeout()
return true; return true;
} }
void PacketReceiver::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback) void PacketReceiver::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
{ {
while (true) while (true)
{ {

View File

@ -57,7 +57,7 @@ private:
PacketReceiver & receiver; PacketReceiver & receiver;
void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override; void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
}; };
/// When epoll file descriptor is ready, check if it's an expired timeout. /// When epoll file descriptor is ready, check if it's an expired timeout.

View File

@ -433,7 +433,7 @@ const String & AsyncLoader::getPoolName(size_t pool) const
return pools[pool].name; // NOTE: lock is not needed because `name` is const and `pools` are immutable return pools[pool].name; // NOTE: lock is not needed because `name` is const and `pools` are immutable
} }
ssize_t AsyncLoader::getPoolPriority(size_t pool) const Priority AsyncLoader::getPoolPriority(size_t pool) const
{ {
return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
} }
@ -576,7 +576,7 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un
{ {
Pool & old_pool = pools[job->pool_id]; Pool & old_pool = pools[job->pool_id];
Pool & new_pool = pools[new_pool_id]; Pool & new_pool = pools[new_pool_id];
if (old_pool.priority >= new_pool.priority) if (old_pool.priority <= new_pool.priority)
return; // Never lower priority or change pool leaving the same priority return; // Never lower priority or change pool leaving the same priority
// Update priority and push job forward through ready queue if needed // Update priority and push job forward through ready queue if needed
@ -590,7 +590,7 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un
spawn(new_pool, lock); spawn(new_pool, lock);
} }
// Set user-facing pool and priority (may affect executing jobs) // Set user-facing pool (may affect executing jobs)
job->pool_id.store(new_pool_id); job->pool_id.store(new_pool_id);
// Recurse into dependencies // Recurse into dependencies
@ -621,7 +621,7 @@ bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
return is_running return is_running
&& !pool.ready_queue.empty() && !pool.ready_queue.empty()
&& pool.workers < pool.max_threads && pool.workers < pool.max_threads
&& (!current_priority || *current_priority <= pool.priority); && (!current_priority || *current_priority >= pool.priority);
} }
bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &) bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
@ -629,17 +629,17 @@ bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
return is_running return is_running
&& !pool.ready_queue.empty() && !pool.ready_queue.empty()
&& pool.workers <= pool.max_threads && pool.workers <= pool.max_threads
&& (!current_priority || *current_priority <= pool.priority); && (!current_priority || *current_priority >= pool.priority);
} }
void AsyncLoader::updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock) void AsyncLoader::updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock)
{ {
// Find current priority. // Find current priority.
// NOTE: We assume low number of pools, so O(N) scans are fine. // NOTE: We assume low number of pools, so O(N) scans are fine.
std::optional<ssize_t> priority; std::optional<Priority> priority;
for (Pool & pool : pools) for (Pool & pool : pools)
{ {
if (pool.isActive() && (!priority || *priority < pool.priority)) if (pool.isActive() && (!priority || *priority > pool.priority))
priority = pool.priority; priority = pool.priority;
} }
current_priority = priority; current_priority = priority;

View File

@ -11,6 +11,7 @@
#include <boost/noncopyable.hpp> #include <boost/noncopyable.hpp>
#include <base/types.h> #include <base/types.h>
#include <Common/CurrentMetrics.h> #include <Common/CurrentMetrics.h>
#include <Common/Priority.h>
#include <Common/Stopwatch.h> #include <Common/Stopwatch.h>
#include <Common/ThreadPool_fwd.h> #include <Common/ThreadPool_fwd.h>
@ -268,10 +269,10 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
// `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities. // `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
// Basic usage example: // Basic usage example:
// // Start async_loader with two thread pools (0=bg, 1=fg): // // Start async_loader with two thread pools (0=fg, 1=bg):
// AsyncLoader async_loader({ // AsyncLoader async_loader({
// {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority = 0} // {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority{0}}
// {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority = 1} // {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority{1}}
// }); // });
// //
// // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first. // // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
@ -279,19 +280,19 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
// auto job_func = [&] (const LoadJobPtr & self) { // auto job_func = [&] (const LoadJobPtr & self) {
// LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool())); // LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
// }; // };
// auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 0, job_func); // auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
// auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 0, job_func); // auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
// auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 0, job_func); // auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 1, job_func);
// auto task = makeLoadTask(async_loader, { job1, job2, job3 }); // auto task = makeLoadTask(async_loader, { job1, job2, job3 });
// task.schedule(); // task.schedule();
// //
// // Another thread may prioritize a job by changing its pool and wait for it: // // Another thread may prioritize a job by changing its pool and wait for it:
// async_loader->prioritize(job3, /* pool_id = */ 1); // higher priority jobs are run first, default priority is zero. // async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
// job3->wait(); // blocks until job completion or cancellation and rethrow an exception (if any) // job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
// //
// Every job has a pool associated with it. AsyncLoader starts every job in its thread pool. // Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
// Each pool has a constant priority and a mutable maximum number of threads. // Each pool has a constant priority and a mutable maximum number of threads.
// Higher priority (greater `pool.priority` value) jobs are run first. // Higher priority (lower `pool.priority` value) jobs are run first.
// No job with lower priority is started while there is at least one higher priority job ready or running. // No job with lower priority is started while there is at least one higher priority job ready or running.
// //
// Job priority can be elevated (but cannot be lowered) // Job priority can be elevated (but cannot be lowered)
@ -301,7 +302,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
// this also leads to a priority inheritance for all the dependencies. // this also leads to a priority inheritance for all the dependencies.
// Value stored in load job `pool_id` field is atomic and can be changed even during job execution. // Value stored in load job `pool_id` field is atomic and can be changed even during job execution.
// Job is, of course, not moved from its initial thread pool, but it should use `self->pool()` for // Job is, of course, not moved from its initial thread pool, but it should use `self->pool()` for
// all new jobs it create to avoid priority inversion. // all new jobs it create to avoid priority inversion. To obtain pool in which job is being executed
// call `self->execution_pool()` instead.
// //
// === IMPLEMENTATION DETAILS === // === IMPLEMENTATION DETAILS ===
// All possible states and statuses of a job: // All possible states and statuses of a job:
@ -335,7 +337,7 @@ private:
struct Pool struct Pool
{ {
const String name; const String name;
const ssize_t priority; const Priority priority;
std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools. std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno` std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
size_t max_threads; // Max number of workers to be spawn size_t max_threads; // Max number of workers to be spawn
@ -367,7 +369,7 @@ public:
Metric metric_threads; Metric metric_threads;
Metric metric_active_threads; Metric metric_active_threads;
size_t max_threads; size_t max_threads;
ssize_t priority; Priority priority;
}; };
AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_); AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_);
@ -412,7 +414,7 @@ public:
size_t getMaxThreads(size_t pool) const; size_t getMaxThreads(size_t pool) const;
const String & getPoolName(size_t pool) const; const String & getPoolName(size_t pool) const;
ssize_t getPoolPriority(size_t pool) const; Priority getPoolPriority(size_t pool) const;
size_t getScheduledJobCount() const; size_t getScheduledJobCount() const;
@ -451,7 +453,7 @@ private:
mutable std::mutex mutex; // Guards all the fields below. mutable std::mutex mutex; // Guards all the fields below.
bool is_running = true; bool is_running = true;
std::optional<ssize_t> current_priority; // highest priority among active pools std::optional<Priority> current_priority; // highest priority among active pools
UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys. UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info. std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
std::vector<Pool> pools; // Thread pools for job execution and ready queues std::vector<Pool> pools; // Thread pools for job execution and ready queues

View File

@ -3,18 +3,11 @@
namespace DB namespace DB
{ {
thread_local FiberInfo current_fiber_info;
AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_)) AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_))
{ {
createFiber(); createFiber();
} }
FiberInfo AsyncTaskExecutor::getCurrentFiberInfo()
{
return current_fiber_info;
}
void AsyncTaskExecutor::resume() void AsyncTaskExecutor::resume()
{ {
if (routine_is_finished) if (routine_is_finished)
@ -38,10 +31,7 @@ void AsyncTaskExecutor::resume()
void AsyncTaskExecutor::resumeUnlocked() void AsyncTaskExecutor::resumeUnlocked()
{ {
auto parent_fiber_info = current_fiber_info; fiber.resume();
current_fiber_info = FiberInfo{&fiber, &parent_fiber_info};
fiber = std::move(fiber).resume();
current_fiber_info = parent_fiber_info;
} }
void AsyncTaskExecutor::cancel() void AsyncTaskExecutor::cancel()
@ -69,30 +59,19 @@ struct AsyncTaskExecutor::Routine
struct AsyncCallback struct AsyncCallback
{ {
AsyncTaskExecutor & executor; AsyncTaskExecutor & executor;
Fiber & fiber; SuspendCallback suspend_callback;
void operator()(int fd, Poco::Timespan timeout, AsyncEventTimeoutType type, const std::string & desc, uint32_t events) void operator()(int fd, Poco::Timespan timeout, AsyncEventTimeoutType type, const std::string & desc, uint32_t events)
{ {
executor.processAsyncEvent(fd, timeout, type, desc, events); executor.processAsyncEvent(fd, timeout, type, desc, events);
fiber = std::move(fiber).resume(); suspend_callback();
executor.clearAsyncEvent(); executor.clearAsyncEvent();
} }
}; };
struct ResumeCallback void operator()(SuspendCallback suspend_callback)
{ {
Fiber & fiber; auto async_callback = AsyncCallback{executor, suspend_callback};
void operator()()
{
fiber = std::move(fiber).resume();
}
};
Fiber operator()(Fiber && sink)
{
auto async_callback = AsyncCallback{executor, sink};
auto suspend_callback = ResumeCallback{sink};
try try
{ {
executor.task->run(async_callback, suspend_callback); executor.task->run(async_callback, suspend_callback);
@ -110,18 +89,17 @@ struct AsyncTaskExecutor::Routine
} }
executor.routine_is_finished = true; executor.routine_is_finished = true;
return std::move(sink);
} }
}; };
void AsyncTaskExecutor::createFiber() void AsyncTaskExecutor::createFiber()
{ {
fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); fiber = Fiber(fiber_stack, Routine{*this});
} }
void AsyncTaskExecutor::destroyFiber() void AsyncTaskExecutor::destroyFiber()
{ {
boost::context::fiber to_destroy = std::move(fiber); Fiber to_destroy = std::move(fiber);
} }
String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description) String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description)

View File

@ -22,7 +22,7 @@ enum class AsyncEventTimeoutType
}; };
using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>; using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>;
using ResumeCallback = std::function<void()>; using SuspendCallback = std::function<void()>;
struct FiberInfo struct FiberInfo
{ {
@ -38,7 +38,7 @@ struct FiberInfo
struct AsyncTask struct AsyncTask
{ {
public: public:
virtual void run(AsyncCallback async_callback, ResumeCallback suspend_callback) = 0; virtual void run(AsyncCallback async_callback, SuspendCallback suspend_callback) = 0;
virtual ~AsyncTask() = default; virtual ~AsyncTask() = default;
}; };
@ -80,7 +80,6 @@ public:
}; };
#endif #endif
static FiberInfo getCurrentFiberInfo();
protected: protected:
/// Method that is called in resume() before actual fiber resuming. /// Method that is called in resume() before actual fiber resuming.
/// If it returns false, resume() will return immediately without actual fiber resuming. /// If it returns false, resume() will return immediately without actual fiber resuming.
@ -124,48 +123,6 @@ private:
std::unique_ptr<AsyncTask> task; std::unique_ptr<AsyncTask> task;
}; };
/// Simple implementation for fiber local variable.
template <typename T>
struct FiberLocal
{
public:
FiberLocal()
{
/// Initialize main instance for this thread. Instances for fibers will inherit it,
/// (it's needed because main instance could be changed before creating fibers
/// and changes should be visible in fibers).
data[nullptr] = T();
}
T & operator*()
{
return get();
}
T * operator->()
{
return &get();
}
private:
T & get()
{
return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo());
}
T & getInstanceForFiber(FiberInfo info)
{
auto it = data.find(info.fiber);
/// If it's the first request, we need to initialize instance for the fiber
/// using instance from parent fiber or main thread that created fiber.
if (it == data.end())
it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first;
return it->second;
}
std::unordered_map<const Fiber *, T> data;
};
String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description);
} }

View File

@ -3,5 +3,147 @@
/// BOOST_USE_ASAN, BOOST_USE_TSAN and BOOST_USE_UCONTEXT should be correctly defined for sanitizers. /// BOOST_USE_ASAN, BOOST_USE_TSAN and BOOST_USE_UCONTEXT should be correctly defined for sanitizers.
#include <base/defines.h> #include <base/defines.h>
#include <boost/context/fiber.hpp> #include <boost/context/fiber.hpp>
#include <map>
/// Class wrapper for boost::context::fiber.
/// It tracks current executing fiber for thread and
/// supports storing fiber-specific data
/// that will be destroyed on fiber destructor.
class Fiber
{
private:
using Impl = boost::context::fiber;
using FiberPtr = Fiber *;
template <typename T> friend class FiberLocal;
public:
template< typename StackAlloc, typename Fn>
Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
{
}
Fiber() = default;
Fiber(Fiber && other) = default;
Fiber & operator=(Fiber && other) = default;
Fiber(const Fiber &) = delete;
Fiber & operator =(const Fiber &) = delete;
explicit operator bool() const
{
return impl.operator bool();
}
void resume()
{
/// Update information about current executing fiber.
FiberPtr & current_fiber = getCurrentFiber();
FiberPtr parent_fiber = current_fiber;
current_fiber = this;
impl = std::move(impl).resume();
/// Restore parent fiber.
current_fiber = parent_fiber;
}
private:
template <typename Fn>
struct RoutineImpl
{
struct SuspendCallback
{
Impl & impl;
void operator()()
{
impl = std::move(impl).resume();
}
};
explicit RoutineImpl(Fn && fn_) : fn(std::move(fn_))
{
}
Impl operator()(Impl && sink)
{
SuspendCallback suspend_callback{sink};
fn(suspend_callback);
return std::move(sink);
}
Fn fn;
};
static FiberPtr & getCurrentFiber()
{
thread_local static FiberPtr current_fiber;
return current_fiber;
}
/// Special wrapper to store data in uniquer_ptr.
struct DataWrapper
{
virtual ~DataWrapper() = default;
};
using DataPtr = std::unique_ptr<DataWrapper>;
/// Get reference to fiber-specific data by key
/// (the pointer to the structure that uses this data).
DataPtr & getLocalData(void * key)
{
return local_data[key];
}
Impl && release()
{
return std::move(impl);
}
Impl impl;
std::map<void *, DataPtr> local_data;
};
/// Implementation for fiber local variable.
/// If we are in fiber, it returns fiber local data,
/// otherwise it returns it's single field.
/// Fiber local data is destroyed in Fiber destructor.
/// Implementation is similar to boost::fiber::fiber_specific_ptr
/// (we cannot use it because we don't use boost::fiber API.
template <typename T>
class FiberLocal
{
public:
T & operator*()
{
return get();
}
T * operator->()
{
return &get();
}
private:
struct DataWrapperImpl : public Fiber::DataWrapper
{
T impl;
};
T & get()
{
Fiber * current_fiber = Fiber::getCurrentFiber();
if (!current_fiber)
return main_instance;
Fiber::DataPtr & ptr = current_fiber->getLocalData(this);
/// Initialize instance on first request.
if (!ptr)
ptr = std::make_unique<DataWrapperImpl>();
return dynamic_cast<DataWrapperImpl *>(ptr.get())->impl;
}
T main_instance;
};
using Fiber = boost::context::fiber;

View File

@ -15,9 +15,8 @@ namespace DB
namespace OpenTelemetry namespace OpenTelemetry
{ {
///// This code can be executed inside several fibers in one thread, /// This code can be executed inside fibers, we should use fiber local tracing context.
///// we should use fiber local tracing context. thread_local FiberLocal<TracingContextOnThread> current_trace_context;
thread_local FiberLocal<TracingContextOnThread> current_fiber_trace_context;
bool Span::addAttribute(std::string_view name, UInt64 value) noexcept bool Span::addAttribute(std::string_view name, UInt64 value) noexcept
{ {
@ -109,7 +108,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc
SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
{ {
if (!current_fiber_trace_context->isTraceEnabled()) if (!current_trace_context->isTraceEnabled())
{ {
return; return;
} }
@ -117,8 +116,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
/// Use try-catch to make sure the ctor is exception safe. /// Use try-catch to make sure the ctor is exception safe.
try try
{ {
this->trace_id = current_fiber_trace_context->trace_id; this->trace_id = current_trace_context->trace_id;
this->parent_span_id = current_fiber_trace_context->span_id; this->parent_span_id = current_trace_context->span_id;
this->span_id = thread_local_rng(); // create a new id for this span this->span_id = thread_local_rng(); // create a new id for this span
this->operation_name = _operation_name; this->operation_name = _operation_name;
this->kind = _kind; this->kind = _kind;
@ -137,7 +136,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
} }
/// Set current span as parent of other spans created later on this thread. /// Set current span as parent of other spans created later on this thread.
current_fiber_trace_context->span_id = this->span_id; current_trace_context->span_id = this->span_id;
} }
void SpanHolder::finish() noexcept void SpanHolder::finish() noexcept
@ -146,12 +145,12 @@ void SpanHolder::finish() noexcept
return; return;
// First of all, restore old value of current span. // First of all, restore old value of current span.
assert(current_fiber_trace_context->span_id == span_id); assert(current_trace_context->span_id == span_id);
current_fiber_trace_context->span_id = parent_span_id; current_trace_context->span_id = parent_span_id;
try try
{ {
auto log = current_fiber_trace_context->span_log.lock(); auto log = current_trace_context->span_log.lock();
/// The log might be disabled, check it before use /// The log might be disabled, check it before use
if (log) if (log)
@ -274,7 +273,7 @@ void TracingContext::serialize(WriteBuffer & buf) const
const TracingContextOnThread & CurrentContext() const TracingContextOnThread & CurrentContext()
{ {
return *current_fiber_trace_context; return *current_trace_context;
} }
void TracingContextOnThread::reset() noexcept void TracingContextOnThread::reset() noexcept
@ -296,7 +295,7 @@ TracingContextHolder::TracingContextHolder(
/// If any exception is raised during the construction, the tracing is not enabled on current thread. /// If any exception is raised during the construction, the tracing is not enabled on current thread.
try try
{ {
if (current_fiber_trace_context->isTraceEnabled()) if (current_trace_context->isTraceEnabled())
{ {
/// ///
/// This is not the normal case, /// This is not the normal case,
@ -309,15 +308,15 @@ TracingContextHolder::TracingContextHolder(
/// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
/// ///
this->is_context_owner = false; this->is_context_owner = false;
this->root_span.trace_id = current_fiber_trace_context->trace_id; this->root_span.trace_id = current_trace_context->trace_id;
this->root_span.parent_span_id = current_fiber_trace_context->span_id; this->root_span.parent_span_id = current_trace_context->span_id;
this->root_span.span_id = thread_local_rng(); this->root_span.span_id = thread_local_rng();
this->root_span.operation_name = _operation_name; this->root_span.operation_name = _operation_name;
this->root_span.start_time_us this->root_span.start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
/// Set the root span as parent of other spans created on current thread /// Set the root span as parent of other spans created on current thread
current_fiber_trace_context->span_id = this->root_span.span_id; current_trace_context->span_id = this->root_span.span_id;
return; return;
} }
@ -361,10 +360,10 @@ TracingContextHolder::TracingContextHolder(
} }
/// Set up trace context on current thread only when the root span is successfully initialized. /// Set up trace context on current thread only when the root span is successfully initialized.
*current_fiber_trace_context = _parent_trace_context; *current_trace_context = _parent_trace_context;
current_fiber_trace_context->span_id = this->root_span.span_id; current_trace_context->span_id = this->root_span.span_id;
current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; current_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
current_fiber_trace_context->span_log = _span_log; current_trace_context->span_log = _span_log;
} }
TracingContextHolder::~TracingContextHolder() TracingContextHolder::~TracingContextHolder()
@ -376,7 +375,7 @@ TracingContextHolder::~TracingContextHolder()
try try
{ {
auto shared_span_log = current_fiber_trace_context->span_log.lock(); auto shared_span_log = current_trace_context->span_log.lock();
if (shared_span_log) if (shared_span_log)
{ {
try try
@ -407,11 +406,11 @@ TracingContextHolder::~TracingContextHolder()
if (this->is_context_owner) if (this->is_context_owner)
{ {
/// Clear the context on current thread /// Clear the context on current thread
current_fiber_trace_context->reset(); current_trace_context->reset();
} }
else else
{ {
current_fiber_trace_context->span_id = this->root_span.parent_span_id; current_trace_context->span_id = this->root_span.parent_span_id;
} }
} }

View File

@ -8,6 +8,9 @@
M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \
M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \
M(InsertQuery, "Same as Query, but only for INSERT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \
M(QueriesWithSubqueries, "Count queries with all subqueries") \
M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \
M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \
M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \ M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \
M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \ M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \
M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \ M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \
@ -366,7 +369,7 @@ The server successfully detected this situation and will download merged part fr
M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \ M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \
M(WriteBufferFromS3Bytes, "Bytes written to S3.") \ M(WriteBufferFromS3Bytes, "Bytes written to S3.") \
M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \ M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \
\ M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.") \
M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \ M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
\ \
M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \ M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \

View File

@ -92,7 +92,7 @@ public:
String getName() const override { return LogElement::name(); } String getName() const override { return LogElement::name(); }
static const char * getDefaultOrderBy() { return "(event_date, event_time)"; } static const char * getDefaultOrderBy() { return "event_date, event_time"; }
protected: protected:
Poco::Logger * log; Poco::Logger * log;

View File

@ -32,7 +32,7 @@ namespace DB::ErrorCodes
struct Initializer { struct Initializer {
size_t max_threads = 1; size_t max_threads = 1;
ssize_t priority = 0; Priority priority;
}; };
struct AsyncLoaderTest struct AsyncLoaderTest
@ -144,11 +144,11 @@ struct AsyncLoaderTest
TEST(AsyncLoader, Smoke) TEST(AsyncLoader, Smoke)
{ {
AsyncLoaderTest t({ AsyncLoaderTest t({
{.max_threads = 2, .priority = 0}, {.max_threads = 2, .priority = Priority{0}},
{.max_threads = 2, .priority = -1}, {.max_threads = 2, .priority = Priority{1}},
}); });
static constexpr ssize_t low_priority_pool = 1; static constexpr size_t low_priority_pool = 1;
std::atomic<size_t> jobs_done{0}; std::atomic<size_t> jobs_done{0};
std::atomic<size_t> low_priority_jobs_done{0}; std::atomic<size_t> low_priority_jobs_done{0};
@ -419,6 +419,8 @@ TEST(AsyncLoader, CancelExecutingTask)
} }
} }
// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function
// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482
TEST(AsyncLoader, DISABLED_JobFailure) TEST(AsyncLoader, DISABLED_JobFailure)
{ {
AsyncLoaderTest t; AsyncLoaderTest t;
@ -595,16 +597,16 @@ TEST(AsyncLoader, TestOverload)
TEST(AsyncLoader, StaticPriorities) TEST(AsyncLoader, StaticPriorities)
{ {
AsyncLoaderTest t({ AsyncLoaderTest t({
{.max_threads = 1, .priority = 0}, {.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority = 1}, {.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority = 2}, {.max_threads = 1, .priority{-2}},
{.max_threads = 1, .priority = 3}, {.max_threads = 1, .priority{-3}},
{.max_threads = 1, .priority = 4}, {.max_threads = 1, .priority{-4}},
{.max_threads = 1, .priority = 5}, {.max_threads = 1, .priority{-5}},
{.max_threads = 1, .priority = 6}, {.max_threads = 1, .priority{-6}},
{.max_threads = 1, .priority = 7}, {.max_threads = 1, .priority{-7}},
{.max_threads = 1, .priority = 8}, {.max_threads = 1, .priority{-8}},
{.max_threads = 1, .priority = 9}, {.max_threads = 1, .priority{-9}},
}); });
std::string schedule; std::string schedule;
@ -614,6 +616,15 @@ TEST(AsyncLoader, StaticPriorities)
schedule += fmt::format("{}{}", self->name, self->pool()); schedule += fmt::format("{}{}", self->name, self->pool());
}; };
// Job DAG with priorities. After priority inheritance from H9, jobs D9 and E9 can be
// executed in undefined order (Tested further in DynamicPriorities)
// A0(9) -+-> B3
// |
// `-> C4
// |
// `-> D1(9) -.
// | +-> F0(9) --> G0(9) --> H9
// `-> E2(9) -'
std::vector<LoadJobPtr> jobs; std::vector<LoadJobPtr> jobs;
jobs.push_back(makeLoadJob({}, 0, "A", job_func)); // 0 jobs.push_back(makeLoadJob({}, 0, "A", job_func)); // 0
jobs.push_back(makeLoadJob({ jobs[0] }, 3, "B", job_func)); // 1 jobs.push_back(makeLoadJob({ jobs[0] }, 3, "B", job_func)); // 1
@ -627,16 +638,15 @@ TEST(AsyncLoader, StaticPriorities)
t.loader.start(); t.loader.start();
t.loader.wait(); t.loader.wait();
ASSERT_TRUE(schedule == "A9E9D9F9G9H9C4B3" || schedule == "A9D9E9F9G9H9C4B3");
ASSERT_EQ(schedule, "A9E9D9F9G9H9C4B3");
} }
TEST(AsyncLoader, SimplePrioritization) TEST(AsyncLoader, SimplePrioritization)
{ {
AsyncLoaderTest t({ AsyncLoaderTest t({
{.max_threads = 1, .priority = 0}, {.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority = 1}, {.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority = 2}, {.max_threads = 1, .priority{-2}},
}); });
t.loader.start(); t.loader.start();
@ -674,16 +684,16 @@ TEST(AsyncLoader, SimplePrioritization)
TEST(AsyncLoader, DynamicPriorities) TEST(AsyncLoader, DynamicPriorities)
{ {
AsyncLoaderTest t({ AsyncLoaderTest t({
{.max_threads = 1, .priority = 0}, {.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority = 1}, {.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority = 2}, {.max_threads = 1, .priority{-2}},
{.max_threads = 1, .priority = 3}, {.max_threads = 1, .priority{-3}},
{.max_threads = 1, .priority = 4}, {.max_threads = 1, .priority{-4}},
{.max_threads = 1, .priority = 5}, {.max_threads = 1, .priority{-5}},
{.max_threads = 1, .priority = 6}, {.max_threads = 1, .priority{-6}},
{.max_threads = 1, .priority = 7}, {.max_threads = 1, .priority{-7}},
{.max_threads = 1, .priority = 8}, {.max_threads = 1, .priority{-8}},
{.max_threads = 1, .priority = 9}, {.max_threads = 1, .priority{-9}},
}); });
for (bool prioritize : {false, true}) for (bool prioritize : {false, true})
@ -890,8 +900,8 @@ TEST(AsyncLoader, DynamicPools)
const size_t max_threads[] { 2, 10 }; const size_t max_threads[] { 2, 10 };
const int jobs_in_chain = 16; const int jobs_in_chain = 16;
AsyncLoaderTest t({ AsyncLoaderTest t({
{.max_threads = max_threads[0], .priority = 0}, {.max_threads = max_threads[0], .priority{0}},
{.max_threads = max_threads[1], .priority = 1}, {.max_threads = max_threads[1], .priority{-1}},
}); });
t.loader.start(); t.loader.start();

View File

@ -47,7 +47,8 @@ struct Settings;
M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \ M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \
M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \ M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \
M(UInt64, max_log_file_size, 50 * 1024 * 1024, "Max size of the Raft log file. If possible, each created log file will preallocate this amount of bytes on disk. Set to 0 to disable the limit", 0) \ M(UInt64, max_log_file_size, 50 * 1024 * 1024, "Max size of the Raft log file. If possible, each created log file will preallocate this amount of bytes on disk. Set to 0 to disable the limit", 0) \
M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \
M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -471,17 +471,6 @@ void KeeperServer::shutdown()
namespace namespace
{ {
// Serialize the request with all the necessary information for the leader
// we don't know ZXID and digest yet so we don't serialize it
nuraft::ptr<nuraft::buffer> getZooKeeperRequestMessage(const KeeperStorage::RequestForSession & request_for_session)
{
DB::WriteBufferFromNuraftBuffer write_buf;
DB::writeIntBinary(request_for_session.session_id, write_buf);
request_for_session.request->write(write_buf);
DB::writeIntBinary(request_for_session.time, write_buf);
return write_buf.getBuffer();
}
// Serialize the request for the log entry // Serialize the request for the log entry
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session) nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
{ {
@ -489,12 +478,11 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
DB::writeIntBinary(request_for_session.session_id, write_buf); DB::writeIntBinary(request_for_session.session_id, write_buf);
request_for_session.request->write(write_buf); request_for_session.request->write(write_buf);
DB::writeIntBinary(request_for_session.time, write_buf); DB::writeIntBinary(request_for_session.time, write_buf);
DB::writeIntBinary(request_for_session.zxid, write_buf); /// we fill with dummy values to eliminate unnecessary copy later on when we will write correct values
assert(request_for_session.digest); DB::writeIntBinary(static_cast<int64_t>(0), write_buf); /// zxid
DB::writeIntBinary(request_for_session.digest->version, write_buf); DB::writeIntBinary(KeeperStorage::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST) DB::writeIntBinary(static_cast<uint64_t>(0), write_buf); /// digest value
DB::writeIntBinary(request_for_session.digest->value, write_buf); /// if new fields are added, update KeeperStateMachine::ZooKeeperLogSerializationVersion along with parseRequest function and PreAppendLog callback handler
return write_buf.getBuffer(); return write_buf.getBuffer();
} }
@ -512,9 +500,7 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS
{ {
std::vector<nuraft::ptr<nuraft::buffer>> entries; std::vector<nuraft::ptr<nuraft::buffer>> entries;
for (const auto & request_for_session : requests_for_sessions) for (const auto & request_for_session : requests_for_sessions)
{ entries.push_back(getZooKeeperLogEntry(request_for_session));
entries.push_back(getZooKeeperRequestMessage(request_for_session));
}
std::lock_guard lock{server_write_mutex}; std::lock_guard lock{server_write_mutex};
if (is_recovering) if (is_recovering)
@ -635,14 +621,50 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
assert(entry->get_val_type() == nuraft::app_log); assert(entry->get_val_type() == nuraft::app_log);
auto next_zxid = state_machine->getNextZxid(); auto next_zxid = state_machine->getNextZxid();
auto & entry_buf = entry->get_buf(); auto entry_buf = entry->get_buf_ptr();
auto request_for_session = state_machine->parseRequest(entry_buf);
request_for_session.zxid = next_zxid; KeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
if (!state_machine->preprocess(request_for_session)) auto request_for_session = state_machine->parseRequest(*entry_buf, /*final=*/false, &serialization_version);
request_for_session->zxid = next_zxid;
if (!state_machine->preprocess(*request_for_session))
return nuraft::cb_func::ReturnCode::ReturnNull; return nuraft::cb_func::ReturnCode::ReturnNull;
request_for_session.digest = state_machine->getNodesDigest(); request_for_session->digest = state_machine->getNodesDigest();
entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), getZooKeeperLogEntry(request_for_session), entry->get_val_type());
/// older versions of Keeper can send logs that are missing some fields
size_t bytes_missing = 0;
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
bytes_missing += sizeof(request_for_session->time);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
bytes_missing += sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (bytes_missing != 0)
{
auto new_buffer = nuraft::buffer::alloc(entry_buf->size() + bytes_missing);
memcpy(new_buffer->data_begin(), entry_buf->data_begin(), entry_buf->size());
entry_buf = std::move(new_buffer);
entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), entry_buf, entry->get_val_type());
}
size_t write_buffer_header_size
= sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
write_buffer_header_size += sizeof(request_for_session->time);
auto * buffer_start = reinterpret_cast<BufferBase::Position>(entry_buf->data_begin() + entry_buf->size() - write_buffer_header_size);
WriteBuffer write_buf(buffer_start, write_buffer_header_size);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
writeIntBinary(request_for_session->time, write_buf);
writeIntBinary(request_for_session->zxid, write_buf);
writeIntBinary(request_for_session->digest->version, write_buf);
if (request_for_session->digest->version != KeeperStorage::NO_DIGEST)
writeIntBinary(request_for_session->digest->value, write_buf);
break; break;
} }
case nuraft::cb_func::AppendLogFailed: case nuraft::cb_func::AppendLogFailed:
@ -654,8 +676,8 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
assert(entry->get_val_type() == nuraft::app_log); assert(entry->get_val_type() == nuraft::app_log);
auto & entry_buf = entry->get_buf(); auto & entry_buf = entry->get_buf();
auto request_for_session = state_machine->parseRequest(entry_buf); auto request_for_session = state_machine->parseRequest(entry_buf, true);
state_machine->rollbackRequest(request_for_session, true); state_machine->rollbackRequest(*request_for_session, true);
break; break;
} }
default: default:

View File

@ -1,16 +1,16 @@
#include <cerrno> #include <cerrno>
#include <base/errnoToString.h>
#include <base/defines.h>
#include <future> #include <future>
#include <Coordination/KeeperSnapshotManager.h> #include <Coordination/KeeperSnapshotManager.h>
#include <Coordination/KeeperStateMachine.h> #include <Coordination/KeeperStateMachine.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h> #include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h> #include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
#include <base/defines.h>
#include <base/errnoToString.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <Common/ProfileEvents.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h> #include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Common/ZooKeeper/ZooKeeperIO.h> #include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Common/ProfileEvents.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include "Coordination/KeeperStorage.h" #include "Coordination/KeeperStorage.h"
@ -60,6 +60,7 @@ KeeperStateMachine::KeeperStateMachine(
coordination_settings->dead_session_check_period_ms.totalMilliseconds()) coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_) , responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_) , snapshots_queue(snapshots_queue_)
, min_request_size_to_cache(coordination_settings_->min_request_size_for_cache)
, last_committed_idx(0) , last_committed_idx(0)
, log(&Poco::Logger::get("KeeperStateMachine")) , log(&Poco::Logger::get("KeeperStateMachine"))
, superdigest(superdigest_) , superdigest(superdigest_)
@ -149,19 +150,19 @@ void assertDigest(
nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data) nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
{ {
auto request_for_session = parseRequest(data); auto request_for_session = parseRequest(data, /*final=*/false);
if (!request_for_session.zxid) if (!request_for_session->zxid)
request_for_session.zxid = log_idx; request_for_session->zxid = log_idx;
preprocess(request_for_session); preprocess(*request_for_session);
return nullptr; return nullptr;
} }
KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer & data) std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
{ {
ReadBufferFromNuraftBuffer buffer(data); ReadBufferFromNuraftBuffer buffer(data);
KeeperStorage::RequestForSession request_for_session; auto request_for_session = std::make_shared<KeeperStorage::RequestForSession>();
readIntBinary(request_for_session.session_id, buffer); readIntBinary(request_for_session->session_id, buffer);
int32_t length; int32_t length;
Coordination::read(length, buffer); Coordination::read(length, buffer);
@ -169,29 +170,81 @@ KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer
int32_t xid; int32_t xid;
Coordination::read(xid, buffer); Coordination::read(xid, buffer);
static constexpr std::array non_cacheable_xids{
Coordination::WATCH_XID,
Coordination::PING_XID,
Coordination::AUTH_XID,
Coordination::CLOSE_XID,
};
const bool should_cache
= min_request_size_to_cache != 0 && request_for_session->session_id != -1 && data.size() >= min_request_size_to_cache
&& std::all_of(
non_cacheable_xids.begin(), non_cacheable_xids.end(), [&](const auto non_cacheable_xid) { return xid != non_cacheable_xid; });
if (should_cache)
{
std::lock_guard lock(request_cache_mutex);
if (auto xid_to_request_it = parsed_request_cache.find(request_for_session->session_id);
xid_to_request_it != parsed_request_cache.end())
{
auto & xid_to_request = xid_to_request_it->second;
if (auto request_it = xid_to_request.find(xid); request_it != xid_to_request.end())
{
if (final)
{
auto request = std::move(request_it->second);
xid_to_request.erase(request_it);
return request;
}
else
return request_it->second;
}
}
}
Coordination::OpNum opnum; Coordination::OpNum opnum;
Coordination::read(opnum, buffer); Coordination::read(opnum, buffer);
request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum); request_for_session->request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
request_for_session.request->xid = xid; request_for_session->request->xid = xid;
request_for_session.request->readImpl(buffer); request_for_session->request->readImpl(buffer);
if (!buffer.eof()) using enum ZooKeeperLogSerializationVersion;
readIntBinary(request_for_session.time, buffer); ZooKeeperLogSerializationVersion version = INITIAL;
else /// backward compatibility
request_for_session.time
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
if (!buffer.eof())
readIntBinary(request_for_session.zxid, buffer);
if (!buffer.eof()) if (!buffer.eof())
{ {
request_for_session.digest.emplace(); version = WITH_TIME;
readIntBinary(request_for_session.digest->version, buffer); readIntBinary(request_for_session->time, buffer);
if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST) }
readIntBinary(request_for_session.digest->value, buffer); else
request_for_session->time
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
if (!buffer.eof())
{
version = WITH_ZXID_DIGEST;
readIntBinary(request_for_session->zxid, buffer);
chassert(!buffer.eof());
request_for_session->digest.emplace();
readIntBinary(request_for_session->digest->version, buffer);
if (request_for_session->digest->version != KeeperStorage::DigestVersion::NO_DIGEST || !buffer.eof())
readIntBinary(request_for_session->digest->value, buffer);
}
if (serialization_version)
*serialization_version = version;
if (should_cache && !final)
{
std::lock_guard lock(request_cache_mutex);
parsed_request_cache[request_for_session->session_id].emplace(xid, request_for_session);
} }
return request_for_session; return request_for_session;
@ -231,15 +284,15 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data) nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
{ {
auto request_for_session = parseRequest(data); auto request_for_session = parseRequest(data, true);
if (!request_for_session.zxid) if (!request_for_session->zxid)
request_for_session.zxid = log_idx; request_for_session->zxid = log_idx;
/// Special processing of session_id request /// Special processing of session_id request
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) if (request_for_session->request->getOpNum() == Coordination::OpNum::SessionID)
{ {
const Coordination::ZooKeeperSessionIDRequest & session_id_request const Coordination::ZooKeeperSessionIDRequest & session_id_request
= dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request); = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request);
int64_t session_id; int64_t session_id;
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>(); std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id; response->internal_id = session_id_request.internal_id;
@ -261,25 +314,34 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
} }
else else
{ {
if (request_for_session->request->getOpNum() == Coordination::OpNum::Close)
{
std::lock_guard lock(request_cache_mutex);
parsed_request_cache.erase(request_for_session->session_id);
}
std::lock_guard lock(storage_and_responses_lock); std::lock_guard lock(storage_and_responses_lock);
KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest( KeeperStorage::ResponsesForSessions responses_for_sessions
request_for_session.request, request_for_session.session_id, request_for_session.zxid); = storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions) for (auto & response_for_session : responses_for_sessions)
if (!responses_queue.push(response_for_session)) if (!responses_queue.push(response_for_session))
{ {
ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed); ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response_for_session.session_id); LOG_WARNING(
log,
"Failed to push response with session id {} to the queue, probably because of shutdown",
response_for_session.session_id);
} }
if (keeper_context->digest_enabled && request_for_session.digest) if (keeper_context->digest_enabled && request_for_session->digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true); assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true);
} }
ProfileEvents::increment(ProfileEvents::KeeperCommits); ProfileEvents::increment(ProfileEvents::KeeperCommits);
last_committed_idx = log_idx; last_committed_idx = log_idx;
if (commit_callback) if (commit_callback)
commit_callback(request_for_session); commit_callback(*request_for_session);
return nullptr; return nullptr;
} }
@ -330,14 +392,14 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr
void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data) void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
{ {
auto request_for_session = parseRequest(data); auto request_for_session = parseRequest(data, true);
// If we received a log from an older node, use the log_idx as the zxid // If we received a log from an older node, use the log_idx as the zxid
// log_idx will always be larger or equal to the zxid so we can safely do this // log_idx will always be larger or equal to the zxid so we can safely do this
// (log_idx is increased for all logs, while zxid is only increased for requests) // (log_idx is increased for all logs, while zxid is only increased for requests)
if (!request_for_session.zxid) if (!request_for_session->zxid)
request_for_session.zxid = log_idx; request_for_session->zxid = log_idx;
rollbackRequest(request_for_session, false); rollbackRequest(*request_for_session, false);
} }
void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing) void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
@ -541,11 +603,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
/// Pure local request, just process it with storage /// Pure local request, just process it with storage
std::lock_guard lock(storage_and_responses_lock); std::lock_guard lock(storage_and_responses_lock);
auto responses = storage->processRequest( auto responses = storage->processRequest(
request_for_session.request, request_for_session.request, request_for_session.session_id, std::nullopt, true /*check_acl*/, true /*is_local*/);
request_for_session.session_id,
std::nullopt,
true /*check_acl*/,
true /*is_local*/);
for (const auto & response : responses) for (const auto & response : responses)
if (!responses_queue.push(response)) if (!responses_queue.push(response))
LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response.session_id); LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response.session_id);

View File

@ -36,7 +36,22 @@ public:
/// Read state from the latest snapshot /// Read state from the latest snapshot
void init(); void init();
static KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data); enum ZooKeeperLogSerializationVersion
{
INITIAL = 0,
WITH_TIME = 1,
WITH_ZXID_DIGEST = 2,
};
/// lifetime of a parsed request is:
/// [preprocess/PreAppendLog -> commit]
/// [preprocess/PreAppendLog -> rollback]
/// on events like commit and rollback we can remove the parsed request to keep the memory usage at minimum
/// request cache is also cleaned on session close in case something strange happened
///
/// final - whether it's the final time we will fetch the request so we can safely remove it from cache
/// serialization_version - information about which fields were parsed from the buffer so we can modify the buffer accordingly
std::shared_ptr<KeeperStorage::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);
bool preprocess(const KeeperStorage::RequestForSession & request_for_session); bool preprocess(const KeeperStorage::RequestForSession & request_for_session);
@ -138,6 +153,13 @@ private:
/// for request. /// for request.
mutable std::mutex storage_and_responses_lock; mutable std::mutex storage_and_responses_lock;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorage::RequestForSession>>> parsed_request_cache;
uint64_t min_request_size_to_cache{0};
/// we only need to protect the access to the map itself
/// requests can be modified from anywhere without lock because a single request
/// can be processed only in 1 thread at any point
std::mutex request_cache_mutex;
/// Last committed Raft log number. /// Last committed Raft log number.
std::atomic<uint64_t> last_committed_idx; std::atomic<uint64_t> last_committed_idx;

View File

@ -110,7 +110,7 @@ public:
struct RequestForSession struct RequestForSession
{ {
int64_t session_id; int64_t session_id;
int64_t time; int64_t time{0};
Coordination::ZooKeeperRequestPtr request; Coordination::ZooKeeperRequestPtr request;
int64_t zxid{0}; int64_t zxid{0};
std::optional<Digest> digest; std::optional<Digest> digest;

View File

@ -1,5 +1,4 @@
#include <Coordination/WriteBufferFromNuraftBuffer.h> #include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Common/logger_useful.h>
namespace DB namespace DB
{ {

View File

@ -78,6 +78,7 @@ class IColumn;
M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \
M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \
M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \ M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \
M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \
M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \
@ -93,6 +94,7 @@ class IColumn;
M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
M(UInt64, s3_retry_attempts, 10, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \ M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \ M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \

View File

@ -188,12 +188,12 @@ try
try try
{ {
file->write(payload.data(), payload.size()); file->write(payload.data(), payload.size());
file->finalize();
} }
catch (...) catch (...)
{ {
/// Log current exception, because finalize() can throw a different exception. /// Log current exception, because finalize() can throw a different exception.
tryLogCurrentException(__PRETTY_FUNCTION__); tryLogCurrentException(__PRETTY_FUNCTION__);
file->finalize();
throw; throw;
} }
} }

View File

@ -146,7 +146,8 @@ std::unique_ptr<S3::Client> getClient(
S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config); S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config);
client_configuration.retryStrategy client_configuration.retryStrategy
= std::make_shared<Aws::Client::DefaultRetryStrategy>(config.getUInt(config_prefix + ".retry_attempts", 10)); = std::make_shared<Aws::Client::DefaultRetryStrategy>(
config.getUInt64(config_prefix + ".retry_attempts", settings.request_settings.retry_attempts));
return S3::ClientFactory::instance().create( return S3::ClientFactory::instance().create(
client_configuration, client_configuration,

View File

@ -1230,8 +1230,11 @@ public:
/// The case when arguments are the same (tautological comparison). Return constant. /// The case when arguments are the same (tautological comparison). Return constant.
/// NOTE: Nullable types are special case. /// NOTE: Nullable types are special case.
/// (BTW, this function use default implementation for Nullable, so Nullable types cannot be here. Check just in case.) /// (BTW, this function use default implementation for Nullable, so Nullable types cannot be here. Check just in case.)
/// NOTE: We consider NaN comparison to be implementation specific (and in our implementation NaNs are sometimes equal sometimes not). if (left_type->equals(*right_type) &&
if (left_type->equals(*right_type) && !left_type->isNullable() && !isTuple(left_type) && col_left_untyped == col_right_untyped) !left_type->isNullable() &&
!isTuple(left_type) &&
!WhichDataType(left_type).isFloat() &&
col_left_untyped == col_right_untyped)
{ {
ColumnPtr result_column; ColumnPtr result_column;

View File

@ -2,6 +2,7 @@
#include <Common/ErrorCodes.h> #include <Common/ErrorCodes.h>
#include <Common/Exception.h> #include <Common/Exception.h>
#include <Common/Priority.h>
#include <IO/ResourceRequest.h> #include <IO/ResourceRequest.h>
#include <Poco/Util/AbstractConfiguration.h> #include <Poco/Util/AbstractConfiguration.h>
@ -37,7 +38,7 @@ inline const Poco::Util::AbstractConfiguration & emptyConfig()
struct SchedulerNodeInfo struct SchedulerNodeInfo
{ {
double weight = 1.0; /// Weight of this node among it's siblings double weight = 1.0; /// Weight of this node among it's siblings
Int64 priority = 0; /// Priority of this node among it's siblings (higher value means higher priority) Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)
/// Arbitrary data accessed/stored by parent /// Arbitrary data accessed/stored by parent
union { union {
@ -65,7 +66,7 @@ struct SchedulerNodeInfo
void setPriority(Int64 value) void setPriority(Int64 value)
{ {
priority = value; priority.value = value;
} }
}; };

View File

@ -12,7 +12,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
{ {
if (whence == SEEK_SET) if (whence == SEEK_SET)
{ {
if (offset >= 0 && internal_buffer.begin() + offset < internal_buffer.end()) if (offset >= 0 && internal_buffer.begin() + offset <= internal_buffer.end())
{ {
pos = internal_buffer.begin() + offset; pos = internal_buffer.begin() + offset;
working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek(). working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
@ -25,7 +25,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
else if (whence == SEEK_CUR) else if (whence == SEEK_CUR)
{ {
Position new_pos = pos + offset; Position new_pos = pos + offset;
if (new_pos >= internal_buffer.begin() && new_pos < internal_buffer.end()) if (new_pos >= internal_buffer.begin() && new_pos <= internal_buffer.end())
{ {
pos = new_pos; pos = new_pos;
working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek(). working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().

View File

@ -26,12 +26,12 @@ class PriorityPolicy : public ISchedulerNode
struct Item struct Item
{ {
ISchedulerNode * child = nullptr; ISchedulerNode * child = nullptr;
Int64 priority = 0; // higher value means higher priority Priority priority; // lower value means higher priority
/// For max-heap by priority /// For max-heap by priority
bool operator<(const Item& rhs) const noexcept bool operator<(const Item& rhs) const noexcept
{ {
return priority < rhs.priority; return priority > rhs.priority; // Reversed for heap top to yield highest priority (lowest value) child first
} }
}; };

View File

@ -22,9 +22,9 @@ TEST(IOResourcePriorityPolicy, Priorities)
ResourceTest t; ResourceTest t;
t.add<PriorityPolicy>("/"); t.add<PriorityPolicy>("/");
t.add<FifoQueue>("/A", "<priority>1</priority>"); t.add<FifoQueue>("/A", "<priority>3</priority>");
t.add<FifoQueue>("/B", "<priority>2</priority>"); t.add<FifoQueue>("/B", "<priority>2</priority>");
t.add<FifoQueue>("/C", "<priority>3</priority>"); t.add<FifoQueue>("/C", "<priority>1</priority>");
t.enqueue("/A", {10, 10, 10}); t.enqueue("/A", {10, 10, 10});
t.enqueue("/B", {10, 10, 10}); t.enqueue("/B", {10, 10, 10});
@ -56,9 +56,9 @@ TEST(IOResourcePriorityPolicy, Activation)
ResourceTest t; ResourceTest t;
t.add<PriorityPolicy>("/"); t.add<PriorityPolicy>("/");
t.add<FifoQueue>("/A", "<priority>1</priority>"); t.add<FifoQueue>("/A", "<priority>3</priority>");
t.add<FifoQueue>("/B", "<priority>2</priority>"); t.add<FifoQueue>("/B", "<priority>2</priority>");
t.add<FifoQueue>("/C", "<priority>3</priority>"); t.add<FifoQueue>("/C", "<priority>1</priority>");
t.enqueue("/A", {10, 10, 10, 10, 10, 10}); t.enqueue("/A", {10, 10, 10, 10, 10, 10});
t.enqueue("/B", {10}); t.enqueue("/B", {10});

View File

@ -49,7 +49,7 @@ TEST(IOResourceStaticResourceManager, Prioritization)
{ {
// Lock is not required here because this is called during request execution and we have max_requests = 1 // Lock is not required here because this is called during request execution and we have max_requests = 1
if (last_priority) if (last_priority)
EXPECT_TRUE(priority <= *last_priority); // Should be true if every queue arrived at the same time at busy period start EXPECT_TRUE(priority >= *last_priority); // Should be true if every queue arrived at the same time at busy period start
last_priority = priority; last_priority = priority;
}; };
@ -63,8 +63,8 @@ TEST(IOResourceStaticResourceManager, Prioritization)
<res1> <res1>
<node path="/"> <type>inflight_limit</type><max_requests>1</max_requests></node> <node path="/"> <type>inflight_limit</type><max_requests>1</max_requests></node>
<node path="/prio"> <type>priority</type></node> <node path="/prio"> <type>priority</type></node>
<node path="/prio/A"> <priority>-1</priority></node> <node path="/prio/A"> <priority>1</priority></node>
<node path="/prio/B"> <priority>1</priority></node> <node path="/prio/B"> <priority>-1</priority></node>
<node path="/prio/C"> </node> <node path="/prio/C"> </node>
<node path="/prio/D"> </node> <node path="/prio/D"> </node>
<node path="/prio/leader"></node> <node path="/prio/leader"></node>

View File

@ -92,8 +92,11 @@ WriteBufferFromS3::WriteBufferFromS3(
, write_settings(write_settings_) , write_settings(write_settings_)
, client_ptr(std::move(client_ptr_)) , client_ptr(std::move(client_ptr_))
, object_metadata(std::move(object_metadata_)) , object_metadata(std::move(object_metadata_))
, buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings())) , buffer_allocation_policy(ChooseBufferPolicy(upload_settings))
, task_tracker(std::make_unique<WriteBufferFromS3::TaskTracker>(std::move(schedule_))) , task_tracker(
std::make_unique<WriteBufferFromS3::TaskTracker>(
std::move(schedule_),
upload_settings.max_inflight_parts_for_one_file))
{ {
LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails()); LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails());
@ -109,8 +112,11 @@ void WriteBufferFromS3::nextImpl()
ErrorCodes::LOGICAL_ERROR, ErrorCodes::LOGICAL_ERROR,
"Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest"); "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest");
/// Make sense to call to before adding new async task to check if there is an exception /// Make sense to call waitIfAny before adding new async task to check if there is an exception
task_tracker->waitReady(); /// The faster the exception is propagated the lesser time is spent for cancellation
/// Despite the fact that `task_tracker->add()` collects tasks statuses and propagates their exceptions
/// that call is necessary for the case when the is no in-flight limitation and therefore `task_tracker->add()` doesn't wait anything
task_tracker->waitIfAny();
hidePartialData(); hidePartialData();
@ -134,7 +140,8 @@ void WriteBufferFromS3::preFinalize()
LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails()); LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails());
task_tracker->waitReady(); /// This function should not be run again if an exception has occurred
is_prefinalized = true;
hidePartialData(); hidePartialData();
@ -166,8 +173,6 @@ void WriteBufferFromS3::preFinalize()
{ {
writeMultipartUpload(); writeMultipartUpload();
} }
is_prefinalized = true;
} }
void WriteBufferFromS3::finalizeImpl() void WriteBufferFromS3::finalizeImpl()
@ -212,8 +217,8 @@ String WriteBufferFromS3::getLogDetails() const
multipart_upload_details = fmt::format(", upload id {}, upload has finished {}" multipart_upload_details = fmt::format(", upload id {}, upload has finished {}"
, multipart_upload_id, multipart_upload_finished); , multipart_upload_id, multipart_upload_finished);
return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}", return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, prefinalized {}, finalized {}{}",
bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details); bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), is_prefinalized, finalized, multipart_upload_details);
} }
void WriteBufferFromS3::tryToAbortMultipartUpload() void WriteBufferFromS3::tryToAbortMultipartUpload()
@ -234,7 +239,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
{ {
LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails()); LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails());
// That descructor could be call with finalized=false in case of exceptions // That destructor could be call with finalized=false in case of exceptions
if (!finalized) if (!finalized)
{ {
LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails()); LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails());

View File

@ -4,12 +4,18 @@
#include <IO/WriteBufferFromS3TaskTracker.h> #include <IO/WriteBufferFromS3TaskTracker.h>
namespace ProfileEvents
{
extern const Event WriteBufferFromS3WaitInflightLimitMicroseconds;
}
namespace DB namespace DB
{ {
WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_) WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_)
: is_async(bool(scheduler_)) : is_async(bool(scheduler_))
, scheduler(scheduler_ ? std::move(scheduler_) : syncRunner()) , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner())
, max_tasks_inflight(max_tasks_inflight_)
{} {}
WriteBufferFromS3::TaskTracker::~TaskTracker() WriteBufferFromS3::TaskTracker::~TaskTracker()
@ -28,36 +34,6 @@ ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
}; };
} }
void WriteBufferFromS3::TaskTracker::waitReady()
{
LOG_TEST(log, "waitReady, in queue {}", futures.size());
/// Exceptions are propagated
auto it = futures.begin();
while (it != futures.end())
{
chassert(it->valid());
if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready)
{
++it;
continue;
}
try
{
it->get();
} catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
throw;
}
it = futures.erase(it);
}
LOG_TEST(log, "waitReady ended, in queue {}", futures.size());
}
void WriteBufferFromS3::TaskTracker::waitAll() void WriteBufferFromS3::TaskTracker::waitAll()
{ {
LOG_TEST(log, "waitAll, in queue {}", futures.size()); LOG_TEST(log, "waitAll, in queue {}", futures.size());
@ -65,66 +41,145 @@ void WriteBufferFromS3::TaskTracker::waitAll()
/// Exceptions are propagated /// Exceptions are propagated
for (auto & future : futures) for (auto & future : futures)
{ {
try future.get();
{
future.get();
} catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
throw;
}
} }
futures.clear(); futures.clear();
std::lock_guard lock(mutex);
finished_futures.clear();
} }
void WriteBufferFromS3::TaskTracker::safeWaitAll() void WriteBufferFromS3::TaskTracker::safeWaitAll()
{ {
LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size()); LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size());
/// Exceptions are not propagated
for (auto & future : futures)
{
LOG_TEST(log, "safeWaitAll, wait future");
if (future.valid())
future.wait();
}
LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size());
for (auto & future : futures) for (auto & future : futures)
{ {
if (future.valid()) if (future.valid())
{ {
try try
{ {
/// Exceptions are not propagated
future.get(); future.get();
} catch (...) } catch (...)
{ {
/// But at least they are printed
tryLogCurrentException(__PRETTY_FUNCTION__); tryLogCurrentException(__PRETTY_FUNCTION__);
} }
} }
} }
futures.clear(); futures.clear();
LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size());
std::lock_guard lock(mutex);
finished_futures.clear();
}
void WriteBufferFromS3::TaskTracker::waitIfAny()
{
LOG_TEST(log, "waitIfAny, in queue {}", futures.size());
if (futures.empty())
return;
Stopwatch watch;
{
std::lock_guard lock(mutex);
for (auto & it : finished_futures)
{
/// actually that call might lock this thread until the future is set finally
/// however that won't lock us for long, the task is about to finish when the pointer appears in the `finished_futures`
it->get();
/// in case of exception in `it->get()`
/// it it not necessary to remove `it` from list `futures`
/// `TaskTracker` has to be destroyed after any exception occurs, for this `safeWaitAll` is called.
/// `safeWaitAll` handles invalid futures in the list `futures`
futures.erase(it);
}
finished_futures.clear();
}
watch.stop();
ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size());
} }
void WriteBufferFromS3::TaskTracker::add(Callback && func) void WriteBufferFromS3::TaskTracker::add(Callback && func)
{ {
LOG_TEST(log, "add, in queue {}", futures.size()); /// All this fuzz is about 2 things. This is the most critical place of TaskTracker.
/// The first is not to fail insertion in the list `futures`.
/// In order to face it, the element is allocated at the end of the list `futures` in advance.
/// The second is not to fail the notification of the task.
/// In order to face it, the list element, which would be inserted to the list `finished_futures`,
/// is allocated in advance as an other list `pre_allocated_finished` with one element inside.
auto future = scheduler(std::move(func), Priority{}); /// preallocation for the first issue
auto exit_scope = scope_guard( futures.emplace_back();
[&future]() auto future_placeholder = std::prev(futures.end());
/// preallocation for the second issue
FinishedList pre_allocated_finished {future_placeholder};
Callback func_with_notification = [&, func=std::move(func), pre_allocated_finished=std::move(pre_allocated_finished)] () mutable
{
SCOPE_EXIT({
DENY_ALLOCATIONS_IN_SCOPE;
std::lock_guard lock(mutex);
finished_futures.splice(finished_futures.end(), pre_allocated_finished);
has_finished.notify_one();
});
func();
};
/// this move is nothrow
*future_placeholder = scheduler(std::move(func_with_notification), Priority{});
LOG_TEST(log, "add ended, in queue {}, limit {}", futures.size(), max_tasks_inflight);
waitTilInflightShrink();
}
void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
{
if (!max_tasks_inflight)
return;
LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size());
Stopwatch watch;
/// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
/// However the faster finished task is collected the faster CH checks if there is an exception
/// The faster an exception is propagated the lesser time is spent for cancellation
while (futures.size() >= max_tasks_inflight)
{
std::unique_lock lock(mutex);
has_finished.wait(lock, [this] () TSA_REQUIRES(mutex) { return !finished_futures.empty(); });
for (auto & it : finished_futures)
{ {
future.wait(); SCOPE_EXIT({
/// According to basic exception safety TaskTracker has to be destroyed after exception
/// If it would be true than this SCOPE_EXIT is superfluous
/// However WriteBufferWithFinalizeCallback, WriteBufferFromFileDecorator do call finalize in d-tor
/// TaskTracker has to cope this until the issue with finalizing in d-tor is addressed in #50274
futures.erase(it);
});
it->get();
} }
);
futures.push_back(std::move(future)); finished_futures.clear();
}
exit_scope.release(); watch.stop();
LOG_TEST(log, "add ended, in queue {}", futures.size()); ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size());
} }
bool WriteBufferFromS3::TaskTracker::isAsync() const bool WriteBufferFromS3::TaskTracker::isAsync() const

View File

@ -6,36 +6,61 @@
#include "WriteBufferFromS3.h" #include "WriteBufferFromS3.h"
#include <list>
namespace DB namespace DB
{ {
/// That class is used only in WriteBufferFromS3 for now. /// That class is used only in WriteBufferFromS3 for now.
/// Therefore it declared as a part of WriteBufferFromS3. /// Therefore it declared as a part of WriteBufferFromS3.
/// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool. /// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool.
/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll /// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll
/// to help with coordination of the running tasks. /// to help with coordination of the running tasks.
/// Basic exception safety is provided. If exception occurred the object has to be destroyed.
/// No thread safety is provided. Use this object with no concurrency.
class WriteBufferFromS3::TaskTracker class WriteBufferFromS3::TaskTracker
{ {
public: public:
using Callback = std::function<void()>; using Callback = std::function<void()>;
explicit TaskTracker(ThreadPoolCallbackRunner<void> scheduler_); TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_);
~TaskTracker(); ~TaskTracker();
static ThreadPoolCallbackRunner<void> syncRunner(); static ThreadPoolCallbackRunner<void> syncRunner();
bool isAsync() const; bool isAsync() const;
void waitReady();
/// waitIfAny collects statuses from already finished tasks
/// There could be no finished tasks yet, so waitIfAny do nothing useful in that case
/// the first exception is thrown if any task has failed
void waitIfAny();
/// Well, waitAll waits all the tasks until they finish and collects their statuses
void waitAll(); void waitAll();
/// safeWaitAll does the same as waitAll but mutes the exceptions
void safeWaitAll(); void safeWaitAll();
void add(Callback && func); void add(Callback && func);
private: private:
bool is_async; /// waitTilInflightShrink waits til the number of in-flight tasks beyond the limit `max_tasks_inflight`.
void waitTilInflightShrink() TSA_NO_THREAD_SAFETY_ANALYSIS;
const bool is_async;
ThreadPoolCallbackRunner<void> scheduler; ThreadPoolCallbackRunner<void> scheduler;
std::list<std::future<void>> futures; const size_t max_tasks_inflight;
using FutureList = std::list<std::future<void>>;
FutureList futures;
Poco::Logger * log = &Poco::Logger::get("TaskTracker"); Poco::Logger * log = &Poco::Logger::get("TaskTracker");
std::mutex mutex;
std::condition_variable has_finished TSA_GUARDED_BY(mutex);
using FinishedList = std::list<FutureList::iterator>;
FinishedList finished_futures TSA_GUARDED_BY(mutex);
}; };
} }

View File

@ -2041,7 +2041,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
*/ */
if (data.hasNullKeyData()) if (data.hasNullKeyData())
{ {
has_null_key_data = Method::one_key_nullable_optimization; has_null_key_data = true;
out_cols->key_columns[0]->insertDefault(); out_cols->key_columns[0]->insertDefault();
insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena); insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena);
data.hasNullKeyData() = false; data.hasNullKeyData() = false;
@ -2076,6 +2076,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data)); res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
places.clear(); places.clear();
out_cols.reset(); out_cols.reset();
has_null_key_data = false;
} }
} }
}); });

View File

@ -45,7 +45,7 @@ public:
using SystemLog<AsynchronousInsertLogElement>::SystemLog; using SystemLog<AsynchronousInsertLogElement>::SystemLog;
/// This table is usually queried for fixed table name. /// This table is usually queried for fixed table name.
static const char * getDefaultOrderBy() { return "(database, table, event_date, event_time)"; } static const char * getDefaultOrderBy() { return "database, table, event_date, event_time"; }
}; };
} }

View File

@ -49,7 +49,7 @@ public:
void addValues(const AsynchronousMetricValues &); void addValues(const AsynchronousMetricValues &);
/// This table is usually queried for fixed metric name. /// This table is usually queried for fixed metric name.
static const char * getDefaultOrderBy() { return "(metric, event_date, event_time)"; } static const char * getDefaultOrderBy() { return "metric, event_date, event_time"; }
}; };
} }

View File

@ -125,10 +125,12 @@ ClusterDiscovery::ClusterDiscovery(
ClusterInfo( ClusterInfo(
/* name_= */ key, /* name_= */ key,
/* zk_root_= */ config.getString(prefix + ".path"), /* zk_root_= */ config.getString(prefix + ".path"),
/* host_name= */ config.getString(prefix + ".my_hostname", getFQDNOrHostName()),
/* port= */ context->getTCPPort(), /* port= */ context->getTCPPort(),
/* secure= */ config.getBool(prefix + ".secure", false), /* secure= */ config.getBool(prefix + ".secure", false),
/* shard_id= */ config.getUInt(prefix + ".shard", 0), /* shard_id= */ config.getUInt(prefix + ".shard", 0),
/* observer_mode= */ ConfigHelper::getBool(config, prefix + ".observer") /* observer_mode= */ ConfigHelper::getBool(config, prefix + ".observer"),
/* invisible= */ ConfigHelper::getBool(config, prefix + ".invisible")
) )
); );
} }
@ -294,6 +296,12 @@ bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info)
return false; return false;
} }
if (cluster_info.current_cluster_is_invisible)
{
LOG_DEBUG(log, "cluster '{}' is invisible!", cluster_info.name);
return true;
}
if (!needUpdate(node_uuids, nodes_info)) if (!needUpdate(node_uuids, nodes_info))
{ {
LOG_DEBUG(log, "No update required for cluster '{}'", cluster_info.name); LOG_DEBUG(log, "No update required for cluster '{}'", cluster_info.name);

View File

@ -3,7 +3,6 @@
#include <Common/ConcurrentBoundedQueue.h> #include <Common/ConcurrentBoundedQueue.h>
#include <Common/ThreadPool.h> #include <Common/ThreadPool.h>
#include <Common/ZooKeeper/Common.h> #include <Common/ZooKeeper/Common.h>
#include <base/getFQDNOrHostName.h>
#include <Interpreters/Cluster.h> #include <Interpreters/Cluster.h>
#include <Poco/Logger.h> #include <Poco/Logger.h>
@ -78,16 +77,24 @@ private:
/// Current node may not belong to cluster, to be just an observer. /// Current node may not belong to cluster, to be just an observer.
bool current_node_is_observer = false; bool current_node_is_observer = false;
/// For internal management need.
/// Is it designed that when deploying multiple compute groups,
/// they are mutually invisible to each other.
bool current_cluster_is_invisible = false;
explicit ClusterInfo(const String & name_, explicit ClusterInfo(const String & name_,
const String & zk_root_, const String & zk_root_,
const String & host_name,
UInt16 port, UInt16 port,
bool secure, bool secure,
size_t shard_id, size_t shard_id,
bool observer_mode) bool observer_mode,
bool invisible)
: name(name_) : name(name_)
, zk_root(zk_root_) , zk_root(zk_root_)
, current_node(getFQDNOrHostName() + ":" + toString(port), secure, shard_id) , current_node(host_name + ":" + toString(port), secure, shard_id)
, current_node_is_observer(observer_mode) , current_node_is_observer(observer_mode)
, current_cluster_is_invisible(invisible)
{ {
} }
}; };

View File

@ -3555,9 +3555,9 @@ void Context::checkPartitionCanBeDropped(const String & database, const String &
} }
InputFormatPtr Context::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings) const InputFormatPtr Context::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings, const std::optional<size_t> max_parsing_threads) const
{ {
return FormatFactory::instance().getInput(name, buf, sample, shared_from_this(), max_block_size, format_settings); return FormatFactory::instance().getInput(name, buf, sample, shared_from_this(), max_block_size, format_settings, max_parsing_threads);
} }
OutputFormatPtr Context::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const OutputFormatPtr Context::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const

View File

@ -738,7 +738,8 @@ public:
BackupsWorker & getBackupsWorker() const; BackupsWorker & getBackupsWorker() const;
/// I/O formats. /// I/O formats.
InputFormatPtr getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings = std::nullopt) const; InputFormatPtr getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size,
const std::optional<FormatSettings> & format_settings = std::nullopt, const std::optional<size_t> max_parsing_threads = std::nullopt) const;
OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const; OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const;
OutputFormatPtr getOutputFormatParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const; OutputFormatPtr getOutputFormatParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const;

View File

@ -19,6 +19,8 @@
#include <Parsers/queryToString.h> #include <Parsers/queryToString.h>
#include <Processors/Executors/PullingAsyncPipelineExecutor.h> #include <Processors/Executors/PullingAsyncPipelineExecutor.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Common/FieldVisitorToString.h>
#include <IO/WriteBufferFromString.h>
namespace ProfileEvents namespace ProfileEvents
{ {
@ -68,17 +70,6 @@ void ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data)
visit(*t, ast, data); visit(*t, ast, data);
} }
/// Converting to literal values might take a fair amount of overhead when the value is large, (e.g.
/// Array, BitMap, etc.), This conversion is required for constant folding, index lookup, branch
/// elimination. However, these optimizations should never be related to large values, thus we
/// blacklist them here.
static bool worthConvertingToLiteral(const Block & scalar)
{
const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName();
static const std::set<std::string_view> useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
return !useless_literal_types.contains(scalar_type_name);
}
static auto getQueryInterpreter(const ASTSubquery & subquery, ExecuteScalarSubqueriesMatcher::Data & data) static auto getQueryInterpreter(const ASTSubquery & subquery, ExecuteScalarSubqueriesMatcher::Data & data)
{ {
auto subquery_context = Context::createCopy(data.getContext()); auto subquery_context = Context::createCopy(data.getContext());
@ -255,7 +246,9 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
const Settings & settings = data.getContext()->getSettingsRef(); const Settings & settings = data.getContext()->getSettingsRef();
// Always convert to literals when there is no query context. // Always convert to literals when there is no query context.
if (data.only_analyze || !settings.enable_scalar_subquery_optimization || worthConvertingToLiteral(scalar) if (data.only_analyze
|| !settings.enable_scalar_subquery_optimization
|| worthConvertingScalarToLiteral(scalar, data.max_literal_size)
|| !data.getContext()->hasQueryContext()) || !data.getContext()->hasQueryContext())
{ {
/// subquery and ast can be the same object and ast will be moved. /// subquery and ast can be the same object and ast will be moved.
@ -278,7 +271,7 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
ast = std::move(func); ast = std::move(func);
} }
} }
else else if (!data.replace_only_to_literals)
{ {
auto func = makeASTFunction("__getScalar", std::make_shared<ASTLiteral>(scalar_query_hash_str)); auto func = makeASTFunction("__getScalar", std::make_shared<ASTLiteral>(scalar_query_hash_str));
func->alias = subquery.alias; func->alias = subquery.alias;
@ -318,4 +311,31 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTFunction & func, ASTPtr & as
Visitor(data).visit(*add_node); Visitor(data).visit(*add_node);
} }
static size_t getSizeOfSerializedLiteral(const Field & field)
{
auto field_str = applyVisitor(FieldVisitorToString(), field);
return field_str.size();
}
bool worthConvertingScalarToLiteral(const Block & scalar, std::optional<size_t> max_literal_size)
{
/// Converting to literal values might take a fair amount of overhead when the value is large, (e.g.
/// Array, BitMap, etc.), This conversion is required for constant folding, index lookup, branch
/// elimination. However, these optimizations should never be related to large values, thus we blacklist them here.
const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName();
static const std::set<std::string_view> maybe_large_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
if (!maybe_large_literal_types.contains(scalar_type_name))
return true;
if (!max_literal_size)
return false;
/// Size of serialized literal cannot be less than size in bytes.
if (scalar.bytes() > *max_literal_size)
return false;
return getSizeOfSerializedLiteral((*scalar.safeGetByPosition(0).column)[0]) <= *max_literal_size;
}
} }

View File

@ -37,6 +37,8 @@ public:
Scalars & local_scalars; Scalars & local_scalars;
bool only_analyze; bool only_analyze;
bool is_create_parameterized_view; bool is_create_parameterized_view;
bool replace_only_to_literals;
std::optional<size_t> max_literal_size;
}; };
static bool needChildVisit(ASTPtr & node, const ASTPtr &); static bool needChildVisit(ASTPtr & node, const ASTPtr &);
@ -49,4 +51,6 @@ private:
using ExecuteScalarSubqueriesVisitor = ExecuteScalarSubqueriesMatcher::Visitor; using ExecuteScalarSubqueriesVisitor = ExecuteScalarSubqueriesMatcher::Visitor;
bool worthConvertingScalarToLiteral(const Block & scalar, std::optional<size_t> max_literal_size);
} }

View File

@ -8,12 +8,14 @@
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
#include <Interpreters/FunctionNameNormalizer.h> #include <Interpreters/FunctionNameNormalizer.h>
#include <Interpreters/MutationsInterpreter.h> #include <Interpreters/MutationsInterpreter.h>
#include <Interpreters/MutationsNonDeterministicHelpers.h>
#include <Interpreters/QueryLog.h> #include <Interpreters/QueryLog.h>
#include <Interpreters/executeDDLQueryOnCluster.h> #include <Interpreters/executeDDLQueryOnCluster.h>
#include <Parsers/ASTAlterQuery.h> #include <Parsers/ASTAlterQuery.h>
#include <Parsers/ASTAssignment.h> #include <Parsers/ASTAssignment.h>
#include <Parsers/ASTIdentifier_fwd.h> #include <Parsers/ASTIdentifier_fwd.h>
#include <Parsers/ASTColumnDeclaration.h> #include <Parsers/ASTColumnDeclaration.h>
#include <Parsers/queryToString.h>
#include <Storages/AlterCommands.h> #include <Storages/AlterCommands.h>
#include <Storages/IStorage.h> #include <Storages/IStorage.h>
#include <Storages/LiveView/LiveViewCommands.h> #include <Storages/LiveView/LiveViewCommands.h>
@ -67,7 +69,6 @@ BlockIO InterpreterAlterQuery::execute()
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type");
} }
BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
{ {
BlockIO res; BlockIO res;
@ -156,7 +157,8 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
if (mutation_commands.hasNonEmptyMutationCommands()) if (mutation_commands.hasNonEmptyMutationCommands())
{ {
table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); MutationsInterpreter::Settings settings(false);
MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), settings).validate();
table->mutate(mutation_commands, getContext()); table->mutate(mutation_commands, getContext());
} }
@ -236,6 +238,7 @@ BlockIO InterpreterAlterQuery::executeToDatabase(const ASTAlterQuery & alter)
return res; return res;
} }
AccessRightsElements InterpreterAlterQuery::getRequiredAccess() const AccessRightsElements InterpreterAlterQuery::getRequiredAccess() const
{ {
AccessRightsElements required_access; AccessRightsElements required_access;

View File

@ -72,7 +72,8 @@ BlockIO InterpreterDeleteQuery::execute()
mutation_commands.emplace_back(mut_command); mutation_commands.emplace_back(mut_command);
table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); MutationsInterpreter::Settings settings(false);
MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), settings).validate();
table->mutate(mutation_commands, getContext()); table->mutate(mutation_commands, getContext());
return {}; return {};
} }

View File

@ -114,6 +114,7 @@
namespace ProfileEvents namespace ProfileEvents
{ {
extern const Event Query; extern const Event Query;
extern const Event QueriesWithSubqueries;
extern const Event SelectQuery; extern const Event SelectQuery;
extern const Event InsertQuery; extern const Event InsertQuery;
} }
@ -131,6 +132,15 @@ std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, ContextMut
{ {
ProfileEvents::increment(ProfileEvents::Query); ProfileEvents::increment(ProfileEvents::Query);
/// SELECT and INSERT query will handle QueriesWithSubqueries on their own.
if (!(query->as<ASTSelectQuery>() ||
query->as<ASTSelectWithUnionQuery>() ||
query->as<ASTSelectIntersectExceptQuery>() ||
query->as<ASTInsertQuery>()))
{
ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
}
if (query->as<ASTSelectQuery>()) if (query->as<ASTSelectQuery>())
{ {
if (context->getSettingsRef().allow_experimental_analyzer) if (context->getSettingsRef().allow_experimental_analyzer)

View File

@ -34,8 +34,15 @@
#include <TableFunctions/TableFunctionFactory.h> #include <TableFunctions/TableFunctionFactory.h>
#include <Common/ThreadStatus.h> #include <Common/ThreadStatus.h>
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
#include <Common/ProfileEvents.h>
namespace ProfileEvents
{
extern const Event InsertQueriesWithSubqueries;
extern const Event QueriesWithSubqueries;
}
namespace DB namespace DB
{ {
@ -234,6 +241,9 @@ Chain InterpreterInsertQuery::buildChain(
ThreadStatusesHolderPtr thread_status_holder, ThreadStatusesHolderPtr thread_status_holder,
std::atomic_uint64_t * elapsed_counter_ms) std::atomic_uint64_t * elapsed_counter_ms)
{ {
ProfileEvents::increment(ProfileEvents::InsertQueriesWithSubqueries);
ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
ThreadGroupPtr running_group; ThreadGroupPtr running_group;
if (current_thread) if (current_thread)
running_group = current_thread->getThreadGroup(); running_group = current_thread->getThreadGroup();

View File

@ -13,6 +13,7 @@
#include <Parsers/ASTTablesInSelectQuery.h> #include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ExpressionListParsers.h> #include <Parsers/ExpressionListParsers.h>
#include <Parsers/parseQuery.h> #include <Parsers/parseQuery.h>
#include <Parsers/FunctionParameterValuesVisitor.h>
#include <Access/Common/AccessFlags.h> #include <Access/Common/AccessFlags.h>
#include <Access/ContextAccess.h> #include <Access/ContextAccess.h>
@ -93,11 +94,17 @@
#include <Common/FieldVisitorsAccurateComparison.h> #include <Common/FieldVisitorsAccurateComparison.h>
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
#include <Common/scope_guard_safe.h> #include <Common/scope_guard_safe.h>
#include <Parsers/FunctionParameterValuesVisitor.h>
#include <Common/typeid_cast.h> #include <Common/typeid_cast.h>
#include <Common/ProfileEvents.h>
#include "config_version.h" #include "config_version.h"
namespace ProfileEvents
{
extern const Event SelectQueriesWithSubqueries;
extern const Event QueriesWithSubqueries;
}
namespace DB namespace DB
{ {
@ -437,7 +444,10 @@ InterpreterSelectQuery::InterpreterSelectQuery(
if (!metadata_snapshot) if (!metadata_snapshot)
metadata_snapshot = storage->getInMemoryMetadataPtr(); metadata_snapshot = storage->getInMemoryMetadataPtr();
storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context); if (options.only_analyze)
storage_snapshot = storage->getStorageSnapshotWithoutData(metadata_snapshot, context);
else
storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context);
} }
if (has_input || !joined_tables.resolveTables()) if (has_input || !joined_tables.resolveTables())
@ -1329,6 +1339,9 @@ static bool hasWithTotalsInAnySubqueryInFromClause(const ASTSelectQuery & query)
void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<Pipe> prepared_pipe) void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<Pipe> prepared_pipe)
{ {
ProfileEvents::increment(ProfileEvents::SelectQueriesWithSubqueries);
ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
/** Streams of data. When the query is executed in parallel, we have several data streams. /** Streams of data. When the query is executed in parallel, we have several data streams.
* If there is no GROUP BY, then perform all operations before ORDER BY and LIMIT in parallel, then * If there is no GROUP BY, then perform all operations before ORDER BY and LIMIT in parallel, then
* if there is an ORDER BY, then glue the streams using ResizeProcessor, and then MergeSorting transforms, * if there is an ORDER BY, then glue the streams using ResizeProcessor, and then MergeSorting transforms,

View File

@ -1,9 +1,9 @@
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h> #include <Functions/IFunction.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Interpreters/InterpreterSelectQuery.h> #include <Interpreters/InterpreterSelectQuery.h>
#include <Interpreters/MutationsInterpreter.h> #include <Interpreters/MutationsInterpreter.h>
#include <Interpreters/TreeRewriter.h> #include <Interpreters/TreeRewriter.h>
#include <Interpreters/MutationsNonDeterministicHelpers.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/StorageFromMergeTreeDataPart.h> #include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
#include <Storages/StorageMergeTree.h> #include <Storages/StorageMergeTree.h>
@ -31,7 +31,6 @@
#include <Interpreters/PreparedSets.h> #include <Interpreters/PreparedSets.h>
#include <Storages/LightweightDeleteDescription.h> #include <Storages/LightweightDeleteDescription.h>
#include <Storages/MergeTree/MergeTreeSequentialSource.h> #include <Storages/MergeTree/MergeTreeSequentialSource.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <Processors/Sources/ThrowingExceptionSource.h> #include <Processors/Sources/ThrowingExceptionSource.h>
#include <Analyzer/QueryTreeBuilder.h> #include <Analyzer/QueryTreeBuilder.h>
#include <Analyzer/QueryTreePassManager.h> #include <Analyzer/QueryTreePassManager.h>
@ -53,90 +52,12 @@ namespace ErrorCodes
extern const int NO_SUCH_COLUMN_IN_TABLE; extern const int NO_SUCH_COLUMN_IN_TABLE;
extern const int CANNOT_UPDATE_COLUMN; extern const int CANNOT_UPDATE_COLUMN;
extern const int UNEXPECTED_EXPRESSION; extern const int UNEXPECTED_EXPRESSION;
extern const int THERE_IS_NO_COLUMN;
} }
namespace namespace
{ {
/// Helps to detect situations, where non-deterministic functions may be used in mutations of Replicated*MergeTree.
class FirstNonDeterministicFunctionMatcher
{
public:
struct Data
{
ContextPtr context;
std::optional<String> nondeterministic_function_name;
bool subquery = false;
};
static bool needChildVisit(const ASTPtr & /*node*/, const ASTPtr & /*child*/)
{
return true;
}
static void visit(const ASTPtr & node, Data & data)
{
if (data.nondeterministic_function_name || data.subquery)
return;
if (node->as<ASTSelectQuery>())
{
/// We cannot determine if subquery is deterministic or not,
/// so we do not allow to use subqueries in mutation without allow_nondeterministic_mutations=1
data.subquery = true;
}
else if (const auto * function = typeid_cast<const ASTFunction *>(node.get()))
{
/// Property of being deterministic for lambda expression is completely determined
/// by the contents of its definition, so we just proceed to it.
if (function->name != "lambda")
{
/// NOTE It may be an aggregate function, so get(...) may throw.
/// However, an aggregate function can be used only in subquery and we do not go into subquery.
const auto func = FunctionFactory::instance().get(function->name, data.context);
if (!func->isDeterministic())
data.nondeterministic_function_name = func->getName();
}
}
}
};
using FirstNonDeterministicFunctionFinder = InDepthNodeVisitor<FirstNonDeterministicFunctionMatcher, true>;
using FirstNonDeterministicFunctionData = FirstNonDeterministicFunctionMatcher::Data;
FirstNonDeterministicFunctionData findFirstNonDeterministicFunctionName(const MutationCommand & command, ContextPtr context)
{
FirstNonDeterministicFunctionMatcher::Data finder_data{context, std::nullopt, false};
switch (command.type)
{
case MutationCommand::UPDATE:
{
auto update_assignments_ast = command.ast->as<const ASTAlterCommand &>().update_assignments->clone();
FirstNonDeterministicFunctionFinder(finder_data).visit(update_assignments_ast);
if (finder_data.nondeterministic_function_name)
return finder_data;
/// Currently UPDATE and DELETE both always have predicates so we can use fallthrough
[[fallthrough]];
}
case MutationCommand::DELETE:
{
auto predicate_ast = command.predicate->clone();
FirstNonDeterministicFunctionFinder(finder_data).visit(predicate_ast);
return finder_data;
}
default:
break;
}
return {};
}
ASTPtr prepareQueryAffectedAST(const std::vector<MutationCommand> & commands, const StoragePtr & storage, ContextPtr context) ASTPtr prepareQueryAffectedAST(const std::vector<MutationCommand> & commands, const StoragePtr & storage, ContextPtr context)
{ {
/// Execute `SELECT count() FROM storage WHERE predicate1 OR predicate2 OR ...` query. /// Execute `SELECT count() FROM storage WHERE predicate1 OR predicate2 OR ...` query.
@ -326,10 +247,10 @@ MutationsInterpreter::Source::Source(MergeTreeData & storage_, MergeTreeData::Da
StorageSnapshotPtr MutationsInterpreter::Source::getStorageSnapshot(const StorageMetadataPtr & snapshot_, const ContextPtr & context_) const StorageSnapshotPtr MutationsInterpreter::Source::getStorageSnapshot(const StorageMetadataPtr & snapshot_, const ContextPtr & context_) const
{ {
if (data) if (const auto * merge_tree = getMergeTreeData())
return data->getStorageSnapshot(snapshot_, context_); return merge_tree->getStorageSnapshotWithoutData(snapshot_, context_);
return storage->getStorageSnapshot(snapshot_, context_); return storage->getStorageSnapshotWithoutData(snapshot_, context_);
} }
StoragePtr MutationsInterpreter::Source::getStorage() const StoragePtr MutationsInterpreter::Source::getStorage() const
@ -367,20 +288,27 @@ bool MutationsInterpreter::Source::materializeTTLRecalculateOnly() const
return data && data->getSettings()->materialize_ttl_recalculate_only; return data && data->getSettings()->materialize_ttl_recalculate_only;
} }
static Names getAvailableColumnsWithVirtuals(StorageMetadataPtr metadata_snapshot, const IStorage & storage)
{
auto all_columns = metadata_snapshot->getColumns().getNamesOfPhysical();
for (const auto & column : storage.getVirtuals())
all_columns.push_back(column.name);
return all_columns;
}
MutationsInterpreter::MutationsInterpreter( MutationsInterpreter::MutationsInterpreter(
StoragePtr storage_, StoragePtr storage_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_)
bool return_all_columns_,
bool return_mutated_rows_)
: MutationsInterpreter( : MutationsInterpreter(
Source(std::move(storage_)), Source(storage_),
metadata_snapshot_, std::move(commands_), std::move(context_), metadata_snapshot_, std::move(commands_),
can_execute_, return_all_columns_, return_mutated_rows_) getAvailableColumnsWithVirtuals(metadata_snapshot_, *storage_),
std::move(context_), std::move(settings_))
{ {
if (can_execute_ && dynamic_cast<const MergeTreeData *>(source.getStorage().get())) if (settings.can_execute && dynamic_cast<const MergeTreeData *>(source.getStorage().get()))
{ {
throw Exception( throw Exception(
ErrorCodes::LOGICAL_ERROR, ErrorCodes::LOGICAL_ERROR,
@ -392,37 +320,34 @@ MutationsInterpreter::MutationsInterpreter(
MutationsInterpreter::MutationsInterpreter( MutationsInterpreter::MutationsInterpreter(
MergeTreeData & storage_, MergeTreeData & storage_,
MergeTreeData::DataPartPtr source_part_, MergeTreeData::DataPartPtr source_part_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
Names available_columns_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_)
bool return_all_columns_,
bool return_mutated_rows_)
: MutationsInterpreter( : MutationsInterpreter(
Source(storage_, std::move(source_part_)), Source(storage_, std::move(source_part_)),
metadata_snapshot_, std::move(commands_), std::move(context_), std::move(metadata_snapshot_), std::move(commands_),
can_execute_, return_all_columns_, return_mutated_rows_) std::move(available_columns_), std::move(context_), std::move(settings_))
{ {
} }
MutationsInterpreter::MutationsInterpreter( MutationsInterpreter::MutationsInterpreter(
Source source_, Source source_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
Names available_columns_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_)
bool return_all_columns_,
bool return_mutated_rows_)
: source(std::move(source_)) : source(std::move(source_))
, metadata_snapshot(metadata_snapshot_) , metadata_snapshot(metadata_snapshot_)
, commands(std::move(commands_)) , commands(std::move(commands_))
, available_columns(std::move(available_columns_))
, context(Context::createCopy(context_)) , context(Context::createCopy(context_))
, can_execute(can_execute_) , settings(std::move(settings_))
, select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections()) , select_limits(SelectQueryOptions().analyze(!settings.can_execute).ignoreLimits().ignoreProjections())
, return_all_columns(return_all_columns_)
, return_mutated_rows(return_mutated_rows_)
{ {
prepare(!can_execute); prepare(!settings.can_execute);
} }
static NameSet getKeyColumns(const MutationsInterpreter::Source & source, const StorageMetadataPtr & metadata_snapshot) static NameSet getKeyColumns(const MutationsInterpreter::Source & source, const StorageMetadataPtr & metadata_snapshot)
@ -546,16 +471,18 @@ void MutationsInterpreter::prepare(bool dry_run)
const ColumnsDescription & columns_desc = metadata_snapshot->getColumns(); const ColumnsDescription & columns_desc = metadata_snapshot->getColumns();
const IndicesDescription & indices_desc = metadata_snapshot->getSecondaryIndices(); const IndicesDescription & indices_desc = metadata_snapshot->getSecondaryIndices();
const ProjectionsDescription & projections_desc = metadata_snapshot->getProjections(); const ProjectionsDescription & projections_desc = metadata_snapshot->getProjections();
NamesAndTypesList all_columns = columns_desc.getAllPhysical();
auto storage_snapshot = std::make_shared<StorageSnapshot>(*source.getStorage(), metadata_snapshot);
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withVirtuals();
auto all_columns = storage_snapshot->getColumnsByNames(options, available_columns);
NameSet available_columns_set(available_columns.begin(), available_columns.end());
/// Add _row_exists column if it is physically present in the part /// Add _row_exists column if it is physically present in the part
if (source.hasLightweightDeleteMask()) if (source.hasLightweightDeleteMask())
all_columns.push_back({LightweightDeleteDescription::FILTER_COLUMN});
if (return_all_columns)
{ {
for (const auto & column : source.getStorage()->getVirtuals()) all_columns.push_back({LightweightDeleteDescription::FILTER_COLUMN});
all_columns.push_back(column); available_columns_set.insert(LightweightDeleteDescription::FILTER_COLUMN.name);
} }
NameSet updated_columns; NameSet updated_columns;
@ -567,9 +494,13 @@ void MutationsInterpreter::prepare(bool dry_run)
|| command.type == MutationCommand::Type::DELETE) || command.type == MutationCommand::Type::DELETE)
materialize_ttl_recalculate_only = false; materialize_ttl_recalculate_only = false;
for (const auto & kv : command.column_to_update_expression) for (const auto & [name, _] : command.column_to_update_expression)
{ {
updated_columns.insert(kv.first); if (!available_columns_set.contains(name) && name != LightweightDeleteDescription::FILTER_COLUMN.name)
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
"Column {} is updated but not requested to read", name);
updated_columns.insert(name);
} }
} }
@ -580,29 +511,28 @@ void MutationsInterpreter::prepare(bool dry_run)
{ {
for (const auto & column : columns_desc) for (const auto & column : columns_desc)
{ {
if (column.default_desc.kind == ColumnDefaultKind::Materialized) if (column.default_desc.kind == ColumnDefaultKind::Materialized && available_columns_set.contains(column.name))
{ {
auto query = column.default_desc.expression->clone(); auto query = column.default_desc.expression->clone();
auto syntax_result = TreeRewriter(context).analyze(query, all_columns); auto syntax_result = TreeRewriter(context).analyze(query, all_columns);
for (const String & dependency : syntax_result->requiredSourceColumns()) for (const auto & dependency : syntax_result->requiredSourceColumns())
{
if (updated_columns.contains(dependency)) if (updated_columns.contains(dependency))
column_to_affected_materialized[dependency].push_back(column.name); column_to_affected_materialized[dependency].push_back(column.name);
}
} }
} }
validateUpdateColumns(source, metadata_snapshot, updated_columns, column_to_affected_materialized); validateUpdateColumns(source, metadata_snapshot, updated_columns, column_to_affected_materialized);
} }
dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns); if (settings.recalculate_dependencies_of_updated_columns)
dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns);
std::vector<String> read_columns; std::vector<String> read_columns;
/// First, break a sequence of commands into stages. /// First, break a sequence of commands into stages.
for (auto & command : commands) for (auto & command : commands)
{ {
// we can return deleted rows only if it's the only present command // we can return deleted rows only if it's the only present command
assert(command.type == MutationCommand::DELETE || command.type == MutationCommand::UPDATE || !return_mutated_rows); assert(command.type == MutationCommand::DELETE || command.type == MutationCommand::UPDATE || !settings.return_mutated_rows);
if (command.type == MutationCommand::DELETE) if (command.type == MutationCommand::DELETE)
{ {
@ -612,7 +542,7 @@ void MutationsInterpreter::prepare(bool dry_run)
auto predicate = getPartitionAndPredicateExpressionForMutationCommand(command); auto predicate = getPartitionAndPredicateExpressionForMutationCommand(command);
if (!return_mutated_rows) if (!settings.return_mutated_rows)
predicate = makeASTFunction("isZeroOrNull", predicate); predicate = makeASTFunction("isZeroOrNull", predicate);
stages.back().filters.push_back(predicate); stages.back().filters.push_back(predicate);
@ -700,7 +630,7 @@ void MutationsInterpreter::prepare(bool dry_run)
stages.back().column_to_updated.emplace(column, updated_column); stages.back().column_to_updated.emplace(column, updated_column);
if (condition && return_mutated_rows) if (condition && settings.return_mutated_rows)
stages.back().filters.push_back(condition); stages.back().filters.push_back(condition);
} }
@ -909,17 +839,15 @@ void MutationsInterpreter::prepare(bool dry_run)
} }
is_prepared = true; is_prepared = true;
prepareMutationStages(stages, dry_run); prepareMutationStages(stages, dry_run);
} }
void MutationsInterpreter::prepareMutationStages(std::vector<Stage> & prepared_stages, bool dry_run) void MutationsInterpreter::prepareMutationStages(std::vector<Stage> & prepared_stages, bool dry_run)
{ {
auto storage_snapshot = source.getStorageSnapshot(metadata_snapshot, context); auto storage_snapshot = source.getStorageSnapshot(metadata_snapshot, context);
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals();
if (return_all_columns)
options.withVirtuals(); auto all_columns = storage_snapshot->getColumnsByNames(options, available_columns);
auto all_columns = storage_snapshot->getColumns(options);
/// Add _row_exists column if it is present in the part /// Add _row_exists column if it is present in the part
if (source.hasLightweightDeleteMask()) if (source.hasLightweightDeleteMask())
@ -928,7 +856,7 @@ void MutationsInterpreter::prepareMutationStages(std::vector<Stage> & prepared_s
/// Next, for each stage calculate columns changed by this and previous stages. /// Next, for each stage calculate columns changed by this and previous stages.
for (size_t i = 0; i < prepared_stages.size(); ++i) for (size_t i = 0; i < prepared_stages.size(); ++i)
{ {
if (return_all_columns || !prepared_stages[i].filters.empty()) if (settings.return_all_columns || !prepared_stages[i].filters.empty())
{ {
for (const auto & column : all_columns) for (const auto & column : all_columns)
prepared_stages[i].output_columns.insert(column.name); prepared_stages[i].output_columns.insert(column.name);
@ -1054,8 +982,7 @@ struct VirtualColumns
{ {
if (columns_to_read[i] == LightweightDeleteDescription::FILTER_COLUMN.name) if (columns_to_read[i] == LightweightDeleteDescription::FILTER_COLUMN.name)
{ {
LoadedMergeTreeDataPartInfoForReader part_info_reader(part); if (!part->getColumns().contains(LightweightDeleteDescription::FILTER_COLUMN.name))
if (!part_info_reader.getColumns().contains(LightweightDeleteDescription::FILTER_COLUMN.name))
{ {
ColumnWithTypeAndName mask_column; ColumnWithTypeAndName mask_column;
mask_column.type = LightweightDeleteDescription::FILTER_COLUMN.type; mask_column.type = LightweightDeleteDescription::FILTER_COLUMN.type;
@ -1144,7 +1071,6 @@ void MutationsInterpreter::Source::read(
ActionsDAGPtr filter; ActionsDAGPtr filter;
if (!first_stage.filter_column_names.empty()) if (!first_stage.filter_column_names.empty())
{ {
ActionsDAG::NodeRawConstPtrs nodes(num_filters); ActionsDAG::NodeRawConstPtrs nodes(num_filters);
for (size_t i = 0; i < num_filters; ++i) for (size_t i = 0; i < num_filters; ++i)
nodes[i] = &steps[i]->actions()->findInOutputs(names[i]); nodes[i] = &steps[i]->actions()->findInOutputs(names[i]);
@ -1155,7 +1081,9 @@ void MutationsInterpreter::Source::read(
VirtualColumns virtual_columns(std::move(required_columns), part); VirtualColumns virtual_columns(std::move(required_columns), part);
createMergeTreeSequentialSource( createMergeTreeSequentialSource(
plan, *data, storage_snapshot, part, std::move(virtual_columns.columns_to_read), apply_deleted_mask_, filter, context_, plan, *data, storage_snapshot, part,
std::move(virtual_columns.columns_to_read),
apply_deleted_mask_, filter, context_,
&Poco::Logger::get("MutationsInterpreter")); &Poco::Logger::get("MutationsInterpreter"));
virtual_columns.addVirtuals(plan); virtual_columns.addVirtuals(plan);
@ -1208,7 +1136,7 @@ void MutationsInterpreter::Source::read(
void MutationsInterpreter::initQueryPlan(Stage & first_stage, QueryPlan & plan) void MutationsInterpreter::initQueryPlan(Stage & first_stage, QueryPlan & plan)
{ {
source.read(first_stage, plan, metadata_snapshot, context, apply_deleted_mask, can_execute); source.read(first_stage, plan, metadata_snapshot, context, settings.apply_deleted_mask, settings.can_execute);
addCreatingSetsStep(plan, first_stage.analyzer->getPreparedSets(), context); addCreatingSetsStep(plan, first_stage.analyzer->getPreparedSets(), context);
} }
@ -1221,6 +1149,7 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v
const auto & step = stage.expressions_chain.steps[i]; const auto & step = stage.expressions_chain.steps[i];
if (step->actions()->hasArrayJoin()) if (step->actions()->hasArrayJoin())
throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "arrayJoin is not allowed in mutations"); throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "arrayJoin is not allowed in mutations");
if (i < stage.filter_column_names.size()) if (i < stage.filter_column_names.size())
{ {
/// Execute DELETEs. /// Execute DELETEs.
@ -1253,15 +1182,13 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v
void MutationsInterpreter::validate() void MutationsInterpreter::validate()
{ {
const Settings & settings = context->getSettingsRef();
/// For Replicated* storages mutations cannot employ non-deterministic functions /// For Replicated* storages mutations cannot employ non-deterministic functions
/// because that produces inconsistencies between replicas /// because that produces inconsistencies between replicas
if (startsWith(source.getStorage()->getName(), "Replicated") && !settings.allow_nondeterministic_mutations) if (startsWith(source.getStorage()->getName(), "Replicated") && !context->getSettingsRef().allow_nondeterministic_mutations)
{ {
for (const auto & command : commands) for (const auto & command : commands)
{ {
const auto nondeterministic_func_data = findFirstNonDeterministicFunctionName(command, context); const auto nondeterministic_func_data = findFirstNonDeterministicFunction(command, context);
if (nondeterministic_func_data.subquery) if (nondeterministic_func_data.subquery)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "ALTER UPDATE/ALTER DELETE statement with subquery may be nondeterministic, " throw Exception(ErrorCodes::BAD_ARGUMENTS, "ALTER UPDATE/ALTER DELETE statement with subquery may be nondeterministic, "
"see allow_nondeterministic_mutations setting"); "see allow_nondeterministic_mutations setting");
@ -1281,7 +1208,7 @@ void MutationsInterpreter::validate()
QueryPipelineBuilder MutationsInterpreter::execute() QueryPipelineBuilder MutationsInterpreter::execute()
{ {
if (!can_execute) if (!settings.can_execute)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot execute mutations interpreter because can_execute flag set to false"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot execute mutations interpreter because can_execute flag set to false");
QueryPlan plan; QueryPlan plan;

View File

@ -36,30 +36,44 @@ ASTPtr getPartitionAndPredicateExpressionForMutationCommand(
/// to this data. /// to this data.
class MutationsInterpreter class MutationsInterpreter
{ {
private:
struct Stage; struct Stage;
public: public:
struct Settings
{
explicit Settings(bool can_execute_) : can_execute(can_execute_) {}
/// If false only analyze mutation expressions.
bool can_execute = false;
/// Whether all columns should be returned, not just updated
bool return_all_columns = false;
/// Whether we should return mutated or all existing rows
bool return_mutated_rows = false;
/// Where we should filter deleted rows by lightweight DELETE.
bool apply_deleted_mask = true;
/// Where we should recalculate skip indexes, TTL expressions, etc. that depend on updated columns.
bool recalculate_dependencies_of_updated_columns = true;
};
/// Storage to mutate, array of mutations commands and context. If you really want to execute mutation /// Storage to mutate, array of mutations commands and context. If you really want to execute mutation
/// use can_execute = true, in other cases (validation, amount of commands) it can be false /// use can_execute = true, in other cases (validation, amount of commands) it can be false
MutationsInterpreter( MutationsInterpreter(
StoragePtr storage_, StoragePtr storage_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_);
bool return_all_columns_ = false,
bool return_mutated_rows_ = false);
/// Special case for *MergeTree /// Special case for *MergeTree
MutationsInterpreter( MutationsInterpreter(
MergeTreeData & storage_, MergeTreeData & storage_,
MergeTreeData::DataPartPtr source_part_, MergeTreeData::DataPartPtr source_part_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
Names available_columns_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_);
bool return_all_columns_ = false,
bool return_mutated_rows_ = false);
void validate(); void validate();
size_t evaluateCommandsSize(); size_t evaluateCommandsSize();
@ -93,8 +107,6 @@ public:
MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; } MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; }
void setApplyDeletedMask(bool apply) { apply_deleted_mask = apply; }
/// Internal class which represents a data part for MergeTree /// Internal class which represents a data part for MergeTree
/// or just storage for other storages. /// or just storage for other storages.
/// The main idea is to create a dedicated reading from MergeTree part. /// The main idea is to create a dedicated reading from MergeTree part.
@ -131,12 +143,11 @@ public:
private: private:
MutationsInterpreter( MutationsInterpreter(
Source source_, Source source_,
const StorageMetadataPtr & metadata_snapshot_, StorageMetadataPtr metadata_snapshot_,
MutationCommands commands_, MutationCommands commands_,
Names available_columns_,
ContextPtr context_, ContextPtr context_,
bool can_execute_, Settings settings_);
bool return_all_columns_,
bool return_mutated_rows_);
void prepare(bool dry_run); void prepare(bool dry_run);
@ -151,12 +162,11 @@ private:
Source source; Source source;
StorageMetadataPtr metadata_snapshot; StorageMetadataPtr metadata_snapshot;
MutationCommands commands; MutationCommands commands;
Names available_columns;
ContextPtr context; ContextPtr context;
bool can_execute; Settings settings;
SelectQueryOptions select_limits; SelectQueryOptions select_limits;
bool apply_deleted_mask = true;
/// A sequence of mutation commands is executed as a sequence of stages. Each stage consists of several /// A sequence of mutation commands is executed as a sequence of stages. Each stage consists of several
/// filters, followed by updating values of some columns. Commands can reuse expressions calculated by the /// filters, followed by updating values of some columns. Commands can reuse expressions calculated by the
/// previous commands in the same stage, but at the end of each stage intermediate columns are thrown away /// previous commands in the same stage, but at the end of each stage intermediate columns are thrown away
@ -206,12 +216,6 @@ private:
/// Columns, that we need to read for calculation of skip indices, projections or TTL expressions. /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
ColumnDependencies dependencies; ColumnDependencies dependencies;
// whether all columns should be returned, not just updated
bool return_all_columns;
// whether we should return mutated or all existing rows
bool return_mutated_rows;
}; };
} }

View File

@ -0,0 +1,100 @@
#include "Parsers/IAST_fwd.h"
#include <Interpreters/MutationsNonDeterministicHelpers.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTAlterQuery.h>
#include <Storages/MutationCommands.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Interpreters/evaluateConstantExpression.h>
#include <Interpreters/ExecuteScalarSubqueriesVisitor.h>
#include <Interpreters/addTypeConversionToAST.h>
#include <Interpreters/Context.h>
#include <Functions/FunctionFactory.h>
namespace DB
{
namespace
{
/// Helps to detect situations, where non-deterministic functions may be used in mutations.
class FirstNonDeterministicFunctionMatcher
{
public:
struct Data
{
ContextPtr context;
FirstNonDeterministicFunctionResult result;
};
static bool needChildVisit(const ASTPtr & /*node*/, const ASTPtr & /*child*/)
{
return true;
}
static void visit(const ASTPtr & node, Data & data)
{
if (data.result.nondeterministic_function_name || data.result.subquery)
return;
if (node->as<ASTSelectQuery>())
{
/// We cannot determine if subquery is deterministic or not,
/// so we do not allow to use subqueries in mutation without allow_nondeterministic_mutations=1
data.result.subquery = true;
}
else if (const auto * function = typeid_cast<const ASTFunction *>(node.get()))
{
/// Property of being deterministic for lambda expression is completely determined
/// by the contents of its definition, so we just proceed to it.
if (function->name != "lambda")
{
/// NOTE It may be an aggregate function, so get(...) may throw.
/// However, an aggregate function can be used only in subquery and we do not go into subquery.
const auto func = FunctionFactory::instance().get(function->name, data.context);
if (!func->isDeterministic())
data.result.nondeterministic_function_name = func->getName();
}
}
}
};
using FirstNonDeterministicFunctionFinder = InDepthNodeVisitor<FirstNonDeterministicFunctionMatcher, true>;
using FirstNonDeterministicFunctionData = FirstNonDeterministicFunctionMatcher::Data;
}
FirstNonDeterministicFunctionResult findFirstNonDeterministicFunction(const MutationCommand & command, ContextPtr context)
{
FirstNonDeterministicFunctionMatcher::Data finder_data{context, {}};
switch (command.type)
{
case MutationCommand::UPDATE:
{
auto update_assignments_ast = command.ast->as<const ASTAlterCommand &>().update_assignments->clone();
FirstNonDeterministicFunctionFinder(finder_data).visit(update_assignments_ast);
if (finder_data.result.nondeterministic_function_name)
return finder_data.result;
/// Currently UPDATE and DELETE both always have predicates so we can use fallthrough
[[fallthrough]];
}
case MutationCommand::DELETE:
{
auto predicate_ast = command.predicate->clone();
FirstNonDeterministicFunctionFinder(finder_data).visit(predicate_ast);
return finder_data.result;
}
default:
break;
}
return {};
}
}

View File

@ -0,0 +1,21 @@
#pragma once
#include <Interpreters/Context_fwd.h>
#include <Parsers/IAST_fwd.h>
#include <Core/Types.h>
namespace DB
{
struct MutationCommand;
struct FirstNonDeterministicFunctionResult
{
std::optional<String> nondeterministic_function_name;
bool subquery = false;
};
/// Searches for non-deterministic functions and subqueries which
/// may also be non-deterministic in expressions of mutation command.
FirstNonDeterministicFunctionResult findFirstNonDeterministicFunction(const MutationCommand & command, ContextPtr context);
}

View File

@ -143,28 +143,58 @@ std::shared_ptr<TSystemLog> createSystemLog(
"If 'engine' is specified for system table, PARTITION BY parameters should " "If 'engine' is specified for system table, PARTITION BY parameters should "
"be specified directly inside 'engine' and 'partition_by' setting doesn't make sense"); "be specified directly inside 'engine' and 'partition_by' setting doesn't make sense");
if (config.has(config_prefix + ".ttl")) if (config.has(config_prefix + ".ttl"))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "If 'engine' is specified for system table, " throw Exception(ErrorCodes::BAD_ARGUMENTS,
"TTL parameters should be specified directly inside 'engine' and 'ttl' setting doesn't make sense"); "If 'engine' is specified for system table, TTL parameters should "
"be specified directly inside 'engine' and 'ttl' setting doesn't make sense");
if (config.has(config_prefix + ".order_by"))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"If 'engine' is specified for system table, ORDER BY parameters should "
"be specified directly inside 'engine' and 'order_by' setting doesn't make sense");
if (config.has(config_prefix + ".storage_policy")) if (config.has(config_prefix + ".storage_policy"))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "If 'engine' is specified for system table, SETTINGS storage_policy = '...' " throw Exception(ErrorCodes::BAD_ARGUMENTS,
"should be specified directly inside 'engine' and 'storage_policy' setting doesn't make sense"); "If 'engine' is specified for system table, SETTINGS storage_policy = '...' should "
"be specified directly inside 'engine' and 'storage_policy' setting doesn't make sense");
if (config.has(config_prefix + ".settings"))
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"If 'engine' is specified for system table, SETTINGS parameters should "
"be specified directly inside 'engine' and 'settings' setting doesn't make sense");
engine = config.getString(config_prefix + ".engine"); engine = config.getString(config_prefix + ".engine");
} }
else else
{ {
String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)"); /// ENGINE expr is necessary.
engine = "ENGINE = MergeTree"; engine = "ENGINE = MergeTree";
/// PARTITION expr is not necessary.
String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)");
if (!partition_by.empty()) if (!partition_by.empty())
engine += " PARTITION BY (" + partition_by + ")"; engine += " PARTITION BY (" + partition_by + ")";
/// TTL expr is not necessary.
String ttl = config.getString(config_prefix + ".ttl", ""); String ttl = config.getString(config_prefix + ".ttl", "");
if (!ttl.empty()) if (!ttl.empty())
engine += " TTL " + ttl; engine += " TTL " + ttl;
engine += " ORDER BY "; /// ORDER BY expr is necessary.
engine += TSystemLog::getDefaultOrderBy(); String order_by = config.getString(config_prefix + ".order_by", TSystemLog::getDefaultOrderBy());
engine += " ORDER BY (" + order_by + ")";
/// SETTINGS expr is not necessary.
/// https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#settings
///
/// STORAGE POLICY expr is retained for backward compatible.
String storage_policy = config.getString(config_prefix + ".storage_policy", ""); String storage_policy = config.getString(config_prefix + ".storage_policy", "");
if (!storage_policy.empty()) String settings = config.getString(config_prefix + ".settings", "");
engine += " SETTINGS storage_policy = " + quoteString(storage_policy); if (!storage_policy.empty() || !settings.empty())
{
engine += " SETTINGS";
/// If 'storage_policy' is repeated, the 'settings' configuration is preferred.
if (!storage_policy.empty())
engine += " storage_policy = " + quoteString(storage_policy);
if (!settings.empty())
engine += (storage_policy.empty() ? " " : ", ") + settings;
}
} }
/// Validate engine definition syntax to prevent some configuration errors. /// Validate engine definition syntax to prevent some configuration errors.

View File

@ -455,7 +455,11 @@ void executeScalarSubqueries(
ASTPtr & query, ContextPtr context, size_t subquery_depth, Scalars & scalars, Scalars & local_scalars, bool only_analyze, bool is_create_parameterized_view) ASTPtr & query, ContextPtr context, size_t subquery_depth, Scalars & scalars, Scalars & local_scalars, bool only_analyze, bool is_create_parameterized_view)
{ {
LogAST log; LogAST log;
ExecuteScalarSubqueriesVisitor::Data visitor_data{WithContext{context}, subquery_depth, scalars, local_scalars, only_analyze, is_create_parameterized_view}; ExecuteScalarSubqueriesVisitor::Data visitor_data{
WithContext{context}, subquery_depth, scalars,
local_scalars, only_analyze, is_create_parameterized_view,
/*replace_only_to_literals=*/ false, /*max_literal_size=*/ std::nullopt};
ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query); ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query);
} }

View File

@ -666,9 +666,13 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal));
if (context->getCurrentTransaction() && !interpreter->supportsTransactions() && const auto & query_settings = context->getSettingsRef();
context->getSettingsRef().throw_on_unsupported_query_inside_transaction) if (context->getCurrentTransaction() && query_settings.throw_on_unsupported_query_inside_transaction)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID()); {
if (!interpreter->supportsTransactions())
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID());
}
if (!interpreter->ignoreQuota() && !quota_checked) if (!interpreter->ignoreQuota() && !quota_checked)
{ {

View File

@ -2,6 +2,7 @@
#include <Core/ProtocolDefines.h> #include <Core/ProtocolDefines.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include <Common/ProfileEvents.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
@ -73,6 +74,12 @@
#include <Planner/CollectColumnIdentifiers.h> #include <Planner/CollectColumnIdentifiers.h>
#include <Planner/PlannerQueryProcessingInfo.h> #include <Planner/PlannerQueryProcessingInfo.h>
namespace ProfileEvents
{
extern const Event SelectQueriesWithSubqueries;
extern const Event QueriesWithSubqueries;
}
namespace DB namespace DB
{ {
@ -1155,6 +1162,9 @@ void Planner::buildPlanForUnionNode()
void Planner::buildPlanForQueryNode() void Planner::buildPlanForQueryNode()
{ {
ProfileEvents::increment(ProfileEvents::SelectQueriesWithSubqueries);
ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
auto & query_node = query_tree->as<QueryNode &>(); auto & query_node = query_tree->as<QueryNode &>();
const auto & query_context = planner_context->getQueryContext(); const auto & query_context = planner_context->getQueryContext();
@ -1192,13 +1202,14 @@ void Planner::buildPlanForQueryNode()
const auto & settings = query_context->getSettingsRef(); const auto & settings = query_context->getSettingsRef();
if (planner_context->getTableExpressionNodeToData().size() > 1 /// Check support for JOIN for parallel replicas with custom key
&& (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas > 0)) if (planner_context->getTableExpressionNodeToData().size() > 1)
{ {
if (settings.allow_experimental_parallel_reading_from_replicas == 1) if (settings.allow_experimental_parallel_reading_from_replicas == 1 || !settings.parallel_replicas_custom_key.value.empty())
{ {
LOG_WARNING( LOG_WARNING(
&Poco::Logger::get("Planner"), "JOINs are not supported with parallel replicas. Query will be executed without using them."); &Poco::Logger::get("Planner"),
"JOINs are not supported with parallel replicas. Query will be executed without using them.");
auto & mutable_context = planner_context->getMutableQueryContext(); auto & mutable_context = planner_context->getMutableQueryContext();
mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));

View File

@ -75,7 +75,7 @@ void ArrowBlockOutputFormat::finalizeImpl()
{ {
if (!writer) if (!writer)
{ {
const Block & header = getPort(PortKind::Main).getHeader(); Block header = materializeBlock(getPort(PortKind::Main).getHeader());
consume(Chunk(header.getColumns(), 0)); consume(Chunk(header.getColumns(), 0));
} }

View File

@ -76,7 +76,7 @@ public:
bool checkEndOfRow(); bool checkEndOfRow();
bool checkForSuffixImpl(bool check_eof); bool checkForSuffixImpl(bool check_eof);
inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); } inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); }
EscapingRule getEscapingRule() const override { return format_settings.custom.escaping_rule; } EscapingRule getEscapingRule() const override { return format_settings.custom.escaping_rule; }

View File

@ -105,7 +105,7 @@ void ParquetBlockOutputFormat::finalizeImpl()
if (!file_writer) if (!file_writer)
{ {
const Block & header = getPort(PortKind::Main).getHeader(); Block header = materializeBlock(getPort(PortKind::Main).getHeader());
write(Chunk(header.getColumns(), 0), 1); write(Chunk(header.getColumns(), 0), 1);
} }

View File

@ -582,7 +582,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes &
ContextPtr context = reading->getContext(); ContextPtr context = reading->getContext();
MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());
auto ordinary_reading_select_result = reading->selectRangesToRead(parts); auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {});
size_t ordinary_reading_marks = ordinary_reading_select_result->marks(); size_t ordinary_reading_marks = ordinary_reading_select_result->marks();
/// Selecting best candidate. /// Selecting best candidate.
@ -640,7 +640,8 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes &
query_info_copy.prewhere_info = nullptr; query_info_copy.prewhere_info = nullptr;
projection_reading = reader.readFromParts( projection_reading = reader.readFromParts(
{}, /* parts = */ {},
/* alter_conversions = */ {},
best_candidate->dag->getRequiredColumnsNames(), best_candidate->dag->getRequiredColumnsNames(),
proj_snapshot, proj_snapshot,
query_info_copy, query_info_copy,

View File

@ -110,10 +110,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
return false; return false;
if (query.dag) if (query.dag)
{
query.dag->removeUnusedActions(); query.dag->removeUnusedActions();
// LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Query DAG: {}", query.dag->dumpDAG());
}
} }
std::list<NormalProjectionCandidate> candidates; std::list<NormalProjectionCandidate> candidates;
@ -125,12 +122,9 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
ContextPtr context = reading->getContext(); ContextPtr context = reading->getContext();
MergeTreeDataSelectExecutor reader(reading->getMergeTreeData()); MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());
auto ordinary_reading_select_result = reading->selectRangesToRead(parts); auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {});
size_t ordinary_reading_marks = ordinary_reading_select_result->marks(); size_t ordinary_reading_marks = ordinary_reading_select_result->marks();
// LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"),
// "Marks for ordinary reading {}", ordinary_reading_marks);
std::shared_ptr<PartitionIdToMaxBlock> max_added_blocks = getMaxAddedBlocks(reading); std::shared_ptr<PartitionIdToMaxBlock> max_added_blocks = getMaxAddedBlocks(reading);
for (const auto * projection : normal_projections) for (const auto * projection : normal_projections)
@ -152,9 +146,6 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
if (!analyzed) if (!analyzed)
continue; continue;
// LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"),
// "Marks for projection {} {}", projection->name ,candidate.sum_marks);
if (candidate.sum_marks >= ordinary_reading_marks) if (candidate.sum_marks >= ordinary_reading_marks)
continue; continue;
@ -173,14 +164,12 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); //, storage_snapshot->data); storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); //, storage_snapshot->data);
proj_snapshot->addProjection(best_candidate->projection); proj_snapshot->addProjection(best_candidate->projection);
// LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Proj snapshot {}",
// proj_snapshot->getColumns(GetColumnsOptions::Kind::All).toString());
auto query_info_copy = query_info; auto query_info_copy = query_info;
query_info_copy.prewhere_info = nullptr; query_info_copy.prewhere_info = nullptr;
auto projection_reading = reader.readFromParts( auto projection_reading = reader.readFromParts(
{}, /*parts=*/ {},
/*alter_conversions=*/ {},
required_columns, required_columns,
proj_snapshot, proj_snapshot,
query_info_copy, query_info_copy,
@ -201,9 +190,6 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
if (has_ordinary_parts) if (has_ordinary_parts)
reading->setAnalyzedResult(std::move(best_candidate->merge_tree_ordinary_select_result_ptr)); reading->setAnalyzedResult(std::move(best_candidate->merge_tree_ordinary_select_result_ptr));
// LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection reading header {}",
// projection_reading->getOutputStream().header.dumpStructure());
projection_reading->setStepDescription(best_candidate->projection->name); projection_reading->setStepDescription(best_candidate->projection->name);
auto & projection_reading_node = nodes.emplace_back(QueryPlan::Node{.step = std::move(projection_reading)}); auto & projection_reading_node = nodes.emplace_back(QueryPlan::Node{.step = std::move(projection_reading)});

View File

@ -248,7 +248,7 @@ bool analyzeProjectionCandidate(
if (!normal_parts.empty()) if (!normal_parts.empty())
{ {
auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts)); auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts), /* alter_conversions = */ {});
if (normal_result_ptr->error()) if (normal_result_ptr->error())
return false; return false;

View File

@ -142,8 +142,10 @@ std::pair<std::vector<Values>, std::vector<RangesInDataParts>> split(RangesInDat
{ {
result_layers.back().emplace_back( result_layers.back().emplace_back(
parts[part_idx].data_part, parts[part_idx].data_part,
parts[part_idx].alter_conversions,
parts[part_idx].part_index_in_query, parts[part_idx].part_index_in_query,
MarkRanges{{current_part_range_begin[part_idx], current.range.end}}); MarkRanges{{current_part_range_begin[part_idx], current.range.end}});
current_part_range_begin.erase(part_idx); current_part_range_begin.erase(part_idx);
current_part_range_end.erase(part_idx); current_part_range_end.erase(part_idx);
continue; continue;
@ -170,8 +172,10 @@ std::pair<std::vector<Values>, std::vector<RangesInDataParts>> split(RangesInDat
{ {
result_layers.back().emplace_back( result_layers.back().emplace_back(
parts[part_idx].data_part, parts[part_idx].data_part,
parts[part_idx].alter_conversions,
parts[part_idx].part_index_in_query, parts[part_idx].part_index_in_query,
MarkRanges{{current_part_range_begin[part_idx], last_mark + 1}}); MarkRanges{{current_part_range_begin[part_idx], last_mark + 1}});
current_part_range_begin[part_idx] = current_part_range_end[part_idx]; current_part_range_begin[part_idx] = current_part_range_end[part_idx];
} }
} }

View File

@ -171,6 +171,7 @@ void ReadFromMergeTree::AnalysisResult::checkLimits(const Settings & settings, c
ReadFromMergeTree::ReadFromMergeTree( ReadFromMergeTree::ReadFromMergeTree(
MergeTreeData::DataPartsVector parts_, MergeTreeData::DataPartsVector parts_,
std::vector<AlterConversionsPtr> alter_conversions_,
Names real_column_names_, Names real_column_names_,
Names virt_column_names_, Names virt_column_names_,
const MergeTreeData & data_, const MergeTreeData & data_,
@ -191,6 +192,7 @@ ReadFromMergeTree::ReadFromMergeTree(
virt_column_names_)}) virt_column_names_)})
, reader_settings(getMergeTreeReaderSettings(context_, query_info_)) , reader_settings(getMergeTreeReaderSettings(context_, query_info_))
, prepared_parts(std::move(parts_)) , prepared_parts(std::move(parts_))
, alter_conversions_for_parts(std::move(alter_conversions_))
, real_column_names(std::move(real_column_names_)) , real_column_names(std::move(real_column_names_))
, virt_column_names(std::move(virt_column_names_)) , virt_column_names(std::move(virt_column_names_))
, data(data_) , data(data_)
@ -307,8 +309,7 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
reader_settings, reader_settings,
required_columns, required_columns,
virt_column_names, virt_column_names,
min_marks_for_concurrent_read min_marks_for_concurrent_read);
);
Pipes pipes; Pipes pipes;
const auto & settings = context->getSettingsRef(); const auto & settings = context->getSettingsRef();
@ -459,7 +460,7 @@ ProcessorPtr ReadFromMergeTree::createSource(
bool set_rows_approx = !is_parallel_reading_from_replicas && !reader_settings.read_in_order; bool set_rows_approx = !is_parallel_reading_from_replicas && !reader_settings.read_in_order;
auto algorithm = std::make_unique<Algorithm>( auto algorithm = std::make_unique<Algorithm>(
data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes, data, storage_snapshot, part.data_part, part.alter_conversions, max_block_size, preferred_block_size_bytes,
preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info, preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info,
actions_settings, reader_settings, pool, virt_column_names, part.part_index_in_query, has_limit_below_one_block); actions_settings, reader_settings, pool, virt_column_names, part.part_index_in_query, has_limit_below_one_block);
@ -808,7 +809,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
} }
ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction); ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
new_parts.emplace_back(part.data_part, part.part_index_in_query, std::move(ranges_to_get_from_part)); new_parts.emplace_back(part.data_part, part.alter_conversions, part.part_index_in_query, std::move(ranges_to_get_from_part));
} }
splitted_parts_and_ranges.emplace_back(std::move(new_parts)); splitted_parts_and_ranges.emplace_back(std::move(new_parts));
@ -1000,7 +1001,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
{ {
for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it) for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it)
{ {
new_parts.emplace_back(part_it->data_part, part_it->part_index_in_query, part_it->ranges); new_parts.emplace_back(part_it->data_part, part_it->alter_conversions, part_it->part_index_in_query, part_it->ranges);
} }
} }
@ -1111,10 +1112,13 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
return Pipe::unitePipes(std::move(partition_pipes)); return Pipe::unitePipes(std::move(partition_pipes));
} }
MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(MergeTreeData::DataPartsVector parts) const MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions) const
{ {
return selectRangesToRead( return selectRangesToRead(
std::move(parts), std::move(parts),
std::move(alter_conversions),
prewhere_info, prewhere_info,
filter_nodes, filter_nodes,
storage_snapshot->metadata, storage_snapshot->metadata,
@ -1131,6 +1135,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge
MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead( MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
MergeTreeData::DataPartsVector parts, MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
const PrewhereInfoPtr & prewhere_info, const PrewhereInfoPtr & prewhere_info,
const ActionDAGNodes & added_filter_nodes, const ActionDAGNodes & added_filter_nodes,
const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot_base,
@ -1182,7 +1187,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
updated_query_info_with_filter_dag.filter_actions_dag = ActionsDAG::buildFilterActionsDAG(nodes, node_name_to_input_node_column, context); updated_query_info_with_filter_dag.filter_actions_dag = ActionsDAG::buildFilterActionsDAG(nodes, node_name_to_input_node_column, context);
return selectRangesToReadImpl( return selectRangesToReadImpl(
parts, std::move(parts),
std::move(alter_conversions),
metadata_snapshot_base, metadata_snapshot_base,
metadata_snapshot, metadata_snapshot,
updated_query_info_with_filter_dag, updated_query_info_with_filter_dag,
@ -1196,7 +1202,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
} }
return selectRangesToReadImpl( return selectRangesToReadImpl(
parts, std::move(parts),
std::move(alter_conversions),
metadata_snapshot_base, metadata_snapshot_base,
metadata_snapshot, metadata_snapshot,
query_info, query_info,
@ -1211,6 +1218,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl( MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
MergeTreeData::DataPartsVector parts, MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot_base,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info, const SelectQueryInfo & query_info,
@ -1284,6 +1292,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
{ {
MergeTreeDataSelectExecutor::filterPartsByPartition( MergeTreeDataSelectExecutor::filterPartsByPartition(
parts, parts,
alter_conversions,
part_values, part_values,
metadata_snapshot_base, metadata_snapshot_base,
data, data,
@ -1321,6 +1330,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
result.parts_with_ranges = MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipIndexes( result.parts_with_ranges = MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipIndexes(
std::move(parts), std::move(parts),
std::move(alter_conversions),
metadata_snapshot, metadata_snapshot,
query_info, query_info,
context, context,
@ -1491,7 +1501,7 @@ bool ReadFromMergeTree::requestOutputEachPartitionThroughSeparatePort()
ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const
{ {
auto result_ptr = analyzed_result_ptr ? analyzed_result_ptr : selectRangesToRead(prepared_parts); auto result_ptr = analyzed_result_ptr ? analyzed_result_ptr : selectRangesToRead(prepared_parts, alter_conversions_for_parts);
if (std::holds_alternative<std::exception_ptr>(result_ptr->result)) if (std::holds_alternative<std::exception_ptr>(result_ptr->result))
std::rethrow_exception(std::get<std::exception_ptr>(result_ptr->result)); std::rethrow_exception(std::get<std::exception_ptr>(result_ptr->result));
@ -1720,7 +1730,6 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons
for (const auto & processor : pipe.getProcessors()) for (const auto & processor : pipe.getProcessors())
processors.emplace_back(processor); processors.emplace_back(processor);
pipeline.init(std::move(pipe)); pipeline.init(std::move(pipe));
// Attach QueryIdHolder if needed // Attach QueryIdHolder if needed
if (query_id_holder) if (query_id_holder)

View File

@ -5,6 +5,7 @@
#include <Storages/SelectQueryInfo.h> #include <Storages/SelectQueryInfo.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeReadPool.h> #include <Storages/MergeTree/MergeTreeReadPool.h>
#include <Storages/MergeTree/AlterConversions.h>
namespace DB namespace DB
{ {
@ -97,6 +98,7 @@ public:
ReadFromMergeTree( ReadFromMergeTree(
MergeTreeData::DataPartsVector parts_, MergeTreeData::DataPartsVector parts_,
std::vector<AlterConversionsPtr> alter_conversions_,
Names real_column_names_, Names real_column_names_,
Names virt_column_names_, Names virt_column_names_,
const MergeTreeData & data_, const MergeTreeData & data_,
@ -134,6 +136,7 @@ public:
static MergeTreeDataSelectAnalysisResultPtr selectRangesToRead( static MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(
MergeTreeData::DataPartsVector parts, MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
const PrewhereInfoPtr & prewhere_info, const PrewhereInfoPtr & prewhere_info,
const ActionDAGNodes & added_filter_nodes, const ActionDAGNodes & added_filter_nodes,
const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot_base,
@ -147,7 +150,9 @@ public:
bool sample_factor_column_queried, bool sample_factor_column_queried,
Poco::Logger * log); Poco::Logger * log);
MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(MergeTreeData::DataPartsVector parts) const; MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(
MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions) const;
ContextPtr getContext() const { return context; } ContextPtr getContext() const { return context; }
const SelectQueryInfo & getQueryInfo() const { return query_info; } const SelectQueryInfo & getQueryInfo() const { return query_info; }
@ -168,7 +173,12 @@ public:
bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; } bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; }
void setAnalyzedResult(MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); } void setAnalyzedResult(MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); }
void resetParts(MergeTreeData::DataPartsVector parts) { prepared_parts = std::move(parts); }
void resetParts(MergeTreeData::DataPartsVector parts)
{
prepared_parts = std::move(parts);
alter_conversions_for_parts = {};
}
const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; } const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; }
const MergeTreeData & getMergeTreeData() const { return data; } const MergeTreeData & getMergeTreeData() const { return data; }
@ -179,6 +189,7 @@ public:
private: private:
static MergeTreeDataSelectAnalysisResultPtr selectRangesToReadImpl( static MergeTreeDataSelectAnalysisResultPtr selectRangesToReadImpl(
MergeTreeData::DataPartsVector parts, MergeTreeData::DataPartsVector parts,
std::vector<AlterConversionsPtr> alter_conversions,
const StorageMetadataPtr & metadata_snapshot_base, const StorageMetadataPtr & metadata_snapshot_base,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const SelectQueryInfo & query_info, const SelectQueryInfo & query_info,
@ -202,6 +213,8 @@ private:
MergeTreeReaderSettings reader_settings; MergeTreeReaderSettings reader_settings;
MergeTreeData::DataPartsVector prepared_parts; MergeTreeData::DataPartsVector prepared_parts;
std::vector<AlterConversionsPtr> alter_conversions_for_parts;
Names real_column_names; Names real_column_names;
Names virt_column_names; Names virt_column_names;

View File

@ -34,7 +34,7 @@ bool RemoteQueryExecutorReadContext::checkBeforeTaskResume()
} }
void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback) void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
{ {
read_context.executor.sendQueryUnlocked(ClientInfo::QueryKind::SECONDARY_QUERY, async_callback); read_context.executor.sendQueryUnlocked(ClientInfo::QueryKind::SECONDARY_QUERY, async_callback);
read_context.is_query_sent = true; read_context.is_query_sent = true;

View File

@ -58,7 +58,7 @@ private:
RemoteQueryExecutorReadContext & read_context; RemoteQueryExecutorReadContext & read_context;
void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override; void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
}; };
std::atomic_bool is_in_progress = false; std::atomic_bool is_in_progress = false;

View File

@ -244,7 +244,13 @@ struct IcebergMetadataParser<Configuration, MetadataReadHelper>::Impl
const auto * str_col = assert_cast<const ColumnString *>(col_str.get()); const auto * str_col = assert_cast<const ColumnString *>(col_str.get());
for (size_t i = 0; i < str_col->size(); ++i) for (size_t i = 0; i < str_col->size(); ++i)
keys.emplace_back(str_col->getDataAt(i).toView()); {
const auto data_path = std::string(str_col->getDataAt(i).toView());
const auto pos = data_path.find(configuration.url.key);
if (pos == std::string::npos)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration.url.key, data_path);
keys.emplace_back(data_path.substr(pos));
}
} }
return keys; return keys;

View File

@ -661,6 +661,12 @@ public:
return getStorageSnapshot(metadata_snapshot, query_context); return getStorageSnapshot(metadata_snapshot, query_context);
} }
/// Creates a storage snapshot but without holding a data specific to storage.
virtual StorageSnapshotPtr getStorageSnapshotWithoutData(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const
{
return getStorageSnapshot(metadata_snapshot, query_context);
}
/// A helper to implement read() /// A helper to implement read()
static void readFromPipe( static void readFromPipe(
QueryPlan & query_plan, QueryPlan & query_plan,

View File

@ -9,6 +9,13 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR; extern const int LOGICAL_ERROR;
} }
void AlterConversions::addMutationCommand(const MutationCommand & command)
{
/// Currently only RENAME_COLUMN is applied on-fly.
if (command.type == MutationCommand::Type::RENAME_COLUMN)
rename_map.emplace_back(RenamePair{command.rename_to, command.column_name});
}
bool AlterConversions::columnHasNewName(const std::string & old_name) const bool AlterConversions::columnHasNewName(const std::string & old_name) const
{ {
for (const auto & [new_name, prev_name] : rename_map) for (const auto & [new_name, prev_name] : rename_map)
@ -31,7 +38,6 @@ std::string AlterConversions::getColumnNewName(const std::string & old_name) con
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column {} was not renamed", old_name); throw Exception(ErrorCodes::LOGICAL_ERROR, "Column {} was not renamed", old_name);
} }
bool AlterConversions::isColumnRenamed(const std::string & new_name) const bool AlterConversions::isColumnRenamed(const std::string & new_name) const
{ {
for (const auto & [name_to, name_from] : rename_map) for (const auto & [name_to, name_from] : rename_map)
@ -41,6 +47,7 @@ bool AlterConversions::isColumnRenamed(const std::string & new_name) const
} }
return false; return false;
} }
/// Get column old name before rename (lookup by key in rename_map) /// Get column old name before rename (lookup by key in rename_map)
std::string AlterConversions::getColumnOldName(const std::string & new_name) const std::string AlterConversions::getColumnOldName(const std::string & new_name) const
{ {

View File

@ -1,5 +1,6 @@
#pragma once #pragma once
#include <Storages/MutationCommands.h>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
@ -7,20 +8,23 @@
namespace DB namespace DB
{ {
/// Alter conversions which should be applied on-fly for part. Build from of
/// the most recent mutation commands for part. Now we have only rename_map /// Alter conversions which should be applied on-fly for part.
/// here (from ALTER_RENAME) command, because for all other type of alters /// Built from of the most recent mutation commands for part.
/// we can deduce conversions for part from difference between /// Now only ALTER RENAME COLUMN is applied.
/// part->getColumns() and storage->getColumns(). class AlterConversions : private boost::noncopyable
struct AlterConversions
{ {
public:
AlterConversions() = default;
struct RenamePair struct RenamePair
{ {
std::string rename_to; std::string rename_to;
std::string rename_from; std::string rename_from;
}; };
/// Rename map new_name -> old_name
std::vector<RenamePair> rename_map; void addMutationCommand(const MutationCommand & command);
const std::vector<RenamePair> & getRenameMap() const { return rename_map; }
/// Column was renamed (lookup by value in rename_map) /// Column was renamed (lookup by value in rename_map)
bool columnHasNewName(const std::string & old_name) const; bool columnHasNewName(const std::string & old_name) const;
@ -30,6 +34,12 @@ struct AlterConversions
bool isColumnRenamed(const std::string & new_name) const; bool isColumnRenamed(const std::string & new_name) const;
/// Get column old name before rename (lookup by key in rename_map) /// Get column old name before rename (lookup by key in rename_map)
std::string getColumnOldName(const std::string & new_name) const; std::string getColumnOldName(const std::string & new_name) const;
private:
/// Rename map new_name -> old_name.
std::vector<RenamePair> rename_map;
}; };
using AlterConversionsPtr = std::shared_ptr<const AlterConversions>;
} }

View File

@ -6,6 +6,7 @@
#include <Core/NamesAndTypes.h> #include <Core/NamesAndTypes.h>
#include <Storages/IStorage.h> #include <Storages/IStorage.h>
#include <Storages/LightweightDeleteDescription.h> #include <Storages/LightweightDeleteDescription.h>
#include <Storages/MergeTree/AlterConversions.h>
#include <Storages/MergeTree/IDataPartStorage.h> #include <Storages/MergeTree/IDataPartStorage.h>
#include <Storages/MergeTree/MergeTreeDataPartState.h> #include <Storages/MergeTree/MergeTreeDataPartState.h>
#include <Storages/MergeTree/MergeTreeIndexGranularity.h> #include <Storages/MergeTree/MergeTreeIndexGranularity.h>
@ -92,6 +93,7 @@ public:
const MarkRanges & mark_ranges, const MarkRanges & mark_ranges,
UncompressedCache * uncompressed_cache, UncompressedCache * uncompressed_cache,
MarkCache * mark_cache, MarkCache * mark_cache,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReaderSettings & reader_settings_, const MergeTreeReaderSettings & reader_settings_,
const ValueSizeMap & avg_value_size_hints_, const ValueSizeMap & avg_value_size_hints_,
const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0; const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0;

View File

@ -52,7 +52,7 @@ public:
virtual const MergeTreeDataPartChecksums & getChecksums() const = 0; virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;
virtual AlterConversions getAlterConversions() const = 0; virtual AlterConversionsPtr getAlterConversions() const = 0;
virtual size_t getMarksCount() const = 0; virtual size_t getMarksCount() const = 0;

View File

@ -1,3 +1,4 @@
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeArray.h> #include <DataTypes/DataTypeArray.h>
#include <Common/escapeForFileName.h> #include <Common/escapeForFileName.h>
@ -5,8 +6,6 @@
#include <Columns/ColumnArray.h> #include <Columns/ColumnArray.h>
#include <Interpreters/inplaceBlockConversions.h> #include <Interpreters/inplaceBlockConversions.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <Common/typeid_cast.h>
namespace DB namespace DB
@ -133,9 +132,9 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
String IMergeTreeReader::getColumnNameInPart(const NameAndTypePair & required_column) const String IMergeTreeReader::getColumnNameInPart(const NameAndTypePair & required_column) const
{ {
auto name_in_storage = required_column.getNameInStorage(); auto name_in_storage = required_column.getNameInStorage();
if (alter_conversions.isColumnRenamed(name_in_storage)) if (alter_conversions->isColumnRenamed(name_in_storage))
{ {
name_in_storage = alter_conversions.getColumnOldName(name_in_storage); name_in_storage = alter_conversions->getColumnOldName(name_in_storage);
return Nested::concatenateName(name_in_storage, required_column.getSubcolumnName()); return Nested::concatenateName(name_in_storage, required_column.getSubcolumnName());
} }

View File

@ -50,8 +50,8 @@ public:
/// Evaluate defaulted columns if necessary. /// Evaluate defaulted columns if necessary.
void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const; void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const;
/// If part metadata is not equal to storage metadata, than /// If part metadata is not equal to storage metadata,
/// try to perform conversions of columns. /// then try to perform conversions of columns.
void performRequiredConversions(Columns & res_columns) const; void performRequiredConversions(Columns & res_columns) const;
const NamesAndTypesList & getColumns() const { return requested_columns; } const NamesAndTypesList & getColumns() const { return requested_columns; }
@ -104,7 +104,7 @@ protected:
private: private:
/// Alter conversions, which must be applied on fly if required /// Alter conversions, which must be applied on fly if required
AlterConversions alter_conversions; AlterConversionsPtr alter_conversions;
/// Columns that are requested to read. /// Columns that are requested to read.
NamesAndTypesList requested_columns; NamesAndTypesList requested_columns;

View File

@ -1,4 +1,5 @@
#pragma once #pragma once
#include <Storages/MergeTree/AlterConversions.h>
#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h> #include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
@ -9,9 +10,11 @@ namespace DB
class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
{ {
public: public:
explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_) LoadedMergeTreeDataPartInfoForReader(
MergeTreeData::DataPartPtr data_part_, AlterConversionsPtr alter_conversions_)
: IMergeTreeDataPartInfoForReader(data_part_->storage.getContext()) : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
, data_part(data_part_) , data_part(std::move(data_part_))
, alter_conversions(std::move(alter_conversions_))
{ {
} }
@ -33,7 +36,7 @@ public:
std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); } std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }
AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); } AlterConversionsPtr getAlterConversions() const override { return alter_conversions; }
String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); } String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }
@ -53,8 +56,11 @@ public:
SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); } SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }
MergeTreeData::DataPartPtr getDataPart() const { return data_part; }
private: private:
MergeTreeData::DataPartPtr data_part; MergeTreeData::DataPartPtr data_part;
AlterConversionsPtr alter_conversions;
}; };
} }

View File

@ -193,8 +193,9 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical(); global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical();
auto object_columns = MergeTreeData::getConcreteObjectColumns(global_ctx->future_part->parts, global_ctx->metadata_snapshot->getColumns()); auto object_columns = MergeTreeData::getConcreteObjectColumns(global_ctx->future_part->parts, global_ctx->metadata_snapshot->getColumns());
global_ctx->storage_snapshot = std::make_shared<StorageSnapshot>(*global_ctx->data, global_ctx->metadata_snapshot, object_columns);
extendObjectColumns(global_ctx->storage_columns, object_columns, false); extendObjectColumns(global_ctx->storage_columns, object_columns, false);
global_ctx->storage_snapshot = std::make_shared<StorageSnapshot>(*global_ctx->data, global_ctx->metadata_snapshot, std::move(object_columns));
extractMergingAndGatheringColumns( extractMergingAndGatheringColumns(
global_ctx->storage_columns, global_ctx->storage_columns,
@ -544,8 +545,8 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
global_ctx->future_part->parts[part_num], global_ctx->future_part->parts[part_num],
column_names, column_names,
ctx->read_with_direct_io, ctx->read_with_direct_io,
true, /*take_column_types_from_storage=*/ true,
false, /*quiet=*/ false,
global_ctx->input_rows_filtered); global_ctx->input_rows_filtered);
pipes.emplace_back(std::move(pipe)); pipes.emplace_back(std::move(pipe));
@ -896,8 +897,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
part, part,
global_ctx->merging_column_names, global_ctx->merging_column_names,
ctx->read_with_direct_io, ctx->read_with_direct_io,
true, /*take_column_types_from_storage=*/ true,
false, /*quiet=*/ false,
global_ctx->input_rows_filtered); global_ctx->input_rows_filtered);
if (global_ctx->metadata_snapshot->hasSortingKey()) if (global_ctx->metadata_snapshot->hasSortingKey())

View File

@ -80,49 +80,51 @@ IMergeTreeSelectAlgorithm::IMergeTreeSelectAlgorithm(
result_header = header_without_const_virtual_columns; result_header = header_without_const_virtual_columns;
injectPartConstVirtualColumns(0, result_header, nullptr, partition_value_type, virt_column_names); injectPartConstVirtualColumns(0, result_header, nullptr, partition_value_type, virt_column_names);
if (prewhere_actions) if (!prewhere_actions.steps.empty())
LOG_TRACE(log, "PREWHERE condition was split into {} steps: {}", prewhere_actions->steps.size(), prewhere_actions->dumpConditions()); LOG_TRACE(log, "PREWHERE condition was split into {} steps: {}", prewhere_actions.steps.size(), prewhere_actions.dumpConditions());
if (prewhere_info) if (prewhere_info)
LOG_TEST(log, "Original PREWHERE DAG:\n{}\nPREWHERE actions:\n{}", LOG_TEST(log, "Original PREWHERE DAG:\n{}\nPREWHERE actions:\n{}",
(prewhere_info->prewhere_actions ? prewhere_info->prewhere_actions->dumpDAG(): std::string("<nullptr>")), (prewhere_info->prewhere_actions ? prewhere_info->prewhere_actions->dumpDAG(): std::string("<nullptr>")),
(prewhere_actions ? prewhere_actions->dump() : std::string("<nullptr>"))); (!prewhere_actions.steps.empty() ? prewhere_actions.dump() : std::string("<nullptr>")));
} }
bool tryBuildPrewhereSteps(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, PrewhereExprInfo & prewhere); bool tryBuildPrewhereSteps(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, PrewhereExprInfo & prewhere);
std::unique_ptr<PrewhereExprInfo> IMergeTreeSelectAlgorithm::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps) PrewhereExprInfo IMergeTreeSelectAlgorithm::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps)
{ {
std::unique_ptr<PrewhereExprInfo> prewhere_actions; PrewhereExprInfo prewhere_actions;
if (prewhere_info) if (prewhere_info)
{ {
prewhere_actions = std::make_unique<PrewhereExprInfo>();
if (prewhere_info->row_level_filter) if (prewhere_info->row_level_filter)
{ {
PrewhereExprStep row_level_filter_step PrewhereExprStep row_level_filter_step
{ {
.type = PrewhereExprStep::Filter,
.actions = std::make_shared<ExpressionActions>(prewhere_info->row_level_filter, actions_settings), .actions = std::make_shared<ExpressionActions>(prewhere_info->row_level_filter, actions_settings),
.column_name = prewhere_info->row_level_column_name, .filter_column_name = prewhere_info->row_level_column_name,
.remove_column = true, .remove_filter_column = true,
.need_filter = true .need_filter = true,
.perform_alter_conversions = true,
}; };
prewhere_actions->steps.emplace_back(std::move(row_level_filter_step)); prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(row_level_filter_step)));
} }
if (!enable_multiple_prewhere_read_steps || if (!enable_multiple_prewhere_read_steps ||
!tryBuildPrewhereSteps(prewhere_info, actions_settings, *prewhere_actions)) !tryBuildPrewhereSteps(prewhere_info, actions_settings, prewhere_actions))
{ {
PrewhereExprStep prewhere_step PrewhereExprStep prewhere_step
{ {
.type = PrewhereExprStep::Filter,
.actions = std::make_shared<ExpressionActions>(prewhere_info->prewhere_actions, actions_settings), .actions = std::make_shared<ExpressionActions>(prewhere_info->prewhere_actions, actions_settings),
.column_name = prewhere_info->prewhere_column_name, .filter_column_name = prewhere_info->prewhere_column_name,
.remove_column = prewhere_info->remove_prewhere_column, .remove_filter_column = prewhere_info->remove_prewhere_column,
.need_filter = prewhere_info->need_filter .need_filter = prewhere_info->need_filter,
.perform_alter_conversions = true,
}; };
prewhere_actions->steps.emplace_back(std::move(prewhere_step)); prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(prewhere_step)));
} }
} }
@ -213,7 +215,7 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForCurrentTask(
reader = task->data_part->getReader( reader = task->data_part->getReader(
task->task_columns.columns, metadata_snapshot, task->mark_ranges, task->task_columns.columns, metadata_snapshot, task->mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), owned_uncompressed_cache.get(), owned_mark_cache.get(),
reader_settings, value_size_map, profile_callback); task->alter_conversions, reader_settings, value_size_map, profile_callback);
} }
if (!task->pre_reader_for_step.empty()) if (!task->pre_reader_for_step.empty())
@ -226,13 +228,15 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForCurrentTask(
else else
{ {
initializeMergeTreePreReadersForPart( initializeMergeTreePreReadersForPart(
task->data_part, task->task_columns, metadata_snapshot, task->data_part, task->alter_conversions,
task->task_columns, metadata_snapshot,
task->mark_ranges, value_size_map, profile_callback); task->mark_ranges, value_size_map, profile_callback);
} }
} }
void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart( void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart(
MergeTreeData::DataPartPtr & data_part, const MergeTreeData::DataPartPtr & data_part,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReadTaskColumns & task_columns, const MergeTreeReadTaskColumns & task_columns,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const MarkRanges & mark_ranges, const MarkRanges & mark_ranges,
@ -242,15 +246,16 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart(
reader = data_part->getReader( reader = data_part->getReader(
task_columns.columns, metadata_snapshot, mark_ranges, task_columns.columns, metadata_snapshot, mark_ranges,
owned_uncompressed_cache.get(), owned_mark_cache.get(), owned_uncompressed_cache.get(), owned_mark_cache.get(),
reader_settings, value_size_map, profile_callback); alter_conversions, reader_settings, value_size_map, profile_callback);
initializeMergeTreePreReadersForPart( initializeMergeTreePreReadersForPart(
data_part, task_columns, metadata_snapshot, data_part, alter_conversions, task_columns, metadata_snapshot,
mark_ranges, value_size_map, profile_callback); mark_ranges, value_size_map, profile_callback);
} }
void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart( void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart(
MergeTreeData::DataPartPtr & data_part, const MergeTreeData::DataPartPtr & data_part,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReadTaskColumns & task_columns, const MergeTreeReadTaskColumns & task_columns,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const MarkRanges & mark_ranges, const MarkRanges & mark_ranges,
@ -266,36 +271,37 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart(
data_part->getReader( data_part->getReader(
{LightweightDeleteDescription::FILTER_COLUMN}, metadata_snapshot, {LightweightDeleteDescription::FILTER_COLUMN}, metadata_snapshot,
mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(),
reader_settings, value_size_map, profile_callback)); alter_conversions, reader_settings, value_size_map, profile_callback));
} }
if (prewhere_info) for (const auto & pre_columns_per_step : task_columns.pre_columns)
{ {
for (const auto & pre_columns_per_step : task_columns.pre_columns) pre_reader_for_step.push_back(
{ data_part->getReader(
pre_reader_for_step.push_back( pre_columns_per_step, metadata_snapshot, mark_ranges,
data_part->getReader( owned_uncompressed_cache.get(), owned_mark_cache.get(),
pre_columns_per_step, metadata_snapshot, mark_ranges, alter_conversions, reader_settings, value_size_map, profile_callback));
owned_uncompressed_cache.get(), owned_mark_cache.get(),
reader_settings, value_size_map, profile_callback));
}
} }
} }
void IMergeTreeSelectAlgorithm::initializeRangeReaders(MergeTreeReadTask & current_task) void IMergeTreeSelectAlgorithm::initializeRangeReaders(MergeTreeReadTask & current_task)
{ {
return initializeRangeReadersImpl( return initializeRangeReadersImpl(
current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(), current_task.range_reader, current_task.pre_range_readers, prewhere_actions,
reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings, reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names); pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
} }
void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl( void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl(
MergeTreeRangeReader & range_reader, std::deque<MergeTreeRangeReader> & pre_range_readers, MergeTreeRangeReader & range_reader,
PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions, std::deque<MergeTreeRangeReader> & pre_range_readers,
IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings, const PrewhereExprInfo & prewhere_actions,
IMergeTreeReader * reader,
bool has_lightweight_delete,
const MergeTreeReaderSettings & reader_settings,
const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step, const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names) const PrewhereExprStep & lightweight_delete_filter_step,
const Names & non_const_virtual_column_names)
{ {
MergeTreeRangeReader * prev_reader = nullptr; MergeTreeRangeReader * prev_reader = nullptr;
bool last_reader = false; bool last_reader = false;
@ -310,25 +316,25 @@ void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl(
pre_readers_shift++; pre_readers_shift++;
} }
if (prewhere_info) if (prewhere_actions.steps.size() + pre_readers_shift != pre_reader_for_step.size())
{ {
if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size()) throw Exception(
{ ErrorCodes::LOGICAL_ERROR,
throw Exception( "PREWHERE steps count mismatch, actions: {}, readers: {}",
ErrorCodes::LOGICAL_ERROR, prewhere_actions.steps.size(), pre_reader_for_step.size());
"PREWHERE steps count mismatch, actions: {}, readers: {}", }
prewhere_actions->steps.size(), pre_reader_for_step.size());
}
for (size_t i = 0; i < prewhere_actions->steps.size(); ++i) for (size_t i = 0; i < prewhere_actions.steps.size(); ++i)
{ {
last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size()); last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions.steps.size());
MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names); MergeTreeRangeReader current_reader(
pre_reader_for_step[i + pre_readers_shift].get(),
prev_reader, prewhere_actions.steps[i].get(),
last_reader, non_const_virtual_column_names);
pre_range_readers.push_back(std::move(current_reader)); pre_range_readers.push_back(std::move(current_reader));
prev_reader = &pre_range_readers.back(); prev_reader = &pre_range_readers.back();
}
} }
if (!last_reader) if (!last_reader)

View File

@ -74,7 +74,7 @@ public:
virtual std::string getName() const = 0; virtual std::string getName() const = 0;
static std::unique_ptr<PrewhereExprInfo> getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps); static PrewhereExprInfo getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps);
protected: protected:
/// This struct allow to return block with no columns but with non-zero number of rows similar to Chunk /// This struct allow to return block with no columns but with non-zero number of rows similar to Chunk
@ -110,8 +110,7 @@ protected:
static void initializeRangeReadersImpl( static void initializeRangeReadersImpl(
MergeTreeRangeReader & range_reader, MergeTreeRangeReader & range_reader,
std::deque<MergeTreeRangeReader> & pre_range_readers, std::deque<MergeTreeRangeReader> & pre_range_readers,
PrewhereInfoPtr prewhere_info, const PrewhereExprInfo & prewhere_actions,
const PrewhereExprInfo * prewhere_actions,
IMergeTreeReader * reader, IMergeTreeReader * reader,
bool has_lightweight_delete, bool has_lightweight_delete,
const MergeTreeReaderSettings & reader_settings, const MergeTreeReaderSettings & reader_settings,
@ -126,7 +125,8 @@ protected:
const ReadBufferFromFileBase::ProfileCallback & profile_callback); const ReadBufferFromFileBase::ProfileCallback & profile_callback);
void initializeMergeTreeReadersForPart( void initializeMergeTreeReadersForPart(
MergeTreeData::DataPartPtr & data_part, const MergeTreeData::DataPartPtr & data_part,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReadTaskColumns & task_columns, const MergeTreeReadTaskColumns & task_columns,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const MarkRanges & mark_ranges, const MarkRanges & mark_ranges,
@ -140,10 +140,19 @@ protected:
StorageSnapshotPtr storage_snapshot; StorageSnapshotPtr storage_snapshot;
/// This step is added when the part has lightweight delete mask /// This step is added when the part has lightweight delete mask
const PrewhereExprStep lightweight_delete_filter_step { nullptr, LightweightDeleteDescription::FILTER_COLUMN.name, true, true }; const PrewhereExprStep lightweight_delete_filter_step
{
.type = PrewhereExprStep::Filter,
.actions = nullptr,
.filter_column_name = LightweightDeleteDescription::FILTER_COLUMN.name,
.remove_filter_column = true,
.need_filter = true,
.perform_alter_conversions = true,
};
PrewhereInfoPtr prewhere_info; PrewhereInfoPtr prewhere_info;
ExpressionActionsSettings actions_settings; ExpressionActionsSettings actions_settings;
std::unique_ptr<PrewhereExprInfo> prewhere_actions; PrewhereExprInfo prewhere_actions;
UInt64 max_block_size_rows; UInt64 max_block_size_rows;
UInt64 preferred_block_size_bytes; UInt64 preferred_block_size_bytes;
@ -195,7 +204,8 @@ private:
/// Initialize pre readers. /// Initialize pre readers.
void initializeMergeTreePreReadersForPart( void initializeMergeTreePreReadersForPart(
MergeTreeData::DataPartPtr & data_part, const MergeTreeData::DataPartPtr & data_part,
const AlterConversionsPtr & alter_conversions,
const MergeTreeReadTaskColumns & task_columns, const MergeTreeReadTaskColumns & task_columns,
const StorageMetadataPtr & metadata_snapshot, const StorageMetadataPtr & metadata_snapshot,
const MarkRanges & mark_ranges, const MarkRanges & mark_ranges,

View File

@ -1,6 +1,7 @@
#include <Storages/MergeTree/MergeTreeBlockReadUtils.h> #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
#include <Storages/MergeTree/MergeTreeData.h> #include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h> #include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
#include <Storages/MergeTree/MergeTreeRangeReader.h>
#include <DataTypes/NestedUtils.h> #include <DataTypes/NestedUtils.h>
#include <Core/NamesAndTypes.h> #include <Core/NamesAndTypes.h>
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
@ -30,7 +31,7 @@ namespace
bool injectRequiredColumnsRecursively( bool injectRequiredColumnsRecursively(
const String & column_name, const String & column_name,
const StorageSnapshotPtr & storage_snapshot, const StorageSnapshotPtr & storage_snapshot,
const AlterConversions & alter_conversions, const AlterConversionsPtr & alter_conversions,
const IMergeTreeDataPartInfoForReader & data_part_info_for_reader, const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
const GetColumnsOptions & options, const GetColumnsOptions & options,
Names & columns, Names & columns,
@ -46,8 +47,8 @@ bool injectRequiredColumnsRecursively(
if (column_in_storage) if (column_in_storage)
{ {
auto column_name_in_part = column_in_storage->getNameInStorage(); auto column_name_in_part = column_in_storage->getNameInStorage();
if (alter_conversions.isColumnRenamed(column_name_in_part)) if (alter_conversions && alter_conversions->isColumnRenamed(column_name_in_part))
column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part); column_name_in_part = alter_conversions->getColumnOldName(column_name_in_part);
auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part); auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part);
@ -98,13 +99,14 @@ NameSet injectRequiredColumns(
NameSet injected_columns; NameSet injected_columns;
bool have_at_least_one_physical_column = false; bool have_at_least_one_physical_column = false;
AlterConversions alter_conversions; AlterConversionsPtr alter_conversions;
if (!data_part_info_for_reader.isProjectionPart()) if (!data_part_info_for_reader.isProjectionPart())
alter_conversions = data_part_info_for_reader.getAlterConversions(); alter_conversions = data_part_info_for_reader.getAlterConversions();
auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical) auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
.withExtendedObjects() .withExtendedObjects()
.withSystemColumns(); .withSystemColumns();
if (with_subcolumns) if (with_subcolumns)
options.withSubcolumns(); options.withSubcolumns();
@ -137,6 +139,7 @@ NameSet injectRequiredColumns(
MergeTreeReadTask::MergeTreeReadTask( MergeTreeReadTask::MergeTreeReadTask(
const DataPartPtr & data_part_, const DataPartPtr & data_part_,
const AlterConversionsPtr & alter_conversions_,
const MarkRanges & mark_ranges_, const MarkRanges & mark_ranges_,
size_t part_index_in_query_, size_t part_index_in_query_,
const NameSet & column_name_set_, const NameSet & column_name_set_,
@ -146,6 +149,7 @@ MergeTreeReadTask::MergeTreeReadTask(
std::future<MergeTreeReaderPtr> reader_, std::future<MergeTreeReaderPtr> reader_,
std::vector<std::future<MergeTreeReaderPtr>> && pre_reader_for_step_) std::vector<std::future<MergeTreeReaderPtr>> && pre_reader_for_step_)
: data_part{data_part_} : data_part{data_part_}
, alter_conversions{alter_conversions_}
, mark_ranges{mark_ranges_} , mark_ranges{mark_ranges_}
, part_index_in_query{part_index_in_query_} , part_index_in_query{part_index_in_query_}
, column_name_set{column_name_set_} , column_name_set{column_name_set_}
@ -306,10 +310,8 @@ MergeTreeReadTaskColumns getReadTaskColumns(
/// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part
for (const auto & name : system_columns) for (const auto & name : system_columns)
{
if (data_part_info_for_reader.getColumns().contains(name)) if (data_part_info_for_reader.getColumns().contains(name))
column_to_read_after_prewhere.push_back(name); column_to_read_after_prewhere.push_back(name);
}
/// Inject columns required for defaults evaluation /// Inject columns required for defaults evaluation
injectRequiredColumns( injectRequiredColumns(
@ -319,44 +321,50 @@ MergeTreeReadTaskColumns getReadTaskColumns(
auto options = GetColumnsOptions(GetColumnsOptions::All) auto options = GetColumnsOptions(GetColumnsOptions::All)
.withExtendedObjects() .withExtendedObjects()
.withSystemColumns(); .withSystemColumns();
if (with_subcolumns) if (with_subcolumns)
options.withSubcolumns(); options.withSubcolumns();
NameSet columns_from_previous_steps;
auto add_step = [&](const PrewhereExprStep & step)
{
Names step_column_names = step.actions->getActionsDAG().getRequiredColumnsNames();
injectRequiredColumns(
data_part_info_for_reader, storage_snapshot, with_subcolumns, step_column_names);
Names columns_to_read_in_step;
for (const auto & name : step_column_names)
{
if (columns_from_previous_steps.contains(name))
continue;
columns_to_read_in_step.push_back(name);
columns_from_previous_steps.insert(name);
}
result.pre_columns.push_back(storage_snapshot->getColumnsByNames(options, columns_to_read_in_step));
};
if (prewhere_info) if (prewhere_info)
{ {
auto prewhere_actions = IMergeTreeSelectAlgorithm::getPrewhereActions( auto prewhere_actions = IMergeTreeSelectAlgorithm::getPrewhereActions(
prewhere_info, actions_settings, reader_settings.enable_multiple_prewhere_read_steps); prewhere_info,
actions_settings,
reader_settings.enable_multiple_prewhere_read_steps);
NameSet columns_from_previous_steps; for (const auto & step : prewhere_actions.steps)
add_step(*step);
for (const auto & step : prewhere_actions->steps)
{
Names step_column_names = step.actions->getActionsDAG().getRequiredColumnsNames();
injectRequiredColumns(
data_part_info_for_reader, storage_snapshot, with_subcolumns, step_column_names);
Names columns_to_read_in_step;
for (const auto & name : step_column_names)
{
if (columns_from_previous_steps.contains(name))
continue;
columns_to_read_in_step.push_back(name);
columns_from_previous_steps.insert(name);
}
result.pre_columns.push_back(storage_snapshot->getColumnsByNames(options, columns_to_read_in_step));
}
/// Remove columns read in prewehere from the list of columns to read
Names post_column_names;
for (const auto & name : column_to_read_after_prewhere)
if (!columns_from_previous_steps.contains(name))
post_column_names.push_back(name);
column_to_read_after_prewhere = std::move(post_column_names);
} }
/// Remove columns read in prewehere from the list of columns to read
Names post_column_names;
for (const auto & name : column_to_read_after_prewhere)
if (!columns_from_previous_steps.contains(name))
post_column_names.push_back(name);
column_to_read_after_prewhere = std::move(post_column_names);
/// Rest of the requested columns /// Rest of the requested columns
result.columns = storage_snapshot->getColumnsByNames(options, column_to_read_after_prewhere); result.columns = storage_snapshot->getColumnsByNames(options, column_to_read_after_prewhere);
return result; return result;

View File

@ -6,6 +6,7 @@
#include <Storages/MergeTree/RangesInDataPart.h> #include <Storages/MergeTree/RangesInDataPart.h>
#include <Storages/MergeTree/MergeTreeRangeReader.h> #include <Storages/MergeTree/MergeTreeRangeReader.h>
#include <Storages/MergeTree/IMergeTreeReader.h> #include <Storages/MergeTree/IMergeTreeReader.h>
#include <Storages/MergeTree/AlterConversions.h>
namespace DB namespace DB
@ -35,7 +36,6 @@ NameSet injectRequiredColumns(
bool with_subcolumns, bool with_subcolumns,
Names & columns); Names & columns);
struct MergeTreeReadTaskColumns struct MergeTreeReadTaskColumns
{ {
/// column names to read during WHERE /// column names to read during WHERE
@ -49,8 +49,10 @@ struct MergeTreeReadTaskColumns
/// A batch of work for MergeTreeThreadSelectProcessor /// A batch of work for MergeTreeThreadSelectProcessor
struct MergeTreeReadTask struct MergeTreeReadTask
{ {
/// data part which should be read while performing this task /// Data part which should be read while performing this task
DataPartPtr data_part; DataPartPtr data_part;
/// Alter converversionss that should be applied on-fly for part.
AlterConversionsPtr alter_conversions;
/// Ranges to read from `data_part`. /// Ranges to read from `data_part`.
MarkRanges mark_ranges; MarkRanges mark_ranges;
/// for virtual `part_index` virtual column /// for virtual `part_index` virtual column
@ -77,6 +79,7 @@ struct MergeTreeReadTask
MergeTreeReadTask( MergeTreeReadTask(
const DataPartPtr & data_part_, const DataPartPtr & data_part_,
const AlterConversionsPtr & alter_conversions_,
const MarkRanges & mark_ranges_, const MarkRanges & mark_ranges_,
size_t part_index_in_query_, size_t part_index_in_query_,
const NameSet & column_name_set_, const NameSet & column_name_set_,

Some files were not shown because too many files have changed in this diff Show More