Merge branch 'master' into 46229-repl-clickhouse-keeper

2024-09-20 08:40:50 +00:00 · 2023-05-31 02:29:08 +02:00 · 2023-05-31 02:29:08 +02:00 · 31829f7cfc
commit 31829f7cfc
parent 0580859e6f ff5884989f
254 changed files with 4845 additions and 1553 deletions
--- a/contrib/aws
+++ b/contrib/aws
@ -1 +1 @@
-Subproject commit ecccfc026a42b30023289410a67024d561f4bf3e
+Subproject commit ca02358dcc7ce3ab733dd4cbcc32734eecfa4ee3
--- a/contrib/aws-c-auth
+++ b/contrib/aws-c-auth
@ -1 +1 @@
-Subproject commit 30df6c407e2df43bd244e2c34c9b4a4b87372bfb
+Subproject commit 97133a2b5dbca1ccdf88cd6f44f39d0531d27d12
--- a/contrib/aws-c-common
+++ b/contrib/aws-c-common
@ -1 +1 @@
-Subproject commit 324fd1d973ccb25c813aa747bf1759cfde5121c5
+Subproject commit 45dcb2849c891dba2100b270b4676765c92949ff
--- a/contrib/aws-c-event-stream
+++ b/contrib/aws-c-event-stream
@ -1 +1 @@
-Subproject commit 39bfa94a14b7126bf0c1330286ef8db452d87e66
+Subproject commit 2f9b60c42f90840ec11822acda3d8cdfa97a773d
--- a/contrib/aws-c-http
+++ b/contrib/aws-c-http
@ -1 +1 @@
-Subproject commit 2c5a2a7d5556600b9782ffa6c9d7e09964df1abc
+Subproject commit dd34461987947672444d0bc872c5a733dfdb9711
--- a/contrib/aws-c-io
+++ b/contrib/aws-c-io
@ -1 +1 @@
-Subproject commit 5d32c453560d0823df521a686bf7fbacde7f9be3
+Subproject commit d58ed4f272b1cb4f89ac9196526ceebe5f2b0d89
--- a/contrib/aws-c-mqtt
+++ b/contrib/aws-c-mqtt
@ -1 +1 @@
-Subproject commit 882c689561a3db1466330ccfe3b63637e0a575d3
+Subproject commit 33c3455cec82b16feb940e12006cefd7b3ef4194
--- a/contrib/aws-c-s3
+++ b/contrib/aws-c-s3
@ -1 +1 @@
-Subproject commit a41255ece72a7c887bba7f9d998ca3e14f4c8a1b
+Subproject commit d7bfe602d6925948f1fff95784e3613cca6a3900
--- a/contrib/aws-c-sdkutils
+++ b/contrib/aws-c-sdkutils
@ -1 +1 @@
-Subproject commit 25bf5cf225f977c3accc6a05a0a7a181ef2a4a30
+Subproject commit 208a701fa01e99c7c8cc3dcebc8317da71362972
--- a/contrib/aws-checksums
+++ b/contrib/aws-checksums
@ -1 +1 @@
-Subproject commit 48e7c0e01479232f225c8044d76c84e74192889d
+Subproject commit ad53be196a25bbefa3700a01187fdce573a7d2d0
--- a/contrib/aws-cmake/CMakeLists.txt
+++ b/contrib/aws-cmake/CMakeLists.txt
@ -52,8 +52,8 @@ endif()

 # Directories.
 SET(AWS_SDK_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws")
-SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-core")
-SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-s3")
+SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/src/aws-cpp-sdk-core")
+SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/generated/src/aws-cpp-sdk-s3")

 SET(AWS_AUTH_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-auth")
 SET(AWS_CAL_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-cal")
--- a/contrib/aws-crt-cpp
+++ b/contrib/aws-crt-cpp
@ -1 +1 @@
-Subproject commit ec0bea288f451d884c0d80d534bc5c66241c39a4
+Subproject commit 8a301b7e842f1daed478090c869207300972379f
--- a/contrib/aws-s2n-tls
+++ b/contrib/aws-s2n-tls
@ -1 +1 @@
-Subproject commit 0f1ba9e5c4a67cb3898de0c0b4f911d4194dc8de
+Subproject commit 71f4794b7580cf780eb4aca77d69eded5d3c7bb4
--- a/contrib/boost
+++ b/contrib/boost
@ -1 +1 @@
-Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98
+Subproject commit aec12eea7fc762721ae16943d1361340c66c9c17
--- a/contrib/qpl-cmake/CMakeLists.txt
+++ b/contrib/qpl-cmake/CMakeLists.txt
@ -25,6 +25,9 @@ message(STATUS "Intel QPL version: ${QPL_VERSION}")
 # Generate 8 library targets: middle_layer_lib, isal, isal_asm, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, core_iaa, middle_layer_lib.
 # Output ch_contrib::qpl by linking with 8 library targets.

+# The qpl submodule comes with its own version of isal. It contains code which does not exist in upstream isal. It would be nice to link
+# only upstream isal (ch_contrib::isal) but at this point we can't.
+
 include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake")

 # check nasm compiler
--- a/contrib/sparse-checkout/update-aws.sh
+++ b/contrib/sparse-checkout/update-aws.sh
@ -5,8 +5,8 @@ echo "Using sparse checkout for aws"
 FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout
 echo '/*' > $FILES_TO_CHECKOUT
 echo '!/*/*' >> $FILES_TO_CHECKOUT
-echo '/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
-echo '/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT
+echo '/src/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
+echo '/generated/src/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT

 git config core.sparsecheckout true
 git checkout $1
--- a/docs/en/engines/table-engines/integrations/s3.md
+++ b/docs/en/engines/table-engines/integrations/s3.md
@ -131,14 +131,17 @@ CREATE TABLE table_with_asterisk (name String, value UInt32)

 The following settings can be set before query execution or placed into configuration file.

- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`.
- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`.
+- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `32Mb`.
+- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `16Mb`.
 - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`.
 - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`.
 - `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
 - `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
 - `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
 - `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
+- `s3_upload_part_size_multiply_factor` - Multiply `s3_min_upload_part_size` by this factor each time `s3_multiply_parts_count_threshold` parts were uploaded from a single write to S3. Default values is `2`.
+- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. DEfault value us `500`.
+- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each inflight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enought, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file.

 Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration.

--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -1219,11 +1219,12 @@ Authentication parameters (the disk will try all available methods **and** Manag
 * `account_name` and `account_key` - For authentication using Shared Key.

 Limit parameters (mainly for internal usage):
-* `max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
+* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
 * `min_bytes_for_seek` - Limits the size of a seekable region.
 * `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
 * `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
 * `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
+* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object.

 Other parameters:
 * `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
--- a/docs/en/engines/table-engines/special/distributed.md
+++ b/docs/en/engines/table-engines/special/distributed.md
@ -258,4 +258,4 @@ Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](.

 - [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) description
 - [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting
- [shardNum()](../../../sql-reference/functions/other-functions.md#shard-num) and [shardCount()](../../../sql-reference/functions/other-functions.md#shard-count) functions
+- [shardNum()](../../../sql-reference/functions/other-functions.md#shardnum) and [shardCount()](../../../sql-reference/functions/other-functions.md#shardcount) functions
--- a/docs/en/getting-started/example-datasets/environmental-sensors.md
+++ b/docs/en/getting-started/example-datasets/environmental-sensors.md
@ -37,7 +37,7 @@ The data is in CSV files but uses a semi-colon for the delimiter. The rows look
 │     13199 │ BMP180      │     6664 │ 52.514 │  13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │       19.74 │
 │     12753 │ BMP180      │     6440 │ 44.616 │  2.032 │ 2019-06-01T00:00:07 │     99475 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │          17 │
 │     16956 │ BMP180      │     8594 │ 52.052 │  8.354 │ 2019-06-01T00:00:08 │    101322 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │        17.2 │
-└───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘
+└───────────┴─────────────┴──────────┴────────┴────────┴─────────────────────┴───────────┴──────────┴───────────────────┴─────────────┘
 ```

 2. We will use the following `MergeTree` table to store the data in ClickHouse:
--- a/docs/en/operations/named-collections.md
+++ b/docs/en/operations/named-collections.md
@ -167,9 +167,9 @@ user = 'myuser',
 password = 'mypass',
 host = '127.0.0.1',
 port = 3306,
-database = 'test'
-connection_pool_size = 8
-on_duplicate_clause = 1
+database = 'test',
+connection_pool_size = 8,
+on_duplicate_clause = 1,
 replace_query = 1
 ```

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -917,9 +917,9 @@ We recommend using this option in macOS since the `getrlimit()` function returns

 Restriction on deleting tables.

-If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can’t delete it using a DROP query.
+If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you can’t delete it using a [DROP](../../sql-reference/statements/drop.md) query or [TRUNCATE](../../sql-reference/statements/truncate.md) query.

-If you still need to delete the table without restarting the ClickHouse server, create the `<clickhouse-path>/flags/force_drop_table` file and run the DROP query.
+This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `<clickhouse-path>/flags/force_drop_table` file.

 Default value: 50 GB.

@ -931,6 +931,28 @@ The value 0 means that you can delete all tables without any restrictions.
 <max_table_size_to_drop>0</max_table_size_to_drop>
 ```

+## max_partition_size_to_drop {#max-partition-size-to-drop}
+
+Restriction on dropping partitions.
+
+If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_partition_size_to_drop` (in bytes), you can’t drop a partition using a [DROP PARTITION](../../sql-reference/statements/alter/partition.md#drop-partitionpart) query.
+
+This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `<clickhouse-path>/flags/force_drop_table` file.
+
+Default value: 50 GB.
+
+The value 0 means that you can drop partitions without any restrictions.
+
+:::note
+This limitation does not restrict drop table and truncate table, see [max_table_size_to_drop](#max-table-size-to-drop)
+:::
+
+**Example**
+
+``` xml
+<max_partition_size_to_drop>0</max_partition_size_to_drop>
+```
+
 ## max_thread_pool_size {#max-thread-pool-size}

 ClickHouse uses threads from the Global Thread pool to process queries. If there is no idle thread to process a query, then a new thread is created in the pool. `max_thread_pool_size` limits the maximum number of threads in the pool.
@ -1319,12 +1341,14 @@ Queries are logged in the [system.part_log](../../operations/system-tables/part_

 Use the following parameters to configure logging:

- `database` – Name of the database.
- `table` – Name of the system table.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `database` - Name of the database.
+- `table` - Name of the system table.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 **Example**

@ -1395,12 +1419,14 @@ Queries are logged in the [system.query_log](../../operations/system-tables/quer

 Use the following parameters to configure logging:

- `database` – Name of the database.
- `table` – Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `database` - Name of the database.
+- `table` - Name of the system table the queries will be logged in.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 If the table does not exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1451,12 +1477,14 @@ Queries are logged in the [system.query_thread_log](../../operations/system-tabl

 Use the following parameters to configure logging:

- `database` – Name of the database.
- `table` – Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `database` - Name of the database.
+- `table` - Name of the system table the queries will be logged in.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 If the table does not exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1479,12 +1507,14 @@ Queries are logged in the [system.query_views_log](../../operations/system-table

 Use the following parameters to configure logging:

- `database` – Name of the database.
- `table` – Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `database` - Name of the database.
+- `table` - Name of the system table the queries will be logged in.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 If the table does not exist, ClickHouse will create it. If the structure of the query views log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1505,13 +1535,15 @@ Settings for the [text_log](../../operations/system-tables/text_log.md#system_ta

 Parameters:

- `level` — Maximum Message Level (by default `Trace`) which will be stored in a table.
- `database` — Database name.
- `table` — Table name.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `level` - Maximum Message Level (by default `Trace`) which will be stored in a table.
+- `database` - Database name.
+- `table` - Table name.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 **Example**
 ```xml
@ -1534,12 +1566,14 @@ Settings for the [trace_log](../../operations/system-tables/trace_log.md#system_

 Parameters:

- `database` — Database for storing a table.
- `table` — Table name.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
- `storage_policy` – Name of storage policy to use for the table (optional)
+- `database` - Database for storing a table.
+- `table` - Table name.
+- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
+- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
+- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` or `order_by` defined.
+- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
+- `storage_policy` - Name of storage policy to use for the table (optional).
+- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).

 The default server configuration file `config.xml` contains the following settings section:

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -577,7 +577,7 @@ Default value: 20

 **Usage**

-The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
+The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.

 ## max_part_loading_threads {#max-part-loading-threads}

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1187,6 +1187,36 @@ Disable limit on kafka_num_consumers that depends on the number of available CPU

 Default value: false.

+## postgresql_connection_pool_size {#postgresql-connection-pool-size}
+
+Connection pool size for PostgreSQL table engine and database engine.
+
+Default value: 16
+
+## postgresql_connection_pool_size {#postgresql-connection-pool-size}
+
+Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.
+
+Default value: 5000
+
+## postgresql_connection_pool_auto_close_connection {#postgresql-connection-pool-auto-close-connection}
+
+Close connection before returning connection to the pool.
+
+Default value: true.
+
+## odbc_bridge_connection_pool_size {#odbc-bridge-connection-pool-size}
+
+Connection pool size for each connection settings string in ODBC bridge.
+
+Default value: 16
+
+## odbc_bridge_use_connection_pooling {#odbc-bridge-use-connection-pooling}
+
+Use connection pooling in ODBC bridge. If set to false, a new connection is created every time.
+
+Default value: true
+
 ## use_uncompressed_cache {#setting-use_uncompressed_cache}

 Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled).
@ -3563,7 +3593,7 @@ SETTINGS index_granularity = 8192 │

 ## external_table_functions_use_nulls {#external-table-functions-use-nulls}

-Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md)] table functions use Nullable columns.
+Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns.

 Possible values:

--- a/docs/en/operations/system-tables/build_options.md
+++ b/docs/en/operations/system-tables/build_options.md
@ -0,0 +1,27 @@
+---
+slug: /en/operations/system-tables/build_options
+---
+# build_options
+
+Contains information about the ClickHouse server's build options.
+
+Columns:
+
+- `name` (String) — Name of the build option, e.g. `USE_ODBC`
+- `value` (String) — Value of the build option, e.g. `1`
+
+**Example**
+
+``` sql
+SELECT * FROM system.build_options LIMIT 5
+```
+
+``` text
+┌─name─────────────┬─value─┐
+│ USE_BROTLI       │ 1     │
+│ USE_BZIP2        │ 1     │
+│ USE_CAPNP        │ 1     │
+│ USE_CASSANDRA    │ 1     │
+│ USE_DATASKETCHES │ 1     │
+└──────────────────┴───────┘
+```
--- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md
@ -29,7 +29,7 @@ select first_value(b) from test_data
 ### example2
 The NULL value is ignored.
 ```sql
-select first_value(b) ignore nulls sfrom test_data
+select first_value(b) ignore nulls from test_data
 ```

 ```text
--- a/docs/en/sql-reference/dictionaries/index.md
+++ b/docs/en/sql-reference/dictionaries/index.md
@ -2234,7 +2234,7 @@ Result:

 ## Regular Expression Tree Dictionary {#regexp-tree-dictionary}

-Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of (user agent)[https://en.wikipedia.org/wiki/User_agent] strings, which can be expressed elegantly with regexp tree dictionaries.
+Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of [user agent](https://en.wikipedia.org/wiki/User_agent) strings, which can be expressed elegantly with regexp tree dictionaries.

 ### Use Regular Expression Tree Dictionary in ClickHouse Open-Source

@ -2280,7 +2280,7 @@ This config consists of a list of regular expression tree nodes. Each node has t
  - The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution.
 - **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example.

-Regexp tree dictionaries only allow access using functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull`.
+Regexp tree dictionaries only allow access using the functions `dictGet` and `dictGetOrDefault`.

 Example:

--- a/docs/en/sql-reference/functions/string-functions.md
+++ b/docs/en/sql-reference/functions/string-functions.md
@ -323,11 +323,11 @@ Alias: `REPEAT`
 **Arguments**

 - `s` — The string to repeat. [String](../../sql-reference/data-types/string.md).
- `n` — The number of times to repeat the string. [UInt or Int](../../sql-reference/data-types/int-uint.md).
+- `n` — The number of times to repeat the string. [UInt* or Int*](../../sql-reference/data-types/int-uint.md).

 **Returned value**

-The single string containing string `s` repeated `n` times. If `n` \< 1, the function returns empty string.
+A string containing string `s` repeated `n` times. If `n` <= 0, the function returns the empty string.

 Type: `String`.

@ -345,6 +345,44 @@ Result:
 └────────────────────────────────┘
 ```

+## space
+
+Concatenates a space (` `) as many times with itself as specified.
+
+**Syntax**
+
+``` sql
+space(n)
+```
+
+Alias: `SPACE`.
+
+**Arguments**
+
+- `n` — The number of times to repeat the space. [UInt* or Int*](../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+The string containing string ` ` repeated `n` times. If `n` <= 0, the function returns the empty string.
+
+Type: `String`.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT space(3);
+```
+
+Result:
+
+``` text
+┌─space(3) ────┐
+│              │
+└──────────────┘
+```
+
 ## reverse

 Reverses the sequence of bytes in a string.
--- a/docs/en/sql-reference/statements/select/order-by.md
+++ b/docs/en/sql-reference/statements/select/order-by.md
@ -544,10 +544,10 @@ Result:
 └─────┴──────────┴───────┘
 ```

-##Filling grouped by sorting prefix
+## Filling grouped by sorting prefix

 It can be useful to fill rows which have the same values in particular columns independently, - a good example is filling missing values in time series.
-Assume there is the following time series table
+Assume there is the following time series table:
 ``` sql
 CREATE TABLE timeseries
 (
@ -567,7 +567,7 @@ SELECT * FROM timeseries;
 └───────────┴─────────────────────────┴───────┘
 ```
 And we'd like to fill missing values for each sensor independently with 1 second interval.
-The way to achieve it is to use `sensor_id` column as sorting prefix for filling column `timestamp`
+The way to achieve it is to use `sensor_id` column as sorting prefix for filling column `timestamp`:
 ```
 SELECT *
 FROM timeseries
@ -589,7 +589,7 @@ INTERPOLATE ( value AS 9999 )
 │       432 │ 2021-12-01 00:00:05.000 │     5 │
 └───────────┴─────────────────────────┴───────┘
 ```
-Here, the `value` column was interpolated with `9999` just to make filled rows more noticeable
+Here, the `value` column was interpolated with `9999` just to make filled rows more noticeable.
 This behavior is controlled by setting `use_with_fill_by_sorting_prefix` (enabled by default)

 ## Related content
--- a/docs/en/sql-reference/statements/select/sample.md
+++ b/docs/en/sql-reference/statements/select/sample.md
@ -34,7 +34,7 @@ For the `SAMPLE` clause the following syntax is supported:
 | `SAMPLE k OFFSET m`  |  Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset)  |


-## SAMPLE K
+## SAMPLE K {#select-sample-k}

 Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`.

@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000

 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10.

-## SAMPLE N
+## SAMPLE N {#select-sample-n}

 Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`.

@ -90,7 +90,7 @@ FROM visits
 SAMPLE 10000000
 ```

-## SAMPLE K OFFSET M
+## SAMPLE K OFFSET M {#select-sample-offset}

 Here `k` and `m` are numbers from 0 to 1. Examples are shown below.

--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -1137,6 +1137,16 @@
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
        -->

+        <!--
+            ORDER BY expr: https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#order_by
+            Example:
+                event_date, event_time
+                event_date, type, query_id
+                event_date, event_time, initial_query_id
+
+        <order_by>event_date, event_time, initial_query_id</order_by>
+        -->
+
        <!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
             Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
          -->
--- a/src/AggregateFunctions/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/AggregateFunctionIf.h
@ -152,6 +152,13 @@ public:
        nested_func->merge(place, rhs, arena);
    }

+    bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
+    {
+        nested_func->merge(place, rhs, thread_pool, arena);
+    }
+
    void mergeBatch(
        size_t row_begin,
        size_t row_end,
--- a/src/Backups/BackupEntryFromImmutableFile.cpp
+++ b/src/Backups/BackupEntryFromImmutableFile.cpp
@ -59,16 +59,31 @@ UInt64 BackupEntryFromImmutableFile::getSize() const

 UInt128 BackupEntryFromImmutableFile::getChecksum() const
 {
+    {
+        std::lock_guard lock{size_and_checksum_mutex};
+        if (checksum_adjusted)
+            return *checksum;
+
+        if (checksum)
+        {
+            if (copy_encrypted)
+                checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
+            checksum_adjusted = true;
+            return *checksum;
+        }
+    }
+
+    auto calculated_checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
+
+    {
        std::lock_guard lock{size_and_checksum_mutex};
        if (!checksum_adjusted)
        {
-        if (!checksum)
-            checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
-        else if (copy_encrypted)
-            checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
+            checksum = calculated_checksum;
            checksum_adjusted = true;
        }
        return *checksum;
+    }
 }

 std::optional<UInt128> BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const
--- a/src/Backups/BackupEntryFromImmutableFile.h
+++ b/src/Backups/BackupEntryFromImmutableFile.h
@ -44,7 +44,7 @@ private:
    const DataSourceDescription data_source_description;
    const bool copy_encrypted;
    mutable std::optional<UInt64> file_size;
-    mutable std::optional<UInt64> checksum;
+    mutable std::optional<UInt128> checksum;
    mutable bool file_size_adjusted = false;
    mutable bool checksum_adjusted = false;
    mutable std::mutex size_and_checksum_mutex;
--- a/src/Backups/BackupEntryWithChecksumCalculation.cpp
+++ b/src/Backups/BackupEntryWithChecksumCalculation.cpp
@ -8,15 +8,32 @@ namespace DB
 template <typename Base>
 UInt128 BackupEntryWithChecksumCalculation<Base>::getChecksum() const
 {
+    {
+        std::lock_guard lock{checksum_calculation_mutex};
+        if (calculated_checksum)
+            return *calculated_checksum;
+    }
+
+    size_t size = this->getSize();
+
+    {
        std::lock_guard lock{checksum_calculation_mutex};
        if (!calculated_checksum)
        {
-        auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(this->getSize()));
+            if (size == 0)
+            {
+                calculated_checksum = 0;
+            }
+            else
+            {
+                auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(size));
                HashingReadBuffer hashing_read_buffer(*read_buffer);
                hashing_read_buffer.ignoreAll();
                calculated_checksum = hashing_read_buffer.getHash();
            }
+        }
        return *calculated_checksum;
+    }
 }

 template <typename Base>
--- a/src/Backups/tests/gtest_backup_entries.cpp
+++ b/src/Backups/tests/gtest_backup_entries.cpp
@ -0,0 +1,350 @@
+#include <gtest/gtest.h>
+
+#include <Backups/BackupEntryFromAppendOnlyFile.h>
+#include <Backups/BackupEntryFromImmutableFile.h>
+#include <Backups/BackupEntryFromSmallFile.h>
+
+#include <Disks/IDisk.h>
+#include <Disks/DiskLocal.h>
+#include <Disks/DiskEncrypted.h>
+#include <IO/FileEncryptionCommon.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Poco/TemporaryFile.h>
+
+using namespace DB;
+
+
+class BackupEntriesTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        /// Make local disk.
+        temp_dir = std::make_unique<Poco::TemporaryFile>();
+        temp_dir->createDirectories();
+        local_disk = std::make_shared<DiskLocal>("local_disk", temp_dir->path() + "/", 0);
+
+        /// Make encrypted disk.
+        auto settings = std::make_unique<DiskEncryptedSettings>();
+        settings->wrapped_disk = local_disk;
+        settings->current_algorithm = FileEncryption::Algorithm::AES_128_CTR;
+        settings->keys[0] = "1234567890123456";
+        settings->current_key_id = 0;
+        settings->disk_path = "encrypted/";
+        encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
+    }
+
+    void TearDown() override
+    {
+        encrypted_disk.reset();
+        local_disk.reset();
+    }
+
+    static void writeFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
+        writeString(std::string_view{"Some text"}, *buf);
+        buf->finalize();
+    }
+
+    static void writeEmptyFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
+        buf->finalize();
+    }
+
+    static void appendFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append, {});
+        writeString(std::string_view{"Appended"}, *buf);
+        buf->finalize();
+    }
+
+    static String getChecksum(const BackupEntryPtr & backup_entry)
+    {
+        return getHexUIntUppercase(backup_entry->getChecksum());
+    }
+
+    static const constexpr std::string_view NO_CHECKSUM = "no checksum";
+
+    static String getPartialChecksum(const BackupEntryPtr & backup_entry, size_t prefix_length)
+    {
+        auto partial_checksum = backup_entry->getPartialChecksum(prefix_length);
+        if (!partial_checksum)
+            return String{NO_CHECKSUM};
+        return getHexUIntUppercase(*partial_checksum);
+    }
+
+    static String readAll(const BackupEntryPtr & backup_entry)
+    {
+        auto in = backup_entry->getReadBuffer({});
+        String str;
+        readStringUntilEOF(str, *in);
+        return str;
+    }
+
+    std::unique_ptr<Poco::TemporaryFile> temp_dir;
+    std::shared_ptr<DiskLocal> local_disk;
+    std::shared_ptr<DiskEncrypted> encrypted_disk;
+};
+
+
+static const constexpr std::string_view ZERO_CHECKSUM = "00000000000000000000000000000000";
+
+static const constexpr std::string_view SOME_TEXT_CHECKSUM = "28B5529750AC210952FFD366774363ED";
+static const constexpr std::string_view S_CHECKSUM = "C27395C39AFB5557BFE47661CC9EB86C";
+static const constexpr std::string_view SOME_TEX_CHECKSUM = "D00D9BE8D87919A165F14EDD31088A0E";
+static const constexpr std::string_view SOME_TEXT_APPENDED_CHECKSUM = "5A1F10F638DC7A226231F3FD927D1726";
+
+static const constexpr std::string_view PRECALCULATED_CHECKSUM = "1122334455667788AABBCCDDAABBCCDD";
+static const constexpr UInt128 PRECALCULATED_CHECKSUM_UINT128 = (UInt128(0x1122334455667788) << 64) | 0xAABBCCDDAABBCCDD;
+static const size_t PRECALCULATED_SIZE = 123;
+
+TEST_F(BackupEntriesTest, BackupEntryFromImmutableFile)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    writeEmptyFile(local_disk, "empty.txt");
+
+    auto empty_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "empty.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+
+    auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+    EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
+
+    EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE - 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(readAll(precalculated_entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, BackupEntryFromAppendOnlyFile)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    appendFile(local_disk, "a.txt");
+
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    auto appended_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(appended_entry->getSize(), 17);
+    EXPECT_EQ(getChecksum(appended_entry), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 22), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 1000), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(readAll(appended_entry), "Some textAppended");
+
+    writeEmptyFile(local_disk, "empty_appended.txt");
+
+    auto empty_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "empty_appended.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+
+    appendFile(local_disk, "empty_appended.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+}
+
+TEST_F(BackupEntriesTest, PartialChecksumBeforeFullChecksum)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, BackupEntryFromSmallFile)
+{
+    writeFile(local_disk, "a.txt");
+    auto entry = std::make_shared<BackupEntryFromSmallFile>(local_disk, "a.txt");
+
+    local_disk->removeFile("a.txt");
+
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, DecryptedEntriesFromEncryptedDisk)
+{
+    {
+        writeFile(encrypted_disk, "a.txt");
+        std::pair<BackupEntryPtr, bool /* partial_checksum_allowed */> test_cases[]
+            = {{std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt"), false},
+               {std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt"), true},
+               {std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt"), true}};
+        for (const auto & [entry, partial_checksum_allowed] : test_cases)
+        {
+            EXPECT_EQ(entry->getSize(), 9);
+            EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), partial_checksum_allowed ? S_CHECKSUM : NO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 8), partial_checksum_allowed ? SOME_TEX_CHECKSUM : NO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "Some text");
+        }
+    }
+
+    {
+        writeEmptyFile(encrypted_disk, "empty.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt"),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt"),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt")};
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 0);
+            EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "");
+        }
+    }
+
+    {
+        auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+        EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
+        EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(readAll(precalculated_entry), "Some text");
+    }
+}
+
+TEST_F(BackupEntriesTest, EncryptedEntriesFromEncryptedDisk)
+{
+    {
+        writeFile(encrypted_disk, "a.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true)};
+
+        auto encrypted_checksum = getChecksum(entries[0]);
+        EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
+
+        auto partial_checksum = getPartialChecksum(entries[1], 9);
+        EXPECT_NE(partial_checksum, NO_CHECKSUM);
+        EXPECT_NE(partial_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(partial_checksum, SOME_TEXT_CHECKSUM);
+        EXPECT_NE(partial_checksum, encrypted_checksum);
+
+        auto encrypted_data = readAll(entries[0]);
+        EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
+
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 9 + FileEncryption::Header::kSize);
+            EXPECT_EQ(getChecksum(entry), encrypted_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            auto encrypted_checksum_9 = getPartialChecksum(entry, 9);
+            EXPECT_TRUE(encrypted_checksum_9 == NO_CHECKSUM || encrypted_checksum_9 == partial_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 9 + FileEncryption::Header::kSize), encrypted_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 1000), encrypted_checksum);
+            EXPECT_EQ(readAll(entry), encrypted_data);
+        }
+    }
+
+    {
+        writeEmptyFile(encrypted_disk, "empty.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true)};
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 0);
+            EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "");
+        }
+    }
+
+    {
+        auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+        EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE + FileEncryption::Header::kSize);
+
+        auto encrypted_checksum = getChecksum(precalculated_entry);
+        EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, PRECALCULATED_CHECKSUM);
+
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE + FileEncryption::Header::kSize), encrypted_checksum);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), encrypted_checksum);
+
+        auto encrypted_data = readAll(precalculated_entry);
+        EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
+    }
+}
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -121,7 +121,7 @@ ConnectionEstablisherAsync::ConnectionEstablisherAsync(
    epoll.add(timeout_descriptor.getDescriptor());
 }

-void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, ResumeCallback)
+void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, SuspendCallback)
 {
    connection_establisher_async.reset();
    connection_establisher_async.connection_establisher.setAsyncCallback(async_callback);
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@ -91,7 +91,7 @@ private:

        ConnectionEstablisherAsync & connection_establisher_async;

-        void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
+        void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
    };

    void cancelAfter() override;
--- a/src/Client/PacketReceiver.cpp
+++ b/src/Client/PacketReceiver.cpp
@ -57,7 +57,7 @@ bool PacketReceiver::checkTimeout()
    return true;
 }

-void PacketReceiver::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback)
+void PacketReceiver::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
 {
    while (true)
    {
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@ -57,7 +57,7 @@ private:

        PacketReceiver & receiver;

-        void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
+        void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
    };

    /// When epoll file descriptor is ready, check if it's an expired timeout.
--- a/src/Common/AsyncLoader.cpp
+++ b/src/Common/AsyncLoader.cpp
@ -6,6 +6,7 @@
 #include <Common/noexcept_scope.h>
 #include <Common/setThreadName.h>
 #include <Common/logger_useful.h>
+#include <Common/ThreadPool.h>

 namespace DB
 {
@ -41,9 +42,14 @@ std::exception_ptr LoadJob::exception() const
    return load_exception;
 }

-ssize_t LoadJob::priority() const
+size_t LoadJob::executionPool() const
 {
-    return load_priority;
+    return execution_pool_id;
+}
+
+size_t LoadJob::pool() const
+{
+    return pool_id;
 }

 void LoadJob::wait() const
@ -112,8 +118,9 @@ void LoadJob::enqueued()
        enqueue_time = std::chrono::system_clock::now();
 }

-void LoadJob::execute(const LoadJobPtr & self)
+void LoadJob::execute(size_t pool, const LoadJobPtr & self)
 {
+    execution_pool_id = pool;
    start_time = std::chrono::system_clock::now();
    func(self);
 }
@ -148,22 +155,35 @@ void LoadTask::remove()
    {
        loader.remove(jobs);
        jobs.clear();
+        goal_jobs.clear();
    }
 }

 void LoadTask::detach()
 {
    jobs.clear();
+    goal_jobs.clear();
 }

-AsyncLoader::AsyncLoader(Metric metric_threads, Metric metric_active_threads, size_t max_threads_, bool log_failures_, bool log_progress_)
+
+AsyncLoader::AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_)
    : log_failures(log_failures_)
    , log_progress(log_progress_)
    , log(&Poco::Logger::get("AsyncLoader"))
-    , max_threads(max_threads_)
-    , pool(metric_threads, metric_active_threads, max_threads)
 {
-
+    pools.reserve(pool_initializers.size());
+    for (auto && init : pool_initializers)
+        pools.push_back({
+            .name = init.name,
+            .priority = init.priority,
+            .thread_pool = std::make_unique<ThreadPool>(
+                init.metric_threads,
+                init.metric_active_threads,
+                init.max_threads,
+                /* max_free_threads = */ 0,
+                init.max_threads),
+            .max_threads = init.max_threads
+        });
 }

 AsyncLoader::~AsyncLoader()
@ -175,13 +195,20 @@ void AsyncLoader::start()
 {
    std::unique_lock lock{mutex};
    is_running = true;
-    for (size_t i = 0; workers < max_threads && i < ready_queue.size(); i++)
-        spawn(lock);
+    updateCurrentPriorityAndSpawn(lock);
 }

 void AsyncLoader::wait()
 {
-    pool.wait();
+    // Because job can create new jobs in other pools we have to recheck in cycle
+    std::unique_lock lock{mutex};
+    while (!scheduled_jobs.empty())
+    {
+        lock.unlock();
+        for (auto & p : pools)
+            p.thread_pool->wait();
+        lock.lock();
+    }
 }

 void AsyncLoader::stop()
@ -191,7 +218,7 @@ void AsyncLoader::stop()
        is_running = false;
        // NOTE: there is no need to notify because workers never wait
    }
-    pool.wait();
+    wait();
 }

 void AsyncLoader::schedule(LoadTask & task)
@ -229,9 +256,9 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
        old_jobs = finished_jobs.size();
    }

-    // Make set of jobs to schedule:
+    // Pass 1. Make set of jobs to schedule:
    // 1) exclude already scheduled or finished jobs
-    // 2) include pending dependencies, that are not yet scheduled
+    // 2) include assigned job dependencies (that are not yet scheduled)
    LoadJobSet jobs;
    for (const auto & job : input_jobs)
        gatherNotScheduled(job, jobs, lock);
@ -242,17 +269,18 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
    // We do not want any exception to be throws after this point, because the following code is not exception-safe
    DENY_ALLOCATIONS_IN_SCOPE;

-    // Schedule all incoming jobs
+    // Pass 2. Schedule all incoming jobs
    for (const auto & job : jobs)
    {
+        chassert(job->pool() < pools.size());
        NOEXCEPT_SCOPE({
            ALLOW_ALLOCATIONS_IN_SCOPE;
-            scheduled_jobs.emplace(job, Info{.initial_priority = job->load_priority, .priority = job->load_priority});
+            scheduled_jobs.try_emplace(job);
            job->scheduled();
        });
    }

-    // Process dependencies on scheduled pending jobs
+    // Pass 3. Process dependencies on scheduled jobs, priority inheritance
    for (const auto & job : jobs)
    {
        Info & info = scheduled_jobs.find(job)->second;
@ -267,17 +295,18 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
                });
                info.dependencies_left++;

-                // Priority inheritance: prioritize deps to have at least given `priority` to avoid priority inversion
-                prioritize(dep, info.priority, lock);
+                // Priority inheritance: prioritize deps to have at least given `pool.priority` to avoid priority inversion
+                prioritize(dep, job->pool_id, lock);
            }
        }

        // Enqueue non-blocked jobs (w/o dependencies) to ready queue
-        if (!info.is_blocked())
+        if (!info.isBlocked())
            enqueue(info, job, lock);
    }

-    // Process dependencies on other jobs. It is done in a separate pass to facilitate propagation of cancel signals (if any).
+    // Pass 4: Process dependencies on other jobs.
+    // It is done in a separate pass to facilitate cancelling due to already failed dependencies.
    for (const auto & job : jobs)
    {
        if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
@ -285,12 +314,12 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
            for (const auto & dep : job->dependencies)
            {
                if (scheduled_jobs.contains(dep))
-                    continue; // Skip dependencies on scheduled pending jobs (already processed)
+                    continue; // Skip dependencies on scheduled jobs (already processed in pass 3)
                LoadStatus dep_status = dep->status();
                if (dep_status == LoadStatus::OK)
                    continue; // Dependency on already successfully finished job -- it's okay.

-                // Dependency on not scheduled pending job -- it's bad.
+                // Dependency on assigned job -- it's bad.
                // Probably, there is an error in `jobs` set, `gatherNotScheduled()` should have fixed it.
                chassert(dep_status != LoadStatus::PENDING);

@ -305,7 +334,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
                            job->name,
                            getExceptionMessage(dep->exception(), /* with_stacktrace = */ false)));
                    });
-                    finish(lock, job, LoadStatus::CANCELED, e);
+                    finish(job, LoadStatus::CANCELED, e, lock);
                    break; // This job is now finished, stop its dependencies processing
                }
            }
@ -327,13 +356,14 @@ void AsyncLoader::gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs,
    }
 }

-void AsyncLoader::prioritize(const LoadJobPtr & job, ssize_t new_priority)
+void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool)
 {
    if (!job)
        return;
+    chassert(new_pool < pools.size());
    DENY_ALLOCATIONS_IN_SCOPE;
    std::unique_lock lock{mutex};
-    prioritize(job, new_priority, lock);
+    prioritize(job, new_pool, lock);
 }

 void AsyncLoader::remove(const LoadJobSet & jobs)
@ -347,14 +377,14 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
    {
        if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
        {
-            if (info->second.is_executing())
+            if (info->second.isExecuting())
                continue; // Skip executing jobs on the first pass
            std::exception_ptr e;
            NOEXCEPT_SCOPE({
                ALLOW_ALLOCATIONS_IN_SCOPE;
                e = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED, "Load job '{}' canceled", job->name));
            });
-            finish(lock, job, LoadStatus::CANCELED, e);
+            finish(job, LoadStatus::CANCELED, e, lock);
        }
    }
    // On the second pass wait for executing jobs to finish
@ -363,7 +393,7 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
        if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
        {
            // Job is currently executing
-            chassert(info->second.is_executing());
+            chassert(info->second.isExecuting());
            lock.unlock();
            job->waitNoThrow(); // Wait for job to finish
            lock.lock();
@ -379,25 +409,36 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
    }
 }

-void AsyncLoader::setMaxThreads(size_t value)
+void AsyncLoader::setMaxThreads(size_t pool, size_t value)
 {
    std::unique_lock lock{mutex};
-    pool.setMaxThreads(value);
-    pool.setMaxFreeThreads(value);
-    pool.setQueueSize(value);
-    max_threads = value;
+    auto & p = pools[pool];
+    p.thread_pool->setMaxThreads(value);
+    p.thread_pool->setQueueSize(value); // Keep queue size equal max threads count to avoid blocking during spawning
+    p.max_threads = value;
    if (!is_running)
        return;
-    for (size_t i = 0; workers < max_threads && i < ready_queue.size(); i++)
-        spawn(lock);
+    for (size_t i = 0; canSpawnWorker(p, lock) && i < p.ready_queue.size(); i++)
+        spawn(p, lock);
 }

-size_t AsyncLoader::getMaxThreads() const
+size_t AsyncLoader::getMaxThreads(size_t pool) const
 {
    std::unique_lock lock{mutex};
-    return max_threads;
+    return pools[pool].max_threads;
 }

+const String & AsyncLoader::getPoolName(size_t pool) const
+{
+    return pools[pool].name; // NOTE: lock is not needed because `name` is const and `pools` are immutable
+}
+
+Priority AsyncLoader::getPoolPriority(size_t pool) const
+{
+    return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
+}
+
+
 size_t AsyncLoader::getScheduledJobCount() const
 {
    std::unique_lock lock{mutex};
@ -412,11 +453,10 @@ std::vector<AsyncLoader::JobState> AsyncLoader::getJobStates() const
        states.emplace(job->name, JobState{
            .job = job,
            .dependencies_left = info.dependencies_left,
-            .is_executing = info.is_executing(),
-            .is_blocked = info.is_blocked(),
-            .is_ready = info.is_ready(),
-            .initial_priority = info.initial_priority,
-            .ready_seqno = last_ready_seqno
+            .ready_seqno = info.ready_seqno,
+            .is_blocked = info.isBlocked(),
+            .is_ready = info.isReady(),
+            .is_executing = info.isExecuting()
        });
    for (const auto & job : finished_jobs)
        states.emplace(job->name, JobState{.job = job});
@ -462,21 +502,21 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
    return {};
 }

-void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job)
+void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock)
 {
+    chassert(scheduled_jobs.contains(job)); // Job was pending
    if (status == LoadStatus::OK)
    {
        // Notify waiters
        job->ok();

        // Update dependent jobs and enqueue if ready
-        chassert(scheduled_jobs.contains(job)); // Job was pending
        for (const auto & dep : scheduled_jobs[job].dependent_jobs)
        {
            chassert(scheduled_jobs.contains(dep)); // All depended jobs must be pending
            Info & dep_info = scheduled_jobs[dep];
            dep_info.dependencies_left--;
-            if (!dep_info.is_blocked())
+            if (!dep_info.isBlocked())
                enqueue(dep_info, dep, lock);
        }
    }
@ -488,11 +528,10 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
        else if (status == LoadStatus::CANCELED)
            job->canceled(exception_from_job);

-        chassert(scheduled_jobs.contains(job)); // Job was pending
        Info & info = scheduled_jobs[job];
-        if (info.is_ready())
+        if (info.isReady())
        {
-            ready_queue.erase(info.key());
+            pools[job->pool_id].ready_queue.erase(info.ready_seqno);
            info.ready_seqno = 0;
        }

@ -512,7 +551,7 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
                        dep->name,
                        getExceptionMessage(exception_from_job, /* with_stacktrace = */ false)));
            });
-            finish(lock, dep, LoadStatus::CANCELED, e);
+            finish(dep, LoadStatus::CANCELED, e, lock);
        }

        // Clean dependency graph edges pointing to canceled jobs
@ -531,87 +570,130 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
    });
 }

-void AsyncLoader::prioritize(const LoadJobPtr & job, ssize_t new_priority, std::unique_lock<std::mutex> & lock)
+void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock)
 {
    if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
    {
-        if (info->second.priority >= new_priority)
-            return; // Never lower priority
+        Pool & old_pool = pools[job->pool_id];
+        Pool & new_pool = pools[new_pool_id];
+        if (old_pool.priority <= new_pool.priority)
+            return; // Never lower priority or change pool leaving the same priority

        // Update priority and push job forward through ready queue if needed
-        if (info->second.ready_seqno)
-            ready_queue.erase(info->second.key());
-        info->second.priority = new_priority;
-        job->load_priority.store(new_priority); // Set user-facing priority (may affect executing jobs)
-        if (info->second.ready_seqno)
+        UInt64 ready_seqno = info->second.ready_seqno;
+
+        // Requeue job into the new pool queue without allocations
+        if (ready_seqno)
        {
-            NOEXCEPT_SCOPE({
-                ALLOW_ALLOCATIONS_IN_SCOPE;
-                ready_queue.emplace(info->second.key(), job);
-            });
+            new_pool.ready_queue.insert(old_pool.ready_queue.extract(ready_seqno));
+            if (canSpawnWorker(new_pool, lock))
+                spawn(new_pool, lock);
        }

+        // Set user-facing pool (may affect executing jobs)
+        job->pool_id.store(new_pool_id);
+
        // Recurse into dependencies
        for (const auto & dep : job->dependencies)
-            prioritize(dep, new_priority, lock);
+            prioritize(dep, new_pool_id, lock);
    }
 }

 void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock)
 {
-    chassert(!info.is_blocked());
+    chassert(!info.isBlocked());
    chassert(info.ready_seqno == 0);
    info.ready_seqno = ++last_ready_seqno;
+    Pool & pool = pools[job->pool_id];
    NOEXCEPT_SCOPE({
        ALLOW_ALLOCATIONS_IN_SCOPE;
-        ready_queue.emplace(info.key(), job);
+        pool.ready_queue.emplace(info.ready_seqno, job);
    });

    job->enqueued();

-    if (is_running && workers < max_threads)
-        spawn(lock);
+    if (canSpawnWorker(pool, lock))
+        spawn(pool, lock);
 }

-void AsyncLoader::spawn(std::unique_lock<std::mutex> &)
+bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
 {
-    workers++;
+    return is_running
+        && !pool.ready_queue.empty()
+        && pool.workers < pool.max_threads
+        && (!current_priority || *current_priority >= pool.priority);
+}
+
+bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
+{
+    return is_running
+        && !pool.ready_queue.empty()
+        && pool.workers <= pool.max_threads
+        && (!current_priority || *current_priority >= pool.priority);
+}
+
+void AsyncLoader::updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock)
+{
+    // Find current priority.
+    // NOTE: We assume low number of pools, so O(N) scans are fine.
+    std::optional<Priority> priority;
+    for (Pool & pool : pools)
+    {
+        if (pool.isActive() && (!priority || *priority > pool.priority))
+            priority = pool.priority;
+    }
+    current_priority = priority;
+
+    // Spawn workers in all pools with current priority
+    for (Pool & pool : pools)
+    {
+        for (size_t i = 0; canSpawnWorker(pool, lock) && i < pool.ready_queue.size(); i++)
+            spawn(pool, lock);
+    }
+}
+
+void AsyncLoader::spawn(Pool & pool, std::unique_lock<std::mutex> &)
+{
+    pool.workers++;
+    current_priority = pool.priority; // canSpawnWorker() ensures this would not decrease current_priority
    NOEXCEPT_SCOPE({
        ALLOW_ALLOCATIONS_IN_SCOPE;
-        pool.scheduleOrThrowOnError([this] { worker(); });
+        pool.thread_pool->scheduleOrThrowOnError([this, &pool] { worker(pool); });
    });
 }

-void AsyncLoader::worker()
+void AsyncLoader::worker(Pool & pool)
 {
    DENY_ALLOCATIONS_IN_SCOPE;

+    size_t pool_id = &pool - &*pools.begin();
    LoadJobPtr job;
    std::exception_ptr exception_from_job;
    while (true)
    {
        // This is inside the loop to also reset previous thread names set inside the jobs
-        setThreadName("AsyncLoader");
+        setThreadName(pool.name.c_str());

        {
            std::unique_lock lock{mutex};

            // Handle just executed job
            if (exception_from_job)
-                finish(lock, job, LoadStatus::FAILED, exception_from_job);
+                finish(job, LoadStatus::FAILED, exception_from_job, lock);
            else if (job)
-                finish(lock, job, LoadStatus::OK);
+                finish(job, LoadStatus::OK, {}, lock);

-            if (!is_running || ready_queue.empty() || workers > max_threads)
+            if (!canWorkerLive(pool, lock))
            {
-                workers--;
+                if (--pool.workers == 0)
+                    updateCurrentPriorityAndSpawn(lock); // It will spawn lower priority workers if needed
                return;
            }

            // Take next job to be executed from the ready queue
-            auto it = ready_queue.begin();
+            auto it = pool.ready_queue.begin();
            job = it->second;
-            ready_queue.erase(it);
+            pool.ready_queue.erase(it);
            scheduled_jobs.find(job)->second.ready_seqno = 0; // This job is no longer in the ready queue
        }

@ -619,7 +701,7 @@ void AsyncLoader::worker()

        try
        {
-            job->execute(job);
+            job->execute(pool_id, job);
            exception_from_job = {};
        }
        catch (...)
--- a/src/Common/AsyncLoader.h
+++ b/src/Common/AsyncLoader.h
@ -11,8 +11,9 @@
 #include <boost/noncopyable.hpp>
 #include <base/types.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/Priority.h>
 #include <Common/Stopwatch.h>
-#include <Common/ThreadPool.h>
+#include <Common/ThreadPool_fwd.h>


 namespace Poco { class Logger; }
@ -46,22 +47,28 @@ class LoadJob : private boost::noncopyable
 {
 public:
    template <class Func, class LoadJobSetType>
-    LoadJob(LoadJobSetType && dependencies_, String name_, Func && func_, ssize_t priority_ = 0)
+    LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, Func && func_)
        : dependencies(std::forward<LoadJobSetType>(dependencies_))
        , name(std::move(name_))
+        , pool_id(pool_id_)
        , func(std::forward<Func>(func_))
-        , load_priority(priority_)
    {}

    // Current job status.
    LoadStatus status() const;
    std::exception_ptr exception() const;

-    // Returns current value of a priority of the job. May differ from initial priority.
-    ssize_t priority() const;
+    // Returns pool in which the job is executing (was executed). May differ from initial pool and from current pool.
+    // Value is only valid (and constant) after execution started.
+    size_t executionPool() const;
+
+    // Returns current pool of the job. May differ from initial and execution pool.
+    // This value is intended for creating new jobs during this job execution.
+    // Value may change during job execution by `prioritize()`.
+    size_t pool() const;

    // Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
-    // Throws if job is FAILED or CANCELED. Returns or throws immediately on non-pending job.
+    // Throws if job is FAILED or CANCELED. Returns or throws immediately if called on non-pending job.
    void wait() const;

    // Wait for a job to reach any non PENDING status.
@ -90,10 +97,11 @@ private:

    void scheduled();
    void enqueued();
-    void execute(const LoadJobPtr & self);
+    void execute(size_t pool, const LoadJobPtr & self);

+    std::atomic<size_t> execution_pool_id;
+    std::atomic<size_t> pool_id;
    std::function<void(const LoadJobPtr & self)> func;
-    std::atomic<ssize_t> load_priority;

    mutable std::mutex mutex;
    mutable std::condition_variable finished;
@ -115,25 +123,25 @@ struct EmptyJobFunc
 template <class Func = EmptyJobFunc>
 LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, Func && func = EmptyJobFunc())
 {
-    return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), std::forward<Func>(func));
+    return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), 0, std::forward<Func>(func));
 }

 template <class Func = EmptyJobFunc>
 LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, Func && func = EmptyJobFunc())
 {
-    return std::make_shared<LoadJob>(dependencies, std::move(name), std::forward<Func>(func));
+    return std::make_shared<LoadJob>(dependencies, std::move(name), 0, std::forward<Func>(func));
 }

 template <class Func = EmptyJobFunc>
-LoadJobPtr makeLoadJob(LoadJobSet && dependencies, ssize_t priority, String name, Func && func = EmptyJobFunc())
+LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc())
 {
-    return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), std::forward<Func>(func), priority);
+    return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), pool_id, std::forward<Func>(func));
 }

 template <class Func = EmptyJobFunc>
-LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, ssize_t priority, String name, Func && func = EmptyJobFunc())
+LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc())
 {
-    return std::make_shared<LoadJob>(dependencies, std::move(name), std::forward<Func>(func), priority);
+    return std::make_shared<LoadJob>(dependencies, std::move(name), pool_id, std::forward<Func>(func));
 }

 // Represents a logically connected set of LoadJobs required to achieve some goals (final LoadJob in the set).
@ -185,7 +193,7 @@ inline void scheduleLoad(const LoadTaskPtrs & tasks)
 }

 template <class... Args>
-inline void scheduleLoad(Args && ... args)
+inline void scheduleLoadAll(Args && ... args)
 {
    (scheduleLoad(std::forward<Args>(args)), ...);
 }
@ -208,16 +216,16 @@ inline void waitLoad(const LoadTaskPtrs & tasks)
 }

 template <class... Args>
-inline void waitLoad(Args && ... args)
+inline void waitLoadAll(Args && ... args)
 {
    (waitLoad(std::forward<Args>(args)), ...);
 }

 template <class... Args>
-inline void scheduleAndWaitLoad(Args && ... args)
+inline void scheduleAndWaitLoadAll(Args && ... args)
 {
-    scheduleLoad(std::forward<Args>(args)...);
-    waitLoad(std::forward<Args>(args)...);
+    scheduleLoadAll(std::forward<Args>(args)...);
+    waitLoadAll(std::forward<Args>(args)...);
 }

 inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
@ -228,6 +236,14 @@ inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
    return result;
 }

+inline LoadJobSet getGoalsOr(const LoadTaskPtrs & tasks, const LoadJobSet & alternative)
+{
+    LoadJobSet result;
+    for (const auto & task : tasks)
+        result.insert(task->goals().begin(), task->goals().end());
+    return result.empty() ? alternative : result;
+}
+
 inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
 {
    LoadJobSet result;
@ -251,100 +267,118 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
    return result;
 }

-// `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks dependencies and priorities of jobs.
+// `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
 // Basic usage example:
+//     // Start async_loader with two thread pools (0=fg, 1=bg):
+//     AsyncLoader async_loader({
+//         {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority{0}}
+//         {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority{1}}
+//     });
+//
+//     // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
+//     // Job2 and job3 depend on job1 and are run only after job1 completion.
 //     auto job_func = [&] (const LoadJobPtr & self) {
-//         LOG_TRACE(log, "Executing load job '{}' with priority '{}'", self->name, self->priority());
+//         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
 //     };
-//     auto job1 = makeLoadJob({}, "job1", job_func);
-//     auto job2 = makeLoadJob({ job1 }, "job2", job_func);
-//     auto job3 = makeLoadJob({ job1 }, "job3", job_func);
+//     auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
+//     auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
+//     auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 1, job_func);
 //     auto task = makeLoadTask(async_loader, { job1, job2, job3 });
 //     task.schedule();
-// Here we have created and scheduled a task consisting of three jobs. Job1 has no dependencies and is run first.
-// Job2 and job3 depend on job1 and are run only after job1 completion. Another thread may prioritize a job and wait for it:
-//     async_loader->prioritize(job3, /* priority = */ 1); // higher priority jobs are run first, default priority is zero.
-//     job3->wait(); // blocks until job completion or cancellation and rethrow an exception (if any)
 //
-// AsyncLoader tracks state of all scheduled jobs. Job lifecycle is the following:
-// 1)  Job is constructed with PENDING status and initial priority. The job is placed into a task.
-// 2)  The task is scheduled with all its jobs and their dependencies. A scheduled job may be ready (i.e. have all its dependencies finished) or blocked.
-// 3a) When all dependencies are successfully executed, the job became ready. A ready job is enqueued into the ready queue.
+//     // Another thread may prioritize a job by changing its pool and wait for it:
+//     async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
+//     job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
+//
+// Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
+// Each pool has a constant priority and a mutable maximum number of threads.
+// Higher priority (lower `pool.priority` value) jobs are run first.
+// No job with lower priority is started while there is at least one higher priority job ready or running.
+//
+// Job priority can be elevated (but cannot be lowered)
+// (a) if either it has a dependent job with higher priority:
+//     in this case the priority and the pool of a dependent job is inherited during `schedule()` call;
+// (b) or job was explicitly prioritized by `prioritize(job, higher_priority_pool)` call:
+//     this also leads to a priority inheritance for all the dependencies.
+// Value stored in load job `pool_id` field is atomic and can be changed even during job execution.
+// Job is, of course, not moved from its initial thread pool, but it should use `self->pool()` for
+// all new jobs it create to avoid priority inversion. To obtain pool in which job is being executed
+// call `self->execution_pool()` instead.
+//
+// === IMPLEMENTATION DETAILS ===
+// All possible states and statuses of a job:
+//                       .---------- scheduled ----------.
+//  ctor --> assigned --> blocked --> ready --> executing --> finished ------> removed --> dtor
+//  STATUS: '------------------ PENDING -----------------'   '-- OK|FAILED|CANCELED --'
+//
+// AsyncLoader tracks state of all scheduled and finished jobs. Job lifecycle is the following:
+// 1)  A job is constructed with PENDING status and assigned to a pool. The job is placed into a task.
+// 2)  The task is scheduled with all its jobs and their dependencies. A scheduled job may be ready, blocked (and later executing).
+// 3a) When all dependencies are successfully finished, the job became ready. A ready job is enqueued into the ready queue of its pool.
 // 3b) If at least one of the job dependencies is failed or canceled, then this job is canceled (with all it's dependent jobs as well).
 //     On cancellation an ASYNC_LOAD_CANCELED exception is generated and saved inside LoadJob object. The job status is changed to CANCELED.
 //     Exception is rethrown by any existing or new `wait()` call. The job is moved to the set of the finished jobs.
-// 4)  The scheduled pending ready job starts execution by a worker. The job is dequeued. Callback `job_func` is called.
-//     Status of an executing job is PENDING. And it is still considered as a scheduled job by AsyncLoader.
-//     Note that `job_func` of a CANCELED job is never executed.
+// 4)  The ready job starts execution by a worker. The job is dequeued. Callback `job_func` is called.
+//     Status of an executing job is PENDING. Note that `job_func` of a CANCELED job is never executed.
 // 5a) On successful execution the job status is changed to OK and all existing and new `wait()` calls finish w/o exceptions.
 // 5b) Any exception thrown out of `job_func` is wrapped into an ASYNC_LOAD_FAILED exception and saved inside LoadJob.
 //     The job status is changed to FAILED. All the dependent jobs are canceled. The exception is rethrown from all existing and new `wait()` calls.
 // 6)  The job is no longer considered as scheduled and is instead moved to the finished jobs set. This is just for introspection of the finished jobs.
 // 7)  The task containing this job is destructed or `remove()` is explicitly called. The job is removed from the finished job set.
 // 8)  The job is destructed.
-//
-// Every job has a priority associated with it. AsyncLoader runs higher priority (greater `priority` value) jobs first. Job priority can be elevated
-// (a) if either it has a dependent job with higher priority (in this case priority of a dependent job is inherited);
-// (b) or job was explicitly prioritized by `prioritize(job, higher_priority)` call (this also leads to a priority inheritance for all the dependencies).
-// Note that to avoid priority inversion `job_func` should use `self->priority()` to schedule new jobs in AsyncLoader or any other pool.
-// Value stored in load job priority field is atomic and can be increased even during job execution.
-//
-// When a task is scheduled it can contain dependencies on previously scheduled jobs. These jobs can have any status. If job A being scheduled depends on
-// another job B that is not yet scheduled, then job B will also be scheduled (even if the task does not contain it).
 class AsyncLoader : private boost::noncopyable
 {
 private:
-    // Key of a pending job in the ready queue.
-    struct ReadyKey
+    // Thread pool for job execution.
+    // Pools control the following aspects of job execution:
+    // 1) Concurrency: Amount of concurrently executing jobs in a pool is `max_threads`.
+    // 2) Priority: As long as there is executing worker with higher priority, workers with lower priorities are not started
+    //    (although, they can finish last job started before higher priority jobs appeared)
+    struct Pool
    {
-        ssize_t priority; // Ascending order
-        ssize_t initial_priority; // Ascending order
-        UInt64 ready_seqno; // Descending order
+        const String name;
+        const Priority priority;
+        std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
+        std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
+        size_t max_threads; // Max number of workers to be spawn
+        size_t workers = 0; // Number of currently execution workers

-        bool operator<(const ReadyKey & rhs) const
-        {
-            if (priority > rhs.priority)
-                return true;
-            if (priority < rhs.priority)
-                return false;
-            if (initial_priority > rhs.initial_priority)
-                return true;
-            if (initial_priority < rhs.initial_priority)
-                return false;
-            return ready_seqno < rhs.ready_seqno;
-        }
+        bool isActive() const { return workers > 0 || !ready_queue.empty(); }
    };

    // Scheduling information for a pending job.
    struct Info
    {
-        ssize_t initial_priority = 0; // Initial priority passed into schedule().
-        ssize_t priority = 0; // Elevated priority, due to priority inheritance or prioritize().
        size_t dependencies_left = 0; // Current number of dependencies on pending jobs.
        UInt64 ready_seqno = 0; // Zero means that job is not in ready queue.
        LoadJobSet dependent_jobs; // Set of jobs dependent on this job.

-        // Three independent states of a non-finished job.
-        bool is_blocked() const { return dependencies_left > 0; }
-        bool is_ready() const { return dependencies_left == 0 && ready_seqno > 0; }
-        bool is_executing() const { return dependencies_left == 0 && ready_seqno == 0; }
-
-        // Get key of a ready job
-        ReadyKey key() const
-        {
-            return {.priority = priority, .initial_priority = initial_priority, .ready_seqno = ready_seqno};
-        }
+        // Three independent states of a scheduled job.
+        bool isBlocked() const { return dependencies_left > 0; }
+        bool isReady() const { return dependencies_left == 0 && ready_seqno > 0; }
+        bool isExecuting() const { return dependencies_left == 0 && ready_seqno == 0; }
    };

 public:
    using Metric = CurrentMetrics::Metric;

-    AsyncLoader(Metric metric_threads, Metric metric_active_threads, size_t max_threads_, bool log_failures_, bool log_progress_);
+    // Helper struct for AsyncLoader construction
+    struct PoolInitializer
+    {
+        String name;
+        Metric metric_threads;
+        Metric metric_active_threads;
+        size_t max_threads;
+        Priority priority;
+    };

+    AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_);
+
+    // Stops AsyncLoader before destruction
    // WARNING: all tasks instances should be destructed before associated AsyncLoader.
    ~AsyncLoader();

-    // Start workers to execute scheduled load jobs.
+    // Start workers to execute scheduled load jobs. Note that AsyncLoader is constructed as already started.
    void start();

    // Wait for all load jobs to finish, including all new jobs. So at first take care to stop adding new jobs.
@ -356,28 +390,32 @@ public:
    //  - or canceled using ~Task() or remove() later.
    void stop();

-    // Schedule all jobs of given `task` and their dependencies (if any, not scheduled yet).
-    // Higher priority jobs (with greater `job->priority()` value) are executed earlier.
-    // All dependencies of a scheduled job inherit its priority if it is higher. This way higher priority job
-    // never wait for (blocked by) lower priority jobs. No priority inversion is possible.
+    // Schedule all jobs of given `task` and their dependencies (even if they are not in task).
+    // All dependencies of a scheduled job inherit its pool if it has higher priority. This way higher priority job
+    // never waits for (blocked by) lower priority jobs. No priority inversion is possible.
+    // Idempotent: multiple schedule() calls for the same job are no-op.
    // Note that `task` destructor ensures that all its jobs are finished (OK, FAILED or CANCELED)
    // and are removed from AsyncLoader, so it is thread-safe to destroy them.
    void schedule(LoadTask & task);
    void schedule(const LoadTaskPtr & task);

    // Schedule all tasks atomically. To ensure only highest priority jobs among all tasks are run first.
-    void schedule(const std::vector<LoadTaskPtr> & tasks);
+    void schedule(const LoadTaskPtrs & tasks);

    // Increase priority of a job and all its dependencies recursively.
-    void prioritize(const LoadJobPtr & job, ssize_t new_priority);
+    // Jobs from higher (than `new_pool`) priority pools are not changed.
+    void prioritize(const LoadJobPtr & job, size_t new_pool);

    // Remove finished jobs, cancel scheduled jobs, wait for executing jobs to finish and remove them.
    void remove(const LoadJobSet & jobs);

-    // Increase or decrease maximum number of simultaneously executing jobs.
-    void setMaxThreads(size_t value);
+    // Increase or decrease maximum number of simultaneously executing jobs in `pool`.
+    void setMaxThreads(size_t pool, size_t value);
+
+    size_t getMaxThreads(size_t pool) const;
+    const String & getPoolName(size_t pool) const;
+    Priority getPoolPriority(size_t pool) const;

-    size_t getMaxThreads() const;
    size_t getScheduledJobCount() const;

    // Helper class for introspection
@ -385,11 +423,10 @@ public:
    {
        LoadJobPtr job;
        size_t dependencies_left = 0;
-        bool is_executing = false;
+        UInt64 ready_seqno = 0;
        bool is_blocked = false;
        bool is_ready = false;
-        std::optional<ssize_t> initial_priority;
-        std::optional<UInt64> ready_seqno;
+        bool is_executing = false;
    };

    // For introspection and debug only, see `system.async_loader` table
@ -398,42 +435,32 @@ public:
 private:
    void checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
    String checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
-    void finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job = {});
+    void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock);
    void scheduleImpl(const LoadJobSet & input_jobs);
    void gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
-    void prioritize(const LoadJobPtr & job, ssize_t new_priority, std::unique_lock<std::mutex> & lock);
+    void prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock);
    void enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock);
-    void spawn(std::unique_lock<std::mutex> &);
-    void worker();
+    bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &);
+    bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &);
+    void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> &);
+    void spawn(Pool & pool, std::unique_lock<std::mutex> &);
+    void worker(Pool & pool);

    // Logging
    const bool log_failures; // Worker should log all exceptions caught from job functions.
    const bool log_progress; // Periodically log total progress
    Poco::Logger * log;
-    std::chrono::system_clock::time_point busy_period_start_time;
-    AtomicStopwatch stopwatch;
-    size_t old_jobs = 0; // Number of jobs that were finished in previous busy period (for correct progress indication)

    mutable std::mutex mutex; // Guards all the fields below.
-    bool is_running = false;
-
-    // Full set of scheduled pending jobs along with scheduling info.
-    std::unordered_map<LoadJobPtr, Info> scheduled_jobs;
-
-    // Subset of scheduled pending non-blocked jobs (waiting for a worker to be executed).
-    // Represent a queue of jobs in order of decreasing priority and FIFO for jobs with equal priorities.
-    std::map<ReadyKey, LoadJobPtr> ready_queue;
-
-    // Set of finished jobs (for introspection only, until jobs are removed).
-    LoadJobSet finished_jobs;
-
-    // Increasing counter for `ReadyKey` assignment (to preserve FIFO order of the jobs with equal priorities).
-    UInt64 last_ready_seqno = 0;
-
-    // For executing jobs. Note that we avoid using an internal queue of the pool to be able to prioritize jobs.
-    size_t max_threads;
-    size_t workers = 0;
-    ThreadPool pool;
+    bool is_running = true;
+    std::optional<Priority> current_priority; // highest priority among active pools
+    UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
+    std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
+    std::vector<Pool> pools; // Thread pools for job execution and ready queues
+    LoadJobSet finished_jobs; // Set of finished jobs (for introspection only, until jobs are removed).
+    AtomicStopwatch stopwatch; // For progress indication
+    size_t old_jobs = 0; // Number of jobs that were finished in previous busy period (for correct progress indication)
+    std::chrono::system_clock::time_point busy_period_start_time;
 };

 }
--- a/src/Common/AsyncTaskExecutor.cpp
+++ b/src/Common/AsyncTaskExecutor.cpp
@ -3,18 +3,11 @@
 namespace DB
 {

-thread_local FiberInfo current_fiber_info;
-
 AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_))
 {
    createFiber();
 }

-FiberInfo AsyncTaskExecutor::getCurrentFiberInfo()
-{
-    return current_fiber_info;
-}
-
 void AsyncTaskExecutor::resume()
 {
    if (routine_is_finished)
@ -38,10 +31,7 @@ void AsyncTaskExecutor::resume()

 void AsyncTaskExecutor::resumeUnlocked()
 {
-    auto parent_fiber_info = current_fiber_info;
-    current_fiber_info = FiberInfo{&fiber, &parent_fiber_info};
-    fiber = std::move(fiber).resume();
-    current_fiber_info = parent_fiber_info;
+    fiber.resume();
 }

 void AsyncTaskExecutor::cancel()
@ -69,30 +59,19 @@ struct AsyncTaskExecutor::Routine
    struct AsyncCallback
    {
        AsyncTaskExecutor & executor;
-        Fiber & fiber;
+        SuspendCallback suspend_callback;

        void operator()(int fd, Poco::Timespan timeout, AsyncEventTimeoutType type, const std::string & desc, uint32_t events)
        {
            executor.processAsyncEvent(fd, timeout, type, desc, events);
-            fiber = std::move(fiber).resume();
+            suspend_callback();
            executor.clearAsyncEvent();
        }
    };

-    struct ResumeCallback
+    void operator()(SuspendCallback suspend_callback)
    {
-        Fiber & fiber;
-
-        void operator()()
-        {
-            fiber = std::move(fiber).resume();
-        }
-    };
-
-    Fiber operator()(Fiber && sink)
-    {
-        auto async_callback = AsyncCallback{executor, sink};
-        auto suspend_callback = ResumeCallback{sink};
+        auto async_callback = AsyncCallback{executor, suspend_callback};
        try
        {
            executor.task->run(async_callback, suspend_callback);
@ -110,18 +89,17 @@ struct AsyncTaskExecutor::Routine
        }

        executor.routine_is_finished = true;
-        return std::move(sink);
    }
 };

 void AsyncTaskExecutor::createFiber()
 {
-    fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
+    fiber = Fiber(fiber_stack, Routine{*this});
 }

 void AsyncTaskExecutor::destroyFiber()
 {
-    boost::context::fiber to_destroy = std::move(fiber);
+    Fiber to_destroy = std::move(fiber);
 }

 String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description)
--- a/src/Common/AsyncTaskExecutor.h
+++ b/src/Common/AsyncTaskExecutor.h
@ -22,7 +22,7 @@ enum class AsyncEventTimeoutType
 };

 using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>;
-using ResumeCallback = std::function<void()>;
+using SuspendCallback = std::function<void()>;

 struct FiberInfo
 {
@ -38,7 +38,7 @@ struct FiberInfo
 struct AsyncTask
 {
 public:
-    virtual void run(AsyncCallback async_callback, ResumeCallback suspend_callback) = 0;
+    virtual void run(AsyncCallback async_callback, SuspendCallback suspend_callback) = 0;
    virtual ~AsyncTask() = default;
 };

@ -80,7 +80,6 @@ public:
    };
 #endif

-    static FiberInfo getCurrentFiberInfo();
 protected:
    /// Method that is called in resume() before actual fiber resuming.
    /// If it returns false, resume() will return immediately without actual fiber resuming.
@ -124,48 +123,6 @@ private:
    std::unique_ptr<AsyncTask> task;
 };

-/// Simple implementation for fiber local variable.
-template <typename T>
-struct FiberLocal
-{
-public:
-    FiberLocal()
-    {
-        /// Initialize main instance for this thread. Instances for fibers will inherit it,
-        /// (it's needed because main instance could be changed before creating fibers
-        /// and changes should be visible in fibers).
-        data[nullptr] = T();
-    }
-
-    T & operator*()
-    {
-        return get();
-    }
-
-    T * operator->()
-    {
-        return &get();
-    }
-
-private:
-    T & get()
-    {
-        return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo());
-    }
-
-    T & getInstanceForFiber(FiberInfo info)
-    {
-        auto it = data.find(info.fiber);
-        /// If it's the first request, we need to initialize instance for the fiber
-        /// using instance from parent fiber or main thread that created fiber.
-        if (it == data.end())
-            it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first;
-        return it->second;
-    }
-
-    std::unordered_map<const Fiber *, T> data;
-};
-
 String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description);

 }
--- a/src/Common/AsynchronousMetrics.cpp
+++ b/src/Common/AsynchronousMetrics.cpp
@ -1041,18 +1041,16 @@ void AsynchronousMetrics::update(TimePoint update_time)
                // It doesn't read the EOL itself.
                ++cpuinfo->position();

-                if (s.rfind("processor", 0) == 0)
+                static constexpr std::string_view PROCESSOR = "processor";
+                if (s.starts_with(PROCESSOR))
                {
                    /// s390x example: processor 0: version = FF, identification = 039C88, machine = 3906
                    /// non s390x example: processor : 0
-                    if (auto colon = s.find_first_of(':'))
-                    {
-#ifdef __s390x__
-                        core_id = std::stoi(s.substr(10)); /// 10: length of "processor" plus 1
-#else
-                        core_id = std::stoi(s.substr(colon + 2));
-#endif
-                    }
+                    auto core_id_start = std::ssize(PROCESSOR);
+                    while (core_id_start < std::ssize(s) && !std::isdigit(s[core_id_start]))
+                        ++core_id_start;
+
+                    core_id = std::stoi(s.substr(core_id_start));
                }
                else if (s.rfind("cpu MHz", 0) == 0)
                {
--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@ -3,5 +3,147 @@
 /// BOOST_USE_ASAN, BOOST_USE_TSAN and BOOST_USE_UCONTEXT should be correctly defined for sanitizers.
 #include <base/defines.h>
 #include <boost/context/fiber.hpp>
+#include <map>
+
+/// Class wrapper for boost::context::fiber.
+/// It tracks current executing fiber for thread and
+/// supports storing fiber-specific data
+/// that will be destroyed on fiber destructor.
+class Fiber
+{
+private:
+    using Impl = boost::context::fiber;
+    using FiberPtr = Fiber *;
+    template <typename T> friend class FiberLocal;
+
+public:
+    template< typename StackAlloc, typename Fn>
+    Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
+    {
+    }
+
+    Fiber() = default;
+
+    Fiber(Fiber && other) = default;
+    Fiber & operator=(Fiber && other) = default;
+
+    Fiber(const Fiber &) = delete;
+    Fiber & operator =(const Fiber &) = delete;
+
+    explicit operator bool() const
+    {
+        return impl.operator bool();
+    }
+
+    void resume()
+    {
+        /// Update information about current executing fiber.
+        FiberPtr & current_fiber = getCurrentFiber();
+        FiberPtr parent_fiber = current_fiber;
+        current_fiber = this;
+        impl = std::move(impl).resume();
+        /// Restore parent fiber.
+        current_fiber = parent_fiber;
+    }
+
+private:
+    template <typename Fn>
+    struct RoutineImpl
+    {
+        struct SuspendCallback
+        {
+            Impl & impl;
+
+            void operator()()
+            {
+                impl = std::move(impl).resume();
+            }
+        };
+
+        explicit RoutineImpl(Fn && fn_) : fn(std::move(fn_))
+        {
+        }
+
+        Impl operator()(Impl && sink)
+        {
+            SuspendCallback suspend_callback{sink};
+            fn(suspend_callback);
+            return std::move(sink);
+        }
+
+        Fn fn;
+    };
+
+    static FiberPtr & getCurrentFiber()
+    {
+        thread_local static FiberPtr current_fiber;
+        return current_fiber;
+    }
+
+    /// Special wrapper to store data in uniquer_ptr.
+    struct DataWrapper
+    {
+        virtual ~DataWrapper() = default;
+    };
+
+    using DataPtr = std::unique_ptr<DataWrapper>;
+
+    /// Get reference to fiber-specific data by key
+    /// (the pointer to the structure that uses this data).
+    DataPtr & getLocalData(void * key)
+    {
+        return local_data[key];
+    }
+
+    Impl && release()
+    {
+        return std::move(impl);
+    }
+
+    Impl impl;
+    std::map<void *, DataPtr> local_data;
+};
+
+/// Implementation for fiber local variable.
+/// If we are in fiber, it returns fiber local data,
+/// otherwise it returns it's single field.
+/// Fiber local data is destroyed in Fiber destructor.
+/// Implementation is similar to boost::fiber::fiber_specific_ptr
+/// (we cannot use it because we don't use boost::fiber API.
+template <typename T>
+class FiberLocal
+{
+public:
+    T & operator*()
+    {
+        return get();
+    }
+
+    T * operator->()
+    {
+        return &get();
+    }
+
+private:
+    struct DataWrapperImpl : public Fiber::DataWrapper
+    {
+        T impl;
+    };
+
+    T & get()
+    {
+        Fiber * current_fiber = Fiber::getCurrentFiber();
+        if (!current_fiber)
+            return main_instance;
+
+        Fiber::DataPtr & ptr = current_fiber->getLocalData(this);
+        /// Initialize instance on first request.
+        if (!ptr)
+            ptr = std::make_unique<DataWrapperImpl>();
+
+        return dynamic_cast<DataWrapperImpl *>(ptr.get())->impl;
+    }
+
+    T main_instance;
+};

-using Fiber = boost::context::fiber;
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -15,9 +15,8 @@ namespace DB
 namespace OpenTelemetry
 {

-///// This code can be executed inside several fibers in one thread,
-///// we should use fiber local tracing context.
-thread_local FiberLocal<TracingContextOnThread> current_fiber_trace_context;
+/// This code can be executed inside fibers, we should use fiber local tracing context.
+thread_local FiberLocal<TracingContextOnThread> current_trace_context;

 bool Span::addAttribute(std::string_view name, UInt64 value) noexcept
 {
@ -109,7 +108,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc

 SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
 {
-    if (!current_fiber_trace_context->isTraceEnabled())
+    if (!current_trace_context->isTraceEnabled())
    {
        return;
    }
@ -117,8 +116,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
    /// Use try-catch to make sure the ctor is exception safe.
    try
    {
-        this->trace_id = current_fiber_trace_context->trace_id;
-        this->parent_span_id = current_fiber_trace_context->span_id;
+        this->trace_id = current_trace_context->trace_id;
+        this->parent_span_id = current_trace_context->span_id;
        this->span_id = thread_local_rng(); // create a new id for this span
        this->operation_name = _operation_name;
        this->kind = _kind;
@ -137,7 +136,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
    }

    /// Set current span as parent of other spans created later on this thread.
-    current_fiber_trace_context->span_id = this->span_id;
+    current_trace_context->span_id = this->span_id;
 }

 void SpanHolder::finish() noexcept
@ -146,12 +145,12 @@ void SpanHolder::finish() noexcept
        return;

    // First of all, restore old value of current span.
-    assert(current_fiber_trace_context->span_id == span_id);
-    current_fiber_trace_context->span_id = parent_span_id;
+    assert(current_trace_context->span_id == span_id);
+    current_trace_context->span_id = parent_span_id;

    try
    {
-        auto log = current_fiber_trace_context->span_log.lock();
+        auto log = current_trace_context->span_log.lock();

        /// The log might be disabled, check it before use
        if (log)
@ -274,7 +273,7 @@ void TracingContext::serialize(WriteBuffer & buf) const

 const TracingContextOnThread & CurrentContext()
 {
-    return *current_fiber_trace_context;
+    return *current_trace_context;
 }

 void TracingContextOnThread::reset() noexcept
@ -296,7 +295,7 @@ TracingContextHolder::TracingContextHolder(
    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
    try
    {
-        if (current_fiber_trace_context->isTraceEnabled())
+        if (current_trace_context->isTraceEnabled())
        {
            ///
            /// This is not the normal case,
@ -309,15 +308,15 @@ TracingContextHolder::TracingContextHolder(
            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
            ///
            this->is_context_owner = false;
-            this->root_span.trace_id = current_fiber_trace_context->trace_id;
-            this->root_span.parent_span_id = current_fiber_trace_context->span_id;
+            this->root_span.trace_id = current_trace_context->trace_id;
+            this->root_span.parent_span_id = current_trace_context->span_id;
            this->root_span.span_id = thread_local_rng();
            this->root_span.operation_name = _operation_name;
            this->root_span.start_time_us
                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

            /// Set the root span as parent of other spans created on current thread
-            current_fiber_trace_context->span_id = this->root_span.span_id;
+            current_trace_context->span_id = this->root_span.span_id;
            return;
        }

@ -361,10 +360,10 @@ TracingContextHolder::TracingContextHolder(
    }

    /// Set up trace context on current thread only when the root span is successfully initialized.
-    *current_fiber_trace_context = _parent_trace_context;
-    current_fiber_trace_context->span_id = this->root_span.span_id;
-    current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
-    current_fiber_trace_context->span_log = _span_log;
+    *current_trace_context = _parent_trace_context;
+    current_trace_context->span_id = this->root_span.span_id;
+    current_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
+    current_trace_context->span_log = _span_log;
 }

 TracingContextHolder::~TracingContextHolder()
@ -376,7 +375,7 @@ TracingContextHolder::~TracingContextHolder()

    try
    {
-        auto shared_span_log = current_fiber_trace_context->span_log.lock();
+        auto shared_span_log = current_trace_context->span_log.lock();
        if (shared_span_log)
        {
            try
@ -407,11 +406,11 @@ TracingContextHolder::~TracingContextHolder()
    if (this->is_context_owner)
    {
        /// Clear the context on current thread
-        current_fiber_trace_context->reset();
+        current_trace_context->reset();
    }
    else
    {
-        current_fiber_trace_context->span_id = this->root_span.parent_span_id;
+        current_trace_context->span_id = this->root_span.parent_span_id;
    }
 }

--- a/src/Common/Priority.h
+++ b/src/Common/Priority.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <base/types.h>
+
+/// Common type for priority values.
+/// Separate type (rather than `Int64` is used just to avoid implicit conversion errors and to default-initialize
+struct Priority
+{
+    Int64 value = 0; /// Note that lower value means higher priority.
+    constexpr operator Int64() const { return value; } /// NOLINT
+};
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -8,6 +8,9 @@
    M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \
    M(SelectQuery, "Same as Query, but only for SELECT queries.") \
    M(InsertQuery, "Same as Query, but only for INSERT queries.") \
+    M(QueriesWithSubqueries, "Count queries with all subqueries") \
+    M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \
+    M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \
    M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \
    M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \
    M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \
@ -366,7 +369,7 @@ The server successfully detected this situation and will download merged part fr
    M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \
    M(WriteBufferFromS3Bytes, "Bytes written to S3.") \
    M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \
-    \
+    M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.") \
    M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
    \
    M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \
--- a/src/Common/SystemLogBase.h
+++ b/src/Common/SystemLogBase.h
@ -92,7 +92,7 @@ public:

    String getName() const override { return LogElement::name(); }

-    static const char * getDefaultOrderBy() { return "(event_date, event_time)"; }
+    static const char * getDefaultOrderBy() { return "event_date, event_time"; }

 protected:
    Poco::Logger * log;
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@ -123,7 +123,7 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)

 template <typename Thread>
 template <typename ReturnType>
-ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, ssize_t priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context)
+ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context)
 {
    auto on_error = [&](const std::string & reason)
    {
@ -231,19 +231,19 @@ void ThreadPoolImpl<Thread>::startNewThreadsNoLock()
 }

 template <typename Thread>
-void ThreadPoolImpl<Thread>::scheduleOrThrowOnError(Job job, ssize_t priority)
+void ThreadPoolImpl<Thread>::scheduleOrThrowOnError(Job job, Priority priority)
 {
    scheduleImpl<void>(std::move(job), priority, std::nullopt);
 }

 template <typename Thread>
-bool ThreadPoolImpl<Thread>::trySchedule(Job job, ssize_t priority, uint64_t wait_microseconds) noexcept
+bool ThreadPoolImpl<Thread>::trySchedule(Job job, Priority priority, uint64_t wait_microseconds) noexcept
 {
    return scheduleImpl<bool>(std::move(job), priority, wait_microseconds);
 }

 template <typename Thread>
-void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, ssize_t priority, uint64_t wait_microseconds, bool propagate_opentelemetry_tracing_context)
+void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, Priority priority, uint64_t wait_microseconds, bool propagate_opentelemetry_tracing_context)
 {
    scheduleImpl<void>(std::move(job), priority, wait_microseconds, propagate_opentelemetry_tracing_context);
 }
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@ -18,6 +18,7 @@
 #include <Common/OpenTelemetryTraceContext.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/ThreadPool_fwd.h>
+#include <Common/Priority.h>
 #include <base/scope_guard.h>

 /** Very simple thread pool similar to boost::threadpool.
@ -59,17 +60,17 @@ public:
    /// If any thread was throw an exception, first exception will be rethrown from this method,
    ///  and exception will be cleared.
    /// Also throws an exception if cannot create thread.
-    /// Priority: greater is higher.
+    /// Priority: lower is higher.
    /// NOTE: Probably you should call wait() if exception was thrown. If some previously scheduled jobs are using some objects,
    /// located on stack of current thread, the stack must not be unwinded until all jobs finished. However,
    /// if ThreadPool is a local object, it will wait for all scheduled jobs in own destructor.
-    void scheduleOrThrowOnError(Job job, ssize_t priority = 0);
+    void scheduleOrThrowOnError(Job job, Priority priority = {});

    /// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or return false.
-    bool trySchedule(Job job, ssize_t priority = 0, uint64_t wait_microseconds = 0) noexcept;
+    bool trySchedule(Job job, Priority priority = {}, uint64_t wait_microseconds = 0) noexcept;

    /// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or throw an exception.
-    void scheduleOrThrow(Job job, ssize_t priority = 0, uint64_t wait_microseconds = 0, bool propagate_opentelemetry_tracing_context = true);
+    void scheduleOrThrow(Job job, Priority priority = {}, uint64_t wait_microseconds = 0, bool propagate_opentelemetry_tracing_context = true);

    /// Wait for all currently active jobs to be done.
    /// You may call schedule and wait many times in arbitrary order.
@ -123,15 +124,15 @@ private:
    struct JobWithPriority
    {
        Job job;
-        ssize_t priority;
+        Priority priority;
        DB::OpenTelemetry::TracingContextOnThread thread_trace_context;

-        JobWithPriority(Job job_, ssize_t priority_, const DB::OpenTelemetry::TracingContextOnThread& thread_trace_context_)
+        JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_)
            : job(job_), priority(priority_), thread_trace_context(thread_trace_context_) {}

-        bool operator< (const JobWithPriority & rhs) const
+        bool operator<(const JobWithPriority & rhs) const
        {
-            return priority < rhs.priority;
+            return priority > rhs.priority; // Reversed for `priority_queue` max-heap to yield minimum value (i.e. highest priority) first
        }
    };

@ -141,7 +142,7 @@ private:
    std::stack<OnDestroyCallback> on_destroy_callbacks;

    template <typename ReturnType>
-    ReturnType scheduleImpl(Job job, ssize_t priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context = true);
+    ReturnType scheduleImpl(Job job, Priority priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context = true);

    void worker(typename std::list<Thread>::iterator thread_it);

@ -227,7 +228,7 @@ public:
            DB::ThreadStatus thread_status;
            std::apply(function, arguments);
        },
-        0, // default priority
+        {}, // default priority
        0, // default wait_microseconds
        propagate_opentelemetry_context
        );
--- a/src/Common/tests/gtest_async_loader.cpp
+++ b/src/Common/tests/gtest_async_loader.cpp
@ -30,6 +30,11 @@ namespace DB::ErrorCodes
    extern const int ASYNC_LOAD_CANCELED;
 }

+struct Initializer {
+    size_t max_threads = 1;
+    Priority priority;
+};
+
 struct AsyncLoaderTest
 {
    AsyncLoader loader;
@ -37,10 +42,34 @@ struct AsyncLoaderTest
    std::mutex rng_mutex;
    pcg64 rng{randomSeed()};

+    explicit AsyncLoaderTest(std::vector<Initializer> initializers)
+        : loader(getPoolInitializers(initializers), /* log_failures = */ false, /* log_progress = */ false)
+    {
+        loader.stop(); // All tests call `start()` manually to better control ordering
+    }
+
    explicit AsyncLoaderTest(size_t max_threads = 1)
-        : loader(CurrentMetrics::TablesLoaderThreads, CurrentMetrics::TablesLoaderThreadsActive, max_threads, /* log_failures = */ false, /* log_progress = */ false)
+        : AsyncLoaderTest({{.max_threads = max_threads}})
    {}

+    std::vector<AsyncLoader::PoolInitializer> getPoolInitializers(std::vector<Initializer> initializers)
+    {
+        std::vector<AsyncLoader::PoolInitializer> result;
+        size_t pool_id = 0;
+        for (auto & desc : initializers)
+        {
+            result.push_back({
+                .name = fmt::format("Pool{}", pool_id),
+                .metric_threads = CurrentMetrics::TablesLoaderThreads,
+                .metric_active_threads = CurrentMetrics::TablesLoaderThreadsActive,
+                .max_threads = desc.max_threads,
+                .priority = desc.priority
+            });
+            pool_id++;
+        }
+        return result;
+    }
+
    template <typename T>
    T randomInt(T from, T to)
    {
@ -114,16 +143,19 @@ struct AsyncLoaderTest

 TEST(AsyncLoader, Smoke)
 {
-    AsyncLoaderTest t(2);
+    AsyncLoaderTest t({
+        {.max_threads = 2, .priority = Priority{0}},
+        {.max_threads = 2, .priority = Priority{1}},
+    });

-    static constexpr ssize_t low_priority = -1;
+    static constexpr size_t low_priority_pool = 1;

    std::atomic<size_t> jobs_done{0};
    std::atomic<size_t> low_priority_jobs_done{0};

    auto job_func = [&] (const LoadJobPtr & self) {
        jobs_done++;
-        if (self->priority() == low_priority)
+        if (self->pool() == low_priority_pool)
            low_priority_jobs_done++;
    };

@ -135,7 +167,7 @@ TEST(AsyncLoader, Smoke)
        auto job3 = makeLoadJob({ job2 }, "job3", job_func);
        auto job4 = makeLoadJob({ job2 }, "job4", job_func);
        auto task2 = t.schedule({ job3, job4 });
-        auto job5 = makeLoadJob({ job3, job4 }, low_priority, "job5", job_func);
+        auto job5 = makeLoadJob({ job3, job4 }, low_priority_pool, "job5", job_func);
        task2->merge(t.schedule({ job5 }));

        std::thread waiter_thread([=] { job5->wait(); });
@ -387,6 +419,8 @@ TEST(AsyncLoader, CancelExecutingTask)
    }
 }

+// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function
+// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482
 TEST(AsyncLoader, DISABLED_JobFailure)
 {
    AsyncLoaderTest t;
@ -536,7 +570,7 @@ TEST(AsyncLoader, TestOverload)
    AsyncLoaderTest t(3);
    t.loader.start();

-    size_t max_threads = t.loader.getMaxThreads();
+    size_t max_threads = t.loader.getMaxThreads(/* pool = */ 0);
    std::atomic<int> executing{0};

    for (int concurrency = 4; concurrency <= 8; concurrency++)
@ -562,15 +596,35 @@ TEST(AsyncLoader, TestOverload)

 TEST(AsyncLoader, StaticPriorities)
 {
-    AsyncLoaderTest t(1);
+    AsyncLoaderTest t({
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
+        {.max_threads = 1, .priority{-3}},
+        {.max_threads = 1, .priority{-4}},
+        {.max_threads = 1, .priority{-5}},
+        {.max_threads = 1, .priority{-6}},
+        {.max_threads = 1, .priority{-7}},
+        {.max_threads = 1, .priority{-8}},
+        {.max_threads = 1, .priority{-9}},
+    });

    std::string schedule;

    auto job_func = [&] (const LoadJobPtr & self)
    {
-        schedule += fmt::format("{}{}", self->name, self->priority());
+        schedule += fmt::format("{}{}", self->name, self->pool());
    };

+    // Job DAG with priorities. After priority inheritance from H9, jobs D9 and E9 can be
+    // executed in undefined order (Tested further in DynamicPriorities)
+    // A0(9) -+-> B3
+    //        |
+    //        `-> C4
+    //        |
+    //        `-> D1(9) -.
+    //        |          +-> F0(9) --> G0(9) --> H9
+    //        `-> E2(9) -'
    std::vector<LoadJobPtr> jobs;
    jobs.push_back(makeLoadJob({}, 0, "A", job_func)); // 0
    jobs.push_back(makeLoadJob({ jobs[0] }, 3, "B", job_func)); // 1
@ -584,25 +638,113 @@ TEST(AsyncLoader, StaticPriorities)

    t.loader.start();
    t.loader.wait();
+    ASSERT_TRUE(schedule == "A9E9D9F9G9H9C4B3" || schedule == "A9D9E9F9G9H9C4B3");
+}

-    ASSERT_EQ(schedule, "A9E9D9F9G9H9C4B3");
+TEST(AsyncLoader, SimplePrioritization)
+{
+    AsyncLoaderTest t({
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
+    });
+
+    t.loader.start();
+
+    std::atomic<int> executed{0}; // Number of previously executed jobs (to test execution order)
+    LoadJobPtr job_to_prioritize;
+
+    auto job_func_A_booster = [&] (const LoadJobPtr &)
+    {
+        ASSERT_EQ(executed++, 0);
+        t.loader.prioritize(job_to_prioritize, 2);
+    };
+
+    auto job_func_B_tester = [&] (const LoadJobPtr &)
+    {
+        ASSERT_EQ(executed++, 2);
+    };
+
+    auto job_func_C_boosted = [&] (const LoadJobPtr &)
+    {
+        ASSERT_EQ(executed++, 1);
+    };
+
+    std::vector<LoadJobPtr> jobs;
+    jobs.push_back(makeLoadJob({}, 1, "A", job_func_A_booster)); // 0
+    jobs.push_back(makeLoadJob({jobs[0]}, 1, "B", job_func_B_tester)); // 1
+    jobs.push_back(makeLoadJob({}, 0, "C", job_func_C_boosted)); // 2
+    auto task = makeLoadTask(t.loader, { jobs.begin(), jobs.end() });
+
+    job_to_prioritize = jobs[2]; // C
+
+    scheduleAndWaitLoadAll(task);
 }

 TEST(AsyncLoader, DynamicPriorities)
 {
-    AsyncLoaderTest t(1);
+    AsyncLoaderTest t({
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
+        {.max_threads = 1, .priority{-3}},
+        {.max_threads = 1, .priority{-4}},
+        {.max_threads = 1, .priority{-5}},
+        {.max_threads = 1, .priority{-6}},
+        {.max_threads = 1, .priority{-7}},
+        {.max_threads = 1, .priority{-8}},
+        {.max_threads = 1, .priority{-9}},
+    });

    for (bool prioritize : {false, true})
    {
+        // Although all pools have max_threads=1, workers from different pools can run simultaneously just after `prioritize()` call
+        std::barrier sync(2);
+        bool wait_sync = prioritize;
+        std::mutex schedule_mutex;
        std::string schedule;

        LoadJobPtr job_to_prioritize;

+        // Order of execution of jobs D and E after prioritization is undefined, because it depend on `ready_seqno`
+        // (Which depends on initial `schedule()` order, which in turn depend on `std::unordered_map` order)
+        // So we have to obtain `ready_seqno` to be sure.
+        UInt64 ready_seqno_D = 0;
+        UInt64 ready_seqno_E = 0;
+
        auto job_func = [&] (const LoadJobPtr & self)
        {
+            {
+                std::unique_lock lock{schedule_mutex};
+                schedule += fmt::format("{}{}", self->name, self->executionPool());
+            }
+
            if (prioritize && self->name == "C")
-                t.loader.prioritize(job_to_prioritize, 9); // dynamic prioritization
-            schedule += fmt::format("{}{}", self->name, self->priority());
+            {
+                for (const auto & state : t.loader.getJobStates())
+                {
+                    if (state.job->name == "D")
+                        ready_seqno_D = state.ready_seqno;
+                    if (state.job->name == "E")
+                        ready_seqno_E = state.ready_seqno;
+                }
+
+                // Jobs D and E should be enqueued at the moment
+                ASSERT_LT(0, ready_seqno_D);
+                ASSERT_LT(0, ready_seqno_E);
+
+                // Dynamic prioritization G0 -> G9
+                // Note that it will spawn concurrent worker in higher priority pool
+                t.loader.prioritize(job_to_prioritize, 9);
+
+                sync.arrive_and_wait(); // (A) wait for higher priority worker (B) to test they can be concurrent
+            }
+
+            if (wait_sync && (self->name == "D" || self->name == "E"))
+            {
+                wait_sync = false;
+                sync.arrive_and_wait(); // (B)
+            }
        };

        // Job DAG with initial priorities. During execution of C4, job G0 priority is increased to G9, postponing B3 job executing.
@ -624,14 +766,19 @@ TEST(AsyncLoader, DynamicPriorities)
        jobs.push_back(makeLoadJob({ jobs[6] }, 0, "H", job_func)); // 7
        auto task = t.schedule({ jobs.begin(), jobs.end() });

-        job_to_prioritize = jobs[6];
+        job_to_prioritize = jobs[6]; // G

        t.loader.start();
        t.loader.wait();
        t.loader.stop();

        if (prioritize)
+        {
+            if (ready_seqno_D < ready_seqno_E)
+                ASSERT_EQ(schedule, "A4C4D9E9F9G9B3H0");
+            else
                ASSERT_EQ(schedule, "A4C4E9D9F9G9B3H0");
+        }
        else
            ASSERT_EQ(schedule, "A4C4B3E2D1F0G0H0");
    }
@ -742,8 +889,64 @@ TEST(AsyncLoader, SetMaxThreads)
        syncs[idx]->arrive_and_wait(); // (A)
        sync_index++;
        if (sync_index < syncs.size())
-            t.loader.setMaxThreads(max_threads_values[sync_index]);
+            t.loader.setMaxThreads(/* pool = */ 0, max_threads_values[sync_index]);
        syncs[idx]->arrive_and_wait(); // (B) this sync point is required to allow `executing` value to go back down to zero after we change number of workers
    }
    t.loader.wait();
 }
+
+TEST(AsyncLoader, DynamicPools)
+{
+    const size_t max_threads[] { 2, 10 };
+    const int jobs_in_chain = 16;
+    AsyncLoaderTest t({
+        {.max_threads = max_threads[0], .priority{0}},
+        {.max_threads = max_threads[1], .priority{-1}},
+    });
+
+    t.loader.start();
+
+    std::atomic<size_t> executing[2] { 0, 0 }; // Number of currently executing jobs per pool
+
+    for (int concurrency = 1; concurrency <= 12; concurrency++)
+    {
+        std::atomic<bool> boosted{false}; // Visible concurrency was increased
+        std::atomic<int> left{concurrency * jobs_in_chain / 2}; // Number of jobs to start before `prioritize()` call
+
+        LoadJobSet jobs_to_prioritize;
+
+        auto job_func = [&] (const LoadJobPtr & self)
+        {
+            auto pool_id = self->executionPool();
+            executing[pool_id]++;
+            if (executing[pool_id] > max_threads[0])
+                boosted = true;
+            ASSERT_LE(executing[pool_id], max_threads[pool_id]);
+
+            // Dynamic prioritization
+            if (--left == 0)
+            {
+                for (const auto & job : jobs_to_prioritize)
+                    t.loader.prioritize(job, 1);
+            }
+
+            t.randomSleepUs(100, 200, 100);
+
+            ASSERT_LE(executing[pool_id], max_threads[pool_id]);
+            executing[pool_id]--;
+        };
+
+        std::vector<LoadTaskPtr> tasks;
+        tasks.reserve(concurrency);
+        for (int i = 0; i < concurrency; i++)
+            tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func)));
+        jobs_to_prioritize = getGoals(tasks); // All jobs
+        scheduleAndWaitLoadAll(tasks);
+
+        ASSERT_EQ(executing[0], 0);
+        ASSERT_EQ(executing[1], 0);
+        ASSERT_EQ(boosted, concurrency > 2);
+        boosted = false;
+    }
+
+}
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@ -28,7 +28,7 @@ void CachedCompressedReadBuffer::initInput()
 }


-void CachedCompressedReadBuffer::prefetch(int64_t priority)
+void CachedCompressedReadBuffer::prefetch(Priority priority)
 {
    initInput();
    file_in->prefetch(priority);
--- a/src/Compression/CachedCompressedReadBuffer.h
+++ b/src/Compression/CachedCompressedReadBuffer.h
@ -36,7 +36,7 @@ private:

    bool nextImpl() override;

-    void prefetch(int64_t priority) override;
+    void prefetch(Priority priority) override;

    /// Passed into file_in.
    ReadBufferFromFileBase::ProfileCallback profile_callback;
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@ -51,7 +51,7 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadB
 }


-void CompressedReadBufferFromFile::prefetch(int64_t priority)
+void CompressedReadBufferFromFile::prefetch(Priority priority)
 {
    file_in.prefetch(priority);
 }
--- a/src/Compression/CompressedReadBufferFromFile.h
+++ b/src/Compression/CompressedReadBufferFromFile.h
@ -43,7 +43,7 @@ private:

    bool nextImpl() override;

-    void prefetch(int64_t priority) override;
+    void prefetch(Priority priority) override;

 public:
    explicit CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@ -47,7 +47,8 @@ struct Settings;
    M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \
    M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \
    M(UInt64, max_log_file_size, 50 * 1024 * 1024, "Max size of the Raft log file. If possible, each created log file will preallocate this amount of bytes on disk. Set to 0 to disable the limit", 0) \
-    M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0)
+    M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \
+    M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0)

 DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

--- a/src/Coordination/KeeperServer.cpp
+++ b/src/Coordination/KeeperServer.cpp
@ -471,17 +471,6 @@ void KeeperServer::shutdown()
 namespace
 {

-// Serialize the request with all the necessary information for the leader
-// we don't know ZXID and digest yet so we don't serialize it
-nuraft::ptr<nuraft::buffer> getZooKeeperRequestMessage(const KeeperStorage::RequestForSession & request_for_session)
-{
-    DB::WriteBufferFromNuraftBuffer write_buf;
-    DB::writeIntBinary(request_for_session.session_id, write_buf);
-    request_for_session.request->write(write_buf);
-    DB::writeIntBinary(request_for_session.time, write_buf);
-    return write_buf.getBuffer();
-}
-
 // Serialize the request for the log entry
 nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
 {
@ -489,12 +478,11 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
    DB::writeIntBinary(request_for_session.session_id, write_buf);
    request_for_session.request->write(write_buf);
    DB::writeIntBinary(request_for_session.time, write_buf);
-    DB::writeIntBinary(request_for_session.zxid, write_buf);
-    assert(request_for_session.digest);
-    DB::writeIntBinary(request_for_session.digest->version, write_buf);
-    if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
-        DB::writeIntBinary(request_for_session.digest->value, write_buf);
-
+    /// we fill with dummy values to eliminate unnecessary copy later on when we will write correct values
+    DB::writeIntBinary(static_cast<int64_t>(0), write_buf); /// zxid
+    DB::writeIntBinary(KeeperStorage::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
+    DB::writeIntBinary(static_cast<uint64_t>(0), write_buf); /// digest value
+    /// if new fields are added, update KeeperStateMachine::ZooKeeperLogSerializationVersion along with parseRequest function and PreAppendLog callback handler
    return write_buf.getBuffer();
 }

@ -512,9 +500,7 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS
 {
    std::vector<nuraft::ptr<nuraft::buffer>> entries;
    for (const auto & request_for_session : requests_for_sessions)
-    {
-        entries.push_back(getZooKeeperRequestMessage(request_for_session));
-    }
+        entries.push_back(getZooKeeperLogEntry(request_for_session));

    std::lock_guard lock{server_write_mutex};
    if (is_recovering)
@ -635,14 +621,50 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
                assert(entry->get_val_type() == nuraft::app_log);
                auto next_zxid = state_machine->getNextZxid();

-                auto & entry_buf = entry->get_buf();
-                auto request_for_session = state_machine->parseRequest(entry_buf);
-                request_for_session.zxid = next_zxid;
-                if (!state_machine->preprocess(request_for_session))
+                auto entry_buf = entry->get_buf_ptr();
+
+                KeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
+                auto request_for_session = state_machine->parseRequest(*entry_buf, /*final=*/false, &serialization_version);
+                request_for_session->zxid = next_zxid;
+                if (!state_machine->preprocess(*request_for_session))
                    return nuraft::cb_func::ReturnCode::ReturnNull;

-                request_for_session.digest = state_machine->getNodesDigest();
-                entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), getZooKeeperLogEntry(request_for_session), entry->get_val_type());
+                request_for_session->digest = state_machine->getNodesDigest();
+
+                /// older versions of Keeper can send logs that are missing some fields
+                size_t bytes_missing = 0;
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    bytes_missing += sizeof(request_for_session->time);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
+                    bytes_missing += sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
+
+                if (bytes_missing != 0)
+                {
+                    auto new_buffer = nuraft::buffer::alloc(entry_buf->size() + bytes_missing);
+                    memcpy(new_buffer->data_begin(), entry_buf->data_begin(), entry_buf->size());
+                    entry_buf = std::move(new_buffer);
+                    entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), entry_buf, entry->get_val_type());
+                }
+
+                size_t write_buffer_header_size
+                    = sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    write_buffer_header_size += sizeof(request_for_session->time);
+
+                auto * buffer_start = reinterpret_cast<BufferBase::Position>(entry_buf->data_begin() + entry_buf->size() - write_buffer_header_size);
+
+                WriteBuffer write_buf(buffer_start, write_buffer_header_size);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    writeIntBinary(request_for_session->time, write_buf);
+
+                writeIntBinary(request_for_session->zxid, write_buf);
+                writeIntBinary(request_for_session->digest->version, write_buf);
+                if (request_for_session->digest->version != KeeperStorage::NO_DIGEST)
+                    writeIntBinary(request_for_session->digest->value, write_buf);
+
                break;
            }
            case nuraft::cb_func::AppendLogFailed:
@ -654,8 +676,8 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
                assert(entry->get_val_type() == nuraft::app_log);

                auto & entry_buf = entry->get_buf();
-                auto request_for_session = state_machine->parseRequest(entry_buf);
-                state_machine->rollbackRequest(request_for_session, true);
+                auto request_for_session = state_machine->parseRequest(entry_buf, true);
+                state_machine->rollbackRequest(*request_for_session, true);
                break;
            }
            default:
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -1,16 +1,16 @@
 #include <cerrno>
-#include <base/errnoToString.h>
-#include <base/defines.h>
 #include <future>
 #include <Coordination/KeeperSnapshotManager.h>
 #include <Coordination/KeeperStateMachine.h>
 #include <Coordination/ReadBufferFromNuraftBuffer.h>
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
 #include <IO/ReadHelpers.h>
+#include <base/defines.h>
+#include <base/errnoToString.h>
 #include <sys/mman.h>
+#include <Common/ProfileEvents.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/ZooKeeper/ZooKeeperIO.h>
-#include <Common/ProfileEvents.h>
 #include <Common/logger_useful.h>
 #include "Coordination/KeeperStorage.h"

@ -60,6 +60,7 @@ KeeperStateMachine::KeeperStateMachine(
          coordination_settings->dead_session_check_period_ms.totalMilliseconds())
    , responses_queue(responses_queue_)
    , snapshots_queue(snapshots_queue_)
+    , min_request_size_to_cache(coordination_settings_->min_request_size_for_cache)
    , last_committed_idx(0)
    , log(&Poco::Logger::get("KeeperStateMachine"))
    , superdigest(superdigest_)
@ -149,19 +150,19 @@ void assertDigest(

 nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    auto request_for_session = parseRequest(data, /*final=*/false);
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

-    preprocess(request_for_session);
+    preprocess(*request_for_session);
    return nullptr;
 }

-KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer & data)
+std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
 {
    ReadBufferFromNuraftBuffer buffer(data);
-    KeeperStorage::RequestForSession request_for_session;
-    readIntBinary(request_for_session.session_id, buffer);
+    auto request_for_session = std::make_shared<KeeperStorage::RequestForSession>();
+    readIntBinary(request_for_session->session_id, buffer);

    int32_t length;
    Coordination::read(length, buffer);
@ -169,29 +170,81 @@ KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer
    int32_t xid;
    Coordination::read(xid, buffer);

+    static constexpr std::array non_cacheable_xids{
+        Coordination::WATCH_XID,
+        Coordination::PING_XID,
+        Coordination::AUTH_XID,
+        Coordination::CLOSE_XID,
+    };
+
+    const bool should_cache
+        = min_request_size_to_cache != 0 && request_for_session->session_id != -1 && data.size() >= min_request_size_to_cache
+        && std::all_of(
+              non_cacheable_xids.begin(), non_cacheable_xids.end(), [&](const auto non_cacheable_xid) { return xid != non_cacheable_xid; });
+
+    if (should_cache)
+    {
+        std::lock_guard lock(request_cache_mutex);
+        if (auto xid_to_request_it = parsed_request_cache.find(request_for_session->session_id);
+            xid_to_request_it != parsed_request_cache.end())
+        {
+            auto & xid_to_request = xid_to_request_it->second;
+            if (auto request_it = xid_to_request.find(xid); request_it != xid_to_request.end())
+            {
+                if (final)
+                {
+                    auto request = std::move(request_it->second);
+                    xid_to_request.erase(request_it);
+                    return request;
+                }
+                else
+                    return request_it->second;
+            }
+        }
+    }
+
+
    Coordination::OpNum opnum;

    Coordination::read(opnum, buffer);

-    request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
-    request_for_session.request->xid = xid;
-    request_for_session.request->readImpl(buffer);
+    request_for_session->request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
+    request_for_session->request->xid = xid;
+    request_for_session->request->readImpl(buffer);

-    if (!buffer.eof())
-        readIntBinary(request_for_session.time, buffer);
-    else /// backward compatibility
-        request_for_session.time
-            = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-    if (!buffer.eof())
-        readIntBinary(request_for_session.zxid, buffer);
+    using enum ZooKeeperLogSerializationVersion;
+    ZooKeeperLogSerializationVersion version = INITIAL;

    if (!buffer.eof())
    {
-        request_for_session.digest.emplace();
-        readIntBinary(request_for_session.digest->version, buffer);
-        if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
-            readIntBinary(request_for_session.digest->value, buffer);
+        version = WITH_TIME;
+        readIntBinary(request_for_session->time, buffer);
+    }
+    else
+        request_for_session->time
+            = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+    if (!buffer.eof())
+    {
+        version = WITH_ZXID_DIGEST;
+
+        readIntBinary(request_for_session->zxid, buffer);
+
+        chassert(!buffer.eof());
+
+        request_for_session->digest.emplace();
+        readIntBinary(request_for_session->digest->version, buffer);
+        if (request_for_session->digest->version != KeeperStorage::DigestVersion::NO_DIGEST || !buffer.eof())
+            readIntBinary(request_for_session->digest->value, buffer);
+    }
+
+    if (serialization_version)
+        *serialization_version = version;
+
+    if (should_cache && !final)
+    {
+        std::lock_guard lock(request_cache_mutex);
+        parsed_request_cache[request_for_session->session_id].emplace(xid, request_for_session);
    }

    return request_for_session;
@ -231,15 +284,15 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req

 nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    auto request_for_session = parseRequest(data, true);
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

    /// Special processing of session_id request
-    if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
+    if (request_for_session->request->getOpNum() == Coordination::OpNum::SessionID)
    {
        const Coordination::ZooKeeperSessionIDRequest & session_id_request
-            = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
+            = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request);
        int64_t session_id;
        std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
        response->internal_id = session_id_request.internal_id;
@ -261,25 +314,34 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
    }
    else
    {
+        if (request_for_session->request->getOpNum() == Coordination::OpNum::Close)
+        {
+            std::lock_guard lock(request_cache_mutex);
+            parsed_request_cache.erase(request_for_session->session_id);
+        }
+
        std::lock_guard lock(storage_and_responses_lock);
-        KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(
-            request_for_session.request, request_for_session.session_id, request_for_session.zxid);
+        KeeperStorage::ResponsesForSessions responses_for_sessions
+            = storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
        for (auto & response_for_session : responses_for_sessions)
            if (!responses_queue.push(response_for_session))
            {
                ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
-                LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response_for_session.session_id);
+                LOG_WARNING(
+                    log,
+                    "Failed to push response with session id {} to the queue, probably because of shutdown",
+                    response_for_session.session_id);
            }

-        if (keeper_context->digest_enabled && request_for_session.digest)
-            assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
+        if (keeper_context->digest_enabled && request_for_session->digest)
+            assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true);
    }

    ProfileEvents::increment(ProfileEvents::KeeperCommits);
    last_committed_idx = log_idx;

    if (commit_callback)
-        commit_callback(request_for_session);
+        commit_callback(*request_for_session);
    return nullptr;
 }

@ -330,14 +392,14 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr

 void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
+    auto request_for_session = parseRequest(data, true);
    // If we received a log from an older node, use the log_idx as the zxid
    // log_idx will always be larger or equal to the zxid so we can safely do this
    // (log_idx is increased for all logs, while zxid is only increased for requests)
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

-    rollbackRequest(request_for_session, false);
+    rollbackRequest(*request_for_session, false);
 }

 void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
@ -541,11 +603,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
    /// Pure local request, just process it with storage
    std::lock_guard lock(storage_and_responses_lock);
    auto responses = storage->processRequest(
-        request_for_session.request,
-        request_for_session.session_id,
-        std::nullopt,
-        true /*check_acl*/,
-        true /*is_local*/);
+        request_for_session.request, request_for_session.session_id, std::nullopt, true /*check_acl*/, true /*is_local*/);
    for (const auto & response : responses)
        if (!responses_queue.push(response))
            LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response.session_id);
--- a/src/Coordination/KeeperStateMachine.h
+++ b/src/Coordination/KeeperStateMachine.h
@ -36,7 +36,22 @@ public:
    /// Read state from the latest snapshot
    void init();

-    static KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data);
+    enum ZooKeeperLogSerializationVersion
+    {
+        INITIAL = 0,
+        WITH_TIME = 1,
+        WITH_ZXID_DIGEST = 2,
+    };
+
+    /// lifetime of a parsed request is:
+    /// [preprocess/PreAppendLog -> commit]
+    /// [preprocess/PreAppendLog -> rollback]
+    /// on events like commit and rollback we can remove the parsed request to keep the memory usage at minimum
+    /// request cache is also cleaned on session close in case something strange happened
+    ///
+    /// final - whether it's the final time we will fetch the request so we can safely remove it from cache
+    /// serialization_version - information about which fields were parsed from the buffer so we can modify the buffer accordingly
+    std::shared_ptr<KeeperStorage::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);

    bool preprocess(const KeeperStorage::RequestForSession & request_for_session);

@ -138,6 +153,13 @@ private:
    /// for request.
    mutable std::mutex storage_and_responses_lock;

+    std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorage::RequestForSession>>> parsed_request_cache;
+    uint64_t min_request_size_to_cache{0};
+    /// we only need to protect the access to the map itself
+    /// requests can be modified from anywhere without lock because a single request
+    /// can be processed only in 1 thread at any point
+    std::mutex request_cache_mutex;
+
    /// Last committed Raft log number.
    std::atomic<uint64_t> last_committed_idx;

--- a/src/Coordination/KeeperStorage.h
+++ b/src/Coordination/KeeperStorage.h
@ -110,7 +110,7 @@ public:
    struct RequestForSession
    {
        int64_t session_id;
-        int64_t time;
+        int64_t time{0};
        Coordination::ZooKeeperRequestPtr request;
        int64_t zxid{0};
        std::optional<Digest> digest;
--- a/src/Coordination/WriteBufferFromNuraftBuffer.cpp
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.cpp
@ -1,5 +1,4 @@
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
-#include <Common/logger_useful.h>

 namespace DB
 {
--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@ -63,7 +63,7 @@ namespace DB
    \
    M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \
    M(Int32, dns_cache_update_period, 15, "Internal DNS cache update period in seconds.", 0) \
-    M(UInt32, dns_max_consecutive_failures, 1024, "Max connection failures before dropping host from ClickHouse DNS cache.", 0) \
+    M(UInt32, dns_max_consecutive_failures, 1024, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \
    \
    M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \
    M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -78,6 +78,7 @@ class IColumn;
    M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \
    M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \
    M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \
+    M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \
    M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
    M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
    M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \
@ -93,6 +94,7 @@ class IColumn;
    M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
    M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
    M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
+    M(UInt64, s3_retry_attempts, 10, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
    M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
    M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
    M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
--- a/src/Disks/DiskEncrypted.cpp
+++ b/src/Disks/DiskEncrypted.cpp
@ -138,19 +138,6 @@ namespace
        }
    }

-    String getCurrentKey(const String & path, const DiskEncryptedSettings & settings)
-    {
-        auto it = settings.keys.find(settings.current_key_id);
-        if (it == settings.keys.end())
-            throw Exception(
-                ErrorCodes::DATA_ENCRYPTION_ERROR,
-                "Not found a key with the current ID {} required to cipher file {}",
-                settings.current_key_id,
-                quoteString(path));
-
-        return it->second;
-    }
-
    String getKey(const String & path, const FileEncryption::Header & header, const DiskEncryptedSettings & settings)
    {
        auto it = settings.keys.find(header.key_id);
@ -203,18 +190,19 @@ private:
 };

 DiskEncrypted::DiskEncrypted(
-    const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_)
-    : DiskEncrypted(name_, parseDiskEncryptedSettings(name_, config_, config_prefix_, map_))
+    const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_, bool use_fake_transaction_)
+    : DiskEncrypted(name_, parseDiskEncryptedSettings(name_, config_, config_prefix_, map_), use_fake_transaction_)
 {
 }

-DiskEncrypted::DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_)
+DiskEncrypted::DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_, bool use_fake_transaction_)
    : IDisk(name_)
    , delegate(settings_->wrapped_disk)
    , encrypted_name(name_)
    , disk_path(settings_->disk_path)
    , disk_absolute_path(settings_->wrapped_disk->getPath() + settings_->disk_path)
    , current_settings(std::move(settings_))
+    , use_fake_transaction(use_fake_transaction_)
 {
    delegate->createDirectories(disk_path);
 }
@ -309,38 +297,6 @@ std::unique_ptr<ReadBufferFromFileBase> DiskEncrypted::readFile(
    return std::make_unique<ReadBufferFromEncryptedFile>(settings.local_fs_buffer_size, std::move(buffer), key, header);
 }

-std::unique_ptr<WriteBufferFromFileBase> DiskEncrypted::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings &)
-{
-    auto wrapped_path = wrappedPath(path);
-    FileEncryption::Header header;
-    String key;
-    UInt64 old_file_size = 0;
-    auto settings = current_settings.get();
-    if (mode == WriteMode::Append && exists(path))
-    {
-        old_file_size = getFileSize(path);
-        if (old_file_size)
-        {
-            /// Append mode: we continue to use the same header.
-            auto read_buffer = delegate->readFile(wrapped_path, ReadSettings().adjustBufferSize(FileEncryption::Header::kSize));
-            header = readHeader(*read_buffer);
-            key = getKey(path, header, *settings);
-        }
-    }
-    if (!old_file_size)
-    {
-        /// Rewrite mode: we generate a new header.
-        key = getCurrentKey(path, *settings);
-        header.algorithm = settings->current_algorithm;
-        header.key_id = settings->current_key_id;
-        header.key_hash = calculateKeyHash(key);
-        header.init_vector = InitVector::random();
-    }
-    auto buffer = delegate->writeFile(wrapped_path, buf_size, mode);
-    return std::make_unique<WriteBufferFromEncryptedFile>(buf_size, std::move(buffer), key, header, old_file_size);
-}
-
-
 size_t DiskEncrypted::getFileSize(const String & path) const
 {
    auto wrapped_path = wrappedPath(path);
@ -416,7 +372,7 @@ void registerDiskEncrypted(DiskFactory & factory, bool global_skip_access_check)
        const DisksMap & map) -> DiskPtr
    {
        bool skip_access_check = global_skip_access_check || config.getBool(config_prefix + ".skip_access_check", false);
-        DiskPtr disk = std::make_shared<DiskEncrypted>(name, config, config_prefix, map);
+        DiskPtr disk = std::make_shared<DiskEncrypted>(name, config, config_prefix, map, config.getBool(config_prefix + ".use_fake_transaction", true));
        disk->startup(context, skip_access_check);
        return disk;
    };
--- a/src/Disks/DiskEncrypted.h
+++ b/src/Disks/DiskEncrypted.h
@ -6,22 +6,14 @@
 #include <Disks/IDisk.h>
 #include <Common/MultiVersion.h>
 #include <Disks/FakeDiskTransaction.h>
+#include <Disks/DiskEncryptedTransaction.h>


 namespace DB
 {
+
 class ReadBufferFromFileBase;
 class WriteBufferFromFileBase;
-namespace FileEncryption { enum class Algorithm; }
-
-struct DiskEncryptedSettings
-{
-    DiskPtr wrapped_disk;
-    String disk_path;
-    std::unordered_map<UInt64, String> keys;
-    UInt64 current_key_id;
-    FileEncryption::Algorithm current_algorithm;
-};

 /// Encrypted disk ciphers all written files on the fly and writes the encrypted files to an underlying (normal) disk.
 /// And when we read files from an encrypted disk it deciphers them automatically,
@ -29,8 +21,8 @@ struct DiskEncryptedSettings
 class DiskEncrypted : public IDisk
 {
 public:
-    DiskEncrypted(const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_);
-    DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_);
+    DiskEncrypted(const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_, bool use_fake_transaction_);
+    DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_, bool use_fake_transaction_);

    const String & getName() const override { return encrypted_name; }
    const String & getPath() const override { return disk_absolute_path; }
@ -59,28 +51,30 @@ public:

    void createDirectory(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->createDirectory(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->createDirectory(path);
+        tx->commit();
    }

    void createDirectories(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->createDirectories(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->createDirectories(path);
+        tx->commit();
    }

-
    void clearDirectory(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->clearDirectory(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->clearDirectory(path);
+        tx->commit();
    }

    void moveDirectory(const String & from_path, const String & to_path) override
    {
-        auto wrapped_from_path = wrappedPath(from_path);
-        auto wrapped_to_path = wrappedPath(to_path);
-        delegate->moveDirectory(wrapped_from_path, wrapped_to_path);
+        auto tx = createEncryptedTransaction();
+        tx->moveDirectory(from_path, to_path);
+        tx->commit();
    }

    DirectoryIteratorPtr iterateDirectory(const String & path) const override
@ -91,22 +85,23 @@ public:

    void createFile(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->createFile(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->createFile(path);
+        tx->commit();
    }

    void moveFile(const String & from_path, const String & to_path) override
    {
-        auto wrapped_from_path = wrappedPath(from_path);
-        auto wrapped_to_path = wrappedPath(to_path);
-        delegate->moveFile(wrapped_from_path, wrapped_to_path);
+        auto tx = createEncryptedTransaction();
+        tx->moveFile(from_path, to_path);
+        tx->commit();
    }

    void replaceFile(const String & from_path, const String & to_path) override
    {
-        auto wrapped_from_path = wrappedPath(from_path);
-        auto wrapped_to_path = wrappedPath(to_path);
-        delegate->replaceFile(wrapped_from_path, wrapped_to_path);
+        auto tx = createEncryptedTransaction();
+        tx->replaceFile(from_path, to_path);
+        tx->commit();
    }

    void listFiles(const String & path, std::vector<String> & file_names) const override
@ -129,61 +124,67 @@ public:
        const String & path,
        size_t buf_size,
        WriteMode mode,
-        const WriteSettings & settings) override;
+        const WriteSettings & settings) override
+    {
+        auto tx = createEncryptedTransaction();
+        auto result = tx->writeFile(path, buf_size, mode, settings);
+        return result;
+    }

    void removeFile(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeFile(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->removeFile(path);
+        tx->commit();
    }

    void removeFileIfExists(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeFileIfExists(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->removeFileIfExists(path);
+        tx->commit();
    }

    void removeDirectory(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeDirectory(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->removeDirectory(path);
+        tx->commit();
    }

    void removeRecursive(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeRecursive(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->removeRecursive(path);
+        tx->commit();
    }

    void removeSharedFile(const String & path, bool flag) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeSharedFile(wrapped_path, flag);
+        auto tx = createEncryptedTransaction();
+        tx->removeSharedFile(path, flag);
+        tx->commit();
    }

    void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeSharedRecursive(wrapped_path, keep_all_batch_data, file_names_remove_metadata_only);
+        auto tx = createEncryptedTransaction();
+        tx->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
+        tx->commit();
    }

    void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
    {
-        for (const auto & file : files)
-        {
-            auto wrapped_path = wrappedPath(file.path);
-            bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
-            if (file.if_exists)
-                delegate->removeSharedFileIfExists(wrapped_path, keep);
-            else
-                delegate->removeSharedFile(wrapped_path, keep);
-        }
+        auto tx = createEncryptedTransaction();
+        tx->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
+        tx->commit();
    }

    void removeSharedFileIfExists(const String & path, bool flag) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->removeSharedFileIfExists(wrapped_path, flag);
+        auto tx = createEncryptedTransaction();
+        tx->removeSharedFileIfExists(path, flag);
+        tx->commit();
    }

    Strings getBlobPath(const String & path) const override
@ -194,8 +195,9 @@ public:

    void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function));
+        auto tx = createEncryptedTransaction();
+        tx->writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function));
+        tx->commit();
    }

    std::unique_ptr<ReadBufferFromFileBase> readEncryptedFile(const String & path, const ReadSettings & settings) const override
@ -210,8 +212,9 @@ public:
        WriteMode mode,
        const WriteSettings & settings) const override
    {
-        auto wrapped_path = wrappedPath(path);
-        return delegate->writeFile(wrapped_path, buf_size, mode, settings);
+        auto tx = createEncryptedTransaction();
+        auto buf = tx->writeEncryptedFile(path, buf_size, mode, settings);
+        return buf;
    }

    size_t getEncryptedFileSize(const String & path) const override
@ -228,8 +231,9 @@ public:

    void setLastModified(const String & path, const Poco::Timestamp & timestamp) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->setLastModified(wrapped_path, timestamp);
+        auto tx = createEncryptedTransaction();
+        tx->setLastModified(path, timestamp);
+        tx->commit();
    }

    Poco::Timestamp getLastModified(const String & path) const override
@ -246,15 +250,16 @@ public:

    void setReadOnly(const String & path) override
    {
-        auto wrapped_path = wrappedPath(path);
-        delegate->setReadOnly(wrapped_path);
+        auto tx = createEncryptedTransaction();
+        tx->setReadOnly(path);
+        tx->commit();
    }

    void createHardLink(const String & src_path, const String & dst_path) override
    {
-        auto wrapped_src_path = wrappedPath(src_path);
-        auto wrapped_dst_path = wrappedPath(dst_path);
-        delegate->createHardLink(wrapped_src_path, wrapped_dst_path);
+        auto tx = createEncryptedTransaction();
+        tx->createHardLink(src_path, dst_path);
+        tx->commit();
    }

    void truncateFile(const String & path, size_t size) override;
@ -289,12 +294,23 @@ public:

    SyncGuardPtr getDirectorySyncGuard(const String & path) const override;

+    std::shared_ptr<DiskEncryptedTransaction> createEncryptedTransaction() const
+    {
+        auto delegate_transaction = delegate->createTransaction();
+        return std::make_shared<DiskEncryptedTransaction>(delegate_transaction, disk_path, *current_settings.get(), delegate.get());
+    }
+
    DiskTransactionPtr createTransaction() override
    {
-        /// Need to overwrite explicetly because this disk change
-        /// a lot of "delegate" methods.
+        if (use_fake_transaction)
+        {
            return std::make_shared<FakeDiskTransaction>(*this);
        }
+        else
+        {
+            return createEncryptedTransaction();
+        }
+    }

    UInt64 getTotalSpace() const override
    {
@ -331,10 +347,7 @@ public:
 private:
    String wrappedPath(const String & path) const
    {
-        // if path starts_with disk_path -> got already wrapped path
-        if (!disk_path.empty() && path.starts_with(disk_path))
-            return path;
-        return disk_path + path;
+        return DiskEncryptedTransaction::wrappedPath(disk_path, path);
    }

    DiskPtr delegate;
@ -342,6 +355,7 @@ private:
    const String disk_path;
    const String disk_absolute_path;
    MultiVersion<DiskEncryptedSettings> current_settings;
+    bool use_fake_transaction;
 };

 }
--- a/src/Disks/DiskEncryptedTransaction.cpp
+++ b/src/Disks/DiskEncryptedTransaction.cpp
@ -0,0 +1,120 @@
+#include <Disks/DiskEncryptedTransaction.h>
+
+
+#if USE_SSL
+#include <IO/FileEncryptionCommon.h>
+#include <Common/Exception.h>
+#include <boost/algorithm/hex.hpp>
+#include <IO/ReadBufferFromEncryptedFile.h>
+#include <IO/ReadBufferFromFileDecorator.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/WriteBufferFromEncryptedFile.h>
+#include <Common/quoteString.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int DATA_ENCRYPTION_ERROR;
+}
+
+
+namespace
+{
+
+FileEncryption::Header readHeader(ReadBufferFromFileBase & read_buffer)
+{
+    try
+    {
+        FileEncryption::Header header;
+        header.read(read_buffer);
+        return header;
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While reading the header of encrypted file " + quoteString(read_buffer.getFileName()));
+        throw;
+    }
+}
+
+String getCurrentKey(const String & path, const DiskEncryptedSettings & settings)
+{
+    auto it = settings.keys.find(settings.current_key_id);
+    if (it == settings.keys.end())
+        throw Exception(
+            ErrorCodes::DATA_ENCRYPTION_ERROR,
+            "Not found a key with the current ID {} required to cipher file {}",
+            settings.current_key_id,
+            quoteString(path));
+
+    return it->second;
+}
+
+String getKey(const String & path, const FileEncryption::Header & header, const DiskEncryptedSettings & settings)
+{
+    auto it = settings.keys.find(header.key_id);
+    if (it == settings.keys.end())
+        throw Exception(
+            ErrorCodes::DATA_ENCRYPTION_ERROR,
+            "Not found a key with ID {} required to decipher file {}",
+            header.key_id,
+            quoteString(path));
+
+    String key = it->second;
+    if (FileEncryption::calculateKeyHash(key) != header.key_hash)
+        throw Exception(
+            ErrorCodes::DATA_ENCRYPTION_ERROR, "Wrong key with ID {}, could not decipher file {}", header.key_id, quoteString(path));
+
+    return key;
+}
+
+}
+
+void DiskEncryptedTransaction::copyFile(const std::string & from_file_path, const std::string & to_file_path)
+{
+    auto wrapped_from_path = wrappedPath(from_file_path);
+    auto wrapped_to_path = wrappedPath(to_file_path);
+    delegate_transaction->copyFile(wrapped_from_path, wrapped_to_path);
+}
+
+std::unique_ptr<WriteBufferFromFileBase> DiskEncryptedTransaction::writeFile( // NOLINT
+    const std::string & path,
+    size_t buf_size,
+    WriteMode mode,
+    const WriteSettings & settings,
+    bool autocommit)
+{
+    auto wrapped_path = wrappedPath(path);
+    FileEncryption::Header header;
+    String key;
+    UInt64 old_file_size = 0;
+    if (mode == WriteMode::Append && delegate_disk->exists(wrapped_path))
+    {
+        size_t size = delegate_disk->getFileSize(wrapped_path);
+        old_file_size = size > FileEncryption::Header::kSize ? (size - FileEncryption::Header::kSize) : 0;
+        if (old_file_size)
+        {
+            /// Append mode: we continue to use the same header.
+            auto read_buffer = delegate_disk->readFile(wrapped_path, ReadSettings().adjustBufferSize(FileEncryption::Header::kSize));
+            header = readHeader(*read_buffer);
+            key = getKey(path, header, current_settings);
+        }
+    }
+    if (!old_file_size)
+    {
+        /// Rewrite mode: we generate a new header.
+        key = getCurrentKey(path, current_settings);
+        header.algorithm = current_settings.current_algorithm;
+        header.key_id = current_settings.current_key_id;
+        header.key_hash = FileEncryption::calculateKeyHash(key);
+        header.init_vector = FileEncryption::InitVector::random();
+    }
+    auto buffer = delegate_transaction->writeFile(wrapped_path, buf_size, mode, settings, autocommit);
+    return std::make_unique<WriteBufferFromEncryptedFile>(buf_size, std::move(buffer), key, header, old_file_size);
+
+}
+
+}
+
+#endif
--- a/src/Disks/DiskEncryptedTransaction.h
+++ b/src/Disks/DiskEncryptedTransaction.h
@ -0,0 +1,259 @@
+#pragma once
+
+#include "config.h"
+
+#if USE_SSL
+
+#include <Disks/IDiskTransaction.h>
+#include <Disks/IDisk.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/WriteBufferFromFile.h>
+
+namespace DB
+{
+
+namespace FileEncryption { enum class Algorithm; }
+
+struct DiskEncryptedSettings
+{
+    DiskPtr wrapped_disk;
+    String disk_path;
+    std::unordered_map<UInt64, String> keys;
+    UInt64 current_key_id;
+    FileEncryption::Algorithm current_algorithm;
+};
+
+
+class DiskEncryptedTransaction : public IDiskTransaction
+{
+public:
+    static String wrappedPath(const String disk_path, const String & path)
+    {
+        // if path starts_with disk_path -> got already wrapped path
+        if (!disk_path.empty() && path.starts_with(disk_path))
+            return path;
+        return disk_path + path;
+    }
+
+    DiskEncryptedTransaction(DiskTransactionPtr delegate_transaction_, const std::string & disk_path_, DiskEncryptedSettings current_settings_, IDisk * delegate_disk_)
+        : delegate_transaction(delegate_transaction_)
+        , disk_path(disk_path_)
+        , current_settings(current_settings_)
+        , delegate_disk(delegate_disk_)
+    {}
+
+    /// Tries to commit all accumulated operations simultaneously.
+    /// If something fails rollback and throw exception.
+    void commit() override // NOLINT
+    {
+        delegate_transaction->commit();
+    }
+
+    void undo() override
+    {
+        delegate_transaction->undo();
+    }
+
+    ~DiskEncryptedTransaction() override = default;
+
+    /// Create directory.
+    void createDirectory(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->createDirectory(wrapped_path);
+    }
+
+    /// Create directory and all parent directories if necessary.
+    void createDirectories(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->createDirectories(wrapped_path);
+    }
+
+    /// Remove all files from the directory. Directories are not removed.
+    void clearDirectory(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->clearDirectory(wrapped_path);
+    }
+
+    /// Move directory from `from_path` to `to_path`.
+    void moveDirectory(const std::string & from_path, const std::string & to_path) override
+    {
+        auto wrapped_from_path = wrappedPath(from_path);
+        auto wrapped_to_path = wrappedPath(to_path);
+        delegate_transaction->moveDirectory(wrapped_from_path, wrapped_to_path);
+    }
+
+    void moveFile(const std::string & from_path, const std::string & to_path) override
+    {
+        auto wrapped_from_path = wrappedPath(from_path);
+        auto wrapped_to_path = wrappedPath(to_path);
+        delegate_transaction->moveFile(wrapped_from_path, wrapped_to_path);
+
+    }
+
+    void createFile(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->createFile(wrapped_path);
+    }
+
+    /// Move the file from `from_path` to `to_path`.
+    /// If a file with `to_path` path already exists, it will be replaced.
+    void replaceFile(const std::string & from_path, const std::string & to_path) override
+    {
+        auto wrapped_from_path = wrappedPath(from_path);
+        auto wrapped_to_path = wrappedPath(to_path);
+        delegate_transaction->replaceFile(wrapped_from_path, wrapped_to_path);
+    }
+
+    /// Only copy of several files supported now. Disk interface support copy to another disk
+    /// but it's impossible to implement correctly in transactions because other disk can
+    /// use different metadata storage.
+    /// TODO: maybe remove it at all, we don't want copies
+    void copyFile(const std::string & from_file_path, const std::string & to_file_path) override;
+
+    /// Open the file for write and return WriteBufferFromFileBase object.
+    std::unique_ptr<WriteBufferFromFileBase> writeFile( /// NOLINT
+        const std::string & path,
+        size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
+        WriteMode mode = WriteMode::Rewrite,
+        const WriteSettings & settings = {},
+        bool autocommit = true) override;
+
+    /// Remove file. Throws exception if file doesn't exists or it's a directory.
+    void removeFile(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeFile(wrapped_path);
+    }
+
+    /// Remove file if it exists.
+    void removeFileIfExists(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeFileIfExists(wrapped_path);
+    }
+
+    /// Remove directory. Throws exception if it's not a directory or if directory is not empty.
+    void removeDirectory(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeDirectory(wrapped_path);
+    }
+
+    /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
+    void removeRecursive(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeRecursive(wrapped_path);
+    }
+
+    /// Remove file. Throws exception if file doesn't exists or if directory is not empty.
+    /// Differs from removeFile for S3/HDFS disks
+    /// Second bool param is a flag to remove (true) or keep (false) shared data on S3
+    void removeSharedFile(const std::string & path, bool keep_shared_data) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeSharedFile(wrapped_path, keep_shared_data);
+    }
+
+    /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
+    /// Differs from removeRecursive for S3/HDFS disks
+    /// Second bool param is a flag to remove (false) or keep (true) shared data on S3.
+    /// Third param determines which files cannot be removed even if second is true.
+    void removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const NameSet & file_names_remove_metadata_only) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeSharedRecursive(wrapped_path, keep_all_shared_data, file_names_remove_metadata_only);
+    }
+
+    /// Remove file or directory if it exists.
+    /// Differs from removeFileIfExists for S3/HDFS disks
+    /// Second bool param is a flag to remove (true) or keep (false) shared data on S3
+    void removeSharedFileIfExists(const std::string & path, bool keep_shared_data) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->removeSharedFileIfExists(wrapped_path, keep_shared_data);
+    }
+
+    /// Batch request to remove multiple files.
+    /// May be much faster for blob storage.
+    /// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
+    /// Third param determines which files cannot be removed even if second is true.
+    void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
+    {
+        for (const auto & file : files)
+        {
+            auto wrapped_path = wrappedPath(file.path);
+            bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
+            if (file.if_exists)
+                delegate_transaction->removeSharedFileIfExists(wrapped_path, keep);
+            else
+                delegate_transaction->removeSharedFile(wrapped_path, keep);
+        }
+    }
+
+    /// Set last modified time to file or directory at `path`.
+    void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->setLastModified(wrapped_path, timestamp);
+    }
+
+    /// Just chmod.
+    void chmod(const String & path, mode_t mode) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->chmod(wrapped_path, mode);
+    }
+
+    /// Set file at `path` as read-only.
+    void setReadOnly(const std::string & path) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->setReadOnly(wrapped_path);
+    }
+
+    /// Create hardlink from `src_path` to `dst_path`.
+    void createHardLink(const std::string & src_path, const std::string & dst_path) override
+    {
+        auto wrapped_src_path = wrappedPath(src_path);
+        auto wrapped_dst_path = wrappedPath(dst_path);
+        delegate_transaction->createHardLink(wrapped_src_path, wrapped_dst_path);
+    }
+
+    void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override
+    {
+        auto wrapped_path = wrappedPath(path);
+        delegate_transaction->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function));
+    }
+
+    std::unique_ptr<WriteBufferFromFileBase> writeEncryptedFile(
+        const String & path,
+        size_t buf_size,
+        WriteMode mode,
+        const WriteSettings & settings) const
+    {
+        auto wrapped_path = wrappedPath(path);
+        return delegate_transaction->writeFile(wrapped_path, buf_size, mode, settings);
+    }
+
+
+private:
+
+    String wrappedPath(const String & path) const
+    {
+        return wrappedPath(disk_path, path);
+    }
+
+    DiskTransactionPtr delegate_transaction;
+    std::string disk_path;
+    DiskEncryptedSettings current_settings;
+    IDisk * delegate_disk;
+};
+
+}
+
+#endif
--- a/src/Disks/IDisk.cpp
+++ b/src/Disks/IDisk.cpp
@ -188,12 +188,12 @@ try
        try
        {
            file->write(payload.data(), payload.size());
+            file->finalize();
        }
        catch (...)
        {
            /// Log current exception, because finalize() can throw a different exception.
            tryLogCurrentException(__PRETTY_FUNCTION__);
-            file->finalize();
            throw;
        }
    }
--- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
+++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
@ -83,19 +83,19 @@ bool AsynchronousBoundedReadBuffer::hasPendingDataToRead()
 }

 std::future<IAsynchronousReader::Result>
-AsynchronousBoundedReadBuffer::asyncReadInto(char * data, size_t size, int64_t priority)
+AsynchronousBoundedReadBuffer::asyncReadInto(char * data, size_t size, Priority priority)
 {
    IAsynchronousReader::Request request;
    request.descriptor = std::make_shared<RemoteFSFileDescriptor>(*impl, async_read_counters);
    request.buf = data;
    request.size = size;
    request.offset = file_offset_of_buffer_end;
-    request.priority = read_settings.priority + priority;
+    request.priority = Priority{read_settings.priority.value + priority.value};
    request.ignore = bytes_to_ignore;
    return reader.submit(request);
 }

-void AsynchronousBoundedReadBuffer::prefetch(int64_t priority)
+void AsynchronousBoundedReadBuffer::prefetch(Priority priority)
 {
    if (prefetch_future.valid())
        return;
--- a/src/Disks/IO/AsynchronousBoundedReadBuffer.h
+++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.h
@ -39,7 +39,7 @@ public:

    off_t seek(off_t offset_, int whence) override;

-    void prefetch(int64_t priority) override;
+    void prefetch(Priority priority) override;

    void setReadUntilPosition(size_t position) override; /// [..., position).

@ -72,7 +72,7 @@ private:
    struct LastPrefetchInfo
    {
        UInt64 submit_time = 0;
-        size_t priority = 0;
+        Priority priority;
    };
    LastPrefetchInfo last_prefetch_info;

@ -87,7 +87,7 @@ private:
        int64_t size,
        const std::unique_ptr<Stopwatch> & execution_watch);

-    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
+    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, Priority priority);

    void resetPrefetch(FilesystemPrefetchState state);

--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -146,7 +146,8 @@ std::unique_ptr<S3::Client> getClient(
    S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config);

    client_configuration.retryStrategy
-        = std::make_shared<Aws::Client::DefaultRetryStrategy>(config.getUInt(config_prefix + ".retry_attempts", 10));
+        = std::make_shared<Aws::Client::DefaultRetryStrategy>(
+            config.getUInt64(config_prefix + ".retry_attempts", settings.request_settings.retry_attempts));

    return S3::ClientFactory::instance().create(
        client_configuration,
--- a/src/Disks/tests/gtest_disk_encrypted.cpp
+++ b/src/Disks/tests/gtest_disk_encrypted.cpp
@ -40,7 +40,7 @@ protected:
        settings->keys[0] = key;
        settings->current_key_id = 0;
        settings->disk_path = path;
-        encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings));
+        encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
    }

    String getFileNames()
--- a/src/Functions/FunctionsComparison.h
+++ b/src/Functions/FunctionsComparison.h
@ -1230,8 +1230,11 @@ public:
        /// The case when arguments are the same (tautological comparison). Return constant.
        /// NOTE: Nullable types are special case.
        /// (BTW, this function use default implementation for Nullable, so Nullable types cannot be here. Check just in case.)
-        /// NOTE: We consider NaN comparison to be implementation specific (and in our implementation NaNs are sometimes equal sometimes not).
-        if (left_type->equals(*right_type) && !left_type->isNullable() && !isTuple(left_type) && col_left_untyped == col_right_untyped)
+        if (left_type->equals(*right_type) &&
+            !left_type->isNullable() &&
+            !isTuple(left_type) &&
+            !WhichDataType(left_type).isFloat() &&
+            col_left_untyped == col_right_untyped)
        {
            ColumnPtr result_column;

--- a/src/Functions/repeat.cpp
+++ b/src/Functions/repeat.cpp
@ -13,7 +13,6 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int ILLEGAL_COLUMN;
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int TOO_LARGE_STRING_SIZE;
 }

@ -25,18 +24,16 @@ struct RepeatImpl
    /// Safety threshold against DoS.
    static inline void checkRepeatTime(UInt64 repeat_time)
    {
-        static constexpr UInt64 max_repeat_times = 1000000;
+        static constexpr UInt64 max_repeat_times = 1'000'000;
        if (repeat_time > max_repeat_times)
-            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}",
-                std::to_string(repeat_time), std::to_string(max_repeat_times));
+            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}", repeat_time, max_repeat_times);
    }

    static inline void checkStringSize(UInt64 size)
    {
        static constexpr UInt64 max_string_size = 1 << 30;
        if (size > max_string_size)
-            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size ({}) in function repeat, maximum is: {}",
-                size, max_string_size);
+            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size ({}) in function repeat, maximum is: {}", size, max_string_size);
    }

    template <typename T>
@ -186,36 +183,37 @@ public:

    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }

-    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
-        if (!isString(arguments[0]))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
-                arguments[0]->getName(), getName());
-        if (!isInteger(arguments[1]))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
-                arguments[1]->getName(), getName());
-        return arguments[0];
+        FunctionArgumentDescriptors args{
+            {"s", &isString<IDataType>, nullptr, "String"},
+            {"n", &isInteger<IDataType>, nullptr, "Integer"},
+        };
+
+        validateFunctionArgumentTypes(*this, arguments, args);
+
+        return std::make_shared<DataTypeString>();
    }

    bool useDefaultImplementationForConstants() const override { return true; }

-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
    {
-        const auto & strcolumn = arguments[0].column;
-        const auto & numcolumn = arguments[1].column;
+        const auto & col_str = arguments[0].column;
+        const auto & col_num = arguments[1].column;
        ColumnPtr res;

-        if (const ColumnString * col = checkAndGetColumn<ColumnString>(strcolumn.get()))
+        if (const ColumnString * col = checkAndGetColumn<ColumnString>(col_str.get()))
        {
-            if (const ColumnConst * scale_column_num = checkAndGetColumn<ColumnConst>(numcolumn.get()))
+            if (const ColumnConst * col_num_const = checkAndGetColumn<ColumnConst>(col_num.get()))
            {
                auto col_res = ColumnString::create();
                castType(arguments[1].type.get(), [&](const auto & type)
                {
                    using DataType = std::decay_t<decltype(type)>;
                    using T = typename DataType::FieldType;
-                    T repeat_time = scale_column_num->getValue<T>();
-                    RepeatImpl::vectorStrConstRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time);
+                    T times = col_num_const->getValue<T>();
+                    RepeatImpl::vectorStrConstRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), times);
                    return true;
                });
                return col_res;
@ -224,9 +222,9 @@ public:
                {
                    using DataType = std::decay_t<decltype(type)>;
                    using T = typename DataType::FieldType;
-                    const ColumnVector<T> * colnum = checkAndGetColumn<ColumnVector<T>>(numcolumn.get());
+                    const ColumnVector<T> * column = checkAndGetColumn<ColumnVector<T>>(col_num.get());
                    auto col_res = ColumnString::create();
-                    RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), colnum->getData());
+                    RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), column->getData());
                    res = std::move(col_res);
                    return true;
                }))
@ -234,7 +232,7 @@ public:
                return res;
            }
        }
-        else if (const ColumnConst * col_const = checkAndGetColumn<ColumnConst>(strcolumn.get()))
+        else if (const ColumnConst * col_const = checkAndGetColumn<ColumnConst>(col_str.get()))
        {
            /// Note that const-const case is handled by useDefaultImplementationForConstants.

@ -244,9 +242,9 @@ public:
                {
                    using DataType = std::decay_t<decltype(type)>;
                    using T = typename DataType::FieldType;
-                    const ColumnVector<T> * colnum = checkAndGetColumn<ColumnVector<T>>(numcolumn.get());
+                    const ColumnVector<T> * column = checkAndGetColumn<ColumnVector<T>>(col_num.get());
                    auto col_res = ColumnString::create();
-                    RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData());
+                    RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), column->getData());
                    res = std::move(col_res);
                    return true;
                }))
--- a/src/Functions/space.cpp
+++ b/src/Functions/space.cpp
@ -0,0 +1,179 @@
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeString.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <cstring>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int TOO_LARGE_STRING_SIZE;
+}
+
+namespace
+{
+
+/// Prints whitespace n-times. Actually, space() could also be pushed down to repeat(). Chose a standalone-implementation because
+/// we can do memset() whereas repeat() does memcpy().
+class FunctionSpace : public IFunction
+{
+private:
+    static constexpr auto space = ' ';
+
+    /// Safety threshold against DoS.
+    static inline void checkRepeatTime(size_t repeat_time)
+    {
+        static constexpr auto max_repeat_times = 1'000'000uz;
+        if (repeat_time > max_repeat_times)
+            throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}", repeat_time, max_repeat_times);
+    }
+
+public:
+    static constexpr auto name = "space";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSpace>(); }
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 1; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        FunctionArgumentDescriptors args{
+            {"n", &isInteger<IDataType>, nullptr, "Integer"}
+        };
+
+        validateFunctionArgumentTypes(*this, arguments, args);
+
+        return std::make_shared<DataTypeString>();
+    }
+
+
+    template <typename DataType>
+    bool executeConstant(ColumnPtr col_times, ColumnString::Offsets & res_offsets, ColumnString::Chars & res_chars) const
+    {
+        const ColumnConst * col_times_const = checkAndGetColumn<ColumnConst>(col_times.get());
+
+        const ColumnPtr & col_times_const_internal = col_times_const->getDataColumnPtr();
+        if (!checkAndGetColumn<typename DataType::ColumnType>(col_times_const_internal.get()))
+            return false;
+
+        using T = typename DataType::FieldType;
+        T times = col_times_const->getValue<T>();
+
+        if (times < 1)
+            times = 0;
+
+        checkRepeatTime(times);
+
+        res_offsets.resize(col_times->size());
+        res_chars.resize(col_times->size() * (times + 1));
+
+        size_t pos = 0;
+
+        for (size_t i = 0; i < col_times->size(); ++i)
+        {
+            memset(res_chars.begin() + pos, space, times);
+            pos += times;
+
+            *(res_chars.begin() + pos) = '\0';
+            pos += 1;
+
+            res_offsets[i] = pos;
+        }
+
+        return true;
+    }
+
+
+    template <typename DataType>
+    bool executeVector(ColumnPtr col_times_, ColumnString::Offsets & res_offsets, ColumnString::Chars & res_chars) const
+    {
+        auto * col_times = checkAndGetColumn<typename DataType::ColumnType>(col_times_.get());
+        if (!col_times)
+            return false;
+
+        res_offsets.resize(col_times->size());
+        res_chars.resize(col_times->size() * 10); /// heuristic
+
+        const PaddedPODArray<typename DataType::FieldType> & times_data = col_times->getData();
+
+        size_t pos = 0;
+
+        for (size_t i = 0; i < col_times->size(); ++i)
+        {
+            typename DataType::FieldType times = times_data[i];
+
+            if (times < 1)
+                times = 0;
+
+            checkRepeatTime(times);
+
+            if (pos + times + 1 > res_chars.size())
+                res_chars.resize(std::max(2 * res_chars.size(), static_cast<size_t>(pos + times + 1)));
+
+            memset(res_chars.begin() + pos, space, times);
+            pos += times;
+
+            *(res_chars.begin() + pos) = '\0';
+            pos += 1;
+
+            res_offsets[i] = pos;
+        }
+
+        res_chars.resize(pos);
+
+        return true;
+    }
+
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        const auto & col_num = arguments[0].column;
+
+        auto col_res = ColumnString::create();
+
+        ColumnString::Offsets & res_offsets = col_res->getOffsets();
+        ColumnString::Chars & res_chars = col_res->getChars();
+
+        if (const ColumnConst * col_num_const = checkAndGetColumn<ColumnConst>(col_num.get()))
+        {
+            if ((executeConstant<DataTypeUInt8>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeUInt16>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeUInt32>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeUInt64>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeInt8>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeInt16>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeInt32>(col_num, res_offsets, res_chars))
+                || (executeConstant<DataTypeInt64>(col_num, res_offsets, res_chars)))
+                return col_res;
+        }
+        else
+        {
+            if ((executeVector<DataTypeUInt8>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeUInt16>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeUInt32>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeUInt64>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeInt8>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeInt16>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeInt32>(col_num, res_offsets, res_chars))
+                || (executeVector<DataTypeInt64>(col_num, res_offsets, res_chars)))
+                return col_res;
+        }
+
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
+    }
+};
+}
+
+REGISTER_FUNCTION(Space)
+{
+    factory.registerFunction<FunctionSpace>({}, FunctionFactory::CaseInsensitive);
+}
+
+}
--- a/src/IO/AsynchronousReadBufferFromFile.cpp
+++ b/src/IO/AsynchronousReadBufferFromFile.cpp
@ -26,7 +26,7 @@ namespace ErrorCodes

 AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
    IAsynchronousReader & reader_,
-    Int32 priority_,
+    Priority priority_,
    const std::string & file_name_,
    size_t buf_size,
    int flags,
@ -60,7 +60,7 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(

 AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
    IAsynchronousReader & reader_,
-    Int32 priority_,
+    Priority priority_,
    int & fd_,
    const std::string & original_file_name,
    size_t buf_size,
--- a/src/IO/AsynchronousReadBufferFromFile.h
+++ b/src/IO/AsynchronousReadBufferFromFile.h
@ -17,7 +17,7 @@ protected:
 public:
    explicit AsynchronousReadBufferFromFile(
        IAsynchronousReader & reader_,
-        Int32 priority_,
+        Priority priority_,
        const std::string & file_name_,
        size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
        int flags = -1,
@ -28,7 +28,7 @@ public:
    /// Use pre-opened file descriptor.
    explicit AsynchronousReadBufferFromFile(
        IAsynchronousReader & reader_,
-        Int32 priority_,
+        Priority priority_,
        int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
        const std::string & original_file_name = {},
        size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
@ -58,7 +58,7 @@ private:
 public:
    AsynchronousReadBufferFromFileWithDescriptorsCache(
        IAsynchronousReader & reader_,
-        Int32 priority_,
+        Priority priority_,
        const std::string & file_name_,
        size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
        int flags = -1,
--- a/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp
+++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.cpp
@ -40,14 +40,14 @@ std::string AsynchronousReadBufferFromFileDescriptor::getFileName() const
 }


-std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescriptor::asyncReadInto(char * data, size_t size, int64_t priority)
+std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescriptor::asyncReadInto(char * data, size_t size, Priority priority)
 {
    IAsynchronousReader::Request request;
    request.descriptor = std::make_shared<IAsynchronousReader::LocalFileDescriptor>(fd);
    request.buf = data;
    request.size = size;
    request.offset = file_offset_of_buffer_end;
-    request.priority = base_priority + priority;
+    request.priority = Priority{base_priority.value + priority.value};
    request.ignore = bytes_to_ignore;
    bytes_to_ignore = 0;

@ -61,7 +61,7 @@ std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescripto
 }


-void AsynchronousReadBufferFromFileDescriptor::prefetch(int64_t priority)
+void AsynchronousReadBufferFromFileDescriptor::prefetch(Priority priority)
 {
    if (prefetch_future.valid())
        return;
@ -151,7 +151,7 @@ void AsynchronousReadBufferFromFileDescriptor::finalize()

 AsynchronousReadBufferFromFileDescriptor::AsynchronousReadBufferFromFileDescriptor(
    IAsynchronousReader & reader_,
-    Int32 priority_,
+    Priority priority_,
    int fd_,
    size_t buf_size,
    char * existing_memory,
--- a/src/IO/AsynchronousReadBufferFromFileDescriptor.h
+++ b/src/IO/AsynchronousReadBufferFromFileDescriptor.h
@ -4,6 +4,7 @@
 #include <IO/AsynchronousReader.h>
 #include <Interpreters/Context.h>
 #include <Common/Throttler_fwd.h>
+#include <Common/Priority.h>

 #include <optional>
 #include <unistd.h>
@ -18,7 +19,7 @@ class AsynchronousReadBufferFromFileDescriptor : public ReadBufferFromFileBase
 {
 protected:
    IAsynchronousReader & reader;
-    int64_t base_priority;
+    Priority base_priority;

    Memory<> prefetch_buffer;
    std::future<IAsynchronousReader::Result> prefetch_future;
@ -39,7 +40,7 @@ protected:
 public:
    AsynchronousReadBufferFromFileDescriptor(
        IAsynchronousReader & reader_,
-        Int32 priority_,
+        Priority priority_,
        int fd_,
        size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
        char * existing_memory = nullptr,
@ -49,7 +50,7 @@ public:

    ~AsynchronousReadBufferFromFileDescriptor() override;

-    void prefetch(int64_t priority) override;
+    void prefetch(Priority priority) override;

    int getFD() const
    {
@ -70,7 +71,7 @@ public:
    size_t getFileSize() override;

 private:
-    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
+    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, Priority priority);
 };

 }
--- a/src/IO/AsynchronousReader.h
+++ b/src/IO/AsynchronousReader.h
@ -6,6 +6,7 @@
 #include <future>
 #include <boost/noncopyable.hpp>
 #include <Common/Stopwatch.h>
+#include <Common/Priority.h>


 namespace DB
@ -47,7 +48,7 @@ public:
        size_t offset = 0;
        size_t size = 0;
        char * buf = nullptr;
-        int64_t priority = 0;
+        Priority priority;
        size_t ignore = 0;
    };

--- a/src/IO/CompressedReadBufferWrapper.h
+++ b/src/IO/CompressedReadBufferWrapper.h
@ -19,7 +19,7 @@ public:
    const ReadBuffer & getWrappedReadBuffer() const { return *in; }
    ReadBuffer & getWrappedReadBuffer() { return *in; }

-    void prefetch(int64_t priority) override { in->prefetch(priority); }
+    void prefetch(Priority priority) override { in->prefetch(priority); }

 protected:
    std::unique_ptr<ReadBuffer> in;
--- a/src/IO/ISchedulerNode.h
+++ b/src/IO/ISchedulerNode.h
@ -2,6 +2,7 @@

 #include <Common/ErrorCodes.h>
 #include <Common/Exception.h>
+#include <Common/Priority.h>

 #include <IO/ResourceRequest.h>
 #include <Poco/Util/AbstractConfiguration.h>
@ -37,7 +38,7 @@ inline const Poco::Util::AbstractConfiguration & emptyConfig()
 struct SchedulerNodeInfo
 {
    double weight = 1.0; /// Weight of this node among it's siblings
-    Int64 priority = 0; /// Priority of this node among it's siblings (higher value means higher priority)
+    Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)

    /// Arbitrary data accessed/stored by parent
    union {
@ -65,7 +66,7 @@ struct SchedulerNodeInfo

    void setPriority(Int64 value)
    {
-        priority = value;
+        priority.value = value;
    }
 };

--- a/src/IO/ParallelReadBuffer.cpp
+++ b/src/IO/ParallelReadBuffer.cpp
@ -87,7 +87,7 @@ bool ParallelReadBuffer::addReaderToPool()
    auto worker = read_workers.emplace_back(std::make_shared<ReadWorker>(std::move(reader), range_start, size));

    ++active_working_reader;
-    schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, 0);
+    schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, Priority{});

    return true;
 }
--- a/src/IO/PeekableReadBuffer.h
+++ b/src/IO/PeekableReadBuffer.h
@ -20,7 +20,7 @@ public:

    ~PeekableReadBuffer() override;

-    void prefetch(int64_t priority) override { sub_buf->prefetch(priority); }
+    void prefetch(Priority priority) override { sub_buf->prefetch(priority); }

    /// Sets checkpoint at current position
    ALWAYS_INLINE inline void setCheckpoint()
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@ -6,6 +6,7 @@
 #include <memory>

 #include <Common/Exception.h>
+#include <Common/Priority.h>
 #include <IO/BufferBase.h>
 #include <IO/AsynchronousReader.h>

@ -20,7 +21,7 @@ namespace ErrorCodes
    extern const int NOT_IMPLEMENTED;
 }

-static constexpr auto DEFAULT_PREFETCH_PRIORITY = 0;
+static constexpr auto DEFAULT_PREFETCH_PRIORITY = Priority{0};

 /** A simple abstract class for buffered data reading (char sequences) from somewhere.
  * Unlike std::istream, it provides access to the internal buffer,
@ -208,10 +209,10 @@ public:

    /** Do something to allow faster subsequent call to 'nextImpl' if possible.
      * It's used for asynchronous readers with double-buffering.
-      * `priority` is the Threadpool priority, with which the prefetch task will be schedules.
-      * Smaller is more priority.
+      * `priority` is the `ThreadPool` priority, with which the prefetch task will be scheduled.
+      * Lower value means higher priority.
      */
-    virtual void prefetch(int64_t /* priority */) {}
+    virtual void prefetch(Priority) {}

    /**
     * Set upper bound for read range [..., position).
--- a/src/IO/ReadBufferFromFileDescriptor.cpp
+++ b/src/IO/ReadBufferFromFileDescriptor.cpp
@ -124,7 +124,7 @@ bool ReadBufferFromFileDescriptor::nextImpl()
 }


-void ReadBufferFromFileDescriptor::prefetch(int64_t)
+void ReadBufferFromFileDescriptor::prefetch(Priority)
 {
 #if defined(POSIX_FADV_WILLNEED)
    /// For direct IO, loading data into page cache is pointless.
--- a/src/IO/ReadBufferFromFileDescriptor.h
+++ b/src/IO/ReadBufferFromFileDescriptor.h
@ -25,7 +25,7 @@ protected:
    ThrottlerPtr throttler;

    bool nextImpl() override;
-    void prefetch(int64_t priority) override;
+    void prefetch(Priority priority) override;

    /// Name or some description of file.
    std::string getFileName() const override;
--- a/src/IO/ReadBufferFromMemory.cpp
+++ b/src/IO/ReadBufferFromMemory.cpp
@ -12,7 +12,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
 {
    if (whence == SEEK_SET)
    {
-        if (offset >= 0 && internal_buffer.begin() + offset < internal_buffer.end())
+        if (offset >= 0 && internal_buffer.begin() + offset <= internal_buffer.end())
        {
            pos = internal_buffer.begin() + offset;
            working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
@ -25,7 +25,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
    else if (whence == SEEK_CUR)
    {
        Position new_pos = pos + offset;
-        if (new_pos >= internal_buffer.begin() && new_pos < internal_buffer.end())
+        if (new_pos >= internal_buffer.begin() && new_pos <= internal_buffer.end())
        {
            pos = new_pos;
            working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@ -5,6 +5,7 @@
 #include <Core/Defines.h>
 #include <Interpreters/Cache/FileCache_fwd.h>
 #include <Common/Throttler_fwd.h>
+#include <Common/Priority.h>
 #include <IO/ResourceLink.h>

 namespace DB
@ -84,8 +85,8 @@ struct ReadSettings
    size_t mmap_threshold = 0;
    MMappedFileCache * mmap_cache = nullptr;

-    /// For 'pread_threadpool'/'io_uring' method. Lower is more priority.
-    size_t priority = 0;
+    /// For 'pread_threadpool'/'io_uring' method. Lower value is higher priority.
+    Priority priority;

    bool load_marks_asynchronously = true;

--- a/src/IO/Resource/PriorityPolicy.h
+++ b/src/IO/Resource/PriorityPolicy.h
@ -26,12 +26,12 @@ class PriorityPolicy : public ISchedulerNode
    struct Item
    {
        ISchedulerNode * child = nullptr;
-        Int64 priority = 0; // higher value means higher priority
+        Priority priority; // lower value means higher priority

        /// For max-heap by priority
        bool operator<(const Item& rhs) const noexcept
        {
-            return priority < rhs.priority;
+            return priority > rhs.priority; // Reversed for heap top to yield highest priority (lowest value) child first
        }
    };

--- a/src/IO/Resource/tests/gtest_resource_class_priority.cpp
+++ b/src/IO/Resource/tests/gtest_resource_class_priority.cpp
@ -22,9 +22,9 @@ TEST(IOResourcePriorityPolicy, Priorities)
    ResourceTest t;

    t.add<PriorityPolicy>("/");
-    t.add<FifoQueue>("/A", "<priority>1</priority>");
+    t.add<FifoQueue>("/A", "<priority>3</priority>");
    t.add<FifoQueue>("/B", "<priority>2</priority>");
-    t.add<FifoQueue>("/C", "<priority>3</priority>");
+    t.add<FifoQueue>("/C", "<priority>1</priority>");

    t.enqueue("/A", {10, 10, 10});
    t.enqueue("/B", {10, 10, 10});
@ -56,9 +56,9 @@ TEST(IOResourcePriorityPolicy, Activation)
    ResourceTest t;

    t.add<PriorityPolicy>("/");
-    t.add<FifoQueue>("/A", "<priority>1</priority>");
+    t.add<FifoQueue>("/A", "<priority>3</priority>");
    t.add<FifoQueue>("/B", "<priority>2</priority>");
-    t.add<FifoQueue>("/C", "<priority>3</priority>");
+    t.add<FifoQueue>("/C", "<priority>1</priority>");

    t.enqueue("/A", {10, 10, 10, 10, 10, 10});
    t.enqueue("/B", {10});
--- a/src/IO/Resource/tests/gtest_resource_manager_static.cpp
+++ b/src/IO/Resource/tests/gtest_resource_manager_static.cpp
@ -49,7 +49,7 @@ TEST(IOResourceStaticResourceManager, Prioritization)
    {
        // Lock is not required here because this is called during request execution and we have max_requests = 1
        if (last_priority)
-            EXPECT_TRUE(priority <= *last_priority); // Should be true if every queue arrived at the same time at busy period start
+            EXPECT_TRUE(priority >= *last_priority); // Should be true if every queue arrived at the same time at busy period start
        last_priority = priority;
    };

@ -63,8 +63,8 @@ TEST(IOResourceStaticResourceManager, Prioritization)
                <res1>
                    <node path="/">           <type>inflight_limit</type><max_requests>1</max_requests></node>
                    <node path="/prio">       <type>priority</type></node>
-                    <node path="/prio/A">     <priority>-1</priority></node>
-                    <node path="/prio/B">     <priority>1</priority></node>
+                    <node path="/prio/A">     <priority>1</priority></node>
+                    <node path="/prio/B">     <priority>-1</priority></node>
                    <node path="/prio/C">     </node>
                    <node path="/prio/D">     </node>
                    <node path="/prio/leader"></node>
--- a/src/IO/S3/copyS3File.cpp
+++ b/src/IO/S3/copyS3File.cpp
@ -361,7 +361,7 @@ namespace
                            task->exception = std::current_exception();
                        }
                        task_finish_notify();
-                    }, 0);
+                    }, Priority{});
                }
                catch (...)
                {
--- a/src/IO/SeekAvoidingReadBuffer.h
+++ b/src/IO/SeekAvoidingReadBuffer.h
@ -17,7 +17,7 @@ public:

    off_t seek(off_t off, int whence) override;

-    void prefetch(int64_t priority) override { impl->prefetch(priority); }
+    void prefetch(Priority priority) override { impl->prefetch(priority); }

 private:
    UInt64 min_bytes_for_seek; /// Minimum positive seek offset which shall be executed using seek operation.
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -92,8 +92,11 @@ WriteBufferFromS3::WriteBufferFromS3(
    , write_settings(write_settings_)
    , client_ptr(std::move(client_ptr_))
    , object_metadata(std::move(object_metadata_))
-    , buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings()))
-    , task_tracker(std::make_unique<WriteBufferFromS3::TaskTracker>(std::move(schedule_)))
+    , buffer_allocation_policy(ChooseBufferPolicy(upload_settings))
+    , task_tracker(
+          std::make_unique<WriteBufferFromS3::TaskTracker>(
+              std::move(schedule_),
+              upload_settings.max_inflight_parts_for_one_file))
 {
    LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails());

@ -109,8 +112,11 @@ void WriteBufferFromS3::nextImpl()
                ErrorCodes::LOGICAL_ERROR,
                "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest");

-    /// Make sense to call to before adding new async task to check if there is an exception
-    task_tracker->waitReady();
+    /// Make sense to call waitIfAny before adding new async task to check if there is an exception
+    /// The faster the exception is propagated the lesser time is spent for cancellation
+    /// Despite the fact that `task_tracker->add()` collects tasks statuses and propagates their exceptions
+    /// that call is necessary for the case when the is no in-flight limitation and therefore `task_tracker->add()` doesn't wait anything
+    task_tracker->waitIfAny();

    hidePartialData();

@ -134,7 +140,8 @@ void WriteBufferFromS3::preFinalize()

    LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails());

-    task_tracker->waitReady();
+    /// This function should not be run again if an exception has occurred
+    is_prefinalized = true;

    hidePartialData();

@ -166,8 +173,6 @@ void WriteBufferFromS3::preFinalize()
    {
        writeMultipartUpload();
    }
-
-    is_prefinalized = true;
 }

 void WriteBufferFromS3::finalizeImpl()
@ -212,8 +217,8 @@ String WriteBufferFromS3::getLogDetails() const
        multipart_upload_details = fmt::format(", upload id {}, upload has finished {}"
                                       , multipart_upload_id, multipart_upload_finished);

-    return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}",
-                       bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details);
+    return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, prefinalized {}, finalized {}{}",
+                       bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), is_prefinalized, finalized, multipart_upload_details);
 }

 void WriteBufferFromS3::tryToAbortMultipartUpload()
@ -234,7 +239,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
 {
    LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails());

-    // That descructor could be call with finalized=false in case of exceptions
+    // That destructor could be call with finalized=false in case of exceptions
    if (!finalized)
    {
        LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails());
--- a/src/IO/WriteBufferFromS3TaskTracker.cpp
+++ b/src/IO/WriteBufferFromS3TaskTracker.cpp
@ -4,12 +4,18 @@

 #include <IO/WriteBufferFromS3TaskTracker.h>

+namespace ProfileEvents
+{
+    extern const Event WriteBufferFromS3WaitInflightLimitMicroseconds;
+}
+
 namespace DB
 {

-WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_)
+WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_)
    : is_async(bool(scheduler_))
    , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner())
+    , max_tasks_inflight(max_tasks_inflight_)
 {}

 WriteBufferFromS3::TaskTracker::~TaskTracker()
@ -28,103 +34,152 @@ ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
    };
 }

-void WriteBufferFromS3::TaskTracker::waitReady()
-{
-    LOG_TEST(log, "waitReady, in queue {}", futures.size());
-
-    /// Exceptions are propagated
-    auto it = futures.begin();
-    while (it != futures.end())
-    {
-        chassert(it->valid());
-        if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready)
-        {
-            ++it;
-            continue;
-        }
-
-        try
-        {
-            it->get();
-        } catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-            throw;
-        }
-
-        it = futures.erase(it);
-    }
-
-    LOG_TEST(log, "waitReady ended, in queue {}", futures.size());
-}
-
 void WriteBufferFromS3::TaskTracker::waitAll()
 {
    LOG_TEST(log, "waitAll, in queue {}", futures.size());

    /// Exceptions are propagated
    for (auto & future : futures)
-    {
-        try
    {
        future.get();
-        } catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-            throw;
-        }
    }
    futures.clear();
+
+    std::lock_guard lock(mutex);
+    finished_futures.clear();
 }

 void WriteBufferFromS3::TaskTracker::safeWaitAll()
 {
    LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size());

-    /// Exceptions are not propagated
-    for (auto & future : futures)
-    {
-        LOG_TEST(log, "safeWaitAll, wait future");
-
-        if (future.valid())
-            future.wait();
-    }
-
-    LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size());
-
    for (auto & future : futures)
    {
        if (future.valid())
        {
            try
            {
+                /// Exceptions are not propagated
                future.get();
            } catch (...)
            {
+                /// But at least they are printed
                tryLogCurrentException(__PRETTY_FUNCTION__);
            }
        }
    }
    futures.clear();
-    LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size());
+
+    std::lock_guard lock(mutex);
+    finished_futures.clear();
+}
+
+void WriteBufferFromS3::TaskTracker::waitIfAny()
+{
+    LOG_TEST(log, "waitIfAny, in queue {}", futures.size());
+    if (futures.empty())
+        return;
+
+    Stopwatch watch;
+
+    {
+        std::lock_guard lock(mutex);
+        for (auto & it : finished_futures)
+        {
+            /// actually that call might lock this thread until the future is set finally
+            /// however that won't lock us for long, the task is about to finish when the pointer appears in the `finished_futures`
+            it->get();
+
+            /// in case of exception in `it->get()`
+            /// it it not necessary to remove `it` from list `futures`
+            /// `TaskTracker` has to be destroyed after any exception occurs, for this `safeWaitAll` is called.
+            /// `safeWaitAll` handles invalid futures in the list `futures`
+            futures.erase(it);
+        }
+        finished_futures.clear();
+    }
+
+    watch.stop();
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+
+    LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size());
 }

 void WriteBufferFromS3::TaskTracker::add(Callback && func)
 {
-    LOG_TEST(log, "add, in queue {}", futures.size());
+    /// All this fuzz is about 2 things. This is the most critical place of TaskTracker.
+    /// The first is not to fail insertion in the list `futures`.
+    /// In order to face it, the element is allocated at the end of the list `futures` in advance.
+    /// The second is not to fail the notification of the task.
+    /// In order to face it, the list element, which would be inserted to the list `finished_futures`,
+    /// is allocated in advance as an other list `pre_allocated_finished` with one element inside.

-    auto future = scheduler(std::move(func), 0);
-    auto exit_scope = scope_guard(
-        [&future]()
+    /// preallocation for the first issue
+    futures.emplace_back();
+    auto future_placeholder = std::prev(futures.end());
+
+    /// preallocation for the second issue
+    FinishedList pre_allocated_finished {future_placeholder};
+
+    Callback func_with_notification = [&, func=std::move(func), pre_allocated_finished=std::move(pre_allocated_finished)] () mutable
    {
-            future.wait();
+        SCOPE_EXIT({
+            DENY_ALLOCATIONS_IN_SCOPE;
+
+            std::lock_guard lock(mutex);
+            finished_futures.splice(finished_futures.end(), pre_allocated_finished);
+            has_finished.notify_one();
+        });
+
+        func();
+    };
+
+    /// this move is nothrow
+    *future_placeholder = scheduler(std::move(func_with_notification), Priority{});
+
+    LOG_TEST(log, "add ended, in queue {}, limit {}", futures.size(), max_tasks_inflight);
+
+    waitTilInflightShrink();
+}
+
+void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
+{
+    if (!max_tasks_inflight)
+        return;
+
+    LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size());
+
+    Stopwatch watch;
+
+    /// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
+    /// However the faster finished task is collected the faster CH checks if there is an exception
+    /// The faster an exception is propagated the lesser time is spent for cancellation
+    while (futures.size() >= max_tasks_inflight)
+    {
+        std::unique_lock lock(mutex);
+
+        has_finished.wait(lock, [this] () TSA_REQUIRES(mutex) { return !finished_futures.empty(); });
+
+        for (auto & it : finished_futures)
+        {
+            SCOPE_EXIT({
+                /// According to basic exception safety TaskTracker has to be destroyed after exception
+                /// If it would be true than this SCOPE_EXIT is superfluous
+                /// However WriteBufferWithFinalizeCallback, WriteBufferFromFileDecorator do call finalize in d-tor
+                /// TaskTracker has to cope this until the issue with finalizing in d-tor is addressed in #50274
+                futures.erase(it);
+            });
+
+            it->get();
        }
-    );

-    futures.push_back(std::move(future));
+        finished_futures.clear();
+    }

-    exit_scope.release();
-    LOG_TEST(log, "add ended, in queue {}", futures.size());
+    watch.stop();
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+
+    LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size());
 }

 bool WriteBufferFromS3::TaskTracker::isAsync() const
--- a/src/IO/WriteBufferFromS3TaskTracker.h
+++ b/src/IO/WriteBufferFromS3TaskTracker.h
@ -6,36 +6,61 @@

 #include "WriteBufferFromS3.h"

+#include <list>
+
 namespace DB
 {

 /// That class is used only in WriteBufferFromS3 for now.
 /// Therefore it declared as a part of  WriteBufferFromS3.
 /// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool.
-/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll
+/// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll
 /// to help with coordination of the running tasks.

+/// Basic exception safety is provided. If exception occurred the object has to be destroyed.
+/// No thread safety is provided. Use this object with no concurrency.
+
 class WriteBufferFromS3::TaskTracker
 {
 public:
    using Callback = std::function<void()>;

-    explicit TaskTracker(ThreadPoolCallbackRunner<void> scheduler_);
+    TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_);
    ~TaskTracker();

    static ThreadPoolCallbackRunner<void> syncRunner();

    bool isAsync() const;
-    void waitReady();
+
+    /// waitIfAny collects statuses from already finished tasks
+    /// There could be no finished tasks yet, so waitIfAny do nothing useful in that case
+    /// the first exception is thrown if any task has failed
+    void waitIfAny();
+
+    /// Well, waitAll waits all the tasks until they finish and collects their statuses
    void waitAll();
+
+    /// safeWaitAll does the same as waitAll but mutes the exceptions
    void safeWaitAll();
+
    void add(Callback && func);

 private:
-    bool is_async;
+    /// waitTilInflightShrink waits til the number of in-flight tasks beyond the limit `max_tasks_inflight`.
+    void waitTilInflightShrink() TSA_NO_THREAD_SAFETY_ANALYSIS;
+
+    const bool is_async;
    ThreadPoolCallbackRunner<void> scheduler;
-    std::list<std::future<void>> futures;
+    const size_t max_tasks_inflight;
+
+    using FutureList = std::list<std::future<void>>;
+    FutureList futures;
    Poco::Logger * log = &Poco::Logger::get("TaskTracker");
+
+    std::mutex mutex;
+    std::condition_variable has_finished TSA_GUARDED_BY(mutex);
+    using FinishedList = std::list<FutureList::iterator>;
+    FinishedList finished_futures TSA_GUARDED_BY(mutex);
 };

 }
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@ -2041,7 +2041,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
             */
            if (data.hasNullKeyData())
            {
-                has_null_key_data = Method::one_key_nullable_optimization;
+                has_null_key_data = true;
                out_cols->key_columns[0]->insertDefault();
                insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena);
                data.hasNullKeyData() = false;
@ -2076,6 +2076,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
                    res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
                    places.clear();
                    out_cols.reset();
+                    has_null_key_data = false;
                }
            }
        });
--- a/Show More
+++ b/Show More