Merge branch 'master' into ADQM-830

2024-09-20 08:40:50 +00:00 · 2023-05-31 15:07:37 +10:00 · 2023-05-31 15:07:37 +10:00 · 44ba35d2c1
commit 44ba35d2c1
parent 12105c3dfa a01e8644c5
175 changed files with 3589 additions and 1486 deletions
--- a/contrib/qpl-cmake/CMakeLists.txt
+++ b/contrib/qpl-cmake/CMakeLists.txt
@ -25,6 +25,9 @@ message(STATUS "Intel QPL version: ${QPL_VERSION}")
 # Generate 8 library targets: middle_layer_lib, isal, isal_asm, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, core_iaa, middle_layer_lib.
 # Output ch_contrib::qpl by linking with 8 library targets.

+# The qpl submodule comes with its own version of isal. It contains code which does not exist in upstream isal. It would be nice to link
+# only upstream isal (ch_contrib::isal) but at this point we can't.
+
 include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake")

 # check nasm compiler
@ -308,7 +311,7 @@ target_include_directories(middle_layer_lib
 target_compile_definitions(middle_layer_lib PUBLIC -DQPL_LIB)

 # [SUBDIR]c_api
-file(GLOB_RECURSE QPL_C_API_SRC 
+file(GLOB_RECURSE QPL_C_API_SRC
        ${QPL_SRC_DIR}/c_api/*.c
        ${QPL_SRC_DIR}/c_api/*.cpp)

--- a/docs/_includes/install/arm.sh
+++ b/docs/_includes/install/arm.sh
@ -1,6 +0,0 @@
-# ARM (AArch64) build works on Amazon Graviton, Oracle Cloud, Huawei Cloud ARM machines.
-# The support for AArch64 is pre-production ready.
-
-wget 'https://builds.clickhouse.com/master/aarch64/clickhouse'
-chmod a+x ./clickhouse
-sudo ./clickhouse install
--- a/docs/_includes/install/freebsd.sh
+++ b/docs/_includes/install/freebsd.sh
@ -1,3 +0,0 @@
-fetch 'https://builds.clickhouse.com/master/freebsd/clickhouse'
-chmod a+x ./clickhouse
-su -m root -c './clickhouse install'
--- a/docs/_includes/install/mac-arm.sh
+++ b/docs/_includes/install/mac-arm.sh
@ -1,3 +0,0 @@
-wget 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse'
-chmod a+x ./clickhouse
-./clickhouse
--- a/docs/_includes/install/mac-x86.sh
+++ b/docs/_includes/install/mac-x86.sh
@ -1,3 +0,0 @@
-wget 'https://builds.clickhouse.com/master/macos/clickhouse'
-chmod a+x ./clickhouse
-./clickhouse
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -43,7 +43,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 For other Linux distribution - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).

 As of April 2023, any version of Clang >= 15 will work.
-GCC as a compiler is not supported
+GCC as a compiler is not supported.
 To build with a specific Clang version:

 :::tip
@ -114,18 +114,3 @@ mkdir build
 cmake -S . -B build
 cmake --build build
 ```
-
-## You Don’t Have to Build ClickHouse {#you-dont-have-to-build-clickhouse}
-
-ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
-
-The CI checks build the binaries on each commit to [ClickHouse](https://github.com/clickhouse/clickhouse/). To download them:
-
-1. Open the [commits list](https://github.com/ClickHouse/ClickHouse/commits/master)
-1. Choose a **Merge pull request** commit that includes the new feature, or was added after the new feature
-1. Click the status symbol (yellow dot, red x, green check) to open the CI check list
-1. Scroll through the list until you find **ClickHouse build check x/x artifact groups are OK**
-1. Click **Details**
-1. Find the type of package for your operating system that you need and download the files.
-
-![build artifact check](images/find-build-artifact.png)
--- a/docs/en/engines/table-engines/integrations/s3.md
+++ b/docs/en/engines/table-engines/integrations/s3.md
@ -131,14 +131,17 @@ CREATE TABLE table_with_asterisk (name String, value UInt32)

 The following settings can be set before query execution or placed into configuration file.

- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`.
- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`.
+- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `32Mb`.
+- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `16Mb`.
 - `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`.
 - `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`.
 - `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
 - `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
 - `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
 - `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
+- `s3_upload_part_size_multiply_factor` - Multiply `s3_min_upload_part_size` by this factor each time `s3_multiply_parts_count_threshold` parts were uploaded from a single write to S3. Default values is `2`.
+- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. DEfault value us `500`.
+- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each inflight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enought, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file.

 Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration.

--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -1219,11 +1219,12 @@ Authentication parameters (the disk will try all available methods **and** Manag
 * `account_name` and `account_key` - For authentication using Shared Key.

 Limit parameters (mainly for internal usage):
-* `max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
+* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
 * `min_bytes_for_seek` - Limits the size of a seekable region.
 * `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
 * `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
 * `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
+* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object.

 Other parameters:
 * `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.
--- a/docs/en/getting-started/example-datasets/environmental-sensors.md
+++ b/docs/en/getting-started/example-datasets/environmental-sensors.md
@ -36,8 +36,8 @@ The data is in CSV files but uses a semi-colon for the delimiter. The rows look
 │      7389 │ BMP180      │     3735 │ 50.136 │ 11.062 │ 2019-06-01T00:00:06 │     98905 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │        12.1 │
 │     13199 │ BMP180      │     6664 │ 52.514 │  13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │       19.74 │
 │     12753 │ BMP180      │     6440 │ 44.616 │  2.032 │ 2019-06-01T00:00:07 │     99475 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │          17 │
-│     16956 │ BMP180      │     8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │   101322 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │        17.2 │
-└───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘
+│     16956 │ BMP180      │     8594 │ 52.052 │  8.354 │ 2019-06-01T00:00:08 │    101322 │ ᴺᵁᴸᴸ     │ ᴺᵁᴸᴸ              │        17.2 │
+└───────────┴─────────────┴──────────┴────────┴────────┴─────────────────────┴───────────┴──────────┴───────────────────┴─────────────┘
 ```

 2. We will use the following `MergeTree` table to store the data in ClickHouse:
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -28,23 +28,25 @@ The quickest and easiest way to get up and running with ClickHouse is to create
 For production installs of a specific release version see the [installation options](#available-installation-options) down below.
 :::

-On Linux and macOS:
+On Linux, macOS and FreeBSD:

-1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server, clickhouse-client, clickhouse-local,
-ClickHouse Keeper, and other tools:
+1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the
+   following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server,
+   clickhouse-client, clickhouse-local, ClickHouse Keeper, and other tools:

  ```bash
  curl https://clickhouse.com/ | sh
  ```

 1. Run the following command to start the ClickHouse server:
+
    ```bash
    ./clickhouse server
    ```

-    The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.
+   The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.

-1. Open a new terminal and use the **clickhouse-client** to connect to your service:
+1. Open a new terminal and use the **./clickhouse client** to connect to your service:

  ```bash
  ./clickhouse client
@ -330,7 +332,9 @@ For production environments, it’s recommended to use the latest `stable`-versi

 To run ClickHouse inside Docker follow the guide on [Docker Hub](https://hub.docker.com/r/clickhouse/clickhouse-server/). Those images use official `deb` packages inside.

-### From Sources {#from-sources}
+## Non-Production Deployments (Advanced)
+
+### Compile From Source {#from-sources}

 To manually compile ClickHouse, follow the instructions for [Linux](/docs/en/development/build.md) or [macOS](/docs/en/development/build-osx.md).

@ -346,8 +350,33 @@ You’ll need to create data and metadata folders manually and `chown` them for

 On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources.

-### From CI checks pre-built binaries
-ClickHouse binaries are built for each [commit](/docs/en/development/build.md#you-dont-have-to-build-clickhouse).
+### Install a CI-generated Binary
+
+ClickHouse's continuous integration (CI) infrastructure produces specialized builds for each commit in the [ClickHouse
+repository](https://github.com/clickhouse/clickhouse/), e.g. [sanitized](https://github.com/google/sanitizers) builds, unoptimized (Debug)
+builds, cross-compiled builds etc. While such builds are normally only useful during development, they can in certain situations also be
+interesting for users.
+
+:::note
+Since ClickHouse's CI is evolving over time, the exact steps to download CI-generated builds may vary.
+Also, CI may delete too old build artifacts, making them unavailable for download.
+:::
+
+For example, to download a aarch64 binary for ClickHouse v23.4, follow these steps:
+
+- Find the GitHub pull request for release v23.4: [Release pull request for branch 23.4](https://github.com/ClickHouse/ClickHouse/pull/49238)
+- Click "Commits", then click a commit similar to "Update autogenerated version to 23.4.2.1 and contributors" for the particular version you like to install.
+- Click the green check / yellow dot / red cross to open the list of CI checks.
+- Click "Details" next to "ClickHouse Build Check" in the list, it will open a page similar to [this page](https://s3.amazonaws.com/clickhouse-test-reports/46793/b460eb70bf29b19eadd19a1f959b15d186705394/clickhouse_build_check/report.html)
+- Find the rows with compiler = "clang-*-aarch64" - there are multiple rows.
+- Download the artifacts for these builds.
+
+To download binaries for very old x86-64 systems without [SSE3](https://en.wikipedia.org/wiki/SSE3) support or old ARM systems without
+[ARMv8.1-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.1-A) support, open a [pull
+request](https://github.com/ClickHouse/ClickHouse/commits/master) and find CI check "BuilderBinAmd64Compat", respectively
+"BuilderBinAarch64V80Compat". Then click "Details", open the "Build" fold, scroll to the end, find message "Notice: Build URLs
+https://s3.amazonaws.com/clickhouse/builds/PRs/.../.../binary_aarch64_v80compat/clickhouse". You can then click the link to download the
+build.

 ## Launch {#launch}

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -577,7 +577,7 @@ Default value: 20

 **Usage**

-The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
+The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.

 ## max_part_loading_threads {#max-part-loading-threads}

@ -840,4 +840,4 @@ Possible values:

 - `Always` or `Never`.

-Default value: `Never`
+Default value: `Never`
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1187,6 +1187,36 @@ Disable limit on kafka_num_consumers that depends on the number of available CPU

 Default value: false.

+## postgresql_connection_pool_size {#postgresql-connection-pool-size}
+
+Connection pool size for PostgreSQL table engine and database engine.
+
+Default value: 16
+
+## postgresql_connection_pool_size {#postgresql-connection-pool-size}
+
+Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.
+
+Default value: 5000
+
+## postgresql_connection_pool_auto_close_connection {#postgresql-connection-pool-auto-close-connection}
+
+Close connection before returning connection to the pool.
+
+Default value: true.
+
+## odbc_bridge_connection_pool_size {#odbc-bridge-connection-pool-size}
+
+Connection pool size for each connection settings string in ODBC bridge.
+
+Default value: 16
+
+## odbc_bridge_use_connection_pooling {#odbc-bridge-use-connection-pooling}
+
+Use connection pooling in ODBC bridge. If set to false, a new connection is created every time.
+
+Default value: true
+
 ## use_uncompressed_cache {#setting-use_uncompressed_cache}

 Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled).
@ -3563,7 +3593,7 @@ SETTINGS index_granularity = 8192 │

 ## external_table_functions_use_nulls {#external-table-functions-use-nulls}

-Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md)] table functions use Nullable columns.
+Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns.

 Possible values:

--- a/docs/en/sql-reference/aggregate-functions/reference/first_value.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/first_value.md
@ -29,7 +29,7 @@ select first_value(b) from test_data
 ### example2
 The NULL value is ignored.
 ```sql
-select first_value(b) ignore nulls sfrom test_data
+select first_value(b) ignore nulls from test_data
 ```

 ```text
--- a/docs/en/sql-reference/dictionaries/index.md
+++ b/docs/en/sql-reference/dictionaries/index.md
@ -2234,7 +2234,7 @@ Result:

 ## Regular Expression Tree Dictionary {#regexp-tree-dictionary}

-Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of (user agent)[https://en.wikipedia.org/wiki/User_agent] strings, which can be expressed elegantly with regexp tree dictionaries.
+Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of [user agent](https://en.wikipedia.org/wiki/User_agent) strings, which can be expressed elegantly with regexp tree dictionaries.

 ### Use Regular Expression Tree Dictionary in ClickHouse Open-Source

@ -2280,7 +2280,7 @@ This config consists of a list of regular expression tree nodes. Each node has t
  - The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution.
 - **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example.

-Regexp tree dictionaries only allow access using functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull`.
+Regexp tree dictionaries only allow access using the functions `dictGet` and `dictGetOrDefault`.

 Example:

--- a/docs/en/sql-reference/statements/select/sample.md
+++ b/docs/en/sql-reference/statements/select/sample.md
@ -34,7 +34,7 @@ For the `SAMPLE` clause the following syntax is supported:
 | `SAMPLE k OFFSET m`  |  Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset)  |


-## SAMPLE K
+## SAMPLE K {#select-sample-k}

 Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`.

@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000

 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10.

-## SAMPLE N
+## SAMPLE N {#select-sample-n}

 Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`.

@ -90,7 +90,7 @@ FROM visits
 SAMPLE 10000000
 ```

-## SAMPLE K OFFSET M
+## SAMPLE K OFFSET M {#select-sample-offset}

 Here `k` and `m` are numbers from 0 to 1. Examples are shown below.

--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -1137,6 +1137,16 @@
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
        -->

+        <!--
+            ORDER BY expr: https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#order_by
+            Example:
+                event_date, event_time
+                event_date, type, query_id
+                event_date, event_time, initial_query_id
+
+        <order_by>event_date, event_time, initial_query_id</order_by>
+        -->
+
        <!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
             Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
          -->
--- a/src/AggregateFunctions/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/AggregateFunctionIf.h
@ -152,6 +152,13 @@ public:
        nested_func->merge(place, rhs, arena);
    }

+    bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
+    {
+        nested_func->merge(place, rhs, thread_pool, arena);
+    }
+
    void mergeBatch(
        size_t row_begin,
        size_t row_end,
--- a/src/Backups/BackupEntryFromImmutableFile.cpp
+++ b/src/Backups/BackupEntryFromImmutableFile.cpp
@ -59,16 +59,31 @@ UInt64 BackupEntryFromImmutableFile::getSize() const

 UInt128 BackupEntryFromImmutableFile::getChecksum() const
 {
-    std::lock_guard lock{size_and_checksum_mutex};
-    if (!checksum_adjusted)
    {
-        if (!checksum)
-            checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
-        else if (copy_encrypted)
-            checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
-        checksum_adjusted = true;
+        std::lock_guard lock{size_and_checksum_mutex};
+        if (checksum_adjusted)
+            return *checksum;
+
+        if (checksum)
+        {
+            if (copy_encrypted)
+                checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
+            checksum_adjusted = true;
+            return *checksum;
+        }
+    }
+
+    auto calculated_checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
+
+    {
+        std::lock_guard lock{size_and_checksum_mutex};
+        if (!checksum_adjusted)
+        {
+            checksum = calculated_checksum;
+            checksum_adjusted = true;
+        }
+        return *checksum;
    }
-    return *checksum;
 }

 std::optional<UInt128> BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const
--- a/src/Backups/BackupEntryFromImmutableFile.h
+++ b/src/Backups/BackupEntryFromImmutableFile.h
@ -44,7 +44,7 @@ private:
    const DataSourceDescription data_source_description;
    const bool copy_encrypted;
    mutable std::optional<UInt64> file_size;
-    mutable std::optional<UInt64> checksum;
+    mutable std::optional<UInt128> checksum;
    mutable bool file_size_adjusted = false;
    mutable bool checksum_adjusted = false;
    mutable std::mutex size_and_checksum_mutex;
--- a/src/Backups/BackupEntryWithChecksumCalculation.cpp
+++ b/src/Backups/BackupEntryWithChecksumCalculation.cpp
@ -8,15 +8,32 @@ namespace DB
 template <typename Base>
 UInt128 BackupEntryWithChecksumCalculation<Base>::getChecksum() const
 {
-    std::lock_guard lock{checksum_calculation_mutex};
-    if (!calculated_checksum)
    {
-        auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(this->getSize()));
-        HashingReadBuffer hashing_read_buffer(*read_buffer);
-        hashing_read_buffer.ignoreAll();
-        calculated_checksum = hashing_read_buffer.getHash();
+        std::lock_guard lock{checksum_calculation_mutex};
+        if (calculated_checksum)
+            return *calculated_checksum;
+    }
+
+    size_t size = this->getSize();
+
+    {
+        std::lock_guard lock{checksum_calculation_mutex};
+        if (!calculated_checksum)
+        {
+            if (size == 0)
+            {
+                calculated_checksum = 0;
+            }
+            else
+            {
+                auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(size));
+                HashingReadBuffer hashing_read_buffer(*read_buffer);
+                hashing_read_buffer.ignoreAll();
+                calculated_checksum = hashing_read_buffer.getHash();
+            }
+        }
+        return *calculated_checksum;
    }
-    return *calculated_checksum;
 }

 template <typename Base>
--- a/src/Backups/tests/gtest_backup_entries.cpp
+++ b/src/Backups/tests/gtest_backup_entries.cpp
@ -0,0 +1,350 @@
+#include <gtest/gtest.h>
+
+#include <Backups/BackupEntryFromAppendOnlyFile.h>
+#include <Backups/BackupEntryFromImmutableFile.h>
+#include <Backups/BackupEntryFromSmallFile.h>
+
+#include <Disks/IDisk.h>
+#include <Disks/DiskLocal.h>
+#include <Disks/DiskEncrypted.h>
+#include <IO/FileEncryptionCommon.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Poco/TemporaryFile.h>
+
+using namespace DB;
+
+
+class BackupEntriesTest : public ::testing::Test
+{
+protected:
+    void SetUp() override
+    {
+        /// Make local disk.
+        temp_dir = std::make_unique<Poco::TemporaryFile>();
+        temp_dir->createDirectories();
+        local_disk = std::make_shared<DiskLocal>("local_disk", temp_dir->path() + "/", 0);
+
+        /// Make encrypted disk.
+        auto settings = std::make_unique<DiskEncryptedSettings>();
+        settings->wrapped_disk = local_disk;
+        settings->current_algorithm = FileEncryption::Algorithm::AES_128_CTR;
+        settings->keys[0] = "1234567890123456";
+        settings->current_key_id = 0;
+        settings->disk_path = "encrypted/";
+        encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
+    }
+
+    void TearDown() override
+    {
+        encrypted_disk.reset();
+        local_disk.reset();
+    }
+
+    static void writeFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
+        writeString(std::string_view{"Some text"}, *buf);
+        buf->finalize();
+    }
+
+    static void writeEmptyFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
+        buf->finalize();
+    }
+
+    static void appendFile(DiskPtr disk, const String & filepath)
+    {
+        auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append, {});
+        writeString(std::string_view{"Appended"}, *buf);
+        buf->finalize();
+    }
+
+    static String getChecksum(const BackupEntryPtr & backup_entry)
+    {
+        return getHexUIntUppercase(backup_entry->getChecksum());
+    }
+
+    static const constexpr std::string_view NO_CHECKSUM = "no checksum";
+
+    static String getPartialChecksum(const BackupEntryPtr & backup_entry, size_t prefix_length)
+    {
+        auto partial_checksum = backup_entry->getPartialChecksum(prefix_length);
+        if (!partial_checksum)
+            return String{NO_CHECKSUM};
+        return getHexUIntUppercase(*partial_checksum);
+    }
+
+    static String readAll(const BackupEntryPtr & backup_entry)
+    {
+        auto in = backup_entry->getReadBuffer({});
+        String str;
+        readStringUntilEOF(str, *in);
+        return str;
+    }
+
+    std::unique_ptr<Poco::TemporaryFile> temp_dir;
+    std::shared_ptr<DiskLocal> local_disk;
+    std::shared_ptr<DiskEncrypted> encrypted_disk;
+};
+
+
+static const constexpr std::string_view ZERO_CHECKSUM = "00000000000000000000000000000000";
+
+static const constexpr std::string_view SOME_TEXT_CHECKSUM = "28B5529750AC210952FFD366774363ED";
+static const constexpr std::string_view S_CHECKSUM = "C27395C39AFB5557BFE47661CC9EB86C";
+static const constexpr std::string_view SOME_TEX_CHECKSUM = "D00D9BE8D87919A165F14EDD31088A0E";
+static const constexpr std::string_view SOME_TEXT_APPENDED_CHECKSUM = "5A1F10F638DC7A226231F3FD927D1726";
+
+static const constexpr std::string_view PRECALCULATED_CHECKSUM = "1122334455667788AABBCCDDAABBCCDD";
+static const constexpr UInt128 PRECALCULATED_CHECKSUM_UINT128 = (UInt128(0x1122334455667788) << 64) | 0xAABBCCDDAABBCCDD;
+static const size_t PRECALCULATED_SIZE = 123;
+
+TEST_F(BackupEntriesTest, BackupEntryFromImmutableFile)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    writeEmptyFile(local_disk, "empty.txt");
+
+    auto empty_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "empty.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+
+    auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+    EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
+
+    EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE - 1), NO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
+    EXPECT_EQ(readAll(precalculated_entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, BackupEntryFromAppendOnlyFile)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    appendFile(local_disk, "a.txt");
+
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    auto appended_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(appended_entry->getSize(), 17);
+    EXPECT_EQ(getChecksum(appended_entry), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 22), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(appended_entry, 1000), SOME_TEXT_APPENDED_CHECKSUM);
+    EXPECT_EQ(readAll(appended_entry), "Some textAppended");
+
+    writeEmptyFile(local_disk, "empty_appended.txt");
+
+    auto empty_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "empty_appended.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+
+    appendFile(local_disk, "empty_appended.txt");
+    EXPECT_EQ(empty_entry->getSize(), 0);
+    EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
+    EXPECT_EQ(readAll(empty_entry), "");
+}
+
+TEST_F(BackupEntriesTest, PartialChecksumBeforeFullChecksum)
+{
+    writeFile(local_disk, "a.txt");
+
+    auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+
+    entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, BackupEntryFromSmallFile)
+{
+    writeFile(local_disk, "a.txt");
+    auto entry = std::make_shared<BackupEntryFromSmallFile>(local_disk, "a.txt");
+
+    local_disk->removeFile("a.txt");
+
+    EXPECT_EQ(entry->getSize(), 9);
+    EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+    EXPECT_EQ(readAll(entry), "Some text");
+}
+
+TEST_F(BackupEntriesTest, DecryptedEntriesFromEncryptedDisk)
+{
+    {
+        writeFile(encrypted_disk, "a.txt");
+        std::pair<BackupEntryPtr, bool /* partial_checksum_allowed */> test_cases[]
+            = {{std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt"), false},
+               {std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt"), true},
+               {std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt"), true}};
+        for (const auto & [entry, partial_checksum_allowed] : test_cases)
+        {
+            EXPECT_EQ(entry->getSize(), 9);
+            EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), partial_checksum_allowed ? S_CHECKSUM : NO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 8), partial_checksum_allowed ? SOME_TEX_CHECKSUM : NO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "Some text");
+        }
+    }
+
+    {
+        writeEmptyFile(encrypted_disk, "empty.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt"),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt"),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt")};
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 0);
+            EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "");
+        }
+    }
+
+    {
+        auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+        EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
+        EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
+        EXPECT_EQ(readAll(precalculated_entry), "Some text");
+    }
+}
+
+TEST_F(BackupEntriesTest, EncryptedEntriesFromEncryptedDisk)
+{
+    {
+        writeFile(encrypted_disk, "a.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true)};
+
+        auto encrypted_checksum = getChecksum(entries[0]);
+        EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
+
+        auto partial_checksum = getPartialChecksum(entries[1], 9);
+        EXPECT_NE(partial_checksum, NO_CHECKSUM);
+        EXPECT_NE(partial_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(partial_checksum, SOME_TEXT_CHECKSUM);
+        EXPECT_NE(partial_checksum, encrypted_checksum);
+
+        auto encrypted_data = readAll(entries[0]);
+        EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
+
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 9 + FileEncryption::Header::kSize);
+            EXPECT_EQ(getChecksum(entry), encrypted_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            auto encrypted_checksum_9 = getPartialChecksum(entry, 9);
+            EXPECT_TRUE(encrypted_checksum_9 == NO_CHECKSUM || encrypted_checksum_9 == partial_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 9 + FileEncryption::Header::kSize), encrypted_checksum);
+            EXPECT_EQ(getPartialChecksum(entry, 1000), encrypted_checksum);
+            EXPECT_EQ(readAll(entry), encrypted_data);
+        }
+    }
+
+    {
+        writeEmptyFile(encrypted_disk, "empty.txt");
+        BackupEntryPtr entries[]
+            = {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
+               std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true)};
+        for (const auto & entry : entries)
+        {
+            EXPECT_EQ(entry->getSize(), 0);
+            EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
+            EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
+            EXPECT_EQ(readAll(entry), "");
+        }
+    }
+
+    {
+        auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
+        EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE + FileEncryption::Header::kSize);
+
+        auto encrypted_checksum = getChecksum(precalculated_entry);
+        EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
+        EXPECT_NE(encrypted_checksum, PRECALCULATED_CHECKSUM);
+
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), NO_CHECKSUM);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE + FileEncryption::Header::kSize), encrypted_checksum);
+        EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), encrypted_checksum);
+
+        auto encrypted_data = readAll(precalculated_entry);
+        EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
+    }
+}
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -121,7 +121,7 @@ ConnectionEstablisherAsync::ConnectionEstablisherAsync(
    epoll.add(timeout_descriptor.getDescriptor());
 }

-void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, ResumeCallback)
+void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, SuspendCallback)
 {
    connection_establisher_async.reset();
    connection_establisher_async.connection_establisher.setAsyncCallback(async_callback);
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@ -91,7 +91,7 @@ private:

        ConnectionEstablisherAsync & connection_establisher_async;

-        void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
+        void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
    };

    void cancelAfter() override;
--- a/src/Client/PacketReceiver.cpp
+++ b/src/Client/PacketReceiver.cpp
@ -57,7 +57,7 @@ bool PacketReceiver::checkTimeout()
    return true;
 }

-void PacketReceiver::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback)
+void PacketReceiver::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
 {
    while (true)
    {
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@ -57,7 +57,7 @@ private:

        PacketReceiver & receiver;

-        void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
+        void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
    };

    /// When epoll file descriptor is ready, check if it's an expired timeout.
--- a/src/Common/AsyncLoader.cpp
+++ b/src/Common/AsyncLoader.cpp
@ -433,7 +433,7 @@ const String & AsyncLoader::getPoolName(size_t pool) const
    return pools[pool].name; // NOTE: lock is not needed because `name` is const and `pools` are immutable
 }

-ssize_t AsyncLoader::getPoolPriority(size_t pool) const
+Priority AsyncLoader::getPoolPriority(size_t pool) const
 {
    return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
 }
@ -576,7 +576,7 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un
    {
        Pool & old_pool = pools[job->pool_id];
        Pool & new_pool = pools[new_pool_id];
-        if (old_pool.priority >= new_pool.priority)
+        if (old_pool.priority <= new_pool.priority)
            return; // Never lower priority or change pool leaving the same priority

        // Update priority and push job forward through ready queue if needed
@ -590,7 +590,7 @@ void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::un
                spawn(new_pool, lock);
        }

-        // Set user-facing pool and priority (may affect executing jobs)
+        // Set user-facing pool (may affect executing jobs)
        job->pool_id.store(new_pool_id);

        // Recurse into dependencies
@ -621,7 +621,7 @@ bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
    return is_running
        && !pool.ready_queue.empty()
        && pool.workers < pool.max_threads
-        && (!current_priority || *current_priority <= pool.priority);
+        && (!current_priority || *current_priority >= pool.priority);
 }

 bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
@ -629,17 +629,17 @@ bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
    return is_running
        && !pool.ready_queue.empty()
        && pool.workers <= pool.max_threads
-        && (!current_priority || *current_priority <= pool.priority);
+        && (!current_priority || *current_priority >= pool.priority);
 }

 void AsyncLoader::updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock)
 {
    // Find current priority.
    // NOTE: We assume low number of pools, so O(N) scans are fine.
-    std::optional<ssize_t> priority;
+    std::optional<Priority> priority;
    for (Pool & pool : pools)
    {
-        if (pool.isActive() && (!priority || *priority < pool.priority))
+        if (pool.isActive() && (!priority || *priority > pool.priority))
            priority = pool.priority;
    }
    current_priority = priority;
--- a/src/Common/AsyncLoader.h
+++ b/src/Common/AsyncLoader.h
@ -11,6 +11,7 @@
 #include <boost/noncopyable.hpp>
 #include <base/types.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/Priority.h>
 #include <Common/Stopwatch.h>
 #include <Common/ThreadPool_fwd.h>

@ -268,10 +269,10 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &

 // `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
 // Basic usage example:
-//     // Start async_loader with two thread pools (0=bg, 1=fg):
+//     // Start async_loader with two thread pools (0=fg, 1=bg):
 //     AsyncLoader async_loader({
-//         {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority = 0}
-//         {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority = 1}
+//         {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority{0}}
+//         {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority{1}}
 //     });
 //
 //     // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
@ -279,19 +280,19 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //     auto job_func = [&] (const LoadJobPtr & self) {
 //         LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
 //     };
-//     auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 0, job_func);
-//     auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 0, job_func);
-//     auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 0, job_func);
+//     auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
+//     auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
+//     auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 1, job_func);
 //     auto task = makeLoadTask(async_loader, { job1, job2, job3 });
 //     task.schedule();
 //
 //     // Another thread may prioritize a job by changing its pool and wait for it:
-//     async_loader->prioritize(job3, /* pool_id = */ 1); // higher priority jobs are run first, default priority is zero.
-//     job3->wait(); // blocks until job completion or cancellation and rethrow an exception (if any)
+//     async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
+//     job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
 //
 // Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
 // Each pool has a constant priority and a mutable maximum number of threads.
-// Higher priority (greater `pool.priority` value) jobs are run first.
+// Higher priority (lower `pool.priority` value) jobs are run first.
 // No job with lower priority is started while there is at least one higher priority job ready or running.
 //
 // Job priority can be elevated (but cannot be lowered)
@ -301,7 +302,8 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
 //     this also leads to a priority inheritance for all the dependencies.
 // Value stored in load job `pool_id` field is atomic and can be changed even during job execution.
 // Job is, of course, not moved from its initial thread pool, but it should use `self->pool()` for
-// all new jobs it create to avoid priority inversion.
+// all new jobs it create to avoid priority inversion. To obtain pool in which job is being executed
+// call `self->execution_pool()` instead.
 //
 // === IMPLEMENTATION DETAILS ===
 // All possible states and statuses of a job:
@ -335,7 +337,7 @@ private:
    struct Pool
    {
        const String name;
-        const ssize_t priority;
+        const Priority priority;
        std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
        std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
        size_t max_threads; // Max number of workers to be spawn
@ -367,7 +369,7 @@ public:
        Metric metric_threads;
        Metric metric_active_threads;
        size_t max_threads;
-        ssize_t priority;
+        Priority priority;
    };

    AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_);
@ -412,7 +414,7 @@ public:

    size_t getMaxThreads(size_t pool) const;
    const String & getPoolName(size_t pool) const;
-    ssize_t getPoolPriority(size_t pool) const;
+    Priority getPoolPriority(size_t pool) const;

    size_t getScheduledJobCount() const;

@ -451,7 +453,7 @@ private:

    mutable std::mutex mutex; // Guards all the fields below.
    bool is_running = true;
-    std::optional<ssize_t> current_priority; // highest priority among active pools
+    std::optional<Priority> current_priority; // highest priority among active pools
    UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
    std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
    std::vector<Pool> pools; // Thread pools for job execution and ready queues
--- a/src/Common/AsyncTaskExecutor.cpp
+++ b/src/Common/AsyncTaskExecutor.cpp
@ -3,18 +3,11 @@
 namespace DB
 {

-thread_local FiberInfo current_fiber_info;
-
 AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_))
 {
    createFiber();
 }

-FiberInfo AsyncTaskExecutor::getCurrentFiberInfo()
-{
-    return current_fiber_info;
-}
-
 void AsyncTaskExecutor::resume()
 {
    if (routine_is_finished)
@ -38,10 +31,7 @@ void AsyncTaskExecutor::resume()

 void AsyncTaskExecutor::resumeUnlocked()
 {
-    auto parent_fiber_info = current_fiber_info;
-    current_fiber_info = FiberInfo{&fiber, &parent_fiber_info};
-    fiber = std::move(fiber).resume();
-    current_fiber_info = parent_fiber_info;
+    fiber.resume();
 }

 void AsyncTaskExecutor::cancel()
@ -69,30 +59,19 @@ struct AsyncTaskExecutor::Routine
    struct AsyncCallback
    {
        AsyncTaskExecutor & executor;
-        Fiber & fiber;
+        SuspendCallback suspend_callback;

        void operator()(int fd, Poco::Timespan timeout, AsyncEventTimeoutType type, const std::string & desc, uint32_t events)
        {
            executor.processAsyncEvent(fd, timeout, type, desc, events);
-            fiber = std::move(fiber).resume();
+            suspend_callback();
            executor.clearAsyncEvent();
        }
    };

-    struct ResumeCallback
+    void operator()(SuspendCallback suspend_callback)
    {
-        Fiber & fiber;
-
-        void operator()()
-        {
-            fiber = std::move(fiber).resume();
-        }
-    };
-
-    Fiber operator()(Fiber && sink)
-    {
-        auto async_callback = AsyncCallback{executor, sink};
-        auto suspend_callback = ResumeCallback{sink};
+        auto async_callback = AsyncCallback{executor, suspend_callback};
        try
        {
            executor.task->run(async_callback, suspend_callback);
@ -110,18 +89,17 @@ struct AsyncTaskExecutor::Routine
        }

        executor.routine_is_finished = true;
-        return std::move(sink);
    }
 };

 void AsyncTaskExecutor::createFiber()
 {
-    fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
+    fiber = Fiber(fiber_stack, Routine{*this});
 }

 void AsyncTaskExecutor::destroyFiber()
 {
-    boost::context::fiber to_destroy = std::move(fiber);
+    Fiber to_destroy = std::move(fiber);
 }

 String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description)
--- a/src/Common/AsyncTaskExecutor.h
+++ b/src/Common/AsyncTaskExecutor.h
@ -22,7 +22,7 @@ enum class AsyncEventTimeoutType
 };

 using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>;
-using ResumeCallback = std::function<void()>;
+using SuspendCallback = std::function<void()>;

 struct FiberInfo
 {
@ -38,7 +38,7 @@ struct FiberInfo
 struct AsyncTask
 {
 public:
-    virtual void run(AsyncCallback async_callback, ResumeCallback suspend_callback) = 0;
+    virtual void run(AsyncCallback async_callback, SuspendCallback suspend_callback) = 0;
    virtual ~AsyncTask() = default;
 };

@ -80,7 +80,6 @@ public:
    };
 #endif

-    static FiberInfo getCurrentFiberInfo();
 protected:
    /// Method that is called in resume() before actual fiber resuming.
    /// If it returns false, resume() will return immediately without actual fiber resuming.
@ -124,48 +123,6 @@ private:
    std::unique_ptr<AsyncTask> task;
 };

-/// Simple implementation for fiber local variable.
-template <typename T>
-struct FiberLocal
-{
-public:
-    FiberLocal()
-    {
-        /// Initialize main instance for this thread. Instances for fibers will inherit it,
-        /// (it's needed because main instance could be changed before creating fibers
-        /// and changes should be visible in fibers).
-        data[nullptr] = T();
-    }
-
-    T & operator*()
-    {
-        return get();
-    }
-
-    T * operator->()
-    {
-        return &get();
-    }
-
-private:
-    T & get()
-    {
-        return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo());
-    }
-
-    T & getInstanceForFiber(FiberInfo info)
-    {
-        auto it = data.find(info.fiber);
-        /// If it's the first request, we need to initialize instance for the fiber
-        /// using instance from parent fiber or main thread that created fiber.
-        if (it == data.end())
-            it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first;
-        return it->second;
-    }
-
-    std::unordered_map<const Fiber *, T> data;
-};
-
 String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description);

 }
--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@ -3,5 +3,147 @@
 /// BOOST_USE_ASAN, BOOST_USE_TSAN and BOOST_USE_UCONTEXT should be correctly defined for sanitizers.
 #include <base/defines.h>
 #include <boost/context/fiber.hpp>
+#include <map>
+
+/// Class wrapper for boost::context::fiber.
+/// It tracks current executing fiber for thread and
+/// supports storing fiber-specific data
+/// that will be destroyed on fiber destructor.
+class Fiber
+{
+private:
+    using Impl = boost::context::fiber;
+    using FiberPtr = Fiber *;
+    template <typename T> friend class FiberLocal;
+
+public:
+    template< typename StackAlloc, typename Fn>
+    Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
+    {
+    }
+
+    Fiber() = default;
+
+    Fiber(Fiber && other) = default;
+    Fiber & operator=(Fiber && other) = default;
+
+    Fiber(const Fiber &) = delete;
+    Fiber & operator =(const Fiber &) = delete;
+
+    explicit operator bool() const
+    {
+        return impl.operator bool();
+    }
+
+    void resume()
+    {
+        /// Update information about current executing fiber.
+        FiberPtr & current_fiber = getCurrentFiber();
+        FiberPtr parent_fiber = current_fiber;
+        current_fiber = this;
+        impl = std::move(impl).resume();
+        /// Restore parent fiber.
+        current_fiber = parent_fiber;
+    }
+
+private:
+    template <typename Fn>
+    struct RoutineImpl
+    {
+        struct SuspendCallback
+        {
+            Impl & impl;
+
+            void operator()()
+            {
+                impl = std::move(impl).resume();
+            }
+        };
+
+        explicit RoutineImpl(Fn && fn_) : fn(std::move(fn_))
+        {
+        }
+
+        Impl operator()(Impl && sink)
+        {
+            SuspendCallback suspend_callback{sink};
+            fn(suspend_callback);
+            return std::move(sink);
+        }
+
+        Fn fn;
+    };
+
+    static FiberPtr & getCurrentFiber()
+    {
+        thread_local static FiberPtr current_fiber;
+        return current_fiber;
+    }
+
+    /// Special wrapper to store data in uniquer_ptr.
+    struct DataWrapper
+    {
+        virtual ~DataWrapper() = default;
+    };
+
+    using DataPtr = std::unique_ptr<DataWrapper>;
+
+    /// Get reference to fiber-specific data by key
+    /// (the pointer to the structure that uses this data).
+    DataPtr & getLocalData(void * key)
+    {
+        return local_data[key];
+    }
+
+    Impl && release()
+    {
+        return std::move(impl);
+    }
+
+    Impl impl;
+    std::map<void *, DataPtr> local_data;
+};
+
+/// Implementation for fiber local variable.
+/// If we are in fiber, it returns fiber local data,
+/// otherwise it returns it's single field.
+/// Fiber local data is destroyed in Fiber destructor.
+/// Implementation is similar to boost::fiber::fiber_specific_ptr
+/// (we cannot use it because we don't use boost::fiber API.
+template <typename T>
+class FiberLocal
+{
+public:
+    T & operator*()
+    {
+        return get();
+    }
+
+    T * operator->()
+    {
+        return &get();
+    }
+
+private:
+    struct DataWrapperImpl : public Fiber::DataWrapper
+    {
+        T impl;
+    };
+
+    T & get()
+    {
+        Fiber * current_fiber = Fiber::getCurrentFiber();
+        if (!current_fiber)
+            return main_instance;
+
+        Fiber::DataPtr & ptr = current_fiber->getLocalData(this);
+        /// Initialize instance on first request.
+        if (!ptr)
+            ptr = std::make_unique<DataWrapperImpl>();
+
+        return dynamic_cast<DataWrapperImpl *>(ptr.get())->impl;
+    }
+
+    T main_instance;
+};

-using Fiber = boost::context::fiber;
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -15,9 +15,8 @@ namespace DB
 namespace OpenTelemetry
 {

-///// This code can be executed inside several fibers in one thread,
-///// we should use fiber local tracing context.
-thread_local FiberLocal<TracingContextOnThread> current_fiber_trace_context;
+/// This code can be executed inside fibers, we should use fiber local tracing context.
+thread_local FiberLocal<TracingContextOnThread> current_trace_context;

 bool Span::addAttribute(std::string_view name, UInt64 value) noexcept
 {
@ -109,7 +108,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc

 SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
 {
-    if (!current_fiber_trace_context->isTraceEnabled())
+    if (!current_trace_context->isTraceEnabled())
    {
        return;
    }
@ -117,8 +116,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
    /// Use try-catch to make sure the ctor is exception safe.
    try
    {
-        this->trace_id = current_fiber_trace_context->trace_id;
-        this->parent_span_id = current_fiber_trace_context->span_id;
+        this->trace_id = current_trace_context->trace_id;
+        this->parent_span_id = current_trace_context->span_id;
        this->span_id = thread_local_rng(); // create a new id for this span
        this->operation_name = _operation_name;
        this->kind = _kind;
@ -137,7 +136,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
    }

    /// Set current span as parent of other spans created later on this thread.
-    current_fiber_trace_context->span_id = this->span_id;
+    current_trace_context->span_id = this->span_id;
 }

 void SpanHolder::finish() noexcept
@ -146,12 +145,12 @@ void SpanHolder::finish() noexcept
        return;

    // First of all, restore old value of current span.
-    assert(current_fiber_trace_context->span_id == span_id);
-    current_fiber_trace_context->span_id = parent_span_id;
+    assert(current_trace_context->span_id == span_id);
+    current_trace_context->span_id = parent_span_id;

    try
    {
-        auto log = current_fiber_trace_context->span_log.lock();
+        auto log = current_trace_context->span_log.lock();

        /// The log might be disabled, check it before use
        if (log)
@ -274,7 +273,7 @@ void TracingContext::serialize(WriteBuffer & buf) const

 const TracingContextOnThread & CurrentContext()
 {
-    return *current_fiber_trace_context;
+    return *current_trace_context;
 }

 void TracingContextOnThread::reset() noexcept
@ -296,7 +295,7 @@ TracingContextHolder::TracingContextHolder(
    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
    try
    {
-        if (current_fiber_trace_context->isTraceEnabled())
+        if (current_trace_context->isTraceEnabled())
        {
            ///
            /// This is not the normal case,
@ -309,15 +308,15 @@ TracingContextHolder::TracingContextHolder(
            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
            ///
            this->is_context_owner = false;
-            this->root_span.trace_id = current_fiber_trace_context->trace_id;
-            this->root_span.parent_span_id = current_fiber_trace_context->span_id;
+            this->root_span.trace_id = current_trace_context->trace_id;
+            this->root_span.parent_span_id = current_trace_context->span_id;
            this->root_span.span_id = thread_local_rng();
            this->root_span.operation_name = _operation_name;
            this->root_span.start_time_us
                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

            /// Set the root span as parent of other spans created on current thread
-            current_fiber_trace_context->span_id = this->root_span.span_id;
+            current_trace_context->span_id = this->root_span.span_id;
            return;
        }

@ -361,10 +360,10 @@ TracingContextHolder::TracingContextHolder(
    }

    /// Set up trace context on current thread only when the root span is successfully initialized.
-    *current_fiber_trace_context = _parent_trace_context;
-    current_fiber_trace_context->span_id = this->root_span.span_id;
-    current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
-    current_fiber_trace_context->span_log = _span_log;
+    *current_trace_context = _parent_trace_context;
+    current_trace_context->span_id = this->root_span.span_id;
+    current_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
+    current_trace_context->span_log = _span_log;
 }

 TracingContextHolder::~TracingContextHolder()
@ -376,7 +375,7 @@ TracingContextHolder::~TracingContextHolder()

    try
    {
-        auto shared_span_log = current_fiber_trace_context->span_log.lock();
+        auto shared_span_log = current_trace_context->span_log.lock();
        if (shared_span_log)
        {
            try
@ -407,11 +406,11 @@ TracingContextHolder::~TracingContextHolder()
    if (this->is_context_owner)
    {
        /// Clear the context on current thread
-        current_fiber_trace_context->reset();
+        current_trace_context->reset();
    }
    else
    {
-        current_fiber_trace_context->span_id = this->root_span.parent_span_id;
+        current_trace_context->span_id = this->root_span.parent_span_id;
    }
 }

--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -8,6 +8,9 @@
    M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \
    M(SelectQuery, "Same as Query, but only for SELECT queries.") \
    M(InsertQuery, "Same as Query, but only for INSERT queries.") \
+    M(QueriesWithSubqueries, "Count queries with all subqueries") \
+    M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \
+    M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \
    M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \
    M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \
    M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \
@ -366,7 +369,7 @@ The server successfully detected this situation and will download merged part fr
    M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \
    M(WriteBufferFromS3Bytes, "Bytes written to S3.") \
    M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \
-    \
+    M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.") \
    M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
    \
    M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \
--- a/src/Common/SystemLogBase.h
+++ b/src/Common/SystemLogBase.h
@ -92,7 +92,7 @@ public:

    String getName() const override { return LogElement::name(); }

-    static const char * getDefaultOrderBy() { return "(event_date, event_time)"; }
+    static const char * getDefaultOrderBy() { return "event_date, event_time"; }

 protected:
    Poco::Logger * log;
--- a/src/Common/tests/gtest_async_loader.cpp
+++ b/src/Common/tests/gtest_async_loader.cpp
@ -32,7 +32,7 @@ namespace DB::ErrorCodes

 struct Initializer {
    size_t max_threads = 1;
-    ssize_t priority = 0;
+    Priority priority;
 };

 struct AsyncLoaderTest
@ -144,11 +144,11 @@ struct AsyncLoaderTest
 TEST(AsyncLoader, Smoke)
 {
    AsyncLoaderTest t({
-        {.max_threads = 2, .priority = 0},
-        {.max_threads = 2, .priority = -1},
+        {.max_threads = 2, .priority = Priority{0}},
+        {.max_threads = 2, .priority = Priority{1}},
    });

-    static constexpr ssize_t low_priority_pool = 1;
+    static constexpr size_t low_priority_pool = 1;

    std::atomic<size_t> jobs_done{0};
    std::atomic<size_t> low_priority_jobs_done{0};
@ -419,6 +419,8 @@ TEST(AsyncLoader, CancelExecutingTask)
    }
 }

+// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function
+// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482
 TEST(AsyncLoader, DISABLED_JobFailure)
 {
    AsyncLoaderTest t;
@ -595,16 +597,16 @@ TEST(AsyncLoader, TestOverload)
 TEST(AsyncLoader, StaticPriorities)
 {
    AsyncLoaderTest t({
-        {.max_threads = 1, .priority = 0},
-        {.max_threads = 1, .priority = 1},
-        {.max_threads = 1, .priority = 2},
-        {.max_threads = 1, .priority = 3},
-        {.max_threads = 1, .priority = 4},
-        {.max_threads = 1, .priority = 5},
-        {.max_threads = 1, .priority = 6},
-        {.max_threads = 1, .priority = 7},
-        {.max_threads = 1, .priority = 8},
-        {.max_threads = 1, .priority = 9},
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
+        {.max_threads = 1, .priority{-3}},
+        {.max_threads = 1, .priority{-4}},
+        {.max_threads = 1, .priority{-5}},
+        {.max_threads = 1, .priority{-6}},
+        {.max_threads = 1, .priority{-7}},
+        {.max_threads = 1, .priority{-8}},
+        {.max_threads = 1, .priority{-9}},
    });

    std::string schedule;
@ -614,6 +616,15 @@ TEST(AsyncLoader, StaticPriorities)
        schedule += fmt::format("{}{}", self->name, self->pool());
    };

+    // Job DAG with priorities. After priority inheritance from H9, jobs D9 and E9 can be
+    // executed in undefined order (Tested further in DynamicPriorities)
+    // A0(9) -+-> B3
+    //        |
+    //        `-> C4
+    //        |
+    //        `-> D1(9) -.
+    //        |          +-> F0(9) --> G0(9) --> H9
+    //        `-> E2(9) -'
    std::vector<LoadJobPtr> jobs;
    jobs.push_back(makeLoadJob({}, 0, "A", job_func)); // 0
    jobs.push_back(makeLoadJob({ jobs[0] }, 3, "B", job_func)); // 1
@ -627,16 +638,15 @@ TEST(AsyncLoader, StaticPriorities)

    t.loader.start();
    t.loader.wait();
-
-    ASSERT_EQ(schedule, "A9E9D9F9G9H9C4B3");
+    ASSERT_TRUE(schedule == "A9E9D9F9G9H9C4B3" || schedule == "A9D9E9F9G9H9C4B3");
 }

 TEST(AsyncLoader, SimplePrioritization)
 {
    AsyncLoaderTest t({
-        {.max_threads = 1, .priority = 0},
-        {.max_threads = 1, .priority = 1},
-        {.max_threads = 1, .priority = 2},
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
    });

    t.loader.start();
@ -674,16 +684,16 @@ TEST(AsyncLoader, SimplePrioritization)
 TEST(AsyncLoader, DynamicPriorities)
 {
    AsyncLoaderTest t({
-        {.max_threads = 1, .priority = 0},
-        {.max_threads = 1, .priority = 1},
-        {.max_threads = 1, .priority = 2},
-        {.max_threads = 1, .priority = 3},
-        {.max_threads = 1, .priority = 4},
-        {.max_threads = 1, .priority = 5},
-        {.max_threads = 1, .priority = 6},
-        {.max_threads = 1, .priority = 7},
-        {.max_threads = 1, .priority = 8},
-        {.max_threads = 1, .priority = 9},
+        {.max_threads = 1, .priority{0}},
+        {.max_threads = 1, .priority{-1}},
+        {.max_threads = 1, .priority{-2}},
+        {.max_threads = 1, .priority{-3}},
+        {.max_threads = 1, .priority{-4}},
+        {.max_threads = 1, .priority{-5}},
+        {.max_threads = 1, .priority{-6}},
+        {.max_threads = 1, .priority{-7}},
+        {.max_threads = 1, .priority{-8}},
+        {.max_threads = 1, .priority{-9}},
    });

    for (bool prioritize : {false, true})
@ -890,8 +900,8 @@ TEST(AsyncLoader, DynamicPools)
    const size_t max_threads[] { 2, 10 };
    const int jobs_in_chain = 16;
    AsyncLoaderTest t({
-        {.max_threads = max_threads[0], .priority = 0},
-        {.max_threads = max_threads[1], .priority = 1},
+        {.max_threads = max_threads[0], .priority{0}},
+        {.max_threads = max_threads[1], .priority{-1}},
    });

    t.loader.start();
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@ -47,7 +47,8 @@ struct Settings;
    M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \
    M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \
    M(UInt64, max_log_file_size, 50 * 1024 * 1024, "Max size of the Raft log file. If possible, each created log file will preallocate this amount of bytes on disk. Set to 0 to disable the limit", 0) \
-    M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0)
+    M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \
+    M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0)

 DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

--- a/src/Coordination/KeeperServer.cpp
+++ b/src/Coordination/KeeperServer.cpp
@ -471,17 +471,6 @@ void KeeperServer::shutdown()
 namespace
 {

-// Serialize the request with all the necessary information for the leader
-// we don't know ZXID and digest yet so we don't serialize it
-nuraft::ptr<nuraft::buffer> getZooKeeperRequestMessage(const KeeperStorage::RequestForSession & request_for_session)
-{
-    DB::WriteBufferFromNuraftBuffer write_buf;
-    DB::writeIntBinary(request_for_session.session_id, write_buf);
-    request_for_session.request->write(write_buf);
-    DB::writeIntBinary(request_for_session.time, write_buf);
-    return write_buf.getBuffer();
-}
-
 // Serialize the request for the log entry
 nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
 {
@ -489,12 +478,11 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
    DB::writeIntBinary(request_for_session.session_id, write_buf);
    request_for_session.request->write(write_buf);
    DB::writeIntBinary(request_for_session.time, write_buf);
-    DB::writeIntBinary(request_for_session.zxid, write_buf);
-    assert(request_for_session.digest);
-    DB::writeIntBinary(request_for_session.digest->version, write_buf);
-    if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
-        DB::writeIntBinary(request_for_session.digest->value, write_buf);
-
+    /// we fill with dummy values to eliminate unnecessary copy later on when we will write correct values
+    DB::writeIntBinary(static_cast<int64_t>(0), write_buf); /// zxid
+    DB::writeIntBinary(KeeperStorage::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
+    DB::writeIntBinary(static_cast<uint64_t>(0), write_buf); /// digest value
+    /// if new fields are added, update KeeperStateMachine::ZooKeeperLogSerializationVersion along with parseRequest function and PreAppendLog callback handler
    return write_buf.getBuffer();
 }

@ -512,9 +500,7 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS
 {
    std::vector<nuraft::ptr<nuraft::buffer>> entries;
    for (const auto & request_for_session : requests_for_sessions)
-    {
-        entries.push_back(getZooKeeperRequestMessage(request_for_session));
-    }
+        entries.push_back(getZooKeeperLogEntry(request_for_session));

    std::lock_guard lock{server_write_mutex};
    if (is_recovering)
@ -635,14 +621,50 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
                assert(entry->get_val_type() == nuraft::app_log);
                auto next_zxid = state_machine->getNextZxid();

-                auto & entry_buf = entry->get_buf();
-                auto request_for_session = state_machine->parseRequest(entry_buf);
-                request_for_session.zxid = next_zxid;
-                if (!state_machine->preprocess(request_for_session))
+                auto entry_buf = entry->get_buf_ptr();
+
+                KeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
+                auto request_for_session = state_machine->parseRequest(*entry_buf, /*final=*/false, &serialization_version);
+                request_for_session->zxid = next_zxid;
+                if (!state_machine->preprocess(*request_for_session))
                    return nuraft::cb_func::ReturnCode::ReturnNull;

-                request_for_session.digest = state_machine->getNodesDigest();
-                entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), getZooKeeperLogEntry(request_for_session), entry->get_val_type());
+                request_for_session->digest = state_machine->getNodesDigest();
+
+                /// older versions of Keeper can send logs that are missing some fields
+                size_t bytes_missing = 0;
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    bytes_missing += sizeof(request_for_session->time);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
+                    bytes_missing += sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
+
+                if (bytes_missing != 0)
+                {
+                    auto new_buffer = nuraft::buffer::alloc(entry_buf->size() + bytes_missing);
+                    memcpy(new_buffer->data_begin(), entry_buf->data_begin(), entry_buf->size());
+                    entry_buf = std::move(new_buffer);
+                    entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), entry_buf, entry->get_val_type());
+                }
+
+                size_t write_buffer_header_size
+                    = sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    write_buffer_header_size += sizeof(request_for_session->time);
+
+                auto * buffer_start = reinterpret_cast<BufferBase::Position>(entry_buf->data_begin() + entry_buf->size() - write_buffer_header_size);
+
+                WriteBuffer write_buf(buffer_start, write_buffer_header_size);
+
+                if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
+                    writeIntBinary(request_for_session->time, write_buf);
+
+                writeIntBinary(request_for_session->zxid, write_buf);
+                writeIntBinary(request_for_session->digest->version, write_buf);
+                if (request_for_session->digest->version != KeeperStorage::NO_DIGEST)
+                    writeIntBinary(request_for_session->digest->value, write_buf);
+
                break;
            }
            case nuraft::cb_func::AppendLogFailed:
@ -654,8 +676,8 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
                assert(entry->get_val_type() == nuraft::app_log);

                auto & entry_buf = entry->get_buf();
-                auto request_for_session = state_machine->parseRequest(entry_buf);
-                state_machine->rollbackRequest(request_for_session, true);
+                auto request_for_session = state_machine->parseRequest(entry_buf, true);
+                state_machine->rollbackRequest(*request_for_session, true);
                break;
            }
            default:
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -1,16 +1,16 @@
 #include <cerrno>
-#include <base/errnoToString.h>
-#include <base/defines.h>
 #include <future>
 #include <Coordination/KeeperSnapshotManager.h>
 #include <Coordination/KeeperStateMachine.h>
 #include <Coordination/ReadBufferFromNuraftBuffer.h>
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
 #include <IO/ReadHelpers.h>
+#include <base/defines.h>
+#include <base/errnoToString.h>
 #include <sys/mman.h>
+#include <Common/ProfileEvents.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/ZooKeeper/ZooKeeperIO.h>
-#include <Common/ProfileEvents.h>
 #include <Common/logger_useful.h>
 #include "Coordination/KeeperStorage.h"

@ -60,6 +60,7 @@ KeeperStateMachine::KeeperStateMachine(
          coordination_settings->dead_session_check_period_ms.totalMilliseconds())
    , responses_queue(responses_queue_)
    , snapshots_queue(snapshots_queue_)
+    , min_request_size_to_cache(coordination_settings_->min_request_size_for_cache)
    , last_committed_idx(0)
    , log(&Poco::Logger::get("KeeperStateMachine"))
    , superdigest(superdigest_)
@ -149,19 +150,19 @@ void assertDigest(

 nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    auto request_for_session = parseRequest(data, /*final=*/false);
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

-    preprocess(request_for_session);
+    preprocess(*request_for_session);
    return nullptr;
 }

-KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer & data)
+std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
 {
    ReadBufferFromNuraftBuffer buffer(data);
-    KeeperStorage::RequestForSession request_for_session;
-    readIntBinary(request_for_session.session_id, buffer);
+    auto request_for_session = std::make_shared<KeeperStorage::RequestForSession>();
+    readIntBinary(request_for_session->session_id, buffer);

    int32_t length;
    Coordination::read(length, buffer);
@ -169,29 +170,81 @@ KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer
    int32_t xid;
    Coordination::read(xid, buffer);

+    static constexpr std::array non_cacheable_xids{
+        Coordination::WATCH_XID,
+        Coordination::PING_XID,
+        Coordination::AUTH_XID,
+        Coordination::CLOSE_XID,
+    };
+
+    const bool should_cache
+        = min_request_size_to_cache != 0 && request_for_session->session_id != -1 && data.size() >= min_request_size_to_cache
+        && std::all_of(
+              non_cacheable_xids.begin(), non_cacheable_xids.end(), [&](const auto non_cacheable_xid) { return xid != non_cacheable_xid; });
+
+    if (should_cache)
+    {
+        std::lock_guard lock(request_cache_mutex);
+        if (auto xid_to_request_it = parsed_request_cache.find(request_for_session->session_id);
+            xid_to_request_it != parsed_request_cache.end())
+        {
+            auto & xid_to_request = xid_to_request_it->second;
+            if (auto request_it = xid_to_request.find(xid); request_it != xid_to_request.end())
+            {
+                if (final)
+                {
+                    auto request = std::move(request_it->second);
+                    xid_to_request.erase(request_it);
+                    return request;
+                }
+                else
+                    return request_it->second;
+            }
+        }
+    }
+
+
    Coordination::OpNum opnum;

    Coordination::read(opnum, buffer);

-    request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
-    request_for_session.request->xid = xid;
-    request_for_session.request->readImpl(buffer);
+    request_for_session->request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
+    request_for_session->request->xid = xid;
+    request_for_session->request->readImpl(buffer);

-    if (!buffer.eof())
-        readIntBinary(request_for_session.time, buffer);
-    else /// backward compatibility
-        request_for_session.time
-            = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-    if (!buffer.eof())
-        readIntBinary(request_for_session.zxid, buffer);
+    using enum ZooKeeperLogSerializationVersion;
+    ZooKeeperLogSerializationVersion version = INITIAL;

    if (!buffer.eof())
    {
-        request_for_session.digest.emplace();
-        readIntBinary(request_for_session.digest->version, buffer);
-        if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
-            readIntBinary(request_for_session.digest->value, buffer);
+        version = WITH_TIME;
+        readIntBinary(request_for_session->time, buffer);
+    }
+    else
+        request_for_session->time
+            = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+    if (!buffer.eof())
+    {
+        version = WITH_ZXID_DIGEST;
+
+        readIntBinary(request_for_session->zxid, buffer);
+
+        chassert(!buffer.eof());
+
+        request_for_session->digest.emplace();
+        readIntBinary(request_for_session->digest->version, buffer);
+        if (request_for_session->digest->version != KeeperStorage::DigestVersion::NO_DIGEST || !buffer.eof())
+            readIntBinary(request_for_session->digest->value, buffer);
+    }
+
+    if (serialization_version)
+        *serialization_version = version;
+
+    if (should_cache && !final)
+    {
+        std::lock_guard lock(request_cache_mutex);
+        parsed_request_cache[request_for_session->session_id].emplace(xid, request_for_session);
    }

    return request_for_session;
@ -231,15 +284,15 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req

 nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    auto request_for_session = parseRequest(data, true);
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

    /// Special processing of session_id request
-    if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
+    if (request_for_session->request->getOpNum() == Coordination::OpNum::SessionID)
    {
        const Coordination::ZooKeeperSessionIDRequest & session_id_request
-            = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
+            = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request);
        int64_t session_id;
        std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
        response->internal_id = session_id_request.internal_id;
@ -261,25 +314,34 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
    }
    else
    {
+        if (request_for_session->request->getOpNum() == Coordination::OpNum::Close)
+        {
+            std::lock_guard lock(request_cache_mutex);
+            parsed_request_cache.erase(request_for_session->session_id);
+        }
+
        std::lock_guard lock(storage_and_responses_lock);
-        KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(
-            request_for_session.request, request_for_session.session_id, request_for_session.zxid);
+        KeeperStorage::ResponsesForSessions responses_for_sessions
+            = storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
        for (auto & response_for_session : responses_for_sessions)
            if (!responses_queue.push(response_for_session))
            {
                ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
-                LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response_for_session.session_id);
+                LOG_WARNING(
+                    log,
+                    "Failed to push response with session id {} to the queue, probably because of shutdown",
+                    response_for_session.session_id);
            }

-        if (keeper_context->digest_enabled && request_for_session.digest)
-            assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
+        if (keeper_context->digest_enabled && request_for_session->digest)
+            assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true);
    }

    ProfileEvents::increment(ProfileEvents::KeeperCommits);
    last_committed_idx = log_idx;

    if (commit_callback)
-        commit_callback(request_for_session);
+        commit_callback(*request_for_session);
    return nullptr;
 }

@ -330,14 +392,14 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr

 void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
 {
-    auto request_for_session = parseRequest(data);
+    auto request_for_session = parseRequest(data, true);
    // If we received a log from an older node, use the log_idx as the zxid
    // log_idx will always be larger or equal to the zxid so we can safely do this
    // (log_idx is increased for all logs, while zxid is only increased for requests)
-    if (!request_for_session.zxid)
-        request_for_session.zxid = log_idx;
+    if (!request_for_session->zxid)
+        request_for_session->zxid = log_idx;

-    rollbackRequest(request_for_session, false);
+    rollbackRequest(*request_for_session, false);
 }

 void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
@ -541,11 +603,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
    /// Pure local request, just process it with storage
    std::lock_guard lock(storage_and_responses_lock);
    auto responses = storage->processRequest(
-        request_for_session.request,
-        request_for_session.session_id,
-        std::nullopt,
-        true /*check_acl*/,
-        true /*is_local*/);
+        request_for_session.request, request_for_session.session_id, std::nullopt, true /*check_acl*/, true /*is_local*/);
    for (const auto & response : responses)
        if (!responses_queue.push(response))
            LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response.session_id);
--- a/src/Coordination/KeeperStateMachine.h
+++ b/src/Coordination/KeeperStateMachine.h
@ -36,7 +36,22 @@ public:
    /// Read state from the latest snapshot
    void init();

-    static KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data);
+    enum ZooKeeperLogSerializationVersion
+    {
+        INITIAL = 0,
+        WITH_TIME = 1,
+        WITH_ZXID_DIGEST = 2,
+    };
+
+    /// lifetime of a parsed request is:
+    /// [preprocess/PreAppendLog -> commit]
+    /// [preprocess/PreAppendLog -> rollback]
+    /// on events like commit and rollback we can remove the parsed request to keep the memory usage at minimum
+    /// request cache is also cleaned on session close in case something strange happened
+    ///
+    /// final - whether it's the final time we will fetch the request so we can safely remove it from cache
+    /// serialization_version - information about which fields were parsed from the buffer so we can modify the buffer accordingly
+    std::shared_ptr<KeeperStorage::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);

    bool preprocess(const KeeperStorage::RequestForSession & request_for_session);

@ -138,6 +153,13 @@ private:
    /// for request.
    mutable std::mutex storage_and_responses_lock;

+    std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorage::RequestForSession>>> parsed_request_cache;
+    uint64_t min_request_size_to_cache{0};
+    /// we only need to protect the access to the map itself
+    /// requests can be modified from anywhere without lock because a single request
+    /// can be processed only in 1 thread at any point
+    std::mutex request_cache_mutex;
+
    /// Last committed Raft log number.
    std::atomic<uint64_t> last_committed_idx;

--- a/src/Coordination/KeeperStorage.h
+++ b/src/Coordination/KeeperStorage.h
@ -110,7 +110,7 @@ public:
    struct RequestForSession
    {
        int64_t session_id;
-        int64_t time;
+        int64_t time{0};
        Coordination::ZooKeeperRequestPtr request;
        int64_t zxid{0};
        std::optional<Digest> digest;
--- a/src/Coordination/WriteBufferFromNuraftBuffer.cpp
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.cpp
@ -1,5 +1,4 @@
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
-#include <Common/logger_useful.h>

 namespace DB
 {
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -78,6 +78,7 @@ class IColumn;
    M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \
    M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \
    M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \
+    M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \
    M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
    M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
    M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \
@ -93,6 +94,7 @@ class IColumn;
    M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
    M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
    M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
+    M(UInt64, s3_retry_attempts, 10, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
    M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
    M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
    M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
--- a/src/Disks/IDisk.cpp
+++ b/src/Disks/IDisk.cpp
@ -188,12 +188,12 @@ try
        try
        {
            file->write(payload.data(), payload.size());
+            file->finalize();
        }
        catch (...)
        {
            /// Log current exception, because finalize() can throw a different exception.
            tryLogCurrentException(__PRETTY_FUNCTION__);
-            file->finalize();
            throw;
        }
    }
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -146,7 +146,8 @@ std::unique_ptr<S3::Client> getClient(
    S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config);

    client_configuration.retryStrategy
-        = std::make_shared<Aws::Client::DefaultRetryStrategy>(config.getUInt(config_prefix + ".retry_attempts", 10));
+        = std::make_shared<Aws::Client::DefaultRetryStrategy>(
+            config.getUInt64(config_prefix + ".retry_attempts", settings.request_settings.retry_attempts));

    return S3::ClientFactory::instance().create(
        client_configuration,
--- a/src/Functions/FunctionsComparison.h
+++ b/src/Functions/FunctionsComparison.h
@ -1230,8 +1230,11 @@ public:
        /// The case when arguments are the same (tautological comparison). Return constant.
        /// NOTE: Nullable types are special case.
        /// (BTW, this function use default implementation for Nullable, so Nullable types cannot be here. Check just in case.)
-        /// NOTE: We consider NaN comparison to be implementation specific (and in our implementation NaNs are sometimes equal sometimes not).
-        if (left_type->equals(*right_type) && !left_type->isNullable() && !isTuple(left_type) && col_left_untyped == col_right_untyped)
+        if (left_type->equals(*right_type) &&
+            !left_type->isNullable() &&
+            !isTuple(left_type) &&
+            !WhichDataType(left_type).isFloat() &&
+            col_left_untyped == col_right_untyped)
        {
            ColumnPtr result_column;

--- a/src/IO/ISchedulerNode.h
+++ b/src/IO/ISchedulerNode.h
@ -2,6 +2,7 @@

 #include <Common/ErrorCodes.h>
 #include <Common/Exception.h>
+#include <Common/Priority.h>

 #include <IO/ResourceRequest.h>
 #include <Poco/Util/AbstractConfiguration.h>
@ -37,7 +38,7 @@ inline const Poco::Util::AbstractConfiguration & emptyConfig()
 struct SchedulerNodeInfo
 {
    double weight = 1.0; /// Weight of this node among it's siblings
-    Int64 priority = 0; /// Priority of this node among it's siblings (higher value means higher priority)
+    Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)

    /// Arbitrary data accessed/stored by parent
    union {
@ -65,7 +66,7 @@ struct SchedulerNodeInfo

    void setPriority(Int64 value)
    {
-        priority = value;
+        priority.value = value;
    }
 };

--- a/src/IO/ReadBufferFromMemory.cpp
+++ b/src/IO/ReadBufferFromMemory.cpp
@ -12,7 +12,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
 {
    if (whence == SEEK_SET)
    {
-        if (offset >= 0 && internal_buffer.begin() + offset < internal_buffer.end())
+        if (offset >= 0 && internal_buffer.begin() + offset <= internal_buffer.end())
        {
            pos = internal_buffer.begin() + offset;
            working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
@ -25,7 +25,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
    else if (whence == SEEK_CUR)
    {
        Position new_pos = pos + offset;
-        if (new_pos >= internal_buffer.begin() && new_pos < internal_buffer.end())
+        if (new_pos >= internal_buffer.begin() && new_pos <= internal_buffer.end())
        {
            pos = new_pos;
            working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
--- a/src/IO/Resource/PriorityPolicy.h
+++ b/src/IO/Resource/PriorityPolicy.h
@ -26,12 +26,12 @@ class PriorityPolicy : public ISchedulerNode
    struct Item
    {
        ISchedulerNode * child = nullptr;
-        Int64 priority = 0; // higher value means higher priority
+        Priority priority; // lower value means higher priority

        /// For max-heap by priority
        bool operator<(const Item& rhs) const noexcept
        {
-            return priority < rhs.priority;
+            return priority > rhs.priority; // Reversed for heap top to yield highest priority (lowest value) child first
        }
    };

--- a/src/IO/Resource/tests/gtest_resource_class_priority.cpp
+++ b/src/IO/Resource/tests/gtest_resource_class_priority.cpp
@ -22,9 +22,9 @@ TEST(IOResourcePriorityPolicy, Priorities)
    ResourceTest t;

    t.add<PriorityPolicy>("/");
-    t.add<FifoQueue>("/A", "<priority>1</priority>");
+    t.add<FifoQueue>("/A", "<priority>3</priority>");
    t.add<FifoQueue>("/B", "<priority>2</priority>");
-    t.add<FifoQueue>("/C", "<priority>3</priority>");
+    t.add<FifoQueue>("/C", "<priority>1</priority>");

    t.enqueue("/A", {10, 10, 10});
    t.enqueue("/B", {10, 10, 10});
@ -56,9 +56,9 @@ TEST(IOResourcePriorityPolicy, Activation)
    ResourceTest t;

    t.add<PriorityPolicy>("/");
-    t.add<FifoQueue>("/A", "<priority>1</priority>");
+    t.add<FifoQueue>("/A", "<priority>3</priority>");
    t.add<FifoQueue>("/B", "<priority>2</priority>");
-    t.add<FifoQueue>("/C", "<priority>3</priority>");
+    t.add<FifoQueue>("/C", "<priority>1</priority>");

    t.enqueue("/A", {10, 10, 10, 10, 10, 10});
    t.enqueue("/B", {10});
--- a/src/IO/Resource/tests/gtest_resource_manager_static.cpp
+++ b/src/IO/Resource/tests/gtest_resource_manager_static.cpp
@ -49,7 +49,7 @@ TEST(IOResourceStaticResourceManager, Prioritization)
    {
        // Lock is not required here because this is called during request execution and we have max_requests = 1
        if (last_priority)
-            EXPECT_TRUE(priority <= *last_priority); // Should be true if every queue arrived at the same time at busy period start
+            EXPECT_TRUE(priority >= *last_priority); // Should be true if every queue arrived at the same time at busy period start
        last_priority = priority;
    };

@ -63,8 +63,8 @@ TEST(IOResourceStaticResourceManager, Prioritization)
                <res1>
                    <node path="/">           <type>inflight_limit</type><max_requests>1</max_requests></node>
                    <node path="/prio">       <type>priority</type></node>
-                    <node path="/prio/A">     <priority>-1</priority></node>
-                    <node path="/prio/B">     <priority>1</priority></node>
+                    <node path="/prio/A">     <priority>1</priority></node>
+                    <node path="/prio/B">     <priority>-1</priority></node>
                    <node path="/prio/C">     </node>
                    <node path="/prio/D">     </node>
                    <node path="/prio/leader"></node>
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -92,8 +92,11 @@ WriteBufferFromS3::WriteBufferFromS3(
    , write_settings(write_settings_)
    , client_ptr(std::move(client_ptr_))
    , object_metadata(std::move(object_metadata_))
-    , buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings()))
-    , task_tracker(std::make_unique<WriteBufferFromS3::TaskTracker>(std::move(schedule_)))
+    , buffer_allocation_policy(ChooseBufferPolicy(upload_settings))
+    , task_tracker(
+          std::make_unique<WriteBufferFromS3::TaskTracker>(
+              std::move(schedule_),
+              upload_settings.max_inflight_parts_for_one_file))
 {
    LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails());

@ -109,8 +112,11 @@ void WriteBufferFromS3::nextImpl()
                ErrorCodes::LOGICAL_ERROR,
                "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest");

-    /// Make sense to call to before adding new async task to check if there is an exception
-    task_tracker->waitReady();
+    /// Make sense to call waitIfAny before adding new async task to check if there is an exception
+    /// The faster the exception is propagated the lesser time is spent for cancellation
+    /// Despite the fact that `task_tracker->add()` collects tasks statuses and propagates their exceptions
+    /// that call is necessary for the case when the is no in-flight limitation and therefore `task_tracker->add()` doesn't wait anything
+    task_tracker->waitIfAny();

    hidePartialData();

@ -134,7 +140,8 @@ void WriteBufferFromS3::preFinalize()

    LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails());

-    task_tracker->waitReady();
+    /// This function should not be run again if an exception has occurred
+    is_prefinalized = true;

    hidePartialData();

@ -166,8 +173,6 @@ void WriteBufferFromS3::preFinalize()
    {
        writeMultipartUpload();
    }
-
-    is_prefinalized = true;
 }

 void WriteBufferFromS3::finalizeImpl()
@ -212,8 +217,8 @@ String WriteBufferFromS3::getLogDetails() const
        multipart_upload_details = fmt::format(", upload id {}, upload has finished {}"
                                       , multipart_upload_id, multipart_upload_finished);

-    return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}",
-                       bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details);
+    return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, prefinalized {}, finalized {}{}",
+                       bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), is_prefinalized, finalized, multipart_upload_details);
 }

 void WriteBufferFromS3::tryToAbortMultipartUpload()
@ -234,7 +239,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
 {
    LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails());

-    // That descructor could be call with finalized=false in case of exceptions
+    // That destructor could be call with finalized=false in case of exceptions
    if (!finalized)
    {
        LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails());
--- a/src/IO/WriteBufferFromS3TaskTracker.cpp
+++ b/src/IO/WriteBufferFromS3TaskTracker.cpp
@ -4,12 +4,18 @@

 #include <IO/WriteBufferFromS3TaskTracker.h>

+namespace ProfileEvents
+{
+    extern const Event WriteBufferFromS3WaitInflightLimitMicroseconds;
+}
+
 namespace DB
 {

-WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_)
+WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_)
    : is_async(bool(scheduler_))
    , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner())
+    , max_tasks_inflight(max_tasks_inflight_)
 {}

 WriteBufferFromS3::TaskTracker::~TaskTracker()
@ -28,36 +34,6 @@ ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
    };
 }

-void WriteBufferFromS3::TaskTracker::waitReady()
-{
-    LOG_TEST(log, "waitReady, in queue {}", futures.size());
-
-    /// Exceptions are propagated
-    auto it = futures.begin();
-    while (it != futures.end())
-    {
-        chassert(it->valid());
-        if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready)
-        {
-            ++it;
-            continue;
-        }
-
-        try
-        {
-            it->get();
-        } catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-            throw;
-        }
-
-        it = futures.erase(it);
-    }
-
-    LOG_TEST(log, "waitReady ended, in queue {}", futures.size());
-}
-
 void WriteBufferFromS3::TaskTracker::waitAll()
 {
    LOG_TEST(log, "waitAll, in queue {}", futures.size());
@ -65,66 +41,145 @@ void WriteBufferFromS3::TaskTracker::waitAll()
    /// Exceptions are propagated
    for (auto & future : futures)
    {
-        try
-        {
-            future.get();
-        } catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-            throw;
-        }
+        future.get();
    }
    futures.clear();
+
+    std::lock_guard lock(mutex);
+    finished_futures.clear();
 }

 void WriteBufferFromS3::TaskTracker::safeWaitAll()
 {
    LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size());

-    /// Exceptions are not propagated
-    for (auto & future : futures)
-    {
-        LOG_TEST(log, "safeWaitAll, wait future");
-
-        if (future.valid())
-            future.wait();
-    }
-
-    LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size());
-
    for (auto & future : futures)
    {
        if (future.valid())
        {
            try
            {
+                /// Exceptions are not propagated
                future.get();
            } catch (...)
            {
+                /// But at least they are printed
                tryLogCurrentException(__PRETTY_FUNCTION__);
            }
        }
    }
    futures.clear();
-    LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size());
+
+    std::lock_guard lock(mutex);
+    finished_futures.clear();
+}
+
+void WriteBufferFromS3::TaskTracker::waitIfAny()
+{
+    LOG_TEST(log, "waitIfAny, in queue {}", futures.size());
+    if (futures.empty())
+        return;
+
+    Stopwatch watch;
+
+    {
+        std::lock_guard lock(mutex);
+        for (auto & it : finished_futures)
+        {
+            /// actually that call might lock this thread until the future is set finally
+            /// however that won't lock us for long, the task is about to finish when the pointer appears in the `finished_futures`
+            it->get();
+
+            /// in case of exception in `it->get()`
+            /// it it not necessary to remove `it` from list `futures`
+            /// `TaskTracker` has to be destroyed after any exception occurs, for this `safeWaitAll` is called.
+            /// `safeWaitAll` handles invalid futures in the list `futures`
+            futures.erase(it);
+        }
+        finished_futures.clear();
+    }
+
+    watch.stop();
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+
+    LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size());
 }

 void WriteBufferFromS3::TaskTracker::add(Callback && func)
 {
-    LOG_TEST(log, "add, in queue {}", futures.size());
+    /// All this fuzz is about 2 things. This is the most critical place of TaskTracker.
+    /// The first is not to fail insertion in the list `futures`.
+    /// In order to face it, the element is allocated at the end of the list `futures` in advance.
+    /// The second is not to fail the notification of the task.
+    /// In order to face it, the list element, which would be inserted to the list `finished_futures`,
+    /// is allocated in advance as an other list `pre_allocated_finished` with one element inside.

-    auto future = scheduler(std::move(func), Priority{});
-    auto exit_scope = scope_guard(
-        [&future]()
+    /// preallocation for the first issue
+    futures.emplace_back();
+    auto future_placeholder = std::prev(futures.end());
+
+    /// preallocation for the second issue
+    FinishedList pre_allocated_finished {future_placeholder};
+
+    Callback func_with_notification = [&, func=std::move(func), pre_allocated_finished=std::move(pre_allocated_finished)] () mutable
+    {
+        SCOPE_EXIT({
+            DENY_ALLOCATIONS_IN_SCOPE;
+
+            std::lock_guard lock(mutex);
+            finished_futures.splice(finished_futures.end(), pre_allocated_finished);
+            has_finished.notify_one();
+        });
+
+        func();
+    };
+
+    /// this move is nothrow
+    *future_placeholder = scheduler(std::move(func_with_notification), Priority{});
+
+    LOG_TEST(log, "add ended, in queue {}, limit {}", futures.size(), max_tasks_inflight);
+
+    waitTilInflightShrink();
+}
+
+void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
+{
+    if (!max_tasks_inflight)
+        return;
+
+    LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size());
+
+    Stopwatch watch;
+
+    /// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
+    /// However the faster finished task is collected the faster CH checks if there is an exception
+    /// The faster an exception is propagated the lesser time is spent for cancellation
+    while (futures.size() >= max_tasks_inflight)
+    {
+        std::unique_lock lock(mutex);
+
+        has_finished.wait(lock, [this] () TSA_REQUIRES(mutex) { return !finished_futures.empty(); });
+
+        for (auto & it : finished_futures)
        {
-            future.wait();
+            SCOPE_EXIT({
+                /// According to basic exception safety TaskTracker has to be destroyed after exception
+                /// If it would be true than this SCOPE_EXIT is superfluous
+                /// However WriteBufferWithFinalizeCallback, WriteBufferFromFileDecorator do call finalize in d-tor
+                /// TaskTracker has to cope this until the issue with finalizing in d-tor is addressed in #50274
+                futures.erase(it);
+            });
+
+            it->get();
        }
-    );

-    futures.push_back(std::move(future));
+        finished_futures.clear();
+    }

-    exit_scope.release();
-    LOG_TEST(log, "add ended, in queue {}", futures.size());
+    watch.stop();
+    ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
+
+    LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size());
 }

 bool WriteBufferFromS3::TaskTracker::isAsync() const
--- a/src/IO/WriteBufferFromS3TaskTracker.h
+++ b/src/IO/WriteBufferFromS3TaskTracker.h
@ -6,36 +6,61 @@

 #include "WriteBufferFromS3.h"

+#include <list>
+
 namespace DB
 {

 /// That class is used only in WriteBufferFromS3 for now.
 /// Therefore it declared as a part of  WriteBufferFromS3.
 /// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool.
-/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll
+/// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll
 /// to help with coordination of the running tasks.

+/// Basic exception safety is provided. If exception occurred the object has to be destroyed.
+/// No thread safety is provided. Use this object with no concurrency.
+
 class WriteBufferFromS3::TaskTracker
 {
 public:
    using Callback = std::function<void()>;

-    explicit TaskTracker(ThreadPoolCallbackRunner<void> scheduler_);
+    TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_);
    ~TaskTracker();

    static ThreadPoolCallbackRunner<void> syncRunner();

    bool isAsync() const;
-    void waitReady();
+
+    /// waitIfAny collects statuses from already finished tasks
+    /// There could be no finished tasks yet, so waitIfAny do nothing useful in that case
+    /// the first exception is thrown if any task has failed
+    void waitIfAny();
+
+    /// Well, waitAll waits all the tasks until they finish and collects their statuses
    void waitAll();
+
+    /// safeWaitAll does the same as waitAll but mutes the exceptions
    void safeWaitAll();
+
    void add(Callback && func);

 private:
-    bool is_async;
+    /// waitTilInflightShrink waits til the number of in-flight tasks beyond the limit `max_tasks_inflight`.
+    void waitTilInflightShrink() TSA_NO_THREAD_SAFETY_ANALYSIS;
+
+    const bool is_async;
    ThreadPoolCallbackRunner<void> scheduler;
-    std::list<std::future<void>> futures;
+    const size_t max_tasks_inflight;
+
+    using FutureList = std::list<std::future<void>>;
+    FutureList futures;
    Poco::Logger * log = &Poco::Logger::get("TaskTracker");
+
+    std::mutex mutex;
+    std::condition_variable has_finished TSA_GUARDED_BY(mutex);
+    using FinishedList = std::list<FutureList::iterator>;
+    FinishedList finished_futures TSA_GUARDED_BY(mutex);
 };

 }
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@ -2041,7 +2041,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
             */
            if (data.hasNullKeyData())
            {
-                has_null_key_data = Method::one_key_nullable_optimization;
+                has_null_key_data = true;
                out_cols->key_columns[0]->insertDefault();
                insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena);
                data.hasNullKeyData() = false;
@ -2076,6 +2076,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
                    res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
                    places.clear();
                    out_cols.reset();
+                    has_null_key_data = false;
                }
            }
        });
--- a/src/Interpreters/AsynchronousInsertLog.h
+++ b/src/Interpreters/AsynchronousInsertLog.h
@ -45,7 +45,7 @@ public:
    using SystemLog<AsynchronousInsertLogElement>::SystemLog;

    /// This table is usually queried for fixed table name.
-    static const char * getDefaultOrderBy() { return "(database, table, event_date, event_time)"; }
+    static const char * getDefaultOrderBy() { return "database, table, event_date, event_time"; }
 };

 }
--- a/src/Interpreters/AsynchronousMetricLog.h
+++ b/src/Interpreters/AsynchronousMetricLog.h
@ -49,7 +49,7 @@ public:
    void addValues(const AsynchronousMetricValues &);

    /// This table is usually queried for fixed metric name.
-    static const char * getDefaultOrderBy() { return "(metric, event_date, event_time)"; }
+    static const char * getDefaultOrderBy() { return "metric, event_date, event_time"; }
 };

 }
--- a/src/Interpreters/ClusterDiscovery.cpp
+++ b/src/Interpreters/ClusterDiscovery.cpp
@ -125,10 +125,12 @@ ClusterDiscovery::ClusterDiscovery(
            ClusterInfo(
                /* name_= */ key,
                /* zk_root_= */ config.getString(prefix + ".path"),
+                /* host_name= */ config.getString(prefix + ".my_hostname", getFQDNOrHostName()),
                /* port= */ context->getTCPPort(),
                /* secure= */ config.getBool(prefix + ".secure", false),
                /* shard_id= */ config.getUInt(prefix + ".shard", 0),
-                /* observer_mode= */ ConfigHelper::getBool(config, prefix + ".observer")
+                /* observer_mode= */ ConfigHelper::getBool(config, prefix + ".observer"),
+                /* invisible= */ ConfigHelper::getBool(config, prefix + ".invisible")
            )
        );
    }
@ -294,6 +296,12 @@ bool ClusterDiscovery::updateCluster(ClusterInfo & cluster_info)
        return false;
    }

+    if (cluster_info.current_cluster_is_invisible)
+    {
+        LOG_DEBUG(log, "cluster '{}' is invisible!", cluster_info.name);
+        return true;
+    }
+
    if (!needUpdate(node_uuids, nodes_info))
    {
        LOG_DEBUG(log, "No update required for cluster '{}'", cluster_info.name);
--- a/src/Interpreters/ClusterDiscovery.h
+++ b/src/Interpreters/ClusterDiscovery.h
@ -3,7 +3,6 @@
 #include <Common/ConcurrentBoundedQueue.h>
 #include <Common/ThreadPool.h>
 #include <Common/ZooKeeper/Common.h>
-#include <base/getFQDNOrHostName.h>
 #include <Interpreters/Cluster.h>

 #include <Poco/Logger.h>
@ -78,16 +77,24 @@ private:
        /// Current node may not belong to cluster, to be just an observer.
        bool current_node_is_observer = false;

+        /// For internal management need.
+        /// Is it designed that when deploying multiple compute groups,
+        /// they are mutually invisible to each other.
+        bool current_cluster_is_invisible = false;
+
        explicit ClusterInfo(const String & name_,
                             const String & zk_root_,
+                             const String & host_name,
                             UInt16 port,
                             bool secure,
                             size_t shard_id,
-                             bool observer_mode)
+                             bool observer_mode,
+                             bool invisible)
            : name(name_)
            , zk_root(zk_root_)
-            , current_node(getFQDNOrHostName() + ":" + toString(port), secure, shard_id)
+            , current_node(host_name + ":" + toString(port), secure, shard_id)
            , current_node_is_observer(observer_mode)
+            , current_cluster_is_invisible(invisible)
        {
        }
    };
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -3555,9 +3555,9 @@ void Context::checkPartitionCanBeDropped(const String & database, const String &
 }


-InputFormatPtr Context::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings) const
+InputFormatPtr Context::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings, const std::optional<size_t> max_parsing_threads) const
 {
-    return FormatFactory::instance().getInput(name, buf, sample, shared_from_this(), max_block_size, format_settings);
+    return FormatFactory::instance().getInput(name, buf, sample, shared_from_this(), max_block_size, format_settings, max_parsing_threads);
 }

 OutputFormatPtr Context::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -738,7 +738,8 @@ public:
    BackupsWorker & getBackupsWorker() const;

    /// I/O formats.
-    InputFormatPtr getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size, const std::optional<FormatSettings> & format_settings = std::nullopt) const;
+    InputFormatPtr getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size,
+                                  const std::optional<FormatSettings> & format_settings = std::nullopt, const std::optional<size_t> max_parsing_threads = std::nullopt) const;

    OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const;
    OutputFormatPtr getOutputFormatParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const;
--- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
+++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
@ -19,6 +19,8 @@
 #include <Parsers/queryToString.h>
 #include <Processors/Executors/PullingAsyncPipelineExecutor.h>
 #include <Common/ProfileEvents.h>
+#include <Common/FieldVisitorToString.h>
+#include <IO/WriteBufferFromString.h>

 namespace ProfileEvents
 {
@ -68,17 +70,6 @@ void ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data)
        visit(*t, ast, data);
 }

-/// Converting to literal values might take a fair amount of overhead when the value is large, (e.g.
-///  Array, BitMap, etc.), This conversion is required for constant folding, index lookup, branch
-///  elimination. However, these optimizations should never be related to large values, thus we
-///  blacklist them here.
-static bool worthConvertingToLiteral(const Block & scalar)
-{
-    const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName();
-    static const std::set<std::string_view> useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
-    return !useless_literal_types.contains(scalar_type_name);
-}
-
 static auto getQueryInterpreter(const ASTSubquery & subquery, ExecuteScalarSubqueriesMatcher::Data & data)
 {
    auto subquery_context = Context::createCopy(data.getContext());
@ -255,7 +246,9 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
    const Settings & settings = data.getContext()->getSettingsRef();

    // Always convert to literals when there is no query context.
-    if (data.only_analyze || !settings.enable_scalar_subquery_optimization || worthConvertingToLiteral(scalar)
+    if (data.only_analyze
+        || !settings.enable_scalar_subquery_optimization
+        || worthConvertingScalarToLiteral(scalar, data.max_literal_size)
        || !data.getContext()->hasQueryContext())
    {
        /// subquery and ast can be the same object and ast will be moved.
@ -278,7 +271,7 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
            ast = std::move(func);
        }
    }
-    else
+    else if (!data.replace_only_to_literals)
    {
        auto func = makeASTFunction("__getScalar", std::make_shared<ASTLiteral>(scalar_query_hash_str));
        func->alias = subquery.alias;
@ -318,4 +311,31 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTFunction & func, ASTPtr & as
        Visitor(data).visit(*add_node);
 }

+static size_t getSizeOfSerializedLiteral(const Field & field)
+{
+    auto field_str = applyVisitor(FieldVisitorToString(), field);
+    return field_str.size();
+}
+
+bool worthConvertingScalarToLiteral(const Block & scalar, std::optional<size_t> max_literal_size)
+{
+    /// Converting to literal values might take a fair amount of overhead when the value is large, (e.g.
+    /// Array, BitMap, etc.), This conversion is required for constant folding, index lookup, branch
+    /// elimination. However, these optimizations should never be related to large values, thus we blacklist them here.
+    const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName();
+    static const std::set<std::string_view> maybe_large_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
+
+    if (!maybe_large_literal_types.contains(scalar_type_name))
+        return true;
+
+    if (!max_literal_size)
+        return false;
+
+    /// Size of serialized literal cannot be less than size in bytes.
+    if (scalar.bytes() > *max_literal_size)
+        return false;
+
+    return getSizeOfSerializedLiteral((*scalar.safeGetByPosition(0).column)[0]) <= *max_literal_size;
+}
+
 }
--- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.h
+++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.h
@ -37,6 +37,8 @@ public:
        Scalars & local_scalars;
        bool only_analyze;
        bool is_create_parameterized_view;
+        bool replace_only_to_literals;
+        std::optional<size_t> max_literal_size;
    };

    static bool needChildVisit(ASTPtr & node, const ASTPtr &);
@ -49,4 +51,6 @@ private:

 using ExecuteScalarSubqueriesVisitor = ExecuteScalarSubqueriesMatcher::Visitor;

+bool worthConvertingScalarToLiteral(const Block & scalar, std::optional<size_t> max_literal_size);
+
 }
--- a/src/Interpreters/InterpreterAlterQuery.cpp
+++ b/src/Interpreters/InterpreterAlterQuery.cpp
@ -8,12 +8,14 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/FunctionNameNormalizer.h>
 #include <Interpreters/MutationsInterpreter.h>
+#include <Interpreters/MutationsNonDeterministicHelpers.h>
 #include <Interpreters/QueryLog.h>
 #include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Parsers/ASTAlterQuery.h>
 #include <Parsers/ASTAssignment.h>
 #include <Parsers/ASTIdentifier_fwd.h>
 #include <Parsers/ASTColumnDeclaration.h>
+#include <Parsers/queryToString.h>
 #include <Storages/AlterCommands.h>
 #include <Storages/IStorage.h>
 #include <Storages/LiveView/LiveViewCommands.h>
@ -67,7 +69,6 @@ BlockIO InterpreterAlterQuery::execute()
    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown alter object type");
 }

-
 BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
 {
    BlockIO res;
@ -156,7 +157,8 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
    if (mutation_commands.hasNonEmptyMutationCommands())
    {
        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
-        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        MutationsInterpreter::Settings settings(false);
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), settings).validate();
        table->mutate(mutation_commands, getContext());
    }

@ -236,6 +238,7 @@ BlockIO InterpreterAlterQuery::executeToDatabase(const ASTAlterQuery & alter)

    return res;
 }
+
 AccessRightsElements InterpreterAlterQuery::getRequiredAccess() const
 {
    AccessRightsElements required_access;
--- a/src/Interpreters/InterpreterDeleteQuery.cpp
+++ b/src/Interpreters/InterpreterDeleteQuery.cpp
@ -72,7 +72,8 @@ BlockIO InterpreterDeleteQuery::execute()
        mutation_commands.emplace_back(mut_command);

        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
-        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        MutationsInterpreter::Settings settings(false);
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), settings).validate();
        table->mutate(mutation_commands, getContext());
        return {};
    }
--- a/src/Interpreters/InterpreterFactory.cpp
+++ b/src/Interpreters/InterpreterFactory.cpp
@ -114,6 +114,7 @@
 namespace ProfileEvents
 {
    extern const Event Query;
+    extern const Event QueriesWithSubqueries;
    extern const Event SelectQuery;
    extern const Event InsertQuery;
 }
@ -131,6 +132,15 @@ std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, ContextMut
 {
    ProfileEvents::increment(ProfileEvents::Query);

+    /// SELECT and INSERT query will handle QueriesWithSubqueries on their own.
+    if (!(query->as<ASTSelectQuery>() ||
+        query->as<ASTSelectWithUnionQuery>() ||
+        query->as<ASTSelectIntersectExceptQuery>() ||
+        query->as<ASTInsertQuery>()))
+    {
+        ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
+    }
+
    if (query->as<ASTSelectQuery>())
    {
        if (context->getSettingsRef().allow_experimental_analyzer)
--- a/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/src/Interpreters/InterpreterInsertQuery.cpp
@ -34,8 +34,15 @@
 #include <TableFunctions/TableFunctionFactory.h>
 #include <Common/ThreadStatus.h>
 #include <Common/checkStackSize.h>
+#include <Common/ProfileEvents.h>


+namespace ProfileEvents
+{
+    extern const Event InsertQueriesWithSubqueries;
+    extern const Event QueriesWithSubqueries;
+}
+
 namespace DB
 {

@ -234,6 +241,9 @@ Chain InterpreterInsertQuery::buildChain(
    ThreadStatusesHolderPtr thread_status_holder,
    std::atomic_uint64_t * elapsed_counter_ms)
 {
+    ProfileEvents::increment(ProfileEvents::InsertQueriesWithSubqueries);
+    ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
+
    ThreadGroupPtr running_group;
    if (current_thread)
        running_group = current_thread->getThreadGroup();
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -13,6 +13,7 @@
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/ExpressionListParsers.h>
 #include <Parsers/parseQuery.h>
+#include <Parsers/FunctionParameterValuesVisitor.h>

 #include <Access/Common/AccessFlags.h>
 #include <Access/ContextAccess.h>
@ -93,11 +94,17 @@
 #include <Common/FieldVisitorsAccurateComparison.h>
 #include <Common/checkStackSize.h>
 #include <Common/scope_guard_safe.h>
-#include <Parsers/FunctionParameterValuesVisitor.h>
 #include <Common/typeid_cast.h>
+#include <Common/ProfileEvents.h>

 #include "config_version.h"

+namespace ProfileEvents
+{
+    extern const Event SelectQueriesWithSubqueries;
+    extern const Event QueriesWithSubqueries;
+}
+
 namespace DB
 {

@ -437,7 +444,10 @@ InterpreterSelectQuery::InterpreterSelectQuery(
        if (!metadata_snapshot)
            metadata_snapshot = storage->getInMemoryMetadataPtr();

-        storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context);
+        if (options.only_analyze)
+            storage_snapshot = storage->getStorageSnapshotWithoutData(metadata_snapshot, context);
+        else
+            storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context);
    }

    if (has_input || !joined_tables.resolveTables())
@ -1329,6 +1339,9 @@ static bool hasWithTotalsInAnySubqueryInFromClause(const ASTSelectQuery & query)

 void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<Pipe> prepared_pipe)
 {
+    ProfileEvents::increment(ProfileEvents::SelectQueriesWithSubqueries);
+    ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
+
    /** Streams of data. When the query is executed in parallel, we have several data streams.
     *  If there is no GROUP BY, then perform all operations before ORDER BY and LIMIT in parallel, then
     *  if there is an ORDER BY, then glue the streams using ResizeProcessor, and then MergeSorting transforms,
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -1,9 +1,9 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>
-#include <Interpreters/InDepthNodeVisitor.h>
 #include <Interpreters/InterpreterSelectQuery.h>
 #include <Interpreters/MutationsInterpreter.h>
 #include <Interpreters/TreeRewriter.h>
+#include <Interpreters/MutationsNonDeterministicHelpers.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
 #include <Storages/StorageMergeTree.h>
@ -31,7 +31,6 @@
 #include <Interpreters/PreparedSets.h>
 #include <Storages/LightweightDeleteDescription.h>
 #include <Storages/MergeTree/MergeTreeSequentialSource.h>
-#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Processors/Sources/ThrowingExceptionSource.h>
 #include <Analyzer/QueryTreeBuilder.h>
 #include <Analyzer/QueryTreePassManager.h>
@ -53,90 +52,12 @@ namespace ErrorCodes
    extern const int NO_SUCH_COLUMN_IN_TABLE;
    extern const int CANNOT_UPDATE_COLUMN;
    extern const int UNEXPECTED_EXPRESSION;
+    extern const int THERE_IS_NO_COLUMN;
 }

 namespace
 {

-/// Helps to detect situations, where non-deterministic functions may be used in mutations of Replicated*MergeTree.
-class FirstNonDeterministicFunctionMatcher
-{
-public:
-    struct Data
-    {
-        ContextPtr context;
-        std::optional<String> nondeterministic_function_name;
-        bool subquery = false;
-    };
-
-    static bool needChildVisit(const ASTPtr & /*node*/, const ASTPtr & /*child*/)
-    {
-        return true;
-    }
-
-    static void visit(const ASTPtr & node, Data & data)
-    {
-        if (data.nondeterministic_function_name || data.subquery)
-            return;
-
-        if (node->as<ASTSelectQuery>())
-        {
-            /// We cannot determine if subquery is deterministic or not,
-            /// so we do not allow to use subqueries in mutation without allow_nondeterministic_mutations=1
-            data.subquery = true;
-        }
-        else if (const auto * function = typeid_cast<const ASTFunction *>(node.get()))
-        {
-            /// Property of being deterministic for lambda expression is completely determined
-            /// by the contents of its definition, so we just proceed to it.
-            if (function->name != "lambda")
-            {
-                /// NOTE It may be an aggregate function, so get(...) may throw.
-                /// However, an aggregate function can be used only in subquery and we do not go into subquery.
-                const auto func = FunctionFactory::instance().get(function->name, data.context);
-                if (!func->isDeterministic())
-                    data.nondeterministic_function_name = func->getName();
-            }
-        }
-    }
-};
-
-using FirstNonDeterministicFunctionFinder = InDepthNodeVisitor<FirstNonDeterministicFunctionMatcher, true>;
-using FirstNonDeterministicFunctionData = FirstNonDeterministicFunctionMatcher::Data;
-
-FirstNonDeterministicFunctionData findFirstNonDeterministicFunctionName(const MutationCommand & command, ContextPtr context)
-{
-    FirstNonDeterministicFunctionMatcher::Data finder_data{context, std::nullopt, false};
-
-    switch (command.type)
-    {
-        case MutationCommand::UPDATE:
-        {
-            auto update_assignments_ast = command.ast->as<const ASTAlterCommand &>().update_assignments->clone();
-            FirstNonDeterministicFunctionFinder(finder_data).visit(update_assignments_ast);
-
-            if (finder_data.nondeterministic_function_name)
-                return finder_data;
-
-            /// Currently UPDATE and DELETE both always have predicates so we can use fallthrough
-            [[fallthrough]];
-        }
-
-        case MutationCommand::DELETE:
-        {
-            auto predicate_ast = command.predicate->clone();
-            FirstNonDeterministicFunctionFinder(finder_data).visit(predicate_ast);
-
-            return finder_data;
-        }
-
-        default:
-            break;
-    }
-
-    return {};
-}
-
 ASTPtr prepareQueryAffectedAST(const std::vector<MutationCommand> & commands, const StoragePtr & storage, ContextPtr context)
 {
    /// Execute `SELECT count() FROM storage WHERE predicate1 OR predicate2 OR ...` query.
@ -326,10 +247,10 @@ MutationsInterpreter::Source::Source(MergeTreeData & storage_, MergeTreeData::Da

 StorageSnapshotPtr MutationsInterpreter::Source::getStorageSnapshot(const StorageMetadataPtr & snapshot_, const ContextPtr & context_) const
 {
-    if (data)
-        return data->getStorageSnapshot(snapshot_, context_);
+    if (const auto * merge_tree = getMergeTreeData())
+        return merge_tree->getStorageSnapshotWithoutData(snapshot_, context_);

-    return storage->getStorageSnapshot(snapshot_, context_);
+    return storage->getStorageSnapshotWithoutData(snapshot_, context_);
 }

 StoragePtr MutationsInterpreter::Source::getStorage() const
@ -367,20 +288,27 @@ bool MutationsInterpreter::Source::materializeTTLRecalculateOnly() const
    return data && data->getSettings()->materialize_ttl_recalculate_only;
 }

+static Names getAvailableColumnsWithVirtuals(StorageMetadataPtr metadata_snapshot, const IStorage & storage)
+{
+    auto all_columns = metadata_snapshot->getColumns().getNamesOfPhysical();
+    for (const auto & column : storage.getVirtuals())
+        all_columns.push_back(column.name);
+    return all_columns;
+}
+
 MutationsInterpreter::MutationsInterpreter(
    StoragePtr storage_,
-    const StorageMetadataPtr & metadata_snapshot_,
+    StorageMetadataPtr metadata_snapshot_,
    MutationCommands commands_,
    ContextPtr context_,
-    bool can_execute_,
-    bool return_all_columns_,
-    bool return_mutated_rows_)
+    Settings settings_)
    : MutationsInterpreter(
-        Source(std::move(storage_)),
-        metadata_snapshot_, std::move(commands_), std::move(context_),
-        can_execute_, return_all_columns_, return_mutated_rows_)
+        Source(storage_),
+        metadata_snapshot_, std::move(commands_),
+        getAvailableColumnsWithVirtuals(metadata_snapshot_, *storage_),
+        std::move(context_), std::move(settings_))
 {
-    if (can_execute_ && dynamic_cast<const MergeTreeData *>(source.getStorage().get()))
+    if (settings.can_execute && dynamic_cast<const MergeTreeData *>(source.getStorage().get()))
    {
        throw Exception(
            ErrorCodes::LOGICAL_ERROR,
@ -392,37 +320,34 @@ MutationsInterpreter::MutationsInterpreter(
 MutationsInterpreter::MutationsInterpreter(
    MergeTreeData & storage_,
    MergeTreeData::DataPartPtr source_part_,
-    const StorageMetadataPtr & metadata_snapshot_,
+    StorageMetadataPtr metadata_snapshot_,
    MutationCommands commands_,
+    Names available_columns_,
    ContextPtr context_,
-    bool can_execute_,
-    bool return_all_columns_,
-    bool return_mutated_rows_)
+    Settings settings_)
    : MutationsInterpreter(
        Source(storage_, std::move(source_part_)),
-        metadata_snapshot_, std::move(commands_), std::move(context_),
-        can_execute_, return_all_columns_, return_mutated_rows_)
+        std::move(metadata_snapshot_), std::move(commands_),
+        std::move(available_columns_), std::move(context_), std::move(settings_))
 {
 }

 MutationsInterpreter::MutationsInterpreter(
    Source source_,
-    const StorageMetadataPtr & metadata_snapshot_,
+    StorageMetadataPtr metadata_snapshot_,
    MutationCommands commands_,
+    Names available_columns_,
    ContextPtr context_,
-    bool can_execute_,
-    bool return_all_columns_,
-    bool return_mutated_rows_)
+    Settings settings_)
    : source(std::move(source_))
    , metadata_snapshot(metadata_snapshot_)
    , commands(std::move(commands_))
+    , available_columns(std::move(available_columns_))
    , context(Context::createCopy(context_))
-    , can_execute(can_execute_)
-    , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
-    , return_all_columns(return_all_columns_)
-    , return_mutated_rows(return_mutated_rows_)
+    , settings(std::move(settings_))
+    , select_limits(SelectQueryOptions().analyze(!settings.can_execute).ignoreLimits().ignoreProjections())
 {
-    prepare(!can_execute);
+    prepare(!settings.can_execute);
 }

 static NameSet getKeyColumns(const MutationsInterpreter::Source & source, const StorageMetadataPtr & metadata_snapshot)
@ -546,16 +471,18 @@ void MutationsInterpreter::prepare(bool dry_run)
    const ColumnsDescription & columns_desc = metadata_snapshot->getColumns();
    const IndicesDescription & indices_desc = metadata_snapshot->getSecondaryIndices();
    const ProjectionsDescription & projections_desc = metadata_snapshot->getProjections();
-    NamesAndTypesList all_columns = columns_desc.getAllPhysical();
+
+    auto storage_snapshot = std::make_shared<StorageSnapshot>(*source.getStorage(), metadata_snapshot);
+    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withVirtuals();
+
+    auto all_columns = storage_snapshot->getColumnsByNames(options, available_columns);
+    NameSet available_columns_set(available_columns.begin(), available_columns.end());

    /// Add _row_exists column if it is physically present in the part
    if (source.hasLightweightDeleteMask())
-        all_columns.push_back({LightweightDeleteDescription::FILTER_COLUMN});
-
-    if (return_all_columns)
    {
-        for (const auto & column : source.getStorage()->getVirtuals())
-            all_columns.push_back(column);
+        all_columns.push_back({LightweightDeleteDescription::FILTER_COLUMN});
+        available_columns_set.insert(LightweightDeleteDescription::FILTER_COLUMN.name);
    }

    NameSet updated_columns;
@ -567,9 +494,13 @@ void MutationsInterpreter::prepare(bool dry_run)
            || command.type == MutationCommand::Type::DELETE)
            materialize_ttl_recalculate_only = false;

-        for (const auto & kv : command.column_to_update_expression)
+        for (const auto & [name, _] : command.column_to_update_expression)
        {
-            updated_columns.insert(kv.first);
+            if (!available_columns_set.contains(name) && name != LightweightDeleteDescription::FILTER_COLUMN.name)
+                throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
+                    "Column {} is updated but not requested to read", name);
+
+            updated_columns.insert(name);
        }
    }

@ -580,29 +511,28 @@ void MutationsInterpreter::prepare(bool dry_run)
    {
        for (const auto & column : columns_desc)
        {
-            if (column.default_desc.kind == ColumnDefaultKind::Materialized)
+            if (column.default_desc.kind == ColumnDefaultKind::Materialized && available_columns_set.contains(column.name))
            {
                auto query = column.default_desc.expression->clone();
                auto syntax_result = TreeRewriter(context).analyze(query, all_columns);
-                for (const String & dependency : syntax_result->requiredSourceColumns())
-                {
+                for (const auto & dependency : syntax_result->requiredSourceColumns())
                    if (updated_columns.contains(dependency))
                        column_to_affected_materialized[dependency].push_back(column.name);
-                }
            }
        }

        validateUpdateColumns(source, metadata_snapshot, updated_columns, column_to_affected_materialized);
    }

-    dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns);
+    if (settings.recalculate_dependencies_of_updated_columns)
+        dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns);

    std::vector<String> read_columns;
    /// First, break a sequence of commands into stages.
    for (auto & command : commands)
    {
        // we can return deleted rows only if it's the only present command
-        assert(command.type == MutationCommand::DELETE || command.type == MutationCommand::UPDATE || !return_mutated_rows);
+        assert(command.type == MutationCommand::DELETE || command.type == MutationCommand::UPDATE || !settings.return_mutated_rows);

        if (command.type == MutationCommand::DELETE)
        {
@ -612,7 +542,7 @@ void MutationsInterpreter::prepare(bool dry_run)

            auto predicate  = getPartitionAndPredicateExpressionForMutationCommand(command);

-            if (!return_mutated_rows)
+            if (!settings.return_mutated_rows)
                predicate = makeASTFunction("isZeroOrNull", predicate);

            stages.back().filters.push_back(predicate);
@ -700,7 +630,7 @@ void MutationsInterpreter::prepare(bool dry_run)

                stages.back().column_to_updated.emplace(column, updated_column);

-                if (condition && return_mutated_rows)
+                if (condition && settings.return_mutated_rows)
                    stages.back().filters.push_back(condition);
            }

@ -909,17 +839,15 @@ void MutationsInterpreter::prepare(bool dry_run)
    }

    is_prepared = true;
-
    prepareMutationStages(stages, dry_run);
 }

 void MutationsInterpreter::prepareMutationStages(std::vector<Stage> & prepared_stages, bool dry_run)
 {
    auto storage_snapshot = source.getStorageSnapshot(metadata_snapshot, context);
-    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects();
-    if (return_all_columns)
-        options.withVirtuals();
-    auto all_columns = storage_snapshot->getColumns(options);
+    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals();
+
+    auto all_columns = storage_snapshot->getColumnsByNames(options, available_columns);

    /// Add _row_exists column if it is present in the part
    if (source.hasLightweightDeleteMask())
@ -928,7 +856,7 @@ void MutationsInterpreter::prepareMutationStages(std::vector<Stage> & prepared_s
    /// Next, for each stage calculate columns changed by this and previous stages.
    for (size_t i = 0; i < prepared_stages.size(); ++i)
    {
-        if (return_all_columns || !prepared_stages[i].filters.empty())
+        if (settings.return_all_columns || !prepared_stages[i].filters.empty())
        {
            for (const auto & column : all_columns)
                prepared_stages[i].output_columns.insert(column.name);
@ -1054,8 +982,7 @@ struct VirtualColumns
        {
            if (columns_to_read[i] == LightweightDeleteDescription::FILTER_COLUMN.name)
            {
-                LoadedMergeTreeDataPartInfoForReader part_info_reader(part);
-                if (!part_info_reader.getColumns().contains(LightweightDeleteDescription::FILTER_COLUMN.name))
+                if (!part->getColumns().contains(LightweightDeleteDescription::FILTER_COLUMN.name))
                {
                    ColumnWithTypeAndName mask_column;
                    mask_column.type = LightweightDeleteDescription::FILTER_COLUMN.type;
@ -1144,7 +1071,6 @@ void MutationsInterpreter::Source::read(
        ActionsDAGPtr filter;
        if (!first_stage.filter_column_names.empty())
        {
-
            ActionsDAG::NodeRawConstPtrs nodes(num_filters);
            for (size_t i = 0; i < num_filters; ++i)
                nodes[i] = &steps[i]->actions()->findInOutputs(names[i]);
@ -1155,7 +1081,9 @@ void MutationsInterpreter::Source::read(
        VirtualColumns virtual_columns(std::move(required_columns), part);

        createMergeTreeSequentialSource(
-            plan, *data, storage_snapshot, part, std::move(virtual_columns.columns_to_read), apply_deleted_mask_, filter, context_,
+            plan, *data, storage_snapshot, part,
+            std::move(virtual_columns.columns_to_read),
+            apply_deleted_mask_, filter, context_,
            &Poco::Logger::get("MutationsInterpreter"));

        virtual_columns.addVirtuals(plan);
@ -1208,7 +1136,7 @@ void MutationsInterpreter::Source::read(

 void MutationsInterpreter::initQueryPlan(Stage & first_stage, QueryPlan & plan)
 {
-    source.read(first_stage, plan, metadata_snapshot, context, apply_deleted_mask, can_execute);
+    source.read(first_stage, plan, metadata_snapshot, context, settings.apply_deleted_mask, settings.can_execute);
    addCreatingSetsStep(plan, first_stage.analyzer->getPreparedSets(), context);
 }

@ -1221,6 +1149,7 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v
            const auto & step = stage.expressions_chain.steps[i];
            if (step->actions()->hasArrayJoin())
                throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "arrayJoin is not allowed in mutations");
+
            if (i < stage.filter_column_names.size())
            {
                /// Execute DELETEs.
@ -1253,15 +1182,13 @@ QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::v

 void MutationsInterpreter::validate()
 {
-    const Settings & settings = context->getSettingsRef();
-
    /// For Replicated* storages mutations cannot employ non-deterministic functions
    /// because that produces inconsistencies between replicas
-    if (startsWith(source.getStorage()->getName(), "Replicated") && !settings.allow_nondeterministic_mutations)
+    if (startsWith(source.getStorage()->getName(), "Replicated") && !context->getSettingsRef().allow_nondeterministic_mutations)
    {
        for (const auto & command : commands)
        {
-            const auto nondeterministic_func_data = findFirstNonDeterministicFunctionName(command, context);
+            const auto nondeterministic_func_data = findFirstNonDeterministicFunction(command, context);
            if (nondeterministic_func_data.subquery)
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "ALTER UPDATE/ALTER DELETE statement with subquery may be nondeterministic, "
                                                           "see allow_nondeterministic_mutations setting");
@ -1281,7 +1208,7 @@ void MutationsInterpreter::validate()

 QueryPipelineBuilder MutationsInterpreter::execute()
 {
-    if (!can_execute)
+    if (!settings.can_execute)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot execute mutations interpreter because can_execute flag set to false");

    QueryPlan plan;
--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@ -36,30 +36,44 @@ ASTPtr getPartitionAndPredicateExpressionForMutationCommand(
 /// to this data.
 class MutationsInterpreter
 {
+private:
    struct Stage;

 public:
+    struct Settings
+    {
+        explicit Settings(bool can_execute_) : can_execute(can_execute_) {}
+
+        /// If false only analyze mutation expressions.
+        bool can_execute = false;
+        /// Whether all columns should be returned, not just updated
+        bool return_all_columns = false;
+        /// Whether we should return mutated or all existing rows
+        bool return_mutated_rows = false;
+        /// Where we should filter deleted rows by lightweight DELETE.
+        bool apply_deleted_mask = true;
+        /// Where we should recalculate skip indexes, TTL expressions, etc. that depend on updated columns.
+        bool recalculate_dependencies_of_updated_columns = true;
+    };
+
    /// Storage to mutate, array of mutations commands and context. If you really want to execute mutation
    /// use can_execute = true, in other cases (validation, amount of commands) it can be false
    MutationsInterpreter(
        StoragePtr storage_,
-        const StorageMetadataPtr & metadata_snapshot_,
+        StorageMetadataPtr metadata_snapshot_,
        MutationCommands commands_,
        ContextPtr context_,
-        bool can_execute_,
-        bool return_all_columns_ = false,
-        bool return_mutated_rows_ = false);
+        Settings settings_);

    /// Special case for *MergeTree
    MutationsInterpreter(
        MergeTreeData & storage_,
        MergeTreeData::DataPartPtr source_part_,
-        const StorageMetadataPtr & metadata_snapshot_,
+        StorageMetadataPtr metadata_snapshot_,
        MutationCommands commands_,
+        Names available_columns_,
        ContextPtr context_,
-        bool can_execute_,
-        bool return_all_columns_ = false,
-        bool return_mutated_rows_ = false);
+        Settings settings_);

    void validate();
    size_t evaluateCommandsSize();
@ -93,8 +107,6 @@ public:

    MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; }

-    void setApplyDeletedMask(bool apply) { apply_deleted_mask = apply; }
-
    /// Internal class which represents a data part for MergeTree
    /// or just storage for other storages.
    /// The main idea is to create a dedicated reading from MergeTree part.
@ -131,12 +143,11 @@ public:
 private:
    MutationsInterpreter(
        Source source_,
-        const StorageMetadataPtr & metadata_snapshot_,
+        StorageMetadataPtr metadata_snapshot_,
        MutationCommands commands_,
+        Names available_columns_,
        ContextPtr context_,
-        bool can_execute_,
-        bool return_all_columns_,
-        bool return_mutated_rows_);
+        Settings settings_);

    void prepare(bool dry_run);

@ -151,12 +162,11 @@ private:
    Source source;
    StorageMetadataPtr metadata_snapshot;
    MutationCommands commands;
+    Names available_columns;
    ContextPtr context;
-    bool can_execute;
+    Settings settings;
    SelectQueryOptions select_limits;

-    bool apply_deleted_mask = true;
-
    /// A sequence of mutation commands is executed as a sequence of stages. Each stage consists of several
    /// filters, followed by updating values of some columns. Commands can reuse expressions calculated by the
    /// previous commands in the same stage, but at the end of each stage intermediate columns are thrown away
@ -206,12 +216,6 @@ private:

    /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
    ColumnDependencies dependencies;
-
-    // whether all columns should be returned, not just updated
-    bool return_all_columns;
-
-    // whether we should return mutated or all existing rows
-    bool return_mutated_rows;
 };

 }
--- a/src/Interpreters/MutationsNonDeterministicHelpers.cpp
+++ b/src/Interpreters/MutationsNonDeterministicHelpers.cpp
@ -0,0 +1,100 @@
+#include "Parsers/IAST_fwd.h"
+#include <Interpreters/MutationsNonDeterministicHelpers.h>
+#include <Parsers/ASTSelectQuery.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTAlterQuery.h>
+#include <Storages/MutationCommands.h>
+#include <Interpreters/InDepthNodeVisitor.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Interpreters/ExecuteScalarSubqueriesVisitor.h>
+#include <Interpreters/addTypeConversionToAST.h>
+#include <Interpreters/Context.h>
+#include <Functions/FunctionFactory.h>
+
+namespace DB
+{
+
+namespace
+{
+
+/// Helps to detect situations, where non-deterministic functions may be used in mutations.
+class FirstNonDeterministicFunctionMatcher
+{
+public:
+    struct Data
+    {
+        ContextPtr context;
+        FirstNonDeterministicFunctionResult result;
+    };
+
+    static bool needChildVisit(const ASTPtr & /*node*/, const ASTPtr & /*child*/)
+    {
+        return true;
+    }
+
+    static void visit(const ASTPtr & node, Data & data)
+    {
+        if (data.result.nondeterministic_function_name || data.result.subquery)
+            return;
+
+        if (node->as<ASTSelectQuery>())
+        {
+            /// We cannot determine if subquery is deterministic or not,
+            /// so we do not allow to use subqueries in mutation without allow_nondeterministic_mutations=1
+            data.result.subquery = true;
+        }
+        else if (const auto * function = typeid_cast<const ASTFunction *>(node.get()))
+        {
+            /// Property of being deterministic for lambda expression is completely determined
+            /// by the contents of its definition, so we just proceed to it.
+            if (function->name != "lambda")
+            {
+                /// NOTE It may be an aggregate function, so get(...) may throw.
+                /// However, an aggregate function can be used only in subquery and we do not go into subquery.
+                const auto func = FunctionFactory::instance().get(function->name, data.context);
+                if (!func->isDeterministic())
+                    data.result.nondeterministic_function_name = func->getName();
+            }
+        }
+    }
+};
+
+using FirstNonDeterministicFunctionFinder = InDepthNodeVisitor<FirstNonDeterministicFunctionMatcher, true>;
+using FirstNonDeterministicFunctionData = FirstNonDeterministicFunctionMatcher::Data;
+
+}
+
+FirstNonDeterministicFunctionResult findFirstNonDeterministicFunction(const MutationCommand & command, ContextPtr context)
+{
+    FirstNonDeterministicFunctionMatcher::Data finder_data{context, {}};
+
+    switch (command.type)
+    {
+        case MutationCommand::UPDATE:
+        {
+            auto update_assignments_ast = command.ast->as<const ASTAlterCommand &>().update_assignments->clone();
+            FirstNonDeterministicFunctionFinder(finder_data).visit(update_assignments_ast);
+
+            if (finder_data.result.nondeterministic_function_name)
+                return finder_data.result;
+
+            /// Currently UPDATE and DELETE both always have predicates so we can use fallthrough
+            [[fallthrough]];
+        }
+
+        case MutationCommand::DELETE:
+        {
+            auto predicate_ast = command.predicate->clone();
+            FirstNonDeterministicFunctionFinder(finder_data).visit(predicate_ast);
+            return finder_data.result;
+        }
+
+        default:
+            break;
+    }
+
+    return {};
+}
+
+}
--- a/src/Interpreters/MutationsNonDeterministicHelpers.h
+++ b/src/Interpreters/MutationsNonDeterministicHelpers.h
@ -0,0 +1,21 @@
+#pragma once
+#include <Interpreters/Context_fwd.h>
+#include <Parsers/IAST_fwd.h>
+#include <Core/Types.h>
+
+namespace DB
+{
+
+struct MutationCommand;
+
+struct FirstNonDeterministicFunctionResult
+{
+    std::optional<String> nondeterministic_function_name;
+    bool subquery = false;
+};
+
+/// Searches for non-deterministic functions and subqueries which
+/// may also be non-deterministic in expressions of mutation command.
+FirstNonDeterministicFunctionResult findFirstNonDeterministicFunction(const MutationCommand & command, ContextPtr context);
+
+}
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@ -143,28 +143,58 @@ std::shared_ptr<TSystemLog> createSystemLog(
                            "If 'engine' is specified for system table, PARTITION BY parameters should "
                            "be specified directly inside 'engine' and 'partition_by' setting doesn't make sense");
        if (config.has(config_prefix + ".ttl"))
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "If 'engine' is specified for system table, "
-                            "TTL parameters should be specified directly inside 'engine' and 'ttl' setting doesn't make sense");
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "If 'engine' is specified for system table, TTL parameters should "
+                            "be specified directly inside 'engine' and 'ttl' setting doesn't make sense");
+        if (config.has(config_prefix + ".order_by"))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "If 'engine' is specified for system table, ORDER BY parameters should "
+                            "be specified directly inside 'engine' and 'order_by' setting doesn't make sense");
        if (config.has(config_prefix + ".storage_policy"))
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "If 'engine' is specified for system table, SETTINGS storage_policy = '...' "
-                            "should be specified directly inside 'engine' and 'storage_policy' setting doesn't make sense");
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "If 'engine' is specified for system table, SETTINGS storage_policy = '...' should "
+                            "be specified directly inside 'engine' and 'storage_policy' setting doesn't make sense");
+        if (config.has(config_prefix + ".settings"))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "If 'engine' is specified for system table, SETTINGS parameters should "
+                            "be specified directly inside 'engine' and 'settings' setting doesn't make sense");
+
        engine = config.getString(config_prefix + ".engine");
    }
    else
    {
-        String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)");
+        /// ENGINE expr is necessary.
        engine = "ENGINE = MergeTree";
+
+        /// PARTITION expr is not necessary.
+        String partition_by = config.getString(config_prefix + ".partition_by", "toYYYYMM(event_date)");
        if (!partition_by.empty())
            engine += " PARTITION BY (" + partition_by + ")";
+
+        /// TTL expr is not necessary.
        String ttl = config.getString(config_prefix + ".ttl", "");
        if (!ttl.empty())
            engine += " TTL " + ttl;

-        engine += " ORDER BY ";
-        engine += TSystemLog::getDefaultOrderBy();
+        /// ORDER BY expr is necessary.
+        String order_by = config.getString(config_prefix + ".order_by", TSystemLog::getDefaultOrderBy());
+        engine += " ORDER BY (" + order_by + ")";
+
+        /// SETTINGS expr is not necessary.
+        ///   https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#settings
+        ///
+        /// STORAGE POLICY expr is retained for backward compatible.
        String storage_policy = config.getString(config_prefix + ".storage_policy", "");
-        if (!storage_policy.empty())
-            engine += " SETTINGS storage_policy = " + quoteString(storage_policy);
+        String settings = config.getString(config_prefix + ".settings", "");
+        if (!storage_policy.empty() || !settings.empty())
+        {
+            engine += " SETTINGS";
+            /// If 'storage_policy' is repeated, the 'settings' configuration is preferred.
+            if (!storage_policy.empty())
+                engine += " storage_policy = " + quoteString(storage_policy);
+            if (!settings.empty())
+                engine += (storage_policy.empty() ? " " : ", ") + settings;
+        }
    }

    /// Validate engine definition syntax to prevent some configuration errors.
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -455,7 +455,11 @@ void executeScalarSubqueries(
    ASTPtr & query, ContextPtr context, size_t subquery_depth, Scalars & scalars, Scalars & local_scalars, bool only_analyze, bool is_create_parameterized_view)
 {
    LogAST log;
-    ExecuteScalarSubqueriesVisitor::Data visitor_data{WithContext{context}, subquery_depth, scalars, local_scalars, only_analyze, is_create_parameterized_view};
+    ExecuteScalarSubqueriesVisitor::Data visitor_data{
+        WithContext{context}, subquery_depth, scalars,
+        local_scalars, only_analyze, is_create_parameterized_view,
+        /*replace_only_to_literals=*/ false, /*max_literal_size=*/ std::nullopt};
+
    ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query);
 }

--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -666,9 +666,13 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(

            interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal));

-            if (context->getCurrentTransaction() && !interpreter->supportsTransactions() &&
-                context->getSettingsRef().throw_on_unsupported_query_inside_transaction)
-                throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID());
+            const auto & query_settings = context->getSettingsRef();
+            if (context->getCurrentTransaction() && query_settings.throw_on_unsupported_query_inside_transaction)
+            {
+                if (!interpreter->supportsTransactions())
+                    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID());
+
+            }

            if (!interpreter->ignoreQuota() && !quota_checked)
            {
--- a/src/Planner/Planner.cpp
+++ b/src/Planner/Planner.cpp
@ -2,6 +2,7 @@

 #include <Core/ProtocolDefines.h>
 #include <Common/logger_useful.h>
+#include <Common/ProfileEvents.h>

 #include <DataTypes/DataTypeString.h>

@ -73,6 +74,12 @@
 #include <Planner/CollectColumnIdentifiers.h>
 #include <Planner/PlannerQueryProcessingInfo.h>

+namespace ProfileEvents
+{
+    extern const Event SelectQueriesWithSubqueries;
+    extern const Event QueriesWithSubqueries;
+}
+
 namespace DB
 {

@ -1155,6 +1162,9 @@ void Planner::buildPlanForUnionNode()

 void Planner::buildPlanForQueryNode()
 {
+    ProfileEvents::increment(ProfileEvents::SelectQueriesWithSubqueries);
+    ProfileEvents::increment(ProfileEvents::QueriesWithSubqueries);
+
    auto & query_node = query_tree->as<QueryNode &>();
    const auto & query_context = planner_context->getQueryContext();

@ -1192,13 +1202,14 @@ void Planner::buildPlanForQueryNode()

    const auto & settings = query_context->getSettingsRef();

-    if (planner_context->getTableExpressionNodeToData().size() > 1
-        && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas > 0))
+    /// Check support for JOIN for parallel replicas with custom key
+    if (planner_context->getTableExpressionNodeToData().size() > 1)
    {
-        if (settings.allow_experimental_parallel_reading_from_replicas == 1)
+        if (settings.allow_experimental_parallel_reading_from_replicas == 1 || !settings.parallel_replicas_custom_key.value.empty())
        {
-                    LOG_WARNING(
-            &Poco::Logger::get("Planner"), "JOINs are not supported with parallel replicas. Query will be executed without using them.");
+            LOG_WARNING(
+                &Poco::Logger::get("Planner"),
+                "JOINs are not supported with parallel replicas. Query will be executed without using them.");

            auto & mutable_context = planner_context->getMutableQueryContext();
            mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
--- a/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ArrowBlockOutputFormat.cpp
@ -75,7 +75,7 @@ void ArrowBlockOutputFormat::finalizeImpl()
 {
    if (!writer)
    {
-        const Block & header = getPort(PortKind::Main).getHeader();
+        Block header = materializeBlock(getPort(PortKind::Main).getHeader());

        consume(Chunk(header.getColumns(), 0));
    }
--- a/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h
+++ b/src/Processors/Formats/Impl/CustomSeparatedRowInputFormat.h
@ -76,7 +76,7 @@ public:

    bool checkEndOfRow();
    bool checkForSuffixImpl(bool check_eof);
-    inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf); }
+    inline void skipSpaces() { if (ignore_spaces) skipWhitespaceIfAny(*buf, true); }

    EscapingRule getEscapingRule() const override { return format_settings.custom.escaping_rule; }

--- a/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockOutputFormat.cpp
@ -105,7 +105,7 @@ void ParquetBlockOutputFormat::finalizeImpl()

    if (!file_writer)
    {
-        const Block & header = getPort(PortKind::Main).getHeader();
+        Block header = materializeBlock(getPort(PortKind::Main).getHeader());
        write(Chunk(header.getColumns(), 0), 1);
    }

--- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp
@ -582,7 +582,7 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes &
    ContextPtr context = reading->getContext();
    MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());

-    auto ordinary_reading_select_result = reading->selectRangesToRead(parts);
+    auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {});
    size_t ordinary_reading_marks = ordinary_reading_select_result->marks();

    /// Selecting best candidate.
@ -640,7 +640,8 @@ bool optimizeUseAggregateProjections(QueryPlan::Node & node, QueryPlan::Nodes &
        query_info_copy.prewhere_info = nullptr;

        projection_reading = reader.readFromParts(
-            {},
+            /* parts = */ {},
+            /* alter_conversions = */ {},
            best_candidate->dag->getRequiredColumnsNames(),
            proj_snapshot,
            query_info_copy,
--- a/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp
+++ b/src/Processors/QueryPlan/Optimizations/optimizeUseNormalProjection.cpp
@ -110,10 +110,7 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
            return false;

        if (query.dag)
-        {
            query.dag->removeUnusedActions();
-            // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Query DAG: {}", query.dag->dumpDAG());
-        }
    }

    std::list<NormalProjectionCandidate> candidates;
@ -125,12 +122,9 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
    ContextPtr context = reading->getContext();
    MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());

-    auto ordinary_reading_select_result = reading->selectRangesToRead(parts);
+    auto ordinary_reading_select_result = reading->selectRangesToRead(parts, /* alter_conversions = */ {});
    size_t ordinary_reading_marks = ordinary_reading_select_result->marks();

-    // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"),
-    //           "Marks for ordinary reading {}", ordinary_reading_marks);
-
    std::shared_ptr<PartitionIdToMaxBlock> max_added_blocks = getMaxAddedBlocks(reading);

    for (const auto * projection : normal_projections)
@ -152,9 +146,6 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
        if (!analyzed)
            continue;

-        // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"),
-        //           "Marks for projection {} {}", projection->name ,candidate.sum_marks);
-
        if (candidate.sum_marks >= ordinary_reading_marks)
            continue;

@ -173,14 +164,12 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
        storage_snapshot->storage, storage_snapshot->metadata, storage_snapshot->object_columns); //, storage_snapshot->data);
    proj_snapshot->addProjection(best_candidate->projection);

-    // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Proj snapshot {}",
-    //           proj_snapshot->getColumns(GetColumnsOptions::Kind::All).toString());
-
    auto query_info_copy = query_info;
    query_info_copy.prewhere_info = nullptr;

    auto projection_reading = reader.readFromParts(
-        {},
+        /*parts=*/ {},
+        /*alter_conversions=*/ {},
        required_columns,
        proj_snapshot,
        query_info_copy,
@ -201,9 +190,6 @@ bool optimizeUseNormalProjections(Stack & stack, QueryPlan::Nodes & nodes)
    if (has_ordinary_parts)
        reading->setAnalyzedResult(std::move(best_candidate->merge_tree_ordinary_select_result_ptr));

-    // LOG_TRACE(&Poco::Logger::get("optimizeUseProjections"), "Projection reading header {}",
-    //           projection_reading->getOutputStream().header.dumpStructure());
-
    projection_reading->setStepDescription(best_candidate->projection->name);

    auto & projection_reading_node = nodes.emplace_back(QueryPlan::Node{.step = std::move(projection_reading)});
--- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp
+++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp
@ -248,7 +248,7 @@ bool analyzeProjectionCandidate(

    if (!normal_parts.empty())
    {
-        auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts));
+        auto normal_result_ptr = reading.selectRangesToRead(std::move(normal_parts), /* alter_conversions = */ {});

        if (normal_result_ptr->error())
            return false;
--- a/src/Processors/QueryPlan/PartsSplitter.cpp
+++ b/src/Processors/QueryPlan/PartsSplitter.cpp
@ -142,8 +142,10 @@ std::pair<std::vector<Values>, std::vector<RangesInDataParts>> split(RangesInDat
                {
                    result_layers.back().emplace_back(
                        parts[part_idx].data_part,
+                        parts[part_idx].alter_conversions,
                        parts[part_idx].part_index_in_query,
                        MarkRanges{{current_part_range_begin[part_idx], current.range.end}});
+
                    current_part_range_begin.erase(part_idx);
                    current_part_range_end.erase(part_idx);
                    continue;
@ -170,8 +172,10 @@ std::pair<std::vector<Values>, std::vector<RangesInDataParts>> split(RangesInDat
        {
            result_layers.back().emplace_back(
                parts[part_idx].data_part,
+                parts[part_idx].alter_conversions,
                parts[part_idx].part_index_in_query,
                MarkRanges{{current_part_range_begin[part_idx], last_mark + 1}});
+
            current_part_range_begin[part_idx] = current_part_range_end[part_idx];
        }
    }
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -171,6 +171,7 @@ void ReadFromMergeTree::AnalysisResult::checkLimits(const Settings & settings, c

 ReadFromMergeTree::ReadFromMergeTree(
    MergeTreeData::DataPartsVector parts_,
+    std::vector<AlterConversionsPtr> alter_conversions_,
    Names real_column_names_,
    Names virt_column_names_,
    const MergeTreeData & data_,
@ -191,6 +192,7 @@ ReadFromMergeTree::ReadFromMergeTree(
        virt_column_names_)})
    , reader_settings(getMergeTreeReaderSettings(context_, query_info_))
    , prepared_parts(std::move(parts_))
+    , alter_conversions_for_parts(std::move(alter_conversions_))
    , real_column_names(std::move(real_column_names_))
    , virt_column_names(std::move(virt_column_names_))
    , data(data_)
@ -307,8 +309,7 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
        reader_settings,
        required_columns,
        virt_column_names,
-        min_marks_for_concurrent_read
-    );
+        min_marks_for_concurrent_read);

    Pipes pipes;
    const auto & settings = context->getSettingsRef();
@ -459,7 +460,7 @@ ProcessorPtr ReadFromMergeTree::createSource(
    bool set_rows_approx = !is_parallel_reading_from_replicas && !reader_settings.read_in_order;

    auto algorithm = std::make_unique<Algorithm>(
-            data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes,
+            data, storage_snapshot, part.data_part, part.alter_conversions, max_block_size, preferred_block_size_bytes,
            preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info,
            actions_settings, reader_settings, pool, virt_column_names, part.part_index_in_query, has_limit_below_one_block);

@ -808,7 +809,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
            }

            ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
-            new_parts.emplace_back(part.data_part, part.part_index_in_query, std::move(ranges_to_get_from_part));
+            new_parts.emplace_back(part.data_part, part.alter_conversions, part.part_index_in_query, std::move(ranges_to_get_from_part));
        }

        splitted_parts_and_ranges.emplace_back(std::move(new_parts));
@ -1000,7 +1001,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
            {
                for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it)
                {
-                    new_parts.emplace_back(part_it->data_part, part_it->part_index_in_query, part_it->ranges);
+                    new_parts.emplace_back(part_it->data_part, part_it->alter_conversions, part_it->part_index_in_query, part_it->ranges);
                }
            }

@ -1111,10 +1112,13 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
    return Pipe::unitePipes(std::move(partition_pipes));
 }

-MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(MergeTreeData::DataPartsVector parts) const
+MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
+    MergeTreeData::DataPartsVector parts,
+    std::vector<AlterConversionsPtr> alter_conversions) const
 {
    return selectRangesToRead(
        std::move(parts),
+        std::move(alter_conversions),
        prewhere_info,
        filter_nodes,
        storage_snapshot->metadata,
@ -1131,6 +1135,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(Merge

 MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
    MergeTreeData::DataPartsVector parts,
+    std::vector<AlterConversionsPtr> alter_conversions,
    const PrewhereInfoPtr & prewhere_info,
    const ActionDAGNodes & added_filter_nodes,
    const StorageMetadataPtr & metadata_snapshot_base,
@ -1182,7 +1187,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
        updated_query_info_with_filter_dag.filter_actions_dag = ActionsDAG::buildFilterActionsDAG(nodes, node_name_to_input_node_column, context);

        return selectRangesToReadImpl(
-            parts,
+            std::move(parts),
+            std::move(alter_conversions),
            metadata_snapshot_base,
            metadata_snapshot,
            updated_query_info_with_filter_dag,
@ -1196,7 +1202,8 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
    }

    return selectRangesToReadImpl(
-        parts,
+        std::move(parts),
+        std::move(alter_conversions),
        metadata_snapshot_base,
        metadata_snapshot,
        query_info,
@ -1211,6 +1218,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(

 MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
    MergeTreeData::DataPartsVector parts,
+    std::vector<AlterConversionsPtr> alter_conversions,
    const StorageMetadataPtr & metadata_snapshot_base,
    const StorageMetadataPtr & metadata_snapshot,
    const SelectQueryInfo & query_info,
@ -1284,6 +1292,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
    {
        MergeTreeDataSelectExecutor::filterPartsByPartition(
            parts,
+            alter_conversions,
            part_values,
            metadata_snapshot_base,
            data,
@ -1321,6 +1330,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(

        result.parts_with_ranges = MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipIndexes(
            std::move(parts),
+            std::move(alter_conversions),
            metadata_snapshot,
            query_info,
            context,
@ -1491,7 +1501,7 @@ bool ReadFromMergeTree::requestOutputEachPartitionThroughSeparatePort()

 ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const
 {
-    auto result_ptr = analyzed_result_ptr ? analyzed_result_ptr : selectRangesToRead(prepared_parts);
+    auto result_ptr = analyzed_result_ptr ? analyzed_result_ptr : selectRangesToRead(prepared_parts, alter_conversions_for_parts);
    if (std::holds_alternative<std::exception_ptr>(result_ptr->result))
        std::rethrow_exception(std::get<std::exception_ptr>(result_ptr->result));

@ -1720,7 +1730,6 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons
    for (const auto & processor : pipe.getProcessors())
        processors.emplace_back(processor);

-
    pipeline.init(std::move(pipe));
    // Attach QueryIdHolder if needed
    if (query_id_holder)
--- a/src/Processors/QueryPlan/ReadFromMergeTree.h
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.h
@ -5,6 +5,7 @@
 #include <Storages/SelectQueryInfo.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeReadPool.h>
+#include <Storages/MergeTree/AlterConversions.h>

 namespace DB
 {
@ -97,6 +98,7 @@ public:

    ReadFromMergeTree(
        MergeTreeData::DataPartsVector parts_,
+        std::vector<AlterConversionsPtr> alter_conversions_,
        Names real_column_names_,
        Names virt_column_names_,
        const MergeTreeData & data_,
@ -134,6 +136,7 @@ public:

    static MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(
        MergeTreeData::DataPartsVector parts,
+        std::vector<AlterConversionsPtr> alter_conversions,
        const PrewhereInfoPtr & prewhere_info,
        const ActionDAGNodes & added_filter_nodes,
        const StorageMetadataPtr & metadata_snapshot_base,
@ -147,7 +150,9 @@ public:
        bool sample_factor_column_queried,
        Poco::Logger * log);

-    MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(MergeTreeData::DataPartsVector parts) const;
+    MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(
+        MergeTreeData::DataPartsVector parts,
+        std::vector<AlterConversionsPtr> alter_conversions) const;

    ContextPtr getContext() const { return context; }
    const SelectQueryInfo & getQueryInfo() const { return query_info; }
@ -168,7 +173,12 @@ public:

    bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; }
    void setAnalyzedResult(MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); }
-    void resetParts(MergeTreeData::DataPartsVector parts) { prepared_parts = std::move(parts); }
+
+    void resetParts(MergeTreeData::DataPartsVector parts)
+    {
+        prepared_parts = std::move(parts);
+        alter_conversions_for_parts = {};
+    }

    const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; }
    const MergeTreeData & getMergeTreeData() const { return data; }
@ -179,6 +189,7 @@ public:
 private:
    static MergeTreeDataSelectAnalysisResultPtr selectRangesToReadImpl(
        MergeTreeData::DataPartsVector parts,
+        std::vector<AlterConversionsPtr> alter_conversions,
        const StorageMetadataPtr & metadata_snapshot_base,
        const StorageMetadataPtr & metadata_snapshot,
        const SelectQueryInfo & query_info,
@ -202,6 +213,8 @@ private:
    MergeTreeReaderSettings reader_settings;

    MergeTreeData::DataPartsVector prepared_parts;
+    std::vector<AlterConversionsPtr> alter_conversions_for_parts;
+
    Names real_column_names;
    Names virt_column_names;

--- a/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
@ -34,7 +34,7 @@ bool RemoteQueryExecutorReadContext::checkBeforeTaskResume()
 }


-void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback)
+void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
 {
    read_context.executor.sendQueryUnlocked(ClientInfo::QueryKind::SECONDARY_QUERY, async_callback);
    read_context.is_query_sent = true;
--- a/src/QueryPipeline/RemoteQueryExecutorReadContext.h
+++ b/src/QueryPipeline/RemoteQueryExecutorReadContext.h
@ -58,7 +58,7 @@ private:

        RemoteQueryExecutorReadContext & read_context;

-        void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
+        void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
    };

    std::atomic_bool is_in_progress = false;
--- a/src/Storages/DataLakes/IcebergMetadataParser.cpp
+++ b/src/Storages/DataLakes/IcebergMetadataParser.cpp
@ -244,7 +244,13 @@ struct IcebergMetadataParser<Configuration, MetadataReadHelper>::Impl

            const auto * str_col = assert_cast<const ColumnString *>(col_str.get());
            for (size_t i = 0; i < str_col->size(); ++i)
-                keys.emplace_back(str_col->getDataAt(i).toView());
+            {
+                const auto data_path = std::string(str_col->getDataAt(i).toView());
+                const auto pos = data_path.find(configuration.url.key);
+                if (pos == std::string::npos)
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected to find {} in data path: {}", configuration.url.key, data_path);
+                keys.emplace_back(data_path.substr(pos));
+            }
        }

        return keys;
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@ -661,6 +661,12 @@ public:
        return getStorageSnapshot(metadata_snapshot, query_context);
    }

+    /// Creates a storage snapshot but without holding a data specific to storage.
+    virtual StorageSnapshotPtr getStorageSnapshotWithoutData(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const
+    {
+        return getStorageSnapshot(metadata_snapshot, query_context);
+    }
+
    /// A helper to implement read()
    static void readFromPipe(
        QueryPlan & query_plan,
--- a/src/Storages/MergeTree/AlterConversions.cpp
+++ b/src/Storages/MergeTree/AlterConversions.cpp
@ -9,6 +9,13 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+void AlterConversions::addMutationCommand(const MutationCommand & command)
+{
+    /// Currently only RENAME_COLUMN is applied on-fly.
+    if (command.type == MutationCommand::Type::RENAME_COLUMN)
+        rename_map.emplace_back(RenamePair{command.rename_to, command.column_name});
+}
+
 bool AlterConversions::columnHasNewName(const std::string & old_name) const
 {
    for (const auto & [new_name, prev_name] : rename_map)
@ -31,7 +38,6 @@ std::string AlterConversions::getColumnNewName(const std::string & old_name) con
    throw Exception(ErrorCodes::LOGICAL_ERROR, "Column {} was not renamed", old_name);
 }

-
 bool AlterConversions::isColumnRenamed(const std::string & new_name) const
 {
    for (const auto & [name_to, name_from] : rename_map)
@ -41,6 +47,7 @@ bool AlterConversions::isColumnRenamed(const std::string & new_name) const
    }
    return false;
 }
+
 /// Get column old name before rename (lookup by key in rename_map)
 std::string AlterConversions::getColumnOldName(const std::string & new_name) const
 {
--- a/src/Storages/MergeTree/AlterConversions.h
+++ b/src/Storages/MergeTree/AlterConversions.h
@ -1,5 +1,6 @@
 #pragma once

+#include <Storages/MutationCommands.h>
 #include <string>
 #include <unordered_map>

@ -7,20 +8,23 @@
 namespace DB
 {

-/// Alter conversions which should be applied on-fly for part. Build from of
-/// the most recent mutation commands for part. Now we have only rename_map
-/// here (from ALTER_RENAME) command, because for all other type of alters
-/// we can deduce conversions for part from difference between
-/// part->getColumns() and storage->getColumns().
-struct AlterConversions
+
+/// Alter conversions which should be applied on-fly for part.
+/// Built from of the most recent mutation commands for part.
+/// Now only ALTER RENAME COLUMN is applied.
+class AlterConversions : private boost::noncopyable
 {
+public:
+    AlterConversions() = default;
+
    struct RenamePair
    {
        std::string rename_to;
        std::string rename_from;
    };
-    /// Rename map new_name -> old_name
-    std::vector<RenamePair> rename_map;
+
+    void addMutationCommand(const MutationCommand & command);
+    const std::vector<RenamePair> & getRenameMap() const { return rename_map; }

    /// Column was renamed (lookup by value in rename_map)
    bool columnHasNewName(const std::string & old_name) const;
@ -30,6 +34,12 @@ struct AlterConversions
    bool isColumnRenamed(const std::string & new_name) const;
    /// Get column old name before rename (lookup by key in rename_map)
    std::string getColumnOldName(const std::string & new_name) const;
+
+private:
+    /// Rename map new_name -> old_name.
+    std::vector<RenamePair> rename_map;
 };

+using AlterConversionsPtr = std::shared_ptr<const AlterConversions>;
+
 }
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@ -6,6 +6,7 @@
 #include <Core/NamesAndTypes.h>
 #include <Storages/IStorage.h>
 #include <Storages/LightweightDeleteDescription.h>
+#include <Storages/MergeTree/AlterConversions.h>
 #include <Storages/MergeTree/IDataPartStorage.h>
 #include <Storages/MergeTree/MergeTreeDataPartState.h>
 #include <Storages/MergeTree/MergeTreeIndexGranularity.h>
@ -92,6 +93,7 @@ public:
        const MarkRanges & mark_ranges,
        UncompressedCache * uncompressed_cache,
        MarkCache * mark_cache,
+        const AlterConversionsPtr & alter_conversions,
        const MergeTreeReaderSettings & reader_settings_,
        const ValueSizeMap & avg_value_size_hints_,
        const ReadBufferFromFileBase::ProfileCallback & profile_callback_) const = 0;
--- a/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
@ -52,7 +52,7 @@ public:

    virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;

-    virtual AlterConversions getAlterConversions() const = 0;
+    virtual AlterConversionsPtr getAlterConversions() const = 0;

    virtual size_t getMarksCount() const = 0;

--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@ -1,3 +1,4 @@
+#include <Storages/MergeTree/IMergeTreeReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <DataTypes/DataTypeArray.h>
 #include <Common/escapeForFileName.h>
@ -5,8 +6,6 @@
 #include <Columns/ColumnArray.h>
 #include <Interpreters/inplaceBlockConversions.h>
 #include <Interpreters/Context.h>
-#include <Storages/MergeTree/IMergeTreeReader.h>
-#include <Common/typeid_cast.h>


 namespace DB
@ -133,9 +132,9 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
 String IMergeTreeReader::getColumnNameInPart(const NameAndTypePair & required_column) const
 {
    auto name_in_storage = required_column.getNameInStorage();
-    if (alter_conversions.isColumnRenamed(name_in_storage))
+    if (alter_conversions->isColumnRenamed(name_in_storage))
    {
-        name_in_storage = alter_conversions.getColumnOldName(name_in_storage);
+        name_in_storage = alter_conversions->getColumnOldName(name_in_storage);
        return Nested::concatenateName(name_in_storage, required_column.getSubcolumnName());
    }

--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -50,8 +50,8 @@ public:
    /// Evaluate defaulted columns if necessary.
    void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const;

-    /// If part metadata is not equal to storage metadata, than
-    /// try to perform conversions of columns.
+    /// If part metadata is not equal to storage metadata,
+    /// then try to perform conversions of columns.
    void performRequiredConversions(Columns & res_columns) const;

    const NamesAndTypesList & getColumns() const { return requested_columns; }
@ -104,7 +104,7 @@ protected:

 private:
    /// Alter conversions, which must be applied on fly if required
-    AlterConversions alter_conversions;
+    AlterConversionsPtr alter_conversions;

    /// Columns that are requested to read.
    NamesAndTypesList requested_columns;
--- a/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
@ -1,4 +1,5 @@
 #pragma once
+#include <Storages/MergeTree/AlterConversions.h>
 #include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 #include <Storages/MergeTree/MergeTreeData.h>

@ -9,9 +10,11 @@ namespace DB
 class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
 {
 public:
-    explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_)
+    LoadedMergeTreeDataPartInfoForReader(
+        MergeTreeData::DataPartPtr data_part_, AlterConversionsPtr alter_conversions_)
        : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
-        , data_part(data_part_)
+        , data_part(std::move(data_part_))
+        , alter_conversions(std::move(alter_conversions_))
    {
    }

@ -33,7 +36,7 @@ public:

    std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }

-    AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); }
+    AlterConversionsPtr getAlterConversions() const override { return alter_conversions; }

    String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }

@ -53,8 +56,11 @@ public:

    SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }

+    MergeTreeData::DataPartPtr getDataPart() const { return data_part; }
+
 private:
    MergeTreeData::DataPartPtr data_part;
+    AlterConversionsPtr alter_conversions;
 };

 }
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@ -193,8 +193,9 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
    global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical();

    auto object_columns = MergeTreeData::getConcreteObjectColumns(global_ctx->future_part->parts, global_ctx->metadata_snapshot->getColumns());
-    global_ctx->storage_snapshot = std::make_shared<StorageSnapshot>(*global_ctx->data, global_ctx->metadata_snapshot, object_columns);
+
    extendObjectColumns(global_ctx->storage_columns, object_columns, false);
+    global_ctx->storage_snapshot = std::make_shared<StorageSnapshot>(*global_ctx->data, global_ctx->metadata_snapshot, std::move(object_columns));

    extractMergingAndGatheringColumns(
        global_ctx->storage_columns,
@ -544,8 +545,8 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
            global_ctx->future_part->parts[part_num],
            column_names,
            ctx->read_with_direct_io,
-            true,
-            false,
+            /*take_column_types_from_storage=*/ true,
+            /*quiet=*/ false,
            global_ctx->input_rows_filtered);

        pipes.emplace_back(std::move(pipe));
@ -896,8 +897,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
            part,
            global_ctx->merging_column_names,
            ctx->read_with_direct_io,
-            true,
-            false,
+            /*take_column_types_from_storage=*/ true,
+            /*quiet=*/ false,
            global_ctx->input_rows_filtered);

        if (global_ctx->metadata_snapshot->hasSortingKey())
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
@ -80,49 +80,51 @@ IMergeTreeSelectAlgorithm::IMergeTreeSelectAlgorithm(
    result_header = header_without_const_virtual_columns;
    injectPartConstVirtualColumns(0, result_header, nullptr, partition_value_type, virt_column_names);

-    if (prewhere_actions)
-        LOG_TRACE(log, "PREWHERE condition was split into {} steps: {}", prewhere_actions->steps.size(), prewhere_actions->dumpConditions());
+    if (!prewhere_actions.steps.empty())
+        LOG_TRACE(log, "PREWHERE condition was split into {} steps: {}", prewhere_actions.steps.size(), prewhere_actions.dumpConditions());

    if (prewhere_info)
        LOG_TEST(log, "Original PREWHERE DAG:\n{}\nPREWHERE actions:\n{}",
            (prewhere_info->prewhere_actions ? prewhere_info->prewhere_actions->dumpDAG(): std::string("<nullptr>")),
-            (prewhere_actions ? prewhere_actions->dump() : std::string("<nullptr>")));
+            (!prewhere_actions.steps.empty() ? prewhere_actions.dump() : std::string("<nullptr>")));
 }

 bool tryBuildPrewhereSteps(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, PrewhereExprInfo & prewhere);

-std::unique_ptr<PrewhereExprInfo> IMergeTreeSelectAlgorithm::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps)
+PrewhereExprInfo IMergeTreeSelectAlgorithm::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps)
 {
-    std::unique_ptr<PrewhereExprInfo> prewhere_actions;
+    PrewhereExprInfo prewhere_actions;
    if (prewhere_info)
    {
-        prewhere_actions = std::make_unique<PrewhereExprInfo>();
-
        if (prewhere_info->row_level_filter)
        {
            PrewhereExprStep row_level_filter_step
            {
+                .type = PrewhereExprStep::Filter,
                .actions = std::make_shared<ExpressionActions>(prewhere_info->row_level_filter, actions_settings),
-                .column_name = prewhere_info->row_level_column_name,
-                .remove_column = true,
-                .need_filter = true
+                .filter_column_name = prewhere_info->row_level_column_name,
+                .remove_filter_column = true,
+                .need_filter = true,
+                .perform_alter_conversions = true,
            };

-            prewhere_actions->steps.emplace_back(std::move(row_level_filter_step));
+            prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(row_level_filter_step)));
        }

        if (!enable_multiple_prewhere_read_steps ||
-            !tryBuildPrewhereSteps(prewhere_info, actions_settings, *prewhere_actions))
+            !tryBuildPrewhereSteps(prewhere_info, actions_settings, prewhere_actions))
        {
            PrewhereExprStep prewhere_step
            {
+                .type = PrewhereExprStep::Filter,
                .actions = std::make_shared<ExpressionActions>(prewhere_info->prewhere_actions, actions_settings),
-                .column_name = prewhere_info->prewhere_column_name,
-                .remove_column = prewhere_info->remove_prewhere_column,
-                .need_filter = prewhere_info->need_filter
+                .filter_column_name = prewhere_info->prewhere_column_name,
+                .remove_filter_column = prewhere_info->remove_prewhere_column,
+                .need_filter = prewhere_info->need_filter,
+                .perform_alter_conversions = true,
            };

-            prewhere_actions->steps.emplace_back(std::move(prewhere_step));
+            prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(prewhere_step)));
        }
    }

@ -213,7 +215,7 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForCurrentTask(
        reader = task->data_part->getReader(
            task->task_columns.columns, metadata_snapshot, task->mark_ranges,
            owned_uncompressed_cache.get(), owned_mark_cache.get(),
-            reader_settings, value_size_map, profile_callback);
+            task->alter_conversions, reader_settings, value_size_map, profile_callback);
    }

    if (!task->pre_reader_for_step.empty())
@ -226,13 +228,15 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForCurrentTask(
    else
    {
        initializeMergeTreePreReadersForPart(
-            task->data_part, task->task_columns, metadata_snapshot,
+            task->data_part, task->alter_conversions,
+            task->task_columns, metadata_snapshot,
            task->mark_ranges, value_size_map, profile_callback);
    }
 }

 void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart(
-    MergeTreeData::DataPartPtr & data_part,
+    const MergeTreeData::DataPartPtr & data_part,
+    const AlterConversionsPtr & alter_conversions,
    const MergeTreeReadTaskColumns & task_columns,
    const StorageMetadataPtr & metadata_snapshot,
    const MarkRanges & mark_ranges,
@ -242,15 +246,16 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart(
    reader = data_part->getReader(
        task_columns.columns, metadata_snapshot, mark_ranges,
        owned_uncompressed_cache.get(), owned_mark_cache.get(),
-        reader_settings, value_size_map, profile_callback);
+        alter_conversions, reader_settings, value_size_map, profile_callback);

    initializeMergeTreePreReadersForPart(
-        data_part, task_columns, metadata_snapshot,
+        data_part, alter_conversions, task_columns, metadata_snapshot,
        mark_ranges, value_size_map, profile_callback);
 }

 void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart(
-    MergeTreeData::DataPartPtr & data_part,
+    const MergeTreeData::DataPartPtr & data_part,
+    const AlterConversionsPtr & alter_conversions,
    const MergeTreeReadTaskColumns & task_columns,
    const StorageMetadataPtr & metadata_snapshot,
    const MarkRanges & mark_ranges,
@ -266,36 +271,37 @@ void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart(
            data_part->getReader(
                {LightweightDeleteDescription::FILTER_COLUMN}, metadata_snapshot,
                mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(),
-                reader_settings, value_size_map, profile_callback));
+                alter_conversions, reader_settings, value_size_map, profile_callback));
    }

-    if (prewhere_info)
+    for (const auto & pre_columns_per_step : task_columns.pre_columns)
    {
-        for (const auto & pre_columns_per_step : task_columns.pre_columns)
-        {
-            pre_reader_for_step.push_back(
-                data_part->getReader(
-                    pre_columns_per_step, metadata_snapshot, mark_ranges,
-                    owned_uncompressed_cache.get(), owned_mark_cache.get(),
-                    reader_settings, value_size_map, profile_callback));
-        }
+        pre_reader_for_step.push_back(
+            data_part->getReader(
+                pre_columns_per_step, metadata_snapshot, mark_ranges,
+                owned_uncompressed_cache.get(), owned_mark_cache.get(),
+                alter_conversions, reader_settings, value_size_map, profile_callback));
    }
 }

 void IMergeTreeSelectAlgorithm::initializeRangeReaders(MergeTreeReadTask & current_task)
 {
    return initializeRangeReadersImpl(
-        current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(),
+        current_task.range_reader, current_task.pre_range_readers, prewhere_actions,
        reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
        pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
 }

 void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl(
-    MergeTreeRangeReader & range_reader, std::deque<MergeTreeRangeReader> & pre_range_readers,
-    PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions,
-    IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings,
+    MergeTreeRangeReader & range_reader,
+    std::deque<MergeTreeRangeReader> & pre_range_readers,
+    const PrewhereExprInfo & prewhere_actions,
+    IMergeTreeReader * reader,
+    bool has_lightweight_delete,
+    const MergeTreeReaderSettings & reader_settings,
    const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
-    const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names)
+    const PrewhereExprStep & lightweight_delete_filter_step,
+    const Names & non_const_virtual_column_names)
 {
    MergeTreeRangeReader * prev_reader = nullptr;
    bool last_reader = false;
@ -310,25 +316,25 @@ void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl(
        pre_readers_shift++;
    }

-    if (prewhere_info)
+    if (prewhere_actions.steps.size() + pre_readers_shift != pre_reader_for_step.size())
    {
-        if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size())
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "PREWHERE steps count mismatch, actions: {}, readers: {}",
-                prewhere_actions->steps.size(), pre_reader_for_step.size());
-        }
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "PREWHERE steps count mismatch, actions: {}, readers: {}",
+            prewhere_actions.steps.size(), pre_reader_for_step.size());
+    }

-        for (size_t i = 0; i < prewhere_actions->steps.size(); ++i)
-        {
-            last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size());
+    for (size_t i = 0; i < prewhere_actions.steps.size(); ++i)
+    {
+        last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions.steps.size());

-            MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names);
+        MergeTreeRangeReader current_reader(
+            pre_reader_for_step[i + pre_readers_shift].get(),
+            prev_reader, prewhere_actions.steps[i].get(),
+            last_reader, non_const_virtual_column_names);

-            pre_range_readers.push_back(std::move(current_reader));
-            prev_reader = &pre_range_readers.back();
-        }
+        pre_range_readers.push_back(std::move(current_reader));
+        prev_reader = &pre_range_readers.back();
    }

    if (!last_reader)
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
@ -74,7 +74,7 @@ public:

    virtual std::string getName() const = 0;

-    static std::unique_ptr<PrewhereExprInfo> getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps);
+    static PrewhereExprInfo getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps);

 protected:
    /// This struct allow to return block with no columns but with non-zero number of rows similar to Chunk
@ -110,8 +110,7 @@ protected:
    static void initializeRangeReadersImpl(
         MergeTreeRangeReader & range_reader,
         std::deque<MergeTreeRangeReader> & pre_range_readers,
-         PrewhereInfoPtr prewhere_info,
-         const PrewhereExprInfo * prewhere_actions,
+         const PrewhereExprInfo & prewhere_actions,
         IMergeTreeReader * reader,
         bool has_lightweight_delete,
         const MergeTreeReaderSettings & reader_settings,
@ -126,7 +125,8 @@ protected:
        const ReadBufferFromFileBase::ProfileCallback & profile_callback);

    void initializeMergeTreeReadersForPart(
-        MergeTreeData::DataPartPtr & data_part,
+        const MergeTreeData::DataPartPtr & data_part,
+        const AlterConversionsPtr & alter_conversions,
        const MergeTreeReadTaskColumns & task_columns,
        const StorageMetadataPtr & metadata_snapshot,
        const MarkRanges & mark_ranges,
@ -140,10 +140,19 @@ protected:
    StorageSnapshotPtr storage_snapshot;

    /// This step is added when the part has lightweight delete mask
-    const PrewhereExprStep lightweight_delete_filter_step { nullptr, LightweightDeleteDescription::FILTER_COLUMN.name, true, true };
+    const PrewhereExprStep lightweight_delete_filter_step
+    {
+        .type = PrewhereExprStep::Filter,
+        .actions = nullptr,
+        .filter_column_name = LightweightDeleteDescription::FILTER_COLUMN.name,
+        .remove_filter_column = true,
+        .need_filter = true,
+        .perform_alter_conversions = true,
+    };
+
    PrewhereInfoPtr prewhere_info;
    ExpressionActionsSettings actions_settings;
-    std::unique_ptr<PrewhereExprInfo> prewhere_actions;
+    PrewhereExprInfo prewhere_actions;

    UInt64 max_block_size_rows;
    UInt64 preferred_block_size_bytes;
@ -195,7 +204,8 @@ private:

    /// Initialize pre readers.
    void initializeMergeTreePreReadersForPart(
-        MergeTreeData::DataPartPtr & data_part,
+        const MergeTreeData::DataPartPtr & data_part,
+        const AlterConversionsPtr & alter_conversions,
        const MergeTreeReadTaskColumns & task_columns,
        const StorageMetadataPtr & metadata_snapshot,
        const MarkRanges & mark_ranges,
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
@ -1,6 +1,7 @@
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
+#include <Storages/MergeTree/MergeTreeRangeReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Core/NamesAndTypes.h>
 #include <Common/checkStackSize.h>
@ -30,7 +31,7 @@ namespace
 bool injectRequiredColumnsRecursively(
    const String & column_name,
    const StorageSnapshotPtr & storage_snapshot,
-    const AlterConversions & alter_conversions,
+    const AlterConversionsPtr & alter_conversions,
    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const GetColumnsOptions & options,
    Names & columns,
@ -46,8 +47,8 @@ bool injectRequiredColumnsRecursively(
    if (column_in_storage)
    {
        auto column_name_in_part = column_in_storage->getNameInStorage();
-        if (alter_conversions.isColumnRenamed(column_name_in_part))
-            column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
+        if (alter_conversions && alter_conversions->isColumnRenamed(column_name_in_part))
+            column_name_in_part = alter_conversions->getColumnOldName(column_name_in_part);

        auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part);

@ -98,13 +99,14 @@ NameSet injectRequiredColumns(
    NameSet injected_columns;

    bool have_at_least_one_physical_column = false;
-    AlterConversions alter_conversions;
+    AlterConversionsPtr alter_conversions;
    if (!data_part_info_for_reader.isProjectionPart())
        alter_conversions = data_part_info_for_reader.getAlterConversions();

    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
        .withExtendedObjects()
        .withSystemColumns();
+
    if (with_subcolumns)
        options.withSubcolumns();

@ -137,6 +139,7 @@ NameSet injectRequiredColumns(

 MergeTreeReadTask::MergeTreeReadTask(
    const DataPartPtr & data_part_,
+    const AlterConversionsPtr & alter_conversions_,
    const MarkRanges & mark_ranges_,
    size_t part_index_in_query_,
    const NameSet & column_name_set_,
@ -146,6 +149,7 @@ MergeTreeReadTask::MergeTreeReadTask(
    std::future<MergeTreeReaderPtr> reader_,
    std::vector<std::future<MergeTreeReaderPtr>> && pre_reader_for_step_)
    : data_part{data_part_}
+    , alter_conversions{alter_conversions_}
    , mark_ranges{mark_ranges_}
    , part_index_in_query{part_index_in_query_}
    , column_name_set{column_name_set_}
@ -306,10 +310,8 @@ MergeTreeReadTaskColumns getReadTaskColumns(

    /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part
    for (const auto & name : system_columns)
-    {
        if (data_part_info_for_reader.getColumns().contains(name))
            column_to_read_after_prewhere.push_back(name);
-    }

    /// Inject columns required for defaults evaluation
    injectRequiredColumns(
@ -319,44 +321,50 @@ MergeTreeReadTaskColumns getReadTaskColumns(
    auto options = GetColumnsOptions(GetColumnsOptions::All)
        .withExtendedObjects()
        .withSystemColumns();
+
    if (with_subcolumns)
        options.withSubcolumns();

+    NameSet columns_from_previous_steps;
+    auto add_step = [&](const PrewhereExprStep & step)
+    {
+        Names step_column_names = step.actions->getActionsDAG().getRequiredColumnsNames();
+
+        injectRequiredColumns(
+            data_part_info_for_reader, storage_snapshot, with_subcolumns, step_column_names);
+
+        Names columns_to_read_in_step;
+        for (const auto & name : step_column_names)
+        {
+            if (columns_from_previous_steps.contains(name))
+                continue;
+
+            columns_to_read_in_step.push_back(name);
+            columns_from_previous_steps.insert(name);
+        }
+
+        result.pre_columns.push_back(storage_snapshot->getColumnsByNames(options, columns_to_read_in_step));
+    };
+
    if (prewhere_info)
    {
        auto prewhere_actions = IMergeTreeSelectAlgorithm::getPrewhereActions(
-            prewhere_info, actions_settings, reader_settings.enable_multiple_prewhere_read_steps);
+            prewhere_info,
+            actions_settings,
+            reader_settings.enable_multiple_prewhere_read_steps);

-        NameSet columns_from_previous_steps;
-
-        for (const auto & step : prewhere_actions->steps)
-        {
-            Names step_column_names = step.actions->getActionsDAG().getRequiredColumnsNames();
-
-            injectRequiredColumns(
-                data_part_info_for_reader, storage_snapshot, with_subcolumns, step_column_names);
-
-            Names columns_to_read_in_step;
-            for (const auto & name : step_column_names)
-            {
-                if (columns_from_previous_steps.contains(name))
-                    continue;
-                columns_to_read_in_step.push_back(name);
-                columns_from_previous_steps.insert(name);
-            }
-
-            result.pre_columns.push_back(storage_snapshot->getColumnsByNames(options, columns_to_read_in_step));
-        }
-
-        /// Remove columns read in prewehere from the list of columns to read
-        Names post_column_names;
-        for (const auto & name : column_to_read_after_prewhere)
-            if (!columns_from_previous_steps.contains(name))
-                post_column_names.push_back(name);
-
-        column_to_read_after_prewhere = std::move(post_column_names);
+        for (const auto & step : prewhere_actions.steps)
+            add_step(*step);
    }

+    /// Remove columns read in prewehere from the list of columns to read
+    Names post_column_names;
+    for (const auto & name : column_to_read_after_prewhere)
+        if (!columns_from_previous_steps.contains(name))
+            post_column_names.push_back(name);
+
+    column_to_read_after_prewhere = std::move(post_column_names);
+
    /// Rest of the requested columns
    result.columns = storage_snapshot->getColumnsByNames(options, column_to_read_after_prewhere);
    return result;
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
@ -6,6 +6,7 @@
 #include <Storages/MergeTree/RangesInDataPart.h>
 #include <Storages/MergeTree/MergeTreeRangeReader.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
+#include <Storages/MergeTree/AlterConversions.h>


 namespace DB
@ -35,7 +36,6 @@ NameSet injectRequiredColumns(
    bool with_subcolumns,
    Names & columns);

-
 struct MergeTreeReadTaskColumns
 {
    /// column names to read during WHERE
@ -49,8 +49,10 @@ struct MergeTreeReadTaskColumns
 /// A batch of work for MergeTreeThreadSelectProcessor
 struct MergeTreeReadTask
 {
-    /// data part which should be read while performing this task
+    /// Data part which should be read while performing this task
    DataPartPtr data_part;
+    /// Alter converversionss that should be applied on-fly for part.
+    AlterConversionsPtr alter_conversions;
    /// Ranges to read from `data_part`.
    MarkRanges mark_ranges;
    /// for virtual `part_index` virtual column
@ -77,6 +79,7 @@ struct MergeTreeReadTask

    MergeTreeReadTask(
        const DataPartPtr & data_part_,
+        const AlterConversionsPtr & alter_conversions_,
        const MarkRanges & mark_ranges_,
        size_t part_index_in_query_,
        const NameSet & column_name_set_,
--- a/Show More
+++ b/Show More