Merge remote-tracking branch 'origin/master' into pr-distributed-prefer-localhost-replica

2024-11-23 08:02:02 +00:00 · 2023-09-09 19:48:50 +00:00 · 2023-09-09 19:48:50 +00:00 · 9464433683
commit 9464433683
parent 6c3ac83c07 d5ffff93a7
200 changed files with 3959 additions and 3523 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -76,6 +76,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/docs_check.yml
+++ b/.github/workflows/docs_check.yml
@ -73,6 +73,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -60,6 +60,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -53,6 +53,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -94,6 +94,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -52,6 +52,7 @@ jobs:
        uses: ClickHouse/checkout@v1
        with:
          clear-repository: true
+          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.gitmodules
+++ b/.gitmodules
@ -40,9 +40,6 @@
 [submodule "contrib/boost"]
 	path = contrib/boost
 	url = https://github.com/ClickHouse/boost
-[submodule "contrib/base64"]
-	path = contrib/base64
-	url = https://github.com/ClickHouse/Turbo-Base64
 [submodule "contrib/arrow"]
 	path = contrib/arrow
 	url = https://github.com/ClickHouse/arrow
@ -348,3 +345,6 @@
 [submodule "contrib/robin-map"]
 	path = contrib/robin-map
 	url = https://github.com/Tessil/robin-map.git
+[submodule "contrib/aklomp-base64"]
+	path = contrib/aklomp-base64
+	url = https://github.com/aklomp/base64.git
--- a/cmake/linux/toolchain-ppc64le.cmake
+++ b/cmake/linux/toolchain-ppc64le.cmake
@ -5,9 +5,9 @@ set (CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)

 set (CMAKE_SYSTEM_NAME "Linux")
 set (CMAKE_SYSTEM_PROCESSOR "ppc64le")
-set (CMAKE_C_COMPILER_TARGET "ppc64le-linux-gnu")
-set (CMAKE_CXX_COMPILER_TARGET "ppc64le-linux-gnu")
-set (CMAKE_ASM_COMPILER_TARGET "ppc64le-linux-gnu")
+set (CMAKE_C_COMPILER_TARGET "powerpc64le-linux-gnu")
+set (CMAKE_CXX_COMPILER_TARGET "powerpc64le-linux-gnu")
+set (CMAKE_ASM_COMPILER_TARGET "powerpc64le-linux-gnu")

 # Will be changed later, but somehow needed to be set here.
 set (CMAKE_AR "ar")
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -135,7 +135,7 @@ add_contrib (aws-cmake
    aws-cmake
 )

-add_contrib (base64-cmake base64)
+add_contrib (aklomp-base64-cmake aklomp-base64)
 add_contrib (simdjson-cmake simdjson)
 add_contrib (rapidjson-cmake rapidjson)
 add_contrib (fastops-cmake fastops)
--- a/contrib/aklomp-base64
+++ b/contrib/aklomp-base64
@ -0,0 +1 @@
+Subproject commit e77bd70bdd860c52c561568cffb251d88bba064c
--- a/contrib/aklomp-base64-cmake/.gitignore
+++ b/contrib/aklomp-base64-cmake/.gitignore
@ -0,0 +1 @@
+config.h
--- a/contrib/aklomp-base64-cmake/CMakeLists.txt
+++ b/contrib/aklomp-base64-cmake/CMakeLists.txt
@ -0,0 +1,68 @@
+option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES})
+
+if (NOT ENABLE_BASE64)
+    message(STATUS "Not using base64")
+    return()
+endif()
+
+SET(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/aklomp-base64")
+
+if (ARCH_AMD64)
+    # These defines enable/disable SIMD codecs in base64's runtime codec dispatch.
+    # We don't want to limit ourselves --> enable all.
+    set(HAVE_SSSE3 1)
+    set(HAVE_SSE41 1)
+    set(HAVE_SSE42 1)
+    set(HAVE_AVX 1)
+    set(HAVE_AVX2 1)
+    set(HAVE_AVX512 1)
+endif ()
+
+if (ARCH_AARCH64)
+    # The choice of HAVE_NEON* depends on the target machine because base64 provides
+    # no runtime dispatch on ARM. NEON is only mandatory with the normal build profile.
+    if(NOT NO_ARMV81_OR_HIGHER)
+        set(HAVE_NEON64 1)
+        set(HAVE_NEON32 0)
+    endif ()
+endif ()
+
+configure_file(config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+
+add_library(_base64
+    "${LIBRARY_DIR}/lib/lib.c"
+    "${LIBRARY_DIR}/lib/codec_choose.c"
+
+    "${LIBRARY_DIR}/lib/tables/tables.c"
+    "${LIBRARY_DIR}/lib/tables/table_dec_32bit.h"
+    "${LIBRARY_DIR}/lib/tables/table_enc_12bit.h"
+
+    "${LIBRARY_DIR}/lib/codecs.h"
+
+    "${CMAKE_CURRENT_BINARY_DIR}/config.h"
+
+    "${LIBRARY_DIR}/lib/arch/generic/codec.c"
+    "${LIBRARY_DIR}/lib/arch/ssse3/codec.c"
+    "${LIBRARY_DIR}/lib/arch/sse41/codec.c"
+    "${LIBRARY_DIR}/lib/arch/sse42/codec.c"
+    "${LIBRARY_DIR}/lib/arch/avx/codec.c"
+    "${LIBRARY_DIR}/lib/arch/avx2/codec.c"
+    "${LIBRARY_DIR}/lib/arch/avx512/codec.c"
+
+    "${LIBRARY_DIR}/lib/arch/neon32/codec.c"
+    "${LIBRARY_DIR}/lib/arch/neon64/codec.c"
+)
+
+if (ARCH_AMD64)
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/ssse3/codec.c PROPERTIES COMPILE_FLAGS "-mssse3")
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/sse41/codec.c PROPERTIES COMPILE_FLAGS "-msse4.1")
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/sse42/codec.c PROPERTIES COMPILE_FLAGS "-msse4.2")
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/avx/codec.c PROPERTIES COMPILE_FLAGS "-mavx")
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/avx2/codec.c PROPERTIES COMPILE_FLAGS "-mavx2")
+    set_source_files_properties(${LIBRARY_DIR}/lib/arch/avx512/codec.c PROPERTIES COMPILE_FLAGS "-mavx512vl -mavx512vbmi")
+endif()
+
+target_include_directories(_base64 SYSTEM PUBLIC ${LIBRARY_DIR}/include)
+target_include_directories(_base64 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+add_library(ch_contrib::base64 ALIAS _base64)
--- a/contrib/aklomp-base64-cmake/config.h.in
+++ b/contrib/aklomp-base64-cmake/config.h.in
@ -0,0 +1,9 @@
+#cmakedefine01 HAVE_SSSE3
+#cmakedefine01 HAVE_SSE41
+#cmakedefine01 HAVE_SSE42
+#cmakedefine01 HAVE_AVX
+#cmakedefine01 HAVE_AVX2
+#cmakedefine01 HAVE_AVX512
+
+#cmakedefine01 HAVE_NEON32
+#cmakedefine01 HAVE_NEON64
--- a/contrib/base64
+++ b/contrib/base64
@ -1 +0,0 @@
-Subproject commit 8628e258090f9eb76d90ac3c91e1ab4690e9aa11
--- a/contrib/base64-cmake/CMakeLists.txt
+++ b/contrib/base64-cmake/CMakeLists.txt
@ -1,60 +0,0 @@
-if(ARCH_AMD64 OR ARCH_AARCH64 OR ARCH_PPC64LE OR ARCH_S390X)
-    option (ENABLE_BASE64 "Enable base64" ${ENABLE_LIBRARIES})
-elseif(ENABLE_BASE64)
-    message (${RECONFIGURE_MESSAGE_LEVEL} "base64 library is only supported on x86_64 and aarch64")
-endif()
-
-if (NOT ENABLE_BASE64)
-    message(STATUS "Not using base64")
-    return()
-endif()
-
-SET(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/base64")
-
-add_library(_base64_scalar OBJECT "${LIBRARY_DIR}/turbob64c.c" "${LIBRARY_DIR}/turbob64d.c")
-add_library(_base64_ssse3 OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This file also contains code for ARM NEON
-
-if (ARCH_AMD64)
-    add_library(_base64_avx OBJECT "${LIBRARY_DIR}/turbob64sse.c") # This is not a mistake. One file is compiled twice.
-    add_library(_base64_avx2 OBJECT "${LIBRARY_DIR}/turbob64avx2.c")
-endif ()
-
-target_compile_options(_base64_scalar PRIVATE -falign-loops)
-
-if (ARCH_AMD64)
-    target_compile_options(_base64_ssse3 PRIVATE -mno-avx -mno-avx2 -mssse3 -falign-loops)
-    target_compile_options(_base64_avx PRIVATE -falign-loops -mavx)
-    target_compile_options(_base64_avx2 PRIVATE -falign-loops -mavx2)
-else ()
-    if (ARCH_PPC64LE)
-        target_compile_options(_base64_ssse3 PRIVATE -D__SSSE3__ -falign-loops)
-    else()
-        target_compile_options(_base64_ssse3 PRIVATE -falign-loops)
-    endif()
-endif ()
-
-if (ARCH_AMD64)
-    add_library(_base64
-        $<TARGET_OBJECTS:_base64_scalar>
-        $<TARGET_OBJECTS:_base64_ssse3>
-        $<TARGET_OBJECTS:_base64_avx>
-        $<TARGET_OBJECTS:_base64_avx2>)
-else ()
-    add_library(_base64
-        $<TARGET_OBJECTS:_base64_scalar>
-        $<TARGET_OBJECTS:_base64_ssse3>)
-endif ()
-
-target_include_directories(_base64 SYSTEM PUBLIC ${LIBRARY_DIR})
-
-if (XCODE OR XCODE_VERSION)
-    # https://gitlab.kitware.com/cmake/cmake/issues/17457
-    #     Some native build systems may not like targets that have only object files, so consider adding at least one real source file
-    # This applies to Xcode.
-    if (NOT EXISTS "${CMAKE_CURRENT_BINARY_DIR}/dummy.c")
-        file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/dummy.c" "")
-    endif ()
-    target_sources(_base64 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/dummy.c")
-endif ()
-
-add_library(ch_contrib::base64 ALIAS _base64)
--- a/contrib/libunwind
+++ b/contrib/libunwind
@ -1 +1 @@
-Subproject commit e48aa13f67dc722511b5af33a32ba9b7748176b5
+Subproject commit 30cc1d3fd3655a5cfa0ab112fe320fb9fc0a8344
--- a/contrib/openssl-cmake/CMakeLists.txt
+++ b/contrib/openssl-cmake/CMakeLists.txt
@ -126,7 +126,7 @@ if(ENABLE_OPENSSL_DYNAMIC OR ENABLE_OPENSSL)
    elseif(ARCH_PPC64LE)
        macro(perl_generate_asm FILE_IN FILE_OUT)
            add_custom_command(OUTPUT ${FILE_OUT}
-                COMMAND /usr/bin/env perl ${FILE_IN} "linux64" ${FILE_OUT})
+                COMMAND /usr/bin/env perl ${FILE_IN} "linux64v2" ${FILE_OUT})
        endmacro()

        perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/aes/asm/aes-ppc.pl ${OPENSSL_BINARY_DIR}/crypto/aes/aes-ppc.s)
--- a/docker/test/base/setup_export_logs.sh
+++ b/docker/test/base/setup_export_logs.sh
@ -15,8 +15,8 @@ CLICKHOUSE_CI_LOGS_USER=${CLICKHOUSE_CI_LOGS_USER:-ci}
 # Pre-configured destination cluster, where to export the data
 CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export}

-EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime, check_name LowCardinality(String), instance_type LowCardinality(String), "}
-EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"0 AS pull_request_number, '' AS commit_sha, now() AS check_start_time, '' AS check_name, '' AS instance_type"}
+EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime, check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, "}
+EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"0 AS pull_request_number, '' AS commit_sha, now() AS check_start_time, '' AS check_name, '' AS instance_type, '' AS instance_id"}
 EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "}

 function __set_connection_args
@ -125,9 +125,9 @@ function setup_logs_replication
    echo 'Create %_log tables'
    clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table
    do
-        # Calculate hash of its structure:
+        # Calculate hash of its structure. Note: 1 is the version of extra columns - increment it if extra columns are changed:
        hash=$(clickhouse-client --query "
-            SELECT sipHash64(groupArray((name, type)))
+            SELECT sipHash64(1, groupArray((name, type)))
            FROM (SELECT name, type FROM system.columns
                WHERE database = 'system' AND table = '$table'
                ORDER BY position)
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -120,7 +120,7 @@ function clone_submodules
            contrib/libxml2
            contrib/libunwind
            contrib/fmtlib
-            contrib/base64
+            contrib/aklomp-base64
            contrib/cctz
            contrib/libcpuid
            contrib/libdivide
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -30,7 +30,7 @@ It may lack support for new features.

 ## Usage {#cli_usage}

-The client can be used in interactive and non-interactive (batch) mode. 
+The client can be used in interactive and non-interactive (batch) mode.

 ### Gather your connection details
 <ConnectionDetails />
@ -177,8 +177,8 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 - `--user, -u` – The username. Default value: default.
 - `--password` – The password. Default value: empty string.
 - `--ask-password` - Prompt the user to enter a password.
- `--query, -q` – The query to process when using non-interactive mode. Cannot be used simultaneously with `--queries-file`.
- `--queries-file` – file path with queries to execute. Cannot be used simultaneously with `--query`.
+- `--query, -q` – The query to process when using non-interactive mode. `--query` can be specified multiple times, e.g. `--query "SELECT 1" --query "SELECT 2"`. Cannot be used simultaneously with `--queries-file`.
+- `--queries-file` – file path with queries to execute. `--queries-file` can be specified multiple times, e.g. `--query queries1.sql --query queries2.sql`. Cannot be used simultaneously with `--query`.
 - `--multiquery, -n` – If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
 - `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter).
 - `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default).
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@ -18,6 +18,8 @@ $ curl 'http://localhost:8123/'
 Ok.
 ```

+Also see: [HTTP response codes caveats](#http_response_codes_caveats).
+
 Sometimes, `curl` command is not available on user operating systems. On Ubuntu or Debian, run `sudo apt install curl`. Please refer this [documentation](https://curl.se/download.html) to install it before running the examples.

 Web UI can be accessed here: `http://localhost:8123/play`.
@ -323,6 +325,27 @@ $ curl -sS 'http://localhost:8123/?max_result_bytes=4000000&buffer_size=3000000&

 Use buffering to avoid situations where a query processing error occurred after the response code and HTTP headers were sent to the client. In this situation, an error message is written at the end of the response body, and on the client-side, the error can only be detected at the parsing stage.

+## HTTP response codes caveats {#http_response_codes_caveats}
+
+Because of limitation of HTTP protocol, HTTP 200 response code does not guarantee that a query was successful.
+
+Here is an example:
+
+```
+curl -v -Ss "http://localhost:8123/?max_block_size=1&query=select+sleepEachRow(0.001),throwIf(number=2)from+numbers(5)"
+*   Trying 127.0.0.1:8123...
+...
+< HTTP/1.1 200 OK
+...
+Code: 395. DB::Exception: Value passed to 'throwIf' function is non-zero: while executing 'FUNCTION throwIf(equals(number, 2) :: 1) -> throwIf(equals(number, 2))
+```
+
+The reason for this behavior is the nature of the HTTP protocol. The HTTP header is sent first with an HTTP code of 200, followed by the HTTP body, and then the error is injected into the body as plain text.
+This behavior is independent of the format used, whether it's `Native`, `TSV`, or `JSON`; the error message will always be in the middle of the response stream.
+You can mitigate this problem by enabling `wait_end_of_query=1` ([Response Buffering](#response-buffering)). In this case, the sending of the HTTP header is delayed until the entire query is resolved.
+However, this does not completely solve the problem because the result must still fit within the `http_response_buffer_size`, and other settings like `send_progress_in_http_headers` can interfere with the delay of the header.
+The only way to catch all errors is to analyze the HTTP body before parsing it using the required format.
+
 ### Queries with Parameters {#cli-queries-with-parameters}

 You can create a query with parameters and pass values for them from the corresponding HTTP request parameters. For more information, see [Queries with Parameters for CLI](../interfaces/cli.md#cli-queries-with-parameters).
--- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md
+++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md
@ -11,6 +11,8 @@ ClickHouse runs sampling profiler that allows analyzing query execution. Using p

 Query profiler is automatically enabled in ClickHouse Cloud and you can run a sample query as follows

+:::note If you are running the following query in ClickHouse Cloud, make sure to change `FROM system.trace_log` to `FROM clusterAllReplicas(default, system.trace_log)` to select from all nodes of the cluster :::
+
 ``` sql
 SELECT
    count(),
--- a/docs/en/operations/system-tables/licenses.md
+++ b/docs/en/operations/system-tables/licenses.md
@ -1,7 +1,7 @@
 ---
 slug: /en/operations/system-tables/licenses
 ---
-# licenses 
+# licenses

 Contains licenses of third-party libraries that are located in the [contrib](https://github.com/ClickHouse/ClickHouse/tree/master/contrib) directory of ClickHouse sources.

@ -20,21 +20,10 @@ SELECT library_name, license_type, license_path FROM system.licenses LIMIT 15

 ``` text
 ┌─library_name───────┬─license_type─┬─license_path────────────────────────┐
-│ FastMemcpy         │ MIT          │ /contrib/FastMemcpy/LICENSE         │
-│ arrow              │ Apache       │ /contrib/arrow/LICENSE.txt          │
-│ avro               │ Apache       │ /contrib/avro/LICENSE.txt           │
 │ aws-c-common       │ Apache       │ /contrib/aws-c-common/LICENSE       │
-│ aws-c-event-stream │ Apache       │ /contrib/aws-c-event-stream/LICENSE │
-│ aws-checksums      │ Apache       │ /contrib/aws-checksums/LICENSE      │
-│ aws                │ Apache       │ /contrib/aws/LICENSE.txt            │
-│ base64             │ BSD 2-clause │ /contrib/base64/LICENSE             │
-│ boost              │ Boost        │ /contrib/boost/LICENSE_1_0.txt      │
+│ base64             │ BSD 2-clause │ /contrib/aklomp-base64/LICENSE      │
 │ brotli             │ MIT          │ /contrib/brotli/LICENSE             │
-│ capnproto          │ MIT          │ /contrib/capnproto/LICENSE          │
-│ cassandra          │ Apache       │ /contrib/cassandra/LICENSE.txt      │
-│ cctz               │ Apache       │ /contrib/cctz/LICENSE.txt           │
-│ cityhash102        │ MIT          │ /contrib/cityhash102/COPYING        │
-│ cppkafka           │ BSD 2-clause │ /contrib/cppkafka/LICENSE           │
+│ [...]              │ [...]        │ [...]                               │
 └────────────────────┴──────────────┴─────────────────────────────────────┘

 ```
--- a/docs/en/operations/utilities/clickhouse-local.md
+++ b/docs/en/operations/utilities/clickhouse-local.md
@ -202,8 +202,8 @@ Arguments:
 - `-S`, `--structure` — table structure for input data.
 - `--input-format` — input format, `TSV` by default.
 - `-f`, `--file` — path to data, `stdin` by default.
- `-q`, `--query` — queries to execute with `;` as delimiter. Cannot be used simultaneously with `--queries-file`.
- `--queries-file` - file path with queries to execute. Cannot be used simultaneously with `--query`.
+- `-q`, `--query` — queries to execute with `;` as delimiter. `--query` can be specified multiple times, e.g. `--query "SELECT 1" --query "SELECT 2"`. Cannot be used simultaneously with `--queries-file`.
+- `--queries-file` - file path with queries to execute. `--queries-file` can be specified multiple times, e.g. `--query queries1.sql --query queries2.sql`. Cannot be used simultaneously with `--query`.
 - `--multiquery, -n` – If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
 - `-N`, `--table` — table name where to put output data, `table` by default.
 - `--format`, `--output-format` — output format, `TSV` by default.
--- a/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md
@ -7,6 +7,10 @@ sidebar_position: 30

 The result is equal to the square root of [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md).

-:::note    
+Alias:
+- `STD`
+- `STDDEV_POP`
+
+:::note
 This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error.
 :::
--- a/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md
@ -7,6 +7,8 @@ sidebar_position: 31

 The result is equal to the square root of [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md).

-:::note    
+Alias: `STDDEV_SAMP`.
+
+:::note
 This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error.
 :::
--- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md
@ -9,6 +9,8 @@ Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x

 In other words, dispersion for a set of values. Returns `Float64`.

-:::note    
+Alias: `VAR_POP`.
+
+:::note
 This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error.
 :::
--- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md
@ -11,6 +11,8 @@ It represents an unbiased estimate of the variance of a random variable if passe

 Returns `Float64`. When `n <= 1`, returns `+∞`.

-:::note    
+Alias: `VAR_SAMP`.
+
+:::note
 This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error.
 :::
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -237,6 +237,11 @@ type_samoa: DateTime('US/Samoa')
 int32samoa: 1546300800
 ```

+**See Also**
+
+- [formatDateTime](#date_time_functions-formatDateTime) - supports non-constant timezone.
+- [toString](type-conversion-functions.md#tostring) - supports non-constant timezone.
+
 ## timeZoneOf

 Returns the timezone name of [DateTime](../../sql-reference/data-types/datetime.md) or [DateTime64](../../sql-reference/data-types/datetime64.md) data types.
@ -1494,6 +1499,33 @@ Result:
 └─────────────────────────────────────────────────────────────────────┘
 ```

+Additionally, the `formatDateTime` function can take a third String argument containing the name of the time zone. Example: `Asia/Istanbul`. In this case, the time is formatted according to the specified time zone.
+
+**Example**
+
+```sql
+SELECT
+    now() AS ts,
+    time_zone,
+    formatDateTime(ts, '%T', time_zone) AS str_tz_time
+FROM system.time_zones
+WHERE time_zone LIKE 'Europe%'
+LIMIT 10
+
+┌──────────────────ts─┬─time_zone─────────┬─str_tz_time─┐
+│ 2023-09-08 19:13:40 │ Europe/Amsterdam  │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Andorra    │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Astrakhan  │ 23:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Athens     │ 22:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Belfast    │ 20:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Belgrade   │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Berlin     │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Bratislava │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Brussels   │ 21:13:40    │
+│ 2023-09-08 19:13:40 │ Europe/Bucharest  │ 22:13:40    │
+└─────────────────────┴───────────────────┴─────────────┘
+```
+
 **See Also**

 - [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax)
--- a/docs/en/sql-reference/functions/nlp-functions.md
+++ b/docs/en/sql-reference/functions/nlp-functions.md
@ -204,7 +204,7 @@ Other possible results:
 Query:

 ```sql
-SELECT detectLanguageMixed('Je pense que je ne parviendrai jamais à parler français comme un natif. Where there’s a will, there’s a way.');
+SELECT detectLanguage('Je pense que je ne parviendrai jamais à parler français comme un natif. Where there’s a will, there’s a way.');
 ```

 Result:
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -892,16 +892,29 @@ Query:

 ``` sql
 SELECT
-    now() AS now_local,
-    toString(now(), 'Asia/Yekaterinburg') AS now_yekat;
+    now() AS ts,
+    time_zone,
+    toString(ts, time_zone) AS str_tz_datetime
+FROM system.time_zones
+WHERE time_zone LIKE 'Europe%'
+LIMIT 10
 ```

 Result:

 ```response
-┌───────────now_local─┬─now_yekat───────────┐
-│ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │
-└─────────────────────┴─────────────────────┘
+┌──────────────────ts─┬─time_zone─────────┬─str_tz_datetime─────┐
+│ 2023-09-08 19:14:59 │ Europe/Amsterdam  │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Andorra    │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Astrakhan  │ 2023-09-08 23:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Athens     │ 2023-09-08 22:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Belfast    │ 2023-09-08 20:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Belgrade   │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Berlin     │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Bratislava │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Brussels   │ 2023-09-08 21:14:59 │
+│ 2023-09-08 19:14:59 │ Europe/Bucharest  │ 2023-09-08 22:14:59 │
+└─────────────────────┴───────────────────┴─────────────────────┘
 ```

 Also see the `toUnixTimestamp` function.
--- a/docs/en/sql-reference/statements/alter/comment.md
+++ b/docs/en/sql-reference/statements/alter/comment.md
@ -57,3 +57,9 @@ Output of a removed comment:
 │         │
 └─────────┘
 ```
+
+**Caveats**
+
+For Replicated tables, the comment can be different on different replicas. Modifying the comment applies to a single replica.
+
+The feature is available since version 23.9. It does not work in previous ClickHouse versions.
--- a/docs/en/sql-reference/statements/create/table.md
+++ b/docs/en/sql-reference/statements/create/table.md
@ -391,19 +391,19 @@ DEFLATE_QPL is not available in ClickHouse Cloud.

 ### Specialized Codecs

-These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation.
+These codecs are designed to make compression more effective by exploiting specific features of the data. Some of these codecs do not compress data themself, they instead preprocess the data such that a second compression stage using a general-purpose codec can achieve a higher data compression rate.

 #### Delta

-`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Delta is a data preparation codec, i.e. cannot be used stand-alone.
+`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Delta is a data preparation codec, i.e. it cannot be used stand-alone.

 #### DoubleDelta

-`DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). DoubleDelta is a data preparation codec, i.e. cannot be used stand-alone.
+`DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). DoubleDelta is a data preparation codec, i.e. it cannot be used stand-alone.

 #### GCD

-`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. A viable use case are timestamps or monetary values with high precision. GCD is a data preparation codec, i.e. cannot be used stand-alone.
+`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. The codec is well suited for columns with values that change (increase or decrease) in multiples of the GCD, e.g. 24, 28, 16, 24, 8, 24 (GCD = 4). GCD is a data preparation codec, i.e. it cannot be used stand-alone.

 #### Gorilla

--- a/docs/ru/interfaces/cli.md
+++ b/docs/ru/interfaces/cli.md
@ -128,7 +128,7 @@ $ clickhouse-client --param_tbl="numbers" --param_db="system" --param_col="numbe
 -   `--port` — порт для подключения, по умолчанию — 9000. Обратите внимание: для HTTP-интерфейса и нативного интерфейса используются разные порты.
 -   `--user, -u` — имя пользователя, по умолчанию — ‘default’.
 -   `--password` — пароль, по умолчанию — пустая строка.
-   `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме.
+-   `--query, -q` — запрос для выполнения, при использовании в неинтерактивном режиме. Допускается указание `--query` несколько раз (`--query "SELECT 1;" --query "SELECT 2;"...`).
 -   `--queries-file` - путь к файлу с запросами для выполнения. Необходимо указать только одну из опций: `query` или `queries-file`.
 -   `--database, -d` — выбрать текущую БД. Без указания значение берется из настроек сервера (по умолчанию — БД ‘default’).
 -   `--multiline, -m` — если указано — разрешить многострочные запросы, не отправлять запрос по нажатию Enter.
--- a/docs/ru/operations/system-tables/licenses.md
+++ b/docs/ru/operations/system-tables/licenses.md
@ -20,21 +20,10 @@ SELECT library_name, license_type, license_path FROM system.licenses LIMIT 15

 ``` text
 ┌─library_name───────┬─license_type─┬─license_path────────────────────────┐
-│ FastMemcpy         │ MIT          │ /contrib/FastMemcpy/LICENSE         │
-│ arrow              │ Apache       │ /contrib/arrow/LICENSE.txt          │
-│ avro               │ Apache       │ /contrib/avro/LICENSE.txt           │
 │ aws-c-common       │ Apache       │ /contrib/aws-c-common/LICENSE       │
-│ aws-c-event-stream │ Apache       │ /contrib/aws-c-event-stream/LICENSE │
-│ aws-checksums      │ Apache       │ /contrib/aws-checksums/LICENSE      │
-│ aws                │ Apache       │ /contrib/aws/LICENSE.txt            │
-│ base64             │ BSD 2-clause │ /contrib/base64/LICENSE             │
-│ boost              │ Boost        │ /contrib/boost/LICENSE_1_0.txt      │
+│ base64             │ BSD 2-clause │ /contrib/aklomp-base64/LICENSE      │
 │ brotli             │ MIT          │ /contrib/brotli/LICENSE             │
-│ capnproto          │ MIT          │ /contrib/capnproto/LICENSE          │
-│ cassandra          │ Apache       │ /contrib/cassandra/LICENSE.txt      │
-│ cctz               │ Apache       │ /contrib/cctz/LICENSE.txt           │
-│ cityhash102        │ MIT          │ /contrib/cityhash102/COPYING        │
-│ cppkafka           │ BSD 2-clause │ /contrib/cppkafka/LICENSE           │
+│ [...]              │ [...]        │ [...]                               │
 └────────────────────┴──────────────┴─────────────────────────────────────┘

 ```
--- a/docs/zh/interfaces/cli.md
+++ b/docs/zh/interfaces/cli.md
@ -116,7 +116,7 @@ $ clickhouse-client --param_tuple_in_tuple="(10, ('dt', 10))" -q "SELECT * FROM
 -   `--port` – 连接的端口，默认值：9000。注意HTTP接口以及TCP原生接口使用的是不同端口。
 -   `--user, -u` – 用户名。 默认值：`default`。
 -   `--password` – 密码。 默认值：空字符串。
-   `--query, -q` – 使用非交互模式查询。
+-   `--query, -q` – 使用非交互模式查询。 允许多次指定 `--query`（`--query "SELECT 1;" --query "SELECT 2;"...`）。
 -   `--database, -d` – 默认当前操作的数据库. 默认值：服务端默认的配置（默认是`default`）。
 -   `--multiline, -m` – 如果指定，允许多行语句查询（Enter仅代表换行，不代表查询语句完结）。
 -   `--multiquery, -n` – 如果指定, 允许处理用`;`号分隔的多个查询，只在非交互模式下生效。
--- a/docs/zh/operations/system-tables/licenses.md
+++ b/docs/zh/operations/system-tables/licenses.md
@ -20,21 +20,9 @@ SELECT library_name, license_type, license_path FROM system.licenses LIMIT 15

 ``` text
 ┌─library_name───────┬─license_type─┬─license_path────────────────────────┐
-│ FastMemcpy         │ MIT          │ /contrib/FastMemcpy/LICENSE         │
-│ arrow              │ Apache       │ /contrib/arrow/LICENSE.txt          │
-│ avro               │ Apache       │ /contrib/avro/LICENSE.txt           │
 │ aws-c-common       │ Apache       │ /contrib/aws-c-common/LICENSE       │
-│ aws-c-event-stream │ Apache       │ /contrib/aws-c-event-stream/LICENSE │
-│ aws-checksums      │ Apache       │ /contrib/aws-checksums/LICENSE      │
-│ aws                │ Apache       │ /contrib/aws/LICENSE.txt            │
-│ base64             │ BSD 2-clause │ /contrib/base64/LICENSE             │
-│ boost              │ Boost        │ /contrib/boost/LICENSE_1_0.txt      │
+│ base64             │ BSD 2-clause │ /contrib/aklomp-base64/LICENSE      │
 │ brotli             │ MIT          │ /contrib/brotli/LICENSE             │
-│ capnproto          │ MIT          │ /contrib/capnproto/LICENSE          │
-│ cassandra          │ Apache       │ /contrib/cassandra/LICENSE.txt      │
-│ cctz               │ Apache       │ /contrib/cctz/LICENSE.txt           │
-│ cityhash102        │ MIT          │ /contrib/cityhash102/COPYING        │
-│ cppkafka           │ BSD 2-clause │ /contrib/cppkafka/LICENSE           │
+│ [...]              │ [...]        │ [...]                               │
 └────────────────────┴──────────────┴─────────────────────────────────────┘
-
 ```
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -1189,7 +1189,7 @@ void Client::processOptions(const OptionsDescription & options_description,

 void Client::processConfig()
 {
-    if (config().has("query") && config().has("queries-file"))
+    if (!queries.empty() && config().has("queries-file"))
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Options '--query' and '--queries-file' cannot be specified at the same time");

    /// Batch mode is enabled if one of the following is true:
@ -1200,9 +1200,9 @@ void Client::processConfig()
    /// - --queries-file command line option is present.
    ///   The value of the option is used as file with query (or of multiple queries) to execute.

-    delayed_interactive = config().has("interactive") && (config().has("query") || config().has("queries-file"));
+    delayed_interactive = config().has("interactive") && (!queries.empty() || config().has("queries-file"));
    if (stdin_is_a_tty
-        && (delayed_interactive || (!config().has("query") && queries_files.empty())))
+        && (delayed_interactive || (queries.empty() && queries_files.empty())))
    {
        is_interactive = true;
    }
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -319,7 +319,7 @@ static bool checkIfStdinIsRegularFile()

 std::string LocalServer::getInitialCreateTableQuery()
 {
-    if (!config().has("table-structure") && !config().has("table-file") && !config().has("table-data-format") && (!checkIfStdinIsRegularFile() || !config().has("query")))
+    if (!config().has("table-structure") && !config().has("table-file") && !config().has("table-data-format") && (!checkIfStdinIsRegularFile() || queries.empty()))
        return {};

    auto table_name = backQuoteIfNeed(config().getString("table-name", "table"));
@ -461,7 +461,7 @@ try
    if (first_time)
    {

-    if (queries_files.empty() && !config().has("query"))
+    if (queries_files.empty() && queries.empty())
    {
        std::cerr << "\033[31m" << "ClickHouse compiled in fuzzing mode." << "\033[0m" << std::endl;
        std::cerr << "\033[31m" << "You have to provide a query with --query or --queries-file option." << "\033[0m" << std::endl;
@ -473,7 +473,7 @@ try
 #else
    is_interactive = stdin_is_a_tty
        && (config().hasOption("interactive")
-            || (!config().has("query") && !config().has("table-structure") && queries_files.empty() && !config().has("table-file")));
+            || (queries.empty() && !config().has("table-structure") && queries_files.empty() && !config().has("table-file")));
 #endif
    if (!is_interactive)
    {
@ -569,10 +569,10 @@ void LocalServer::updateLoggerLevel(const String & logs_level)

 void LocalServer::processConfig()
 {
-    if (config().has("query") && config().has("queries-file"))
+    if (!queries.empty() && config().has("queries-file"))
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Options '--query' and '--queries-file' cannot be specified at the same time");

-    delayed_interactive = config().has("interactive") && (config().has("query") || config().has("queries-file"));
+    delayed_interactive = config().has("interactive") && (!queries.empty() || config().has("queries-file"));
    if (is_interactive && !delayed_interactive)
    {
        if (config().has("multiquery"))
--- a/src/AggregateFunctions/AggregateFunctionSecondMoment.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSecondMoment.cpp
@ -19,6 +19,7 @@ void registerAggregateFunctionsStatisticsSecondMoment(AggregateFunctionFactory &
    factory.registerAlias("VAR_POP", "varPop", AggregateFunctionFactory::CaseInsensitive);
    factory.registerAlias("STDDEV_SAMP", "stddevSamp", AggregateFunctionFactory::CaseInsensitive);
    factory.registerAlias("STDDEV_POP", "stddevPop", AggregateFunctionFactory::CaseInsensitive);
+    factory.registerAlias("STD", "stddevPop", AggregateFunctionFactory::CaseInsensitive);
 }

 }
--- a/src/AggregateFunctions/Moments.h
+++ b/src/AggregateFunctions/Moments.h
@ -459,6 +459,10 @@ struct AnalysisOfVarianceMoments

    void add(T value, size_t group)
    {
+        if (group == std::numeric_limits<size_t>::max())
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too many groups for analysis of variance (should be no more than {}, got {})",
+                MAX_GROUPS_NUMBER, group);
+
        resizeIfNeeded(group + 1);
        xs1[group] += value;
        xs2[group] += value * value;
--- a/src/Backups/BackupUtils.cpp
+++ b/src/Backups/BackupUtils.cpp
@ -1,6 +1,9 @@
 #include <Backups/BackupUtils.h>
+#include <Backups/DDLAdjustingForBackupVisitor.h>
 #include <Access/Common/AccessRightsElement.h>
 #include <Databases/DDLRenamingVisitor.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/formatAST.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Common/setThreadName.h>

@ -95,4 +98,26 @@ AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements &
    return required_access;
 }

+bool compareRestoredTableDef(const IAST & restored_table_create_query, const IAST & create_query_from_backup, const ContextPtr & global_context)
+{
+    auto adjust_before_comparison = [&](const IAST & query) -> ASTPtr
+    {
+        auto new_query = query.clone();
+        adjustCreateQueryForBackup(new_query, global_context, nullptr);
+        ASTCreateQuery & create = typeid_cast<ASTCreateQuery &>(*new_query);
+        create.setUUID({});
+        create.if_not_exists = false;
+        return new_query;
+    };
+
+    ASTPtr query1 = adjust_before_comparison(restored_table_create_query);
+    ASTPtr query2 = adjust_before_comparison(create_query_from_backup);
+    return serializeAST(*query1) == serializeAST(*query2);
+}
+
+bool compareRestoredDatabaseDef(const IAST & restored_database_create_query, const IAST & create_query_from_backup, const ContextPtr & global_context)
+{
+    return compareRestoredTableDef(restored_database_create_query, create_query_from_backup, global_context);
+}
+
 }
--- a/src/Backups/BackupUtils.h
+++ b/src/Backups/BackupUtils.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Parsers/ASTBackupQuery.h>
+#include <Interpreters/Context_fwd.h>


 namespace DB
@ -12,8 +13,11 @@ class DDLRenamingMap;
 /// Initializes a DDLRenamingMap from a BACKUP or RESTORE query.
 DDLRenamingMap makeRenamingMapFromBackupQuery(const ASTBackupQuery::Elements & elements);

-
 /// Returns access required to execute BACKUP query.
 AccessRightsElements getRequiredAccessToBackup(const ASTBackupQuery::Elements & elements);

+/// Checks the definition of a restored table - it must correspond to the definition from the backup.
+bool compareRestoredTableDef(const IAST & restored_table_create_query, const IAST & create_query_from_backup, const ContextPtr & global_context);
+bool compareRestoredDatabaseDef(const IAST & restored_database_create_query, const IAST & create_query_from_backup, const ContextPtr & global_context);
+
 }
--- a/src/Backups/DDLAdjustingForBackupVisitor.cpp
+++ b/src/Backups/DDLAdjustingForBackupVisitor.cpp
@ -81,9 +81,6 @@ namespace

    void visitCreateQuery(ASTCreateQuery & create, const DDLAdjustingForBackupVisitor::Data & data)
    {
-        create.uuid = UUIDHelpers::Nil;
-        create.to_inner_uuid = UUIDHelpers::Nil;
-
        if (create.storage)
            visitStorage(*create.storage, data);
    }
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -7,6 +7,7 @@ namespace DB
 {
 class Exception;
 enum class UserDefinedSQLObjectType;
+class ASTCreateQuery;

 /// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
 /// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
@ -40,10 +41,13 @@ public:
    /// The function returns false if user-defined function at a specified zk path are being already restored by another replica.
    virtual bool acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type) = 0;

+    /// Generates a new UUID for a table. The same UUID must be used for a replicated table on each replica,
+    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
+    virtual void generateUUIDForTable(ASTCreateQuery & create_query) = 0;
+
    /// This function is used to check if concurrent restores are running
    /// other than the restore passed to the function
    virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
-
 };

 }
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -1,4 +1,5 @@
 #include <Backups/RestoreCoordinationLocal.h>
+#include <Parsers/formatAST.h>
 #include <Common/logger_useful.h>


@ -51,6 +52,39 @@ bool RestoreCoordinationLocal::acquireReplicatedSQLObjects(const String &, UserD
    return true;
 }

+void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+
+    auto find_in_map = [&]
+    {
+        auto it = create_query_uuids.find(query_str);
+        if (it != create_query_uuids.end())
+        {
+            create_query.setUUID(it->second);
+            return true;
+        }
+        return false;
+    };
+
+    {
+        std::lock_guard lock{mutex};
+        if (find_in_map())
+            return;
+    }
+
+    auto new_uuids = create_query.generateRandomUUID(/* always_generate_new_uuid= */ true);
+
+    String new_query_str = serializeAST(create_query);
+
+    {
+        std::lock_guard lock{mutex};
+        if (find_in_map())
+            return;
+        create_query_uuids[query_str] = new_uuids;
+    }
+}
+
 bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
 {
    if (num_active_restores > 1)
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Parsers/ASTCreateQuery.h>
 #include <mutex>
 #include <set>
 #include <unordered_set>
@ -39,6 +40,10 @@ public:
    /// The function returns false if user-defined function at a specified zk path are being already restored by another replica.
    bool acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type) override;

+    /// Generates a new UUID for a table. The same UUID must be used for a replicated table on each replica,
+    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
+    void generateUUIDForTable(ASTCreateQuery & create_query) override;
+
    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;

 private:
@ -46,6 +51,8 @@ private:

    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
+    std::unordered_map<String, ASTCreateQuery::UUIDs> create_query_uuids;
+
    mutable std::mutex mutex;
 };

--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -2,6 +2,8 @@
 #include <Backups/BackupCoordinationStage.h>
 #include <Backups/RestoreCoordinationRemote.h>
 #include <Backups/BackupCoordinationStageSync.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/formatAST.h>
 #include <Functions/UserDefined/UserDefinedSQLObjectType.h>
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/escapeForFileName.h>
@ -87,6 +89,7 @@ void RestoreCoordinationRemote::createRootNodes()
            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
+            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/table_uuids", "", zkutil::CreateMode::Persistent));
            zk->tryMulti(ops, responses);
        });
 }
@ -231,6 +234,33 @@ bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loade
    return result;
 }

+void RestoreCoordinationRemote::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+    String new_uuids_str = create_query.generateRandomUUID(/* always_generate_new_uuid= */ true).toString();
+
+    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
+            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
+
+            if (res == Coordination::Error::ZOK)
+                return;
+
+            if (res == Coordination::Error::ZNODEEXISTS)
+            {
+                create_query.setUUID(ASTCreateQuery::UUIDs::fromString(zk->get(path)));
+                return;
+            }
+
+            zkutil::KeeperException::fromPath(res, path);
+        });
+}
+
 void RestoreCoordinationRemote::removeAllNodes()
 {
    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
--- a/src/Backups/RestoreCoordinationRemote.h
+++ b/src/Backups/RestoreCoordinationRemote.h
@ -46,6 +46,10 @@ public:
    /// The function returns false if user-defined function at a specified zk path are being already restored by another replica.
    bool acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type) override;

+    /// Generates a new UUID for a table. The same UUID must be used for a replicated table on each replica,
+    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
+    void generateUUIDForTable(ASTCreateQuery & create_query) override;
+
    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;

 private:
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -571,12 +571,14 @@ void RestorerFromBackup::createDatabase(const String & database_name) const
    if (database_info.is_predefined_database)
        return;

-    auto create_database_query = database_info.create_database_query;
-    if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
-    {
-        create_database_query = create_database_query->clone();
-        create_database_query->as<ASTCreateQuery &>().if_not_exists = true;
-    }
+    auto create_database_query = typeid_cast<std::shared_ptr<ASTCreateQuery>>(database_info.create_database_query->clone());
+
+    /// Generate a new UUID for a database.
+    /// The generated UUID will be ignored if the database does not support UUIDs.
+    restore_coordination->generateUUIDForTable(*create_database_query);
+
+    /// Add the clause `IF NOT EXISTS` if that is specified in the restore settings.
+    create_database_query->if_not_exists = (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists);

    LOG_TRACE(log, "Creating database {}: {}", backQuoteIfNeed(database_name), serializeAST(*create_database_query));

@ -605,17 +607,17 @@ void RestorerFromBackup::checkDatabase(const String & database_name)
        if (!restore_settings.allow_different_database_def && !database_info.is_predefined_database)
        {
            /// Check that the database's definition is the same as expected.
-            ASTPtr create_database_query = database->getCreateDatabaseQuery();
-            adjustCreateQueryForBackup(create_database_query, context->getGlobalContext(), nullptr);
-            ASTPtr expected_create_query = database_info.create_database_query;
-            if (serializeAST(*create_database_query) != serializeAST(*expected_create_query))
+
+            ASTPtr existing_database_def = database->getCreateDatabaseQuery();
+            ASTPtr database_def_from_backup = database_info.create_database_query;
+            if (!compareRestoredDatabaseDef(*existing_database_def, *database_def_from_backup, context->getGlobalContext()))
            {
                throw Exception(
                    ErrorCodes::CANNOT_RESTORE_DATABASE,
                    "The database has a different definition: {} "
                    "comparing to its definition in the backup: {}",
-                    serializeAST(*create_database_query),
-                    serializeAST(*expected_create_query));
+                    serializeAST(*existing_database_def),
+                    serializeAST(*database_def_from_backup));
            }
        }
    }
@ -714,20 +716,23 @@ void RestorerFromBackup::createTable(const QualifiedTableName & table_name)
    if (table_info.is_predefined_table)
        return;

-    auto create_table_query = table_info.create_table_query;
-    if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
-    {
-        create_table_query = create_table_query->clone();
-        create_table_query->as<ASTCreateQuery &>().if_not_exists = true;
-    }
+    auto create_table_query = typeid_cast<std::shared_ptr<ASTCreateQuery>>(table_info.create_table_query->clone());
+
+    /// Generate a new UUID for a table (the same table on different hosts must use the same UUID, `restore_coordination` will make it so).
+    /// The generated UUID will be ignored if the database does not support UUIDs.
+    restore_coordination->generateUUIDForTable(*create_table_query);
+
+    /// Add the clause `IF NOT EXISTS` if that is specified in the restore settings.
+    create_table_query->if_not_exists = (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists);

    LOG_TRACE(
        log, "Creating {}: {}", tableNameWithTypeToString(table_name.database, table_name.table, false), serializeAST(*create_table_query));

    try
    {
-        DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_name.database);
-        table_info.database = database;
+        if (!table_info.database)
+            table_info.database = DatabaseCatalog::instance().getDatabase(table_name.database);
+        DatabasePtr database = table_info.database;

        /// Execute CREATE TABLE query (we call IDatabase::createTableRestoredFromBackup() to allow the database to do some
        /// database-specific things).
@ -747,37 +752,33 @@ void RestorerFromBackup::createTable(const QualifiedTableName & table_name)
 void RestorerFromBackup::checkTable(const QualifiedTableName & table_name)
 {
    auto & table_info = table_infos.at(table_name);
-    auto database = table_info.database;

    try
    {
-        if (!database)
-        {
-            database = DatabaseCatalog::instance().getDatabase(table_name.database);
-            table_info.database = database;
-        }
-
        auto resolved_id = (table_name.database == DatabaseCatalog::TEMPORARY_DATABASE)
            ? context->resolveStorageID(StorageID{"", table_name.table}, Context::ResolveExternal)
            : context->resolveStorageID(StorageID{table_name.database, table_name.table}, Context::ResolveGlobal);

+        if (!table_info.database)
+            table_info.database = DatabaseCatalog::instance().getDatabase(table_name.database);
+        DatabasePtr database = table_info.database;
+
        StoragePtr storage = database->getTable(resolved_id.table_name, context);
        table_info.storage = storage;
        table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);

        if (!restore_settings.allow_different_table_def && !table_info.is_predefined_table)
        {
-            ASTPtr create_table_query = database->getCreateTableQuery(resolved_id.table_name, context);
-            adjustCreateQueryForBackup(create_table_query, context->getGlobalContext(), nullptr);
-            ASTPtr expected_create_query = table_info.create_table_query;
-            if (serializeAST(*create_table_query) != serializeAST(*expected_create_query))
+            ASTPtr existing_table_def = database->getCreateTableQuery(resolved_id.table_name, context);
+            ASTPtr table_def_from_backup = table_info.create_table_query;
+            if (!compareRestoredTableDef(*existing_table_def, *table_def_from_backup, context->getGlobalContext()))
            {
                throw Exception(
                    ErrorCodes::CANNOT_RESTORE_TABLE,
                    "The table has a different definition: {} "
                    "comparing to its definition in the backup: {}",
-                    serializeAST(*create_table_query),
-                    serializeAST(*expected_create_query));
+                    serializeAST(*existing_table_def),
+                    serializeAST(*table_def_from_backup));
            }
        }
    }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -2495,23 +2495,34 @@ void ClientBase::runNonInteractive()
        return;
    }

-    String text;
-    if (config().has("query"))
+    if (!queries.empty())
    {
-        text += config().getRawString("query"); /// Poco configuration should not process substitutions in form of ${...} inside query.
+        for (const auto & query : queries)
+        {
+            if (query_fuzzer_runs)
+            {
+                if (!processWithFuzzing(query))
+                    return;
+            }
+            else
+            {
+                if (!processQueryText(query))
+                    return;
+            }
+        }
    }
    else
    {
        /// If 'query' parameter is not set, read a query from stdin.
        /// The query is read entirely into memory (streaming is disabled).
        ReadBufferFromFileDescriptor in(STDIN_FILENO);
+        String text;
        readStringUntilEOF(text, in);
+        if (query_fuzzer_runs)
+            processWithFuzzing(text);
+        else
+            processQueryText(text);
    }
-
-    if (query_fuzzer_runs)
-        processWithFuzzing(text);
-    else
-        processQueryText(text);
 }


@ -2680,8 +2691,8 @@ void ClientBase::init(int argc, char ** argv)
    stderr_is_a_tty = isatty(STDERR_FILENO);
    terminal_width = getTerminalWidth();

-    Arguments common_arguments{""}; /// 0th argument is ignored.
    std::vector<Arguments> external_tables_arguments;
+    Arguments common_arguments = {""}; /// 0th argument is ignored.
    std::vector<Arguments> hosts_and_ports_arguments;

    readArguments(argc, argv, common_arguments, external_tables_arguments, hosts_and_ports_arguments);
@ -2699,7 +2710,6 @@ void ClientBase::init(int argc, char ** argv)
    }


-    po::variables_map options;
    OptionsDescription options_description;
    options_description.main_description.emplace(createOptionsDescription("Main options", terminal_width));

@ -2711,9 +2721,8 @@ void ClientBase::init(int argc, char ** argv)

        ("config-file,C", po::value<std::string>(), "config-file path")

-        ("query,q", po::value<std::string>(), "query")
-        ("queries-file", po::value<std::vector<std::string>>()->multitoken(),
-            "file path with queries to execute; multiple files can be specified (--queries-file file1 file2...)")
+        ("query,q", po::value<std::vector<std::string>>()->multitoken(), R"(query; can be specified multiple times (--query "SELECT 1" --query "SELECT 2"...))")
+        ("queries-file", po::value<std::vector<std::string>>()->multitoken(), "file path with queries to execute; multiple files can be specified (--queries-file file1 file2...)")
        ("multiquery,n", "If specified, multiple queries separated by semicolons can be listed after --query. For convenience, it is also possible to omit --query and pass the queries directly after --multiquery.")
        ("multiline,m", "If specified, allow multiline queries (do not send the query on Enter)")
        ("database,d", po::value<std::string>(), "database")
@ -2734,8 +2743,7 @@ void ClientBase::init(int argc, char ** argv)
        ("log-level", po::value<std::string>(), "log level")
        ("server_logs_file", po::value<std::string>(), "put server logs into specified file")

-        ("suggestion_limit", po::value<int>()->default_value(10000),
-            "Suggestion limit for how many databases, tables and columns to fetch.")
+        ("suggestion_limit", po::value<int>()->default_value(10000), "Suggestion limit for how many databases, tables and columns to fetch.")

        ("format,f", po::value<std::string>(), "default output format")
        ("vertical,E", "vertical output format, same as --format=Vertical or FORMAT Vertical or \\G at end of command")
@ -2773,6 +2781,7 @@ void ClientBase::init(int argc, char ** argv)
        std::transform(external_options.begin(), external_options.end(), std::back_inserter(cmd_options), getter);
    }

+    po::variables_map options;
    parseAndCheckOptions(options_description, options, common_arguments);
    po::notify(options);

@ -2800,7 +2809,7 @@ void ClientBase::init(int argc, char ** argv)
    if (options.count("time"))
        print_time_to_stderr = true;
    if (options.count("query"))
-        config().setString("query", options["query"].as<std::string>());
+        queries = options["query"].as<std::vector<std::string>>();
    if (options.count("query_id"))
        config().setString("query_id", options["query_id"].as<std::string>());
    if (options.count("database"))
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -202,6 +202,7 @@ protected:
    std::optional<Suggest> suggest;
    bool load_suggestions = false;

+    std::vector<String> queries; /// Queries passed via '--query'
    std::vector<String> queries_files; /// If not empty, queries will be read from these files
    std::vector<String> interleave_queries_files; /// If not empty, run queries from these files before processing every file from 'queries_files'.
    std::vector<String> cmd_options;
--- a/src/Compression/CompressionCodecDelta.cpp
+++ b/src/Compression/CompressionCodecDelta.cpp
@ -5,7 +5,6 @@
 #include <Parsers/IAST.h>
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTFunction.h>
-#include <IO/WriteHelpers.h>


 namespace DB
--- a/src/Compression/CompressionCodecGCD.cpp
+++ b/src/Compression/CompressionCodecGCD.cpp
@ -3,14 +3,8 @@
 #include <Compression/CompressionFactory.h>
 #include <base/unaligned.h>
 #include <Parsers/IAST.h>
-#include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTFunction.h>
-#include <IO/WriteHelpers.h>
 #include "Common/Exception.h"
 #include "DataTypes/IDataType.h"
-#include "base/Decimal_fwd.h"
-#include "base/types.h"
-#include "config.h"

 #include <boost/integer/common_factor.hpp>
 #include <libdivide-config.h>
@ -84,7 +78,7 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)

    const char * const source_end = source + source_size;

-    T gcd_divider{};
+    T gcd_divider = 0;
    const auto * cur_source = source;
    while (gcd_divider != T(1) && cur_source < source_end)
    {
@ -100,7 +94,7 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)

    if constexpr (sizeof(T) <= 8)
    {
-        /// libdivide support only UInt32 and UInt64.
+        /// libdivide supports only UInt32 and UInt64.
        using LibdivideT = std::conditional_t<sizeof(T) <= 4, UInt32, UInt64>;
        libdivide::divider<LibdivideT> divider(static_cast<LibdivideT>(gcd_divider));
        cur_source = source;
@ -126,8 +120,6 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest)
 template <typename T>
 void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
 {
-    const char * const output_end = dest + output_size;
-
    if (source_size % sizeof(T) != 0)
        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is not aligned to {}", source_size, sizeof(T));

@ -135,11 +127,14 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest,
        throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is less than {}", source_size, sizeof(T));

    const char * const source_end = source + source_size;
+    const char * const dest_end = dest + output_size;
+
    const T gcd_multiplier = unalignedLoad<T>(source);
    source += sizeof(T);
+
    while (source < source_end)
    {
-        if (dest + sizeof(T) > output_end) [[unlikely]]
+        if (dest + sizeof(T) > dest_end) [[unlikely]]
            throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
        unalignedStore<T>(dest, unalignedLoad<T>(source) * gcd_multiplier);

--- a/src/Functions/FunctionBase64Conversion.h
+++ b/src/Functions/FunctionBase64Conversion.h
@ -8,53 +8,34 @@
 #    include <Functions/FunctionHelpers.h>
 #    include <Functions/IFunction.h>
 #    include <Interpreters/Context_fwd.h>
-#    include <turbob64.h>
+#    include <libbase64.h>
 #    include <Common/MemorySanitizer.h>

+#    include <cstddef>
 #    include <span>

 namespace DB
 {
 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
    extern const int ILLEGAL_COLUMN;
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int INCORRECT_DATA;
 }

-namespace Detail
-{
-    inline size_t base64Decode(const std::span<const UInt8> src, UInt8 * dst)
-    {
-#    if defined(__aarch64__)
-        return tb64sdec(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
-#    else
-        return _tb64d(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
-#    endif
-    }
-}
-
 struct Base64Encode
 {
    static constexpr auto name = "base64Encode";

-    static size_t getBufferSize(const size_t string_length, const size_t string_count)
+    static size_t getBufferSize(size_t string_length, size_t string_count)
    {
        return ((string_length - string_count) / 3 + string_count) * 4 + string_count;
    }

-    static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
+    static size_t perform(const std::span<const UInt8> src, UInt8 * dst)
    {
-        /*
-        * Some bug in sse arm64 implementation?
-        * `base64Encode(repeat('a', 46))` returns wrong padding character
-        */
-#    if defined(__aarch64__)
-        return tb64senc(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
-#    else
-        return _tb64e(reinterpret_cast<const uint8_t *>(src.data()), src.size(), reinterpret_cast<uint8_t *>(dst));
-#    endif
+        size_t outlen = 0;
+        base64_encode(reinterpret_cast<const char *>(src.data()), src.size(), reinterpret_cast<char *>(dst), &outlen, 0);
+        return outlen;
    }
 };

@ -62,15 +43,17 @@ struct Base64Decode
 {
    static constexpr auto name = "base64Decode";

-    static size_t getBufferSize(const size_t string_length, const size_t string_count)
+    static size_t getBufferSize(size_t string_length, size_t string_count)
    {
        return ((string_length - string_count) / 4 + string_count) * 3 + string_count;
    }

-    static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
+    static size_t perform(const std::span<const UInt8> src, UInt8 * dst)
    {
-        const auto outlen = Detail::base64Decode(src, dst);
-        if (src.size() > 0 && !outlen)
+        size_t outlen = 0;
+        int rc = base64_decode(reinterpret_cast<const char *>(src.data()), src.size(), reinterpret_cast<char *>(dst), &outlen, 0);
+
+        if (rc != 1)
            throw Exception(
                ErrorCodes::INCORRECT_DATA,
                "Failed to {} input '{}'",
@ -85,17 +68,16 @@ struct TryBase64Decode
 {
    static constexpr auto name = "tryBase64Decode";

-    static size_t getBufferSize(const size_t string_length, const size_t string_count)
+    static size_t getBufferSize(size_t string_length, size_t string_count)
    {
        return Base64Decode::getBufferSize(string_length, string_count);
    }

-    static size_t performCoding(const std::span<const UInt8> src, UInt8 * dst)
+    static size_t perform(const std::span<const UInt8> src, UInt8 * dst)
    {
-        if (src.empty())
-            return 0;
+        size_t outlen = 0;
+        base64_decode(reinterpret_cast<const char *>(src.data()), src.size(), reinterpret_cast<char *>(dst), &outlen, 0);

-        const auto outlen = Detail::base64Decode(src, dst);
        // during decoding character array can be partially polluted
        // if fail, revert back and clean
        if (!outlen)
@ -119,20 +101,16 @@ public:

    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
    {
-        if (arguments.size() != 1)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Wrong number of arguments for function {}: 1 expected.", getName());
+        FunctionArgumentDescriptors mandatory_arguments{
+            {"value", &isStringOrFixedString<IDataType>, nullptr, "String or FixedString"}
+        };

-        if (!WhichDataType(arguments[0].type).isStringOrFixedString())
-            throw Exception(
-                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                "Illegal type {} of 1st argument of function {}. Must be FixedString or String.",
-                arguments[0].type->getName(),
-                getName());
+        validateFunctionArgumentTypes(*this, arguments, mandatory_arguments);

        return std::make_shared<DataTypeString>();
    }

-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, const size_t input_rows_count) const override
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
        const auto & input_column = arguments[0].column;
        if (const auto * src_column_as_fixed_string = checkAndGetColumn<ColumnFixedString>(*input_column))
@ -148,7 +126,7 @@ public:
    }

 private:
-    static ColumnPtr execute(const ColumnString & src_column, const size_t src_row_count)
+    static ColumnPtr execute(const ColumnString & src_column, size_t src_row_count)
    {
        auto dst_column = ColumnString::create();
        auto & dst_chars = dst_column->getChars();
@ -169,7 +147,7 @@ private:
        for (size_t row = 0; row < src_row_count; ++row)
        {
            const size_t src_length = src_offsets[row] - src_offset_prev - 1;
-            const auto outlen = Func::performCoding({src, src_length}, dst_pos);
+            const auto outlen = Func::perform({src, src_length}, dst_pos);

            /// Base64 library is using AVX-512 with some shuffle operations.
            /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle.
@ -188,7 +166,7 @@ private:
        return dst_column;
    }

-    static ColumnPtr execute(const ColumnFixedString & src_column, const size_t src_row_count)
+    static ColumnPtr execute(const ColumnFixedString & src_column, size_t src_row_count)
    {
        auto dst_column = ColumnString::create();
        auto & dst_chars = dst_column->getChars();
@ -207,7 +185,7 @@ private:

        for (size_t row = 0; row < src_row_count; ++row)
        {
-            const auto outlen = Func::performCoding({src, src_n}, dst_pos);
+            const auto outlen = Func::perform({src, src_n}, dst_pos);

            /// Base64 library is using AVX-512 with some shuffle operations.
            /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle.
@ -225,6 +203,7 @@ private:
        return dst_column;
    }
 };
+
 }

 #endif
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -58,6 +58,7 @@
 #include <Common/HashTable/HashMap.h>
 #include <DataTypes/DataTypeIPv4andIPv6.h>
 #include <Common/IPv6ToBinary.h>
+#include "DataTypes/IDataType.h"
 #include <Core/Types.h>


@ -87,7 +88,6 @@ namespace ErrorCodes
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int NOT_IMPLEMENTED;
    extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN;
-    extern const int CANNOT_PARSE_BOOL;
 }


@ -884,75 +884,179 @@ struct ConvertImpl<FromDataType, DataTypeString, Name, ConvertDefaultBehaviorTag

    static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/)
    {
-        ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column);
-
-        const auto & col_with_type_and_name =  columnGetNested(arguments[0]);
-        const auto & type = static_cast<const FromDataType &>(*col_with_type_and_name.type);
-
-        const DateLUTImpl * time_zone = nullptr;
-
-        if constexpr (std::is_same_v<FromDataType, DataTypeDate> || std::is_same_v<FromDataType, DataTypeDate32>)
-            time_zone = &DateLUT::instance();
-        /// For argument of Date or DateTime type, second argument with time zone could be specified.
-        if constexpr (std::is_same_v<FromDataType, DataTypeDateTime> || std::is_same_v<FromDataType, DataTypeDateTime64>)
+        if constexpr (IsDataTypeDateOrDateTime<FromDataType>)
        {
-            auto non_null_args = createBlockWithNestedColumns(arguments);
-            time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0);
-        }
+            auto datetime_arg = arguments[0];

-        if (const auto col_from = checkAndGetColumn<ColVecType>(col_with_type_and_name.column.get()))
-        {
-            auto col_to = ColumnString::create();
+            const DateLUTImpl * time_zone = nullptr;
+            const ColumnConst * time_zone_column = nullptr;

-            const typename ColVecType::Container & vec_from = col_from->getData();
-            ColumnString::Chars & data_to = col_to->getChars();
-            ColumnString::Offsets & offsets_to = col_to->getOffsets();
-            size_t size = vec_from.size();
-
-            if constexpr (std::is_same_v<FromDataType, DataTypeDate>)
-                data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
-            else if constexpr (std::is_same_v<FromDataType, DataTypeDate32>)
-                data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
-            else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime>)
-                data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1));
-            else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime64>)
-                data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1));
-            else
-                data_to.resize(size * 3);   /// Arbitrary
-
-            offsets_to.resize(size);
-
-            WriteBufferFromVector<ColumnString::Chars> write_buffer(data_to);
-
-            if (null_map)
+            if (arguments.size() == 1)
            {
-                for (size_t i = 0; i < size; ++i)
+                auto non_null_args = createBlockWithNestedColumns(arguments);
+                time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0);
+            }
+            else /// When we have a column for timezone
+            {
+                datetime_arg.column = datetime_arg.column->convertToFullColumnIfConst();
+
+                if constexpr (std::is_same_v<FromDataType, DataTypeDate> || std::is_same_v<FromDataType, DataTypeDate32>)
+                    time_zone = &DateLUT::instance();
+                /// For argument of Date or DateTime type, second argument with time zone could be specified.
+                if constexpr (std::is_same_v<FromDataType, DataTypeDateTime> || std::is_same_v<FromDataType, DataTypeDateTime64>)
                {
-                    bool is_ok = FormatImpl<FromDataType>::template execute<bool>(vec_from[i], write_buffer, &type, time_zone);
-                    null_map->getData()[i] |= !is_ok;
-                    writeChar(0, write_buffer);
-                    offsets_to[i] = write_buffer.count();
+                    if ((time_zone_column = checkAndGetColumnConst<ColumnString>(arguments[1].column.get())))
+                    {
+                        auto non_null_args = createBlockWithNestedColumns(arguments);
+                        time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0);
+                    }
                }
            }
-            else
+            const auto & col_with_type_and_name = columnGetNested(datetime_arg);
+
+            if (const auto col_from = checkAndGetColumn<ColVecType>(col_with_type_and_name.column.get()))
            {
-                for (size_t i = 0; i < size; ++i)
+                auto col_to = ColumnString::create();
+
+                const typename ColVecType::Container & vec_from = col_from->getData();
+                ColumnString::Chars & data_to = col_to->getChars();
+                ColumnString::Offsets & offsets_to = col_to->getOffsets();
+                size_t size = vec_from.size();
+
+                if constexpr (std::is_same_v<FromDataType, DataTypeDate>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDate32>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime64>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1));
+                else
+                    data_to.resize(size * 3);   /// Arbitrary
+
+                offsets_to.resize(size);
+
+                WriteBufferFromVector<ColumnString::Chars> write_buffer(data_to);
+                const auto & type = static_cast<const FromDataType &>(*col_with_type_and_name.type);
+
+                ColumnUInt8::MutablePtr null_map = copyNullMap(datetime_arg.column);
+
+                if (null_map)
                {
-                    FormatImpl<FromDataType>::template execute<void>(vec_from[i], write_buffer, &type, time_zone);
-                    writeChar(0, write_buffer);
-                    offsets_to[i] = write_buffer.count();
+                    for (size_t i = 0; i < size; ++i)
+                    {
+                        if (!time_zone_column && arguments.size() > 1)
+                        {
+                            if (!arguments[1].column.get()->getDataAt(i).toString().empty())
+                                time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString());
+                            else
+                                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty");
+                        }
+                        bool is_ok = FormatImpl<FromDataType>::template execute<bool>(vec_from[i], write_buffer, &type, time_zone);
+                        null_map->getData()[i] |= !is_ok;
+                        writeChar(0, write_buffer);
+                        offsets_to[i] = write_buffer.count();
+                    }
                }
+                else
+                {
+                    for (size_t i = 0; i < size; ++i)
+                    {
+                        if (!time_zone_column && arguments.size() > 1)
+                        {
+                            if (!arguments[1].column.get()->getDataAt(i).toString().empty())
+                            time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString());
+                            else
+                                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty");
+                        }
+                        FormatImpl<FromDataType>::template execute<void>(vec_from[i], write_buffer, &type, time_zone);
+                        writeChar(0, write_buffer);
+                        offsets_to[i] = write_buffer.count();
+                    }
+                }
+
+                write_buffer.finalize();
+
+                if (null_map)
+                    return ColumnNullable::create(std::move(col_to), std::move(null_map));
+                return col_to;
            }
-
-            write_buffer.finalize();
-
-            if (null_map)
-                return ColumnNullable::create(std::move(col_to), std::move(null_map));
-            return col_to;
+            else
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
+                        arguments[0].column->getName(), Name::name);
        }
        else
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
-                    arguments[0].column->getName(), Name::name);
+        {
+            ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column);
+
+            const auto & col_with_type_and_name =  columnGetNested(arguments[0]);
+            const auto & type = static_cast<const FromDataType &>(*col_with_type_and_name.type);
+
+            const DateLUTImpl * time_zone = nullptr;
+
+            if constexpr (std::is_same_v<FromDataType, DataTypeDate> || std::is_same_v<FromDataType, DataTypeDate32>)
+                time_zone = &DateLUT::instance();
+            /// For argument of Date or DateTime type, second argument with time zone could be specified.
+            if constexpr (std::is_same_v<FromDataType, DataTypeDateTime> || std::is_same_v<FromDataType, DataTypeDateTime64>)
+            {
+                auto non_null_args = createBlockWithNestedColumns(arguments);
+                time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0);
+            }
+
+            if (const auto col_from = checkAndGetColumn<ColVecType>(col_with_type_and_name.column.get()))
+            {
+                auto col_to = ColumnString::create();
+
+                const typename ColVecType::Container & vec_from = col_from->getData();
+                ColumnString::Chars & data_to = col_to->getChars();
+                ColumnString::Offsets & offsets_to = col_to->getOffsets();
+                size_t size = vec_from.size();
+
+                if constexpr (std::is_same_v<FromDataType, DataTypeDate>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDate32>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1));
+                else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime64>)
+                    data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1));
+                else
+                    data_to.resize(size * 3);   /// Arbitrary
+
+                offsets_to.resize(size);
+
+                WriteBufferFromVector<ColumnString::Chars> write_buffer(data_to);
+
+                if (null_map)
+                {
+                    for (size_t i = 0; i < size; ++i)
+                    {
+                        bool is_ok = FormatImpl<FromDataType>::template execute<bool>(vec_from[i], write_buffer, &type, time_zone);
+                        null_map->getData()[i] |= !is_ok;
+                        writeChar(0, write_buffer);
+                        offsets_to[i] = write_buffer.count();
+                    }
+                }
+                else
+                {
+                    for (size_t i = 0; i < size; ++i)
+                    {
+                        FormatImpl<FromDataType>::template execute<void>(vec_from[i], write_buffer, &type, time_zone);
+                        writeChar(0, write_buffer);
+                        offsets_to[i] = write_buffer.count();
+                    }
+                }
+
+                write_buffer.finalize();
+
+                if (null_map)
+                    return ColumnNullable::create(std::move(col_to), std::move(null_map));
+                return col_to;
+            }
+            else
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
+                        arguments[0].column->getName(), Name::name);
+        }
    }
 };

@ -1598,19 +1702,7 @@ struct ConvertImplGenericFromString

                const auto & val = col_from_string->getDataAt(i);
                ReadBufferFromMemory read_buffer(val.data, val.size);
-                try
-                {
-                    serialization_from.deserializeWholeText(column_to, read_buffer, format_settings);
-                }
-                catch (const Exception & e)
-                {
-                    if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && typeid_cast<ColumnNullable *>(&column_to))
-                    {
-                        column_to.insertDefault();
-                        continue;
-                    }
-                    throw;
-                }
+                serialization_from.deserializeWholeText(column_to, read_buffer, format_settings);

                if (!read_buffer.eof())
                {
@ -1867,7 +1959,7 @@ public:
            // toDateTime64(value, scale : Integer[, timezone: String])
            || std::is_same_v<ToDataType, DataTypeDateTime64>)
        {
-            optional_args.push_back({"timezone", &isString<IDataType>, &isColumnConst, "const String"});
+            optional_args.push_back({"timezone", &isString<IDataType>, nullptr, "String"});
        }

        validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
@ -1931,7 +2023,9 @@ public:
    bool useDefaultImplementationForConstants() const override { return true; }
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override
    {
-        if constexpr (std::is_same_v<ToDataType, DataTypeDateTime64>)
+        if constexpr (std::is_same_v<ToDataType, DataTypeString>)
+            return {};
+        else if constexpr (std::is_same_v<ToDataType, DataTypeDateTime64>)
            return {2};
        return {1};
    }
@ -4067,21 +4161,15 @@ private:
            {
                if constexpr (std::is_same_v<ToDataType, DataTypeIPv4>)
                {
-                    ret = [cast_ipv4_ipv6_default_on_conversion_error_value,
-                           input_format_ipv4_default_on_conversion_error_value,
-                           requested_result_is_nullable](
-                              ColumnsWithTypeAndName & arguments,
-                              const DataTypePtr & result_type,
-                              const ColumnNullable * column_nullable,
-                              size_t) -> ColumnPtr
+                    ret = [cast_ipv4_ipv6_default_on_conversion_error_value, input_format_ipv4_default_on_conversion_error_value, requested_result_is_nullable](
+                                  ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t)
+                            -> ColumnPtr
                    {
                        if (!WhichDataType(result_type).isIPv4())
                            throw Exception(ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName());

                        const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr;
-                        if (requested_result_is_nullable)
-                            return convertToIPv4<IPStringToNumExceptionMode::Null>(arguments[0].column, null_map);
-                        else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv4_default_on_conversion_error_value)
+                        if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv4_default_on_conversion_error_value || requested_result_is_nullable)
                            return convertToIPv4<IPStringToNumExceptionMode::Default>(arguments[0].column, null_map);
                        else
                            return convertToIPv4<IPStringToNumExceptionMode::Throw>(arguments[0].column, null_map);
@ -4092,22 +4180,16 @@ private:

                if constexpr (std::is_same_v<ToDataType, DataTypeIPv6>)
                {
-                    ret = [cast_ipv4_ipv6_default_on_conversion_error_value,
-                           input_format_ipv6_default_on_conversion_error_value,
-                           requested_result_is_nullable](
-                              ColumnsWithTypeAndName & arguments,
-                              const DataTypePtr & result_type,
-                              const ColumnNullable * column_nullable,
-                              size_t) -> ColumnPtr
+                    ret = [cast_ipv4_ipv6_default_on_conversion_error_value, input_format_ipv6_default_on_conversion_error_value, requested_result_is_nullable](
+                                  ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t)
+                            -> ColumnPtr
                    {
                        if (!WhichDataType(result_type).isIPv6())
                            throw Exception(
                                ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv6", result_type->getName());

                        const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr;
-                        if (requested_result_is_nullable)
-                            return convertToIPv6<IPStringToNumExceptionMode::Null>(arguments[0].column, null_map);
-                        else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv6_default_on_conversion_error_value)
+                        if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv6_default_on_conversion_error_value || requested_result_is_nullable)
                            return convertToIPv6<IPStringToNumExceptionMode::Default>(arguments[0].column, null_map);
                        else
                            return convertToIPv6<IPStringToNumExceptionMode::Throw>(arguments[0].column, null_map);
@ -4118,18 +4200,7 @@ private:

                if (to_type->getCustomSerialization() && to_type->getCustomName())
                {
-                    ret = [requested_result_is_nullable](
-                              ColumnsWithTypeAndName & arguments,
-                              const DataTypePtr & result_type,
-                              const ColumnNullable * column_nullable,
-                              size_t input_rows_count) -> ColumnPtr
-                    {
-                        auto wrapped_result_type = result_type;
-                        if (requested_result_is_nullable)
-                            wrapped_result_type = makeNullable(result_type);
-                        return ConvertImplGenericFromString<typename FromDataType::ColumnType>::execute(
-                            arguments, wrapped_result_type, column_nullable, input_rows_count);
-                    };
+                    ret = &ConvertImplGenericFromString<typename FromDataType::ColumnType>::execute;
                    return true;
                }
            }
@ -4144,9 +4215,7 @@ private:
                            ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName());

                    const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr;
-                    if (requested_result_is_nullable)
-                        return convertIPv6ToIPv4<IPStringToNumExceptionMode::Null>(arguments[0].column, null_map);
-                    else if (cast_ipv4_ipv6_default_on_conversion_error_value)
+                    if (cast_ipv4_ipv6_default_on_conversion_error_value || requested_result_is_nullable)
                        return convertIPv6ToIPv4<IPStringToNumExceptionMode::Default>(arguments[0].column, null_map);
                    else
                        return convertIPv6ToIPv4<IPStringToNumExceptionMode::Throw>(arguments[0].column, null_map);
--- a/src/Functions/base64Decode.cpp
+++ b/src/Functions/base64Decode.cpp
@ -7,7 +7,6 @@ namespace DB
 {
 REGISTER_FUNCTION(Base64Decode)
 {
-    tb64ini(0, 0);
    factory.registerFunction<FunctionBase64Conversion<Base64Decode>>();

    /// MysQL compatibility alias.
--- a/src/Functions/base64Encode.cpp
+++ b/src/Functions/base64Encode.cpp
@ -7,7 +7,6 @@ namespace DB
 {
 REGISTER_FUNCTION(Base64Encode)
 {
-    tb64ini(0, 0);
    factory.registerFunction<FunctionBase64Conversion<Base64Encode>>();

    /// MysQL compatibility alias.
--- a/src/Functions/formatDateTime.cpp
+++ b/src/Functions/formatDateTime.cpp
@ -746,7 +746,7 @@ public:

    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }

-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }

    bool isVariadic() const override { return true; }
    size_t getNumberOfArguments() const override { return 0; }
@ -855,17 +855,25 @@ public:
    template <typename DataType>
    ColumnPtr executeType(const ColumnsWithTypeAndName & arguments, const DataTypePtr &) const
    {
-        auto * times = checkAndGetColumn<typename DataType::ColumnType>(arguments[0].column.get());
+        auto non_const_datetime = arguments[0].column->convertToFullColumnIfConst();
+        auto * times = checkAndGetColumn<typename DataType::ColumnType>(non_const_datetime.get());
        if (!times)
            return nullptr;

-        const ColumnConst * format_column = checkAndGetColumnConst<ColumnString>(arguments[1].column.get());
-        if (!format_column)
+        String format;
+        if (const auto * format_column = checkAndGetColumnConst<ColumnString>(arguments[1].column.get()))
+            format = format_column->getValue<String>();
+        else
            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
                "Illegal column {} of second ('format') argument of function {}. Must be constant string.",
                arguments[1].column->getName(), getName());

-        String format = format_column->getValue<String>();
+        const ColumnConst * const_time_zone_column = nullptr;
+        const DateLUTImpl * time_zone = nullptr;
+        if (arguments.size() == 2)
+            time_zone = &extractTimeZoneFromFunctionArguments(arguments, 2, 0);
+        else if (arguments.size() > 2)
+            const_time_zone_column = checkAndGetColumnConst<ColumnString>(arguments[2].column.get());

        UInt32 scale [[maybe_unused]] = 0;
        if constexpr (std::is_same_v<DataType, DataTypeDateTime64>)
@ -893,15 +901,19 @@ public:
        String out_template;
        size_t out_template_size = parseFormat(format, instructions, scale, mysql_with_only_fixed_length_formatters, out_template);

-        const DateLUTImpl * time_zone_tmp = nullptr;
        if (castType(arguments[0].type.get(), [&]([[maybe_unused]] const auto & type) { return true; }))
-            time_zone_tmp = &extractTimeZoneFromFunctionArguments(arguments, 2, 0);
+        {
+            if (const_time_zone_column)
+                time_zone = &extractTimeZoneFromFunctionArguments(arguments, 2, 0);
+        }
        else if (std::is_same_v<DataType, DataTypeDateTime64> || std::is_same_v<DataType, DataTypeDateTime>)
-            time_zone_tmp = &extractTimeZoneFromFunctionArguments(arguments, 2, 0);
+        {
+            if (const_time_zone_column)
+                time_zone = &extractTimeZoneFromFunctionArguments(arguments, 2, 0);
+        }
        else
-            time_zone_tmp = &DateLUT::instance();
+            time_zone = &DateLUT::instance();

-        const DateLUTImpl & time_zone = *time_zone_tmp;
        const auto & vec = times->getData();

        auto col_res = ColumnString::create();
@ -941,6 +953,13 @@ public:
        auto * pos = begin;
        for (size_t i = 0; i < vec.size(); ++i)
        {
+            if (!const_time_zone_column && arguments.size() > 2)
+            {
+                if (!arguments[2].column.get()->getDataAt(i).toString().empty())
+                    time_zone = &DateLUT::instance(arguments[2].column.get()->getDataAt(i).toString());
+                else
+                    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty");
+            }
            if constexpr (std::is_same_v<DataType, DataTypeDateTime64>)
            {
                auto c = DecimalUtils::split(vec[i], scale);
@ -954,12 +973,14 @@ public:
                }

                for (auto & instruction : instructions)
-                    instruction.perform(pos, static_cast<Int64>(c.whole), c.fractional, scale, time_zone);
+                {
+                    instruction.perform(pos, static_cast<Int64>(c.whole), c.fractional, scale, *time_zone);
+                }
            }
            else
            {
                for (auto & instruction : instructions)
-                    instruction.perform(pos, static_cast<UInt32>(vec[i]), 0, 0, time_zone);
+                    instruction.perform(pos, static_cast<UInt32>(vec[i]), 0, 0, *time_zone);
            }
            *pos++ = '\0';

--- a/src/IO/readFloatText.h
+++ b/src/IO/readFloatText.h
@ -238,8 +238,15 @@ ReturnType readFloatTextPreciseImpl(T & x, ReadBuffer & buf)
            ++num_copied_chars;
        }

-        auto res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
-
+        fast_float::from_chars_result res;
+        if constexpr (std::endian::native == std::endian::little)
+            res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x);
+        else
+        {
+            Float64 x64 = 0.0;
+            res = fast_float::from_chars(tmp_buf, tmp_buf + num_copied_chars, x64);
+            x = static_cast<T>(x64);
+        }
        if (unlikely(res.ec != std::errc()))
        {
            if constexpr (throw_exception)
--- a/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -38,7 +38,7 @@ ContextMutablePtr updateSettingsForCluster(bool interserver_mode,
    ContextPtr context,
    const Settings & settings,
    const StorageID & main_table,
-    const SelectQueryInfo * query_info,
+    ASTPtr additional_filter_ast,
    Poco::Logger * log)
 {
    Settings new_settings = settings;
@ -115,11 +115,11 @@ ContextMutablePtr updateSettingsForCluster(bool interserver_mode,
    ///
    /// Here we don't try to analyze setting again. In case if query_info->additional_filter_ast is not empty, some filter was applied.
    /// It's just easier to add this filter for a source table.
-    if (query_info && query_info->additional_filter_ast)
+    if (additional_filter_ast)
    {
        Tuple tuple;
        tuple.push_back(main_table.getShortName());
-        tuple.push_back(queryToString(query_info->additional_filter_ast));
+        tuple.push_back(queryToString(additional_filter_ast));
        new_settings.additional_table_filters.value.push_back(std::move(tuple));
    }

@ -174,7 +174,8 @@ void executeQuery(
    std::vector<QueryPlanPtr> plans;
    SelectStreamFactory::Shards remote_shards;

-    auto new_context = updateSettingsForCluster(!query_info.getCluster()->getSecret().empty(), context, settings, main_table, &query_info, log);
+    auto new_context = updateSettingsForCluster(!not_optimized_cluster->getSecret().empty(), context, settings,
+                                                main_table, query_info.additional_filter_ast, log);
    new_context->increaseDistributedDepth();

    size_t shards = query_info.getCluster()->getShardCount();
@ -269,7 +270,7 @@ void executeQueryWithParallelReplicas(
    SelectStreamFactory & stream_factory,
    const ASTPtr & query_ast,
    ContextPtr context,
-    const SelectQueryInfo & query_info,
+    std::shared_ptr<const StorageLimitsList> storage_limits,
    const ClusterPtr & not_optimized_cluster)
 {
    const auto & settings = context->getSettingsRef();
@ -333,7 +334,7 @@ void executeQueryWithParallelReplicas(
        std::move(scalars),
        std::move(external_tables),
        &Poco::Logger::get("ReadFromParallelRemoteReplicasStep"),
-        query_info.storage_limits);
+        std::move(storage_limits));

    query_plan.addStep(std::move(read_from_remote));
 }
--- a/src/Interpreters/ClusterProxy/executeQuery.h
+++ b/src/Interpreters/ClusterProxy/executeQuery.h
@ -20,6 +20,9 @@ using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;

 struct StorageID;

+struct StorageLimits;
+using StorageLimitsList = std::list<StorageLimits>;
+
 namespace ClusterProxy
 {

@ -38,7 +41,7 @@ ContextMutablePtr updateSettingsForCluster(bool interserver_mode,
    ContextPtr context,
    const Settings & settings,
    const StorageID & main_table,
-    const SelectQueryInfo * query_info = nullptr,
+    ASTPtr additional_filter_ast = nullptr,
    Poco::Logger * log = nullptr);

 using AdditionalShardFilterGenerator = std::function<ASTPtr(uint64_t)>;
@ -66,7 +69,7 @@ void executeQueryWithParallelReplicas(
    SelectStreamFactory & stream_factory,
    const ASTPtr & query_ast,
    ContextPtr context,
-    const SelectQueryInfo & query_info,
+    std::shared_ptr<const StorageLimitsList> storage_limits,
    const ClusterPtr & not_optimized_cluster);
 }

--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -982,7 +982,7 @@ void Context::setTemporaryStorageInCache(const String & cache_disk_name, size_t

    auto file_cache = FileCacheFactory::instance().getByName(disk_ptr->getCacheName()).cache;
    if (!file_cache)
-        throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Cache '{}' is not found", file_cache->getBasePath());
+        throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Cache '{}' is not found", disk_ptr->getCacheName());

    LOG_DEBUG(shared->log, "Using file cache ({}) for temporary files", file_cache->getBasePath());

--- a/src/Interpreters/InterpreterAlterQuery.cpp
+++ b/src/Interpreters/InterpreterAlterQuery.cpp
@ -156,10 +156,10 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter)
    if (typeid_cast<DatabaseReplicated *>(database.get()))
    {
        int command_types_count = !mutation_commands.empty() + !partition_commands.empty() + !alter_commands.empty();
-        bool mixed_settings_amd_metadata_alter = alter_commands.hasSettingsAlterCommand() && !alter_commands.isSettingsAlter();
+        bool mixed_settings_amd_metadata_alter = alter_commands.hasNonReplicatedAlterCommand() && !alter_commands.areNonReplicatedAlterCommands();
        if (1 < command_types_count || mixed_settings_amd_metadata_alter)
            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "For Replicated databases it's not allowed "
-                                                         "to execute ALTERs of different types in single query");
+                                                         "to execute ALTERs of different types (replicated and non replicated) in single query");
    }

    if (mutation_commands.hasNonEmptyMutationCommands())
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -219,10 +219,12 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
    else
    {
        bool is_on_cluster = getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY;
-        if (create.uuid != UUIDHelpers::Nil && !is_on_cluster)
+        if (create.uuid != UUIDHelpers::Nil && !is_on_cluster && !internal)
            throw Exception(ErrorCodes::INCORRECT_QUERY, "Ordinary database engine does not support UUID");

-        /// Ignore UUID if it's ON CLUSTER query
+        /// The database doesn't support UUID so we'll ignore it. The UUID could be set here because of either
+        /// a) the initiator of `ON CLUSTER` query generated it to ensure the same UUIDs are used on different hosts; or
+        /// b) `RESTORE from backup` query generated it to ensure the same UUIDs are used on different hosts.
        create.uuid = UUIDHelpers::Nil;
        metadata_path = metadata_path / "metadata" / database_name_escaped;
    }
@ -983,19 +985,6 @@ void InterpreterCreateQuery::setEngine(ASTCreateQuery & create) const
    setDefaultTableEngine(*create.storage, getContext()->getSettingsRef().default_table_engine.value);
 }

-static void generateUUIDForTable(ASTCreateQuery & create)
-{
-    if (create.uuid == UUIDHelpers::Nil)
-        create.uuid = UUIDHelpers::generateV4();
-
-    /// If destination table (to_table_id) is not specified for materialized view,
-    /// then MV will create inner table. We should generate UUID of inner table here,
-    /// so it will be the same on all hosts if query in ON CLUSTER or database engine is Replicated.
-    bool need_uuid_for_inner_table = !create.attach && create.is_materialized_view && !create.to_table_id;
-    if (need_uuid_for_inner_table && create.to_inner_uuid == UUIDHelpers::Nil)
-        create.to_inner_uuid = UUIDHelpers::generateV4();
-}
-
 void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const DatabasePtr & database) const
 {
    const auto * kind = create.is_dictionary ? "Dictionary" : "Table";
@ -1028,17 +1017,26 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data
                            kind_upper, create.table);
        }

-        generateUUIDForTable(create);
+        create.generateRandomUUID();
    }
    else
    {
        bool is_on_cluster = getContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY;
        bool has_uuid = create.uuid != UUIDHelpers::Nil || create.to_inner_uuid != UUIDHelpers::Nil;
-        if (has_uuid && !is_on_cluster)
+        if (has_uuid && !is_on_cluster && !internal)
+        {
+            /// We don't show the following error message either
+            /// 1) if it's a secondary query (an initiator of a CREATE TABLE ON CLUSTER query
+            /// doesn't know the exact database engines on replicas and generates an UUID, and then the replicas are free to ignore that UUID); or
+            /// 2) if it's an internal query (for example RESTORE uses internal queries to create tables and it generates an UUID
+            /// before creating a table to be possibly ignored if the database engine doesn't need it).
            throw Exception(ErrorCodes::INCORRECT_QUERY,
                            "{} UUID specified, but engine of database {} is not Atomic", kind, create.getDatabase());
+        }

-        /// Ignore UUID if it's ON CLUSTER query
+        /// The database doesn't support UUID so we'll ignore it. The UUID could be set here because of either
+        /// a) the initiator of `ON CLUSTER` query generated it to ensure the same UUIDs are used on different hosts; or
+        /// b) `RESTORE from backup` query generated it to ensure the same UUIDs are used on different hosts.
        create.uuid = UUIDHelpers::Nil;
        create.to_inner_uuid = UUIDHelpers::Nil;
    }
@ -1619,7 +1617,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, Cont

    /// For CREATE query generate UUID on initiator, so it will be the same on all hosts.
    /// It will be ignored if database does not support UUIDs.
-    generateUUIDForTable(create);
+    create.generateRandomUUID();

    /// For cross-replication cluster we cannot use UUID in replica path.
    String cluster_name_expanded = local_context->getMacros()->expand(cluster_name);
--- a/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp
+++ b/src/Interpreters/RewriteArrayExistsFunctionVisitor.cpp
@ -2,6 +2,7 @@
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTTablesInSelectQuery.h>

 namespace DB
 {
@ -14,6 +15,28 @@ void RewriteArrayExistsFunctionMatcher::visit(ASTPtr & ast, Data & data)

        visit(*func, ast, data);
    }
+    else if (auto * join = ast->as<ASTTableJoin>())
+    {
+        if (join->using_expression_list)
+        {
+            auto * it = std::find(join->children.begin(), join->children.end(), join->using_expression_list);
+
+            visit(join->using_expression_list, data);
+
+            if (it && *it != join->using_expression_list)
+                *it = join->using_expression_list;
+        }
+
+        if (join->on_expression)
+        {
+            auto * it = std::find(join->children.begin(), join->children.end(), join->on_expression);
+
+            visit(join->on_expression, data);
+
+            if (it && *it != join->on_expression)
+                *it = join->on_expression;
+        }
+    }
 }

 void RewriteArrayExistsFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data &)
@ -76,4 +99,14 @@ void RewriteArrayExistsFunctionMatcher::visit(const ASTFunction & func, ASTPtr &
    }
 }

+bool RewriteArrayExistsFunctionMatcher::needChildVisit(const ASTPtr & ast, const ASTPtr &)
+{
+    /// Children of ASTTableJoin are handled separately in visit() function
+    if (auto * join = ast->as<ASTTableJoin>())
+        return false;
+
+    return true;
+}
+
+
 }
--- a/src/Interpreters/RewriteArrayExistsFunctionVisitor.h
+++ b/src/Interpreters/RewriteArrayExistsFunctionVisitor.h
@ -18,7 +18,7 @@ public:

    static void visit(ASTPtr & ast, Data &);
    static void visit(const ASTFunction &, ASTPtr & ast, Data &);
-    static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; }
+    static bool needChildVisit(const ASTPtr & ast, const ASTPtr &);
 };

 using RewriteArrayExistsFunctionVisitor = InDepthNodeVisitor<RewriteArrayExistsFunctionMatcher, false>;
--- a/src/Parsers/ASTColumnDeclaration.cpp
+++ b/src/Parsers/ASTColumnDeclaration.cpp
@ -77,10 +77,6 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & settings, FormatSta
                      << (*null_modifier ? "" : "NOT ") << "NULL" << (settings.hilite ? hilite_none : "");
    }

-    if (primary_key_specifier)
-        settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "")
-                      << "PRIMARY KEY" << (settings.hilite ? hilite_none : "");
-
    if (default_expression)
    {
        settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") << default_specifier << (settings.hilite ? hilite_none : "");
--- a/src/Parsers/ASTCreateQuery.cpp
+++ b/src/Parsers/ASTCreateQuery.cpp
@ -6,6 +6,8 @@
 #include <Common/quoteString.h>
 #include <Interpreters/StorageID.h>
 #include <IO/Operators.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/WriteBufferFromString.h>


 namespace DB
@ -460,4 +462,49 @@ bool ASTCreateQuery::isParameterizedView() const
    return false;
 }

+
+ASTCreateQuery::UUIDs::UUIDs(const ASTCreateQuery & query)
+    : uuid(query.uuid)
+    , to_inner_uuid(query.to_inner_uuid)
+{
+}
+
+String ASTCreateQuery::UUIDs::toString() const
+{
+    WriteBufferFromOwnString out;
+    out << "{" << uuid << "," << to_inner_uuid << "}";
+    return out.str();
+}
+
+ASTCreateQuery::UUIDs ASTCreateQuery::UUIDs::fromString(const String & str)
+{
+    ReadBufferFromString in{str};
+    ASTCreateQuery::UUIDs res;
+    in >> "{" >> res.uuid >> "," >> res.to_inner_uuid >> "}";
+    return res;
+}
+
+ASTCreateQuery::UUIDs ASTCreateQuery::generateRandomUUID(bool always_generate_new_uuid)
+{
+    if (always_generate_new_uuid)
+        setUUID({});
+
+    if (uuid == UUIDHelpers::Nil)
+        uuid = UUIDHelpers::generateV4();
+
+    /// If destination table (to_table_id) is not specified for materialized view,
+    /// then MV will create inner table. We should generate UUID of inner table here.
+    bool need_uuid_for_inner_table = !attach && is_materialized_view && !to_table_id;
+    if (need_uuid_for_inner_table && (to_inner_uuid == UUIDHelpers::Nil))
+        to_inner_uuid = UUIDHelpers::generateV4();
+
+    return UUIDs{*this};
+}
+
+void ASTCreateQuery::setUUID(const UUIDs & uuids)
+{
+    uuid = uuids.uuid;
+    to_inner_uuid = uuids.to_inner_uuid;
+}
+
 }
--- a/src/Parsers/ASTCreateQuery.h
+++ b/src/Parsers/ASTCreateQuery.h
@ -146,6 +146,18 @@ public:

    QueryKind getQueryKind() const override { return QueryKind::Create; }

+    struct UUIDs
+    {
+        UUID uuid = UUIDHelpers::Nil;
+        UUID to_inner_uuid = UUIDHelpers::Nil;
+        UUIDs() = default;
+        explicit UUIDs(const ASTCreateQuery & query);
+        String toString() const;
+        static UUIDs fromString(const String & str);
+    };
+    UUIDs generateRandomUUID(bool always_generate_new_uuid = false);
+    void setUUID(const UUIDs & uuids);
+
 protected:
    void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;

--- a/src/Parsers/ParserCreateQuery.cpp
+++ b/src/Parsers/ParserCreateQuery.cpp
@ -526,6 +526,7 @@ bool ParserStorage::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)

        break;
    }
+
    // If any part of storage definition is found create storage node
    if (!storage_like)
        return false;
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@ -21,10 +21,14 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm(
    size_t max_block_size_bytes,
    WriteBuffer * out_row_sources_buf_,
    bool use_average_block_sizes,
-    bool cleanup_)
+    bool cleanup_,
+    size_t * cleanedup_rows_count_)
    : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs)
-    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows, max_block_size_bytes), cleanup(cleanup_)
+    , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size_rows, max_block_size_bytes)
+    , cleanup(cleanup_)
+    , cleanedup_rows_count(cleanedup_rows_count_)
 {
+
    if (!is_deleted_column.empty())
        is_deleted_column_number = header_.getPositionByName(is_deleted_column);
    if (!version_column.empty())
@ -74,10 +78,13 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
            /// Write the data for the previous primary key.
            if (!selected_row.empty())
            {
-                if (is_deleted_column_number!=-1)
+                if (is_deleted_column_number != -1)
                {
-                    if (!(cleanup && assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num]))
+                    uint8_t value = assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num];
+                    if (!cleanup || !value)
                        insertRow();
+                    else if (cleanedup_rows_count != nullptr)
+                        *cleanedup_rows_count += current_row_sources.size();
                }
                else
                    insertRow();
@ -91,7 +98,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
        if (out_row_sources_buf)
            current_row_sources.emplace_back(current.impl->order, true);

-        if ((is_deleted_column_number!=-1))
+        if (is_deleted_column_number != -1)
        {
            const UInt8 is_deleted = assert_cast<const ColumnUInt8 &>(*current->all_columns[is_deleted_column_number]).getData()[current->getRow()];
            if ((is_deleted != 1) && (is_deleted != 0))
@ -129,10 +136,13 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
    /// We will write the data for the last primary key.
    if (!selected_row.empty())
    {
-        if (is_deleted_column_number!=-1)
+        if (is_deleted_column_number != -1)
        {
-            if (!(cleanup && assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num]))
+            uint8_t value = assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num];
+            if (!cleanup || !value)
                insertRow();
+            else if (cleanedup_rows_count != nullptr)
+                *cleanedup_rows_count += current_row_sources.size();
        }
        else
            insertRow();
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
@ -27,7 +27,8 @@ public:
        size_t max_block_size_bytes,
        WriteBuffer * out_row_sources_buf_ = nullptr,
        bool use_average_block_sizes = false,
-        bool cleanup = false);
+        bool cleanup = false,
+        size_t * cleanedup_rows_count = nullptr);

    Status merge() override;

@ -37,6 +38,7 @@ private:
    ssize_t is_deleted_column_number = -1;
    ssize_t version_column_number = -1;
    bool cleanup = false;
+    size_t * cleanedup_rows_count = nullptr;

    using RowRef = detail::RowRefWithOwnedChunk;
    static constexpr size_t max_row_refs = 2; /// last, current.
--- a/src/Processors/Merges/ReplacingSortedTransform.h
+++ b/src/Processors/Merges/ReplacingSortedTransform.h
@ -19,7 +19,8 @@ public:
        size_t max_block_size_bytes,
        WriteBuffer * out_row_sources_buf_ = nullptr,
        bool use_average_block_sizes = false,
-        bool cleanup = false)
+        bool cleanup = false,
+        size_t * cleanedup_rows_count = nullptr)
        : IMergingTransform(
            num_inputs, header, header, /*have_all_inputs_=*/ true, /*limit_hint_=*/ 0, /*always_read_till_end_=*/ false,
            header,
@ -31,7 +32,8 @@ public:
            max_block_size_bytes,
            out_row_sources_buf_,
            use_average_block_sizes,
-            cleanup)
+            cleanup,
+            cleanedup_rows_count)
    {
    }

--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -23,12 +23,12 @@
 #include <Processors/Transforms/ReverseTransform.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
-#include <Storages/MergeTree/MergeTreeInOrderSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeReadPool.h>
-#include <Storages/MergeTree/MergeTreeReverseSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreePrefetchedReadPool.h>
+#include <Storages/MergeTree/MergeTreeReadPoolInOrder.h>
+#include <Storages/MergeTree/MergeTreeReadPoolParallelReplicas.h>
+#include <Storages/MergeTree/MergeTreeReadPoolParallelReplicasInOrder.h>
 #include <Storages/MergeTree/MergeTreeSource.h>
-#include <Storages/MergeTree/MergeTreeThreadSelectProcessor.h>
 #include <Storages/MergeTree/RangesInDataPart.h>
 #include <Storages/MergeTree/RequestResponse.h>
 #include <Storages/VirtualColumnUtils.h>
@ -251,7 +251,7 @@ ReadFromMergeTree::ReadFromMergeTree(
    Poco::Logger * log_,
    MergeTreeDataSelectAnalysisResultPtr analyzed_result_ptr_,
    bool enable_parallel_reading)
-    : SourceStepWithFilter(DataStream{.header = IMergeTreeSelectAlgorithm::transformHeader(
+    : SourceStepWithFilter(DataStream{.header = MergeTreeSelectProcessor::transformHeader(
        storage_snapshot_->getSampleBlockForColumns(real_column_names_),
        getPrewhereInfoFromQueryInfo(query_info_),
        data_.getPartitionValueType(),
@ -268,10 +268,11 @@ ReadFromMergeTree::ReadFromMergeTree(
    , storage_snapshot(std::move(storage_snapshot_))
    , metadata_for_reading(storage_snapshot->getMetadataForQuery())
    , context(std::move(context_))
-    , max_block_size(max_block_size_)
+    , block_size{
+        .max_block_size_rows = max_block_size_,
+        .preferred_block_size_bytes = context->getSettingsRef().preferred_block_size_bytes,
+        .preferred_max_column_in_block_size_bytes = context->getSettingsRef().preferred_max_column_in_block_size_bytes}
    , requested_num_streams(num_streams_)
-    , preferred_block_size_bytes(context->getSettingsRef().preferred_block_size_bytes)
-    , preferred_max_column_in_block_size_bytes(context->getSettingsRef().preferred_max_column_in_block_size_bytes)
    , sample_factor_column_queried(sample_factor_column_queried_)
    , max_block_numbers_to_read(std::move(max_block_numbers_to_read_))
    , log(log_)
@ -281,7 +282,7 @@ ReadFromMergeTree::ReadFromMergeTree(
    if (sample_factor_column_queried)
    {
        /// Only _sample_factor virtual column is added by ReadFromMergeTree
-        /// Other virtual columns are added by MergeTreeBaseSelectProcessor.
+        /// Other virtual columns are added by MergeTreeSelectProcessor.
        auto type = std::make_shared<DataTypeFloat64>();
        output_stream->header.insert({type->createColumn(), type, "_sample_factor"});
    }
@ -325,50 +326,50 @@ ReadFromMergeTree::ReadFromMergeTree(
 Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
    RangesInDataParts parts_with_range,
    Names required_columns,
-    size_t max_streams,
-    size_t min_marks_for_concurrent_read,
-    bool use_uncompressed_cache
-)
+    PoolSettings pool_settings)
 {
    const auto & client_info = context->getClientInfo();
+
    auto extension = ParallelReadingExtension
    {
        .all_callback = all_ranges_callback.value(),
        .callback = read_task_callback.value(),
        .count_participating_replicas = client_info.count_participating_replicas,
        .number_of_current_replica = client_info.number_of_current_replica,
-        .columns_to_read = required_columns
+        .columns_to_read = required_columns,
    };

    /// We have a special logic for local replica. It has to read less data, because in some cases it should
    /// merge states of aggregate functions or do some other important stuff other than reading from Disk.
-    min_marks_for_concurrent_read = static_cast<size_t>(min_marks_for_concurrent_read * context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier);
+    pool_settings.min_marks_for_concurrent_read = static_cast<size_t>(pool_settings.min_marks_for_concurrent_read * context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier);
+    size_t total_rows = parts_with_range.getRowsCountAllParts();

    auto pool = std::make_shared<MergeTreeReadPoolParallelReplicas>(
+        std::move(extension),
+        std::move(parts_with_range),
        storage_snapshot,
-        max_streams,
-        extension,
-        parts_with_range,
        prewhere_info,
        actions_settings,
        reader_settings,
        required_columns,
        virt_column_names,
-        min_marks_for_concurrent_read);
+        pool_settings,
+        context);
+
+    auto block_size_copy = block_size;
+    block_size_copy.min_marks_to_read = pool_settings.min_marks_for_concurrent_read;

    Pipes pipes;
-    const auto & settings = context->getSettingsRef();
-    size_t total_rows = parts_with_range.getRowsCountAllParts();

-    for (size_t i = 0; i < max_streams; ++i)
+    for (size_t i = 0; i < pool_settings.threads; ++i)
    {
-        auto algorithm = std::make_unique<MergeTreeThreadSelectAlgorithm>(
-            i, pool, min_marks_for_concurrent_read, max_block_size,
-            settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes,
-            data, storage_snapshot, use_uncompressed_cache,
-            prewhere_info, actions_settings, reader_settings, virt_column_names);
+        auto algorithm = std::make_unique<MergeTreeThreadSelectAlgorithm>(i);

-        auto source = std::make_shared<MergeTreeSource>(std::move(algorithm));
+        auto processor = std::make_unique<MergeTreeSelectProcessor>(
+            pool, std::move(algorithm), data, prewhere_info,
+            actions_settings, block_size_copy, reader_settings, virt_column_names);
+
+        auto source = std::make_shared<MergeTreeSource>(std::move(processor));

        /// Set the approximate number of rows for the first source only
        /// In case of parallel processing on replicas do not set approximate rows at all.
@ -387,12 +388,8 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
 Pipe ReadFromMergeTree::readFromPool(
    RangesInDataParts parts_with_range,
    Names required_columns,
-    size_t max_streams,
-    size_t min_marks_for_concurrent_read,
-    bool use_uncompressed_cache)
+    PoolSettings pool_settings)
 {
-    Pipes pipes;
-    size_t sum_marks = parts_with_range.getMarksCountAllParts();
    size_t total_rows = parts_with_range.getRowsCountAllParts();

    if (query_info.limit > 0 && query_info.limit < total_rows)
@ -403,11 +400,11 @@ Pipe ReadFromMergeTree::readFromPool(
    /// round min_marks_to_read up to nearest multiple of block_size expressed in marks
    /// If granularity is adaptive it doesn't make sense
    /// Maybe it will make sense to add settings `max_block_size_bytes`
-    if (max_block_size && !data.canUseAdaptiveGranularity())
+    if (block_size.max_block_size_rows && !data.canUseAdaptiveGranularity())
    {
        size_t fixed_index_granularity = data.getSettings()->index_granularity;
-        min_marks_for_concurrent_read = (min_marks_for_concurrent_read * fixed_index_granularity + max_block_size - 1)
-            / max_block_size * max_block_size / fixed_index_granularity;
+        pool_settings.min_marks_for_concurrent_read = (pool_settings.min_marks_for_concurrent_read * fixed_index_granularity + block_size.max_block_size_rows - 1)
+            / block_size.max_block_size_rows * block_size.max_block_size_rows / fixed_index_granularity;
    }

    bool all_parts_are_remote = true;
@ -421,34 +418,30 @@ Pipe ReadFromMergeTree::readFromPool(

    MergeTreeReadPoolPtr pool;

-    if ((all_parts_are_remote && settings.allow_prefetched_read_pool_for_remote_filesystem
-         && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.remote_fs_method))
-        || (all_parts_are_local && settings.allow_prefetched_read_pool_for_local_filesystem
-            && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.local_fs_method)))
+    bool allow_prefetched_remote = all_parts_are_remote
+        && settings.allow_prefetched_read_pool_for_remote_filesystem
+        && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.remote_fs_method);
+
+    bool allow_prefetched_local = all_parts_are_local
+        && settings.allow_prefetched_read_pool_for_local_filesystem
+        && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.local_fs_method);
+
+    if (allow_prefetched_remote || allow_prefetched_local)
    {
        pool = std::make_shared<MergeTreePrefetchedReadPool>(
-            max_streams,
-            sum_marks,
-            min_marks_for_concurrent_read,
            std::move(parts_with_range),
            storage_snapshot,
            prewhere_info,
            actions_settings,
+            reader_settings,
            required_columns,
            virt_column_names,
-            settings.preferred_block_size_bytes,
-            reader_settings,
-            context,
-            use_uncompressed_cache,
-            all_parts_are_remote,
-            *data.getSettings());
+            pool_settings,
+            context);
    }
    else
    {
        pool = std::make_shared<MergeTreeReadPool>(
-            max_streams,
-            sum_marks,
-            min_marks_for_concurrent_read,
            std::move(parts_with_range),
            storage_snapshot,
            prewhere_info,
@ -456,22 +449,28 @@ Pipe ReadFromMergeTree::readFromPool(
            reader_settings,
            required_columns,
            virt_column_names,
-            context,
-            false);
+            pool_settings,
+            context);
    }

-    auto * logger = &Poco::Logger::get(data.getLogName() + " (SelectExecutor)");
-    LOG_DEBUG(logger, "Reading approx. {} rows with {} streams", total_rows, max_streams);
+    LOG_DEBUG(log, "Reading approx. {} rows with {} streams", total_rows, pool_settings.threads);

-    for (size_t i = 0; i < max_streams; ++i)
+    /// The reason why we change this setting is because MergeTreeReadPool takes the full task
+    /// ignoring min_marks_to_read setting in case of remote disk (see MergeTreeReadPool::getTask).
+    /// In this case, we won't limit the number of rows to read based on adaptive granularity settings.
+    auto block_size_copy = block_size;
+    block_size_copy.min_marks_to_read = pool_settings.min_marks_for_concurrent_read;
+
+    Pipes pipes;
+    for (size_t i = 0; i < pool_settings.threads; ++i)
    {
-        auto algorithm = std::make_unique<MergeTreeThreadSelectAlgorithm>(
-            i, pool, min_marks_for_concurrent_read, max_block_size,
-            settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes,
-            data, storage_snapshot, use_uncompressed_cache,
-            prewhere_info, actions_settings, reader_settings, virt_column_names);
+        auto algorithm = std::make_unique<MergeTreeThreadSelectAlgorithm>(i);

-        auto source = std::make_shared<MergeTreeSource>(std::move(algorithm));
+        auto processor = std::make_unique<MergeTreeSelectProcessor>(
+            pool, std::move(algorithm), data, prewhere_info,
+            actions_settings, block_size_copy, reader_settings, virt_column_names);
+
+        auto source = std::make_shared<MergeTreeSource>(std::move(processor));

        if (i == 0)
            source->addTotalRowsApprox(total_rows);
@ -485,17 +484,65 @@ Pipe ReadFromMergeTree::readFromPool(
    return pipe;
 }

-template<typename Algorithm>
-ProcessorPtr ReadFromMergeTree::createSource(
-    const RangesInDataPart & part,
-    const Names & required_columns,
-    bool use_uncompressed_cache,
-    bool has_limit_below_one_block,
-    MergeTreeInOrderReadPoolParallelReplicasPtr pool)
+Pipe ReadFromMergeTree::readInOrder(
+    RangesInDataParts parts_with_ranges,
+    Names required_columns,
+    PoolSettings pool_settings,
+    ReadType read_type,
+    UInt64 limit)
 {
-    auto total_rows = part.getRowsCount();
-    if (query_info.limit > 0 && query_info.limit < total_rows)
-        total_rows = query_info.limit;
+    /// For reading in order it makes sense to read only
+    /// one range per task to reduce number of read rows.
+    bool has_limit_below_one_block = read_type != ReadType::Default && limit && limit < block_size.max_block_size_rows;
+    MergeTreeReadPoolPtr pool;
+
+    if (is_parallel_reading_from_replicas)
+    {
+        const auto & client_info = context->getClientInfo();
+        ParallelReadingExtension extension
+        {
+            .all_callback = all_ranges_callback.value(),
+            .callback = read_task_callback.value(),
+            .count_participating_replicas = client_info.count_participating_replicas,
+            .number_of_current_replica = client_info.number_of_current_replica,
+            .columns_to_read = required_columns,
+        };
+
+        pool_settings.min_marks_for_concurrent_read = static_cast<size_t>(
+            pool_settings.min_marks_for_concurrent_read * context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier);
+
+        CoordinationMode mode = read_type == ReadType::InOrder
+            ? CoordinationMode::WithOrder
+            : CoordinationMode::ReverseOrder;
+
+        pool = std::make_shared<MergeTreeReadPoolParallelReplicasInOrder>(
+            std::move(extension),
+            mode,
+            parts_with_ranges,
+            storage_snapshot,
+            prewhere_info,
+            actions_settings,
+            reader_settings,
+            required_columns,
+            virt_column_names,
+            pool_settings,
+            context);
+    }
+    else
+    {
+        pool = std::make_shared<MergeTreeReadPoolInOrder>(
+            has_limit_below_one_block,
+            read_type,
+            parts_with_ranges,
+            storage_snapshot,
+            prewhere_info,
+            actions_settings,
+            reader_settings,
+            required_columns,
+            virt_column_names,
+            pool_settings,
+            context);
+    }

    /// Actually it means that parallel reading from replicas enabled
    /// and we have to collaborate with initiator.
@ -504,37 +551,34 @@ ProcessorPtr ReadFromMergeTree::createSource(
    /// because we don't know actual amount of read rows in case when limit is set.
    bool set_rows_approx = !is_parallel_reading_from_replicas && !reader_settings.read_in_order;

-    auto algorithm = std::make_unique<Algorithm>(
-            data, storage_snapshot, part.data_part, part.alter_conversions, max_block_size, preferred_block_size_bytes,
-            preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info,
-            actions_settings, reader_settings, pool, virt_column_names, part.part_index_in_query, has_limit_below_one_block);
-
-    auto source = std::make_shared<MergeTreeSource>(std::move(algorithm));
-
-    if (set_rows_approx)
-        source->addTotalRowsApprox(total_rows);
-
-    return source;
-}
-
-Pipe ReadFromMergeTree::readInOrder(
-    RangesInDataParts parts_with_range,
-    Names required_columns,
-    ReadType read_type,
-    bool use_uncompressed_cache,
-    UInt64 limit,
-    MergeTreeInOrderReadPoolParallelReplicasPtr pool)
-{
    Pipes pipes;
-    /// For reading in order it makes sense to read only
-    /// one range per task to reduce number of read rows.
-    bool has_limit_below_one_block = read_type != ReadType::Default && limit && limit < max_block_size;
-
-    for (const auto & part : parts_with_range)
+    for (size_t i = 0; i < parts_with_ranges.size(); ++i)
    {
-        auto source = read_type == ReadType::InReverseOrder
-                    ? createSource<MergeTreeReverseSelectAlgorithm>(part, required_columns, use_uncompressed_cache, has_limit_below_one_block, pool)
-                    : createSource<MergeTreeInOrderSelectAlgorithm>(part, required_columns, use_uncompressed_cache, has_limit_below_one_block, pool);
+        const auto & part_with_ranges = parts_with_ranges[i];
+
+        UInt64 total_rows = part_with_ranges.getRowsCount();
+        if (query_info.limit > 0 && query_info.limit < total_rows)
+            total_rows = query_info.limit;
+
+        LOG_TRACE(log, "Reading {} ranges in{}order from part {}, approx. {} rows starting from {}",
+            part_with_ranges.ranges.size(),
+            read_type == ReadType::InReverseOrder ? " reverse " : " ",
+            part_with_ranges.data_part->name, total_rows,
+            part_with_ranges.data_part->index_granularity.getMarkStartingRow(part_with_ranges.ranges.front().begin));
+
+        MergeTreeSelectAlgorithmPtr algorithm;
+        if (read_type == ReadType::InReverseOrder)
+            algorithm = std::make_unique<MergeTreeInReverseOrderSelectAlgorithm>(i);
+        else
+            algorithm = std::make_unique<MergeTreeInOrderSelectAlgorithm>(i);
+
+        auto processor = std::make_unique<MergeTreeSelectProcessor>(
+            pool, std::move(algorithm), data, prewhere_info,
+            actions_settings, block_size, reader_settings, virt_column_names);
+
+        auto source = std::make_shared<MergeTreeSource>(std::move(processor));
+        if (set_rows_approx)
+            source->addTotalRowsApprox(total_rows);

        pipes.emplace_back(std::move(source));
    }
@ -553,16 +597,33 @@ Pipe ReadFromMergeTree::readInOrder(
 }

 Pipe ReadFromMergeTree::read(
-    RangesInDataParts parts_with_range, Names required_columns, ReadType read_type,
-    size_t max_streams, size_t min_marks_for_concurrent_read, bool use_uncompressed_cache)
+    RangesInDataParts parts_with_range,
+    Names required_columns,
+    ReadType read_type,
+    size_t max_streams,
+    size_t min_marks_for_concurrent_read,
+    bool use_uncompressed_cache)
 {
+    const auto & settings = context->getSettingsRef();
+    size_t sum_marks = parts_with_range.getMarksCountAllParts();
+
+    PoolSettings pool_settings
+    {
+        .threads = max_streams,
+        .sum_marks = sum_marks,
+        .min_marks_for_concurrent_read = min_marks_for_concurrent_read,
+        .preferred_block_size_bytes = settings.preferred_block_size_bytes,
+        .use_uncompressed_cache = use_uncompressed_cache,
+        .use_const_size_tasks_for_remote_reading = settings.merge_tree_use_const_size_tasks_for_remote_reading,
+    };
+
    if (read_type == ReadType::ParallelReplicas)
-        return readFromPoolParallelReplicas(parts_with_range, required_columns, max_streams, min_marks_for_concurrent_read, use_uncompressed_cache);
+        return readFromPoolParallelReplicas(std::move(parts_with_range), std::move(required_columns), std::move(pool_settings));

    if (read_type == ReadType::Default && max_streams > 1)
-        return readFromPool(parts_with_range, required_columns, max_streams, min_marks_for_concurrent_read, use_uncompressed_cache);
+        return readFromPool(std::move(parts_with_range), std::move(required_columns), std::move(pool_settings));

-    auto pipe = readInOrder(parts_with_range, required_columns, read_type, use_uncompressed_cache, /*limit  */0, /*pool*/nullptr);
+    auto pipe = readInOrder(parts_with_range, required_columns, pool_settings, read_type, /*limit=*/ 0);

    /// Use ConcatProcessor to concat sources together.
    /// It is needed to read in parts order (and so in PK order) if single thread is used.
@ -585,7 +646,6 @@ struct PartRangesReadInfo
    size_t index_granularity_bytes = 0;
    size_t max_marks_to_use_cache = 0;
    size_t min_marks_for_concurrent_read = 0;
-
    bool use_uncompressed_cache = false;

    PartRangesReadInfo(
@ -663,8 +723,12 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreams(RangesInDataParts && parts_

    auto read_type = is_parallel_reading_from_replicas ? ReadType::ParallelReplicas : ReadType::Default;

-    return read(std::move(parts_with_ranges), column_names, read_type,
-                num_streams, info.min_marks_for_concurrent_read, info.use_uncompressed_cache);
+    return read(std::move(parts_with_ranges),
+        column_names,
+        read_type,
+        num_streams,
+        info.min_marks_for_concurrent_read,
+        info.use_uncompressed_cache);
 }

 static ActionsDAGPtr createProjection(const Block & header)
@ -715,7 +779,8 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(

    /// Let's split ranges to avoid reading much data.
    auto split_ranges
-        = [rows_granularity = data_settings->index_granularity, my_max_block_size = max_block_size](const auto & ranges, int direction)
+        = [rows_granularity = data_settings->index_granularity, my_max_block_size = block_size.max_block_size_rows]
+        (const auto & ranges, int direction)
    {
        MarkRanges new_ranges;
        const size_t max_marks_in_range = (my_max_block_size + rows_granularity - 1) / rows_granularity;
@ -762,109 +827,94 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
    const size_t min_marks_per_stream = (info.sum_marks - 1) / num_streams + 1;
    bool need_preliminary_merge = (parts_with_ranges.size() > settings.read_in_order_two_level_merge_threshold);

-    std::vector<RangesInDataParts> splitted_parts_and_ranges;
-    splitted_parts_and_ranges.reserve(num_streams);
+    const auto read_type = input_order_info->direction == 1 ? ReadType::InOrder : ReadType::InReverseOrder;

-    const auto read_type = input_order_info->direction == 1
-                       ? ReadFromMergeTree::ReadType::InOrder
-                       : ReadFromMergeTree::ReadType::InReverseOrder;
-
-    MergeTreeInOrderReadPoolParallelReplicasPtr pool;
-
-    if (is_parallel_reading_from_replicas)
+    PoolSettings pool_settings
    {
-        const auto & client_info = context->getClientInfo();
-        auto extension = ParallelReadingExtension
-        {
-            .all_callback = all_ranges_callback.value(),
-            .callback = read_task_callback.value(),
-            .count_participating_replicas = client_info.count_participating_replicas,
-            .number_of_current_replica = client_info.number_of_current_replica,
-            .columns_to_read = column_names
-        };
-
-        auto min_marks_for_concurrent_read = info.min_marks_for_concurrent_read;
-        min_marks_for_concurrent_read = static_cast<size_t>(min_marks_for_concurrent_read * settings.parallel_replicas_single_task_marks_count_multiplier);
-
-        pool = std::make_shared<MergeTreeInOrderReadPoolParallelReplicas>(
-            parts_with_ranges,
-            extension,
-            read_type == ReadFromMergeTree::ReadType::InOrder ? CoordinationMode::WithOrder : CoordinationMode::ReverseOrder,
-            min_marks_for_concurrent_read);
-    }
-
-
-    for (size_t i = 0; i < num_streams && !parts_with_ranges.empty(); ++i)
-    {
-        size_t need_marks = min_marks_per_stream;
-        RangesInDataParts new_parts;
-
-        /// Loop over parts.
-        /// We will iteratively take part or some subrange of a part from the back
-        ///  and assign a stream to read from it.
-        while (need_marks > 0 && !parts_with_ranges.empty())
-        {
-            RangesInDataPart part = parts_with_ranges.back();
-            parts_with_ranges.pop_back();
-            size_t & marks_in_part = info.sum_marks_in_parts.back();
-
-            /// We will not take too few rows from a part.
-            if (marks_in_part >= info.min_marks_for_concurrent_read && need_marks < info.min_marks_for_concurrent_read)
-                need_marks = info.min_marks_for_concurrent_read;
-
-            /// Do not leave too few rows in the part.
-            if (marks_in_part > need_marks && marks_in_part - need_marks < info.min_marks_for_concurrent_read)
-                need_marks = marks_in_part;
-
-            MarkRanges ranges_to_get_from_part;
-
-            /// We take full part if it contains enough marks or
-            /// if we know limit and part contains less than 'limit' rows.
-            bool take_full_part = marks_in_part <= need_marks || (input_order_info->limit && input_order_info->limit < part.getRowsCount());
-
-            /// We take the whole part if it is small enough.
-            if (take_full_part)
-            {
-                ranges_to_get_from_part = part.ranges;
-
-                need_marks -= marks_in_part;
-                info.sum_marks_in_parts.pop_back();
-            }
-            else
-            {
-                /// Loop through ranges in part. Take enough ranges to cover "need_marks".
-                while (need_marks > 0)
-                {
-                    if (part.ranges.empty())
-                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected end of ranges while spreading marks among streams");
-
-                    MarkRange & range = part.ranges.front();
-
-                    const size_t marks_in_range = range.end - range.begin;
-                    const size_t marks_to_get_from_range = std::min(marks_in_range, need_marks);
-
-                    ranges_to_get_from_part.emplace_back(range.begin, range.begin + marks_to_get_from_range);
-                    range.begin += marks_to_get_from_range;
-                    marks_in_part -= marks_to_get_from_range;
-                    need_marks -= marks_to_get_from_range;
-                    if (range.begin == range.end)
-                        part.ranges.pop_front();
-                }
-                parts_with_ranges.emplace_back(part);
-            }
-
-            ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
-            new_parts.emplace_back(part.data_part, part.alter_conversions, part.part_index_in_query, std::move(ranges_to_get_from_part));
-        }
-
-        splitted_parts_and_ranges.emplace_back(std::move(new_parts));
-    }
+        .min_marks_for_concurrent_read = info.min_marks_for_concurrent_read,
+        .preferred_block_size_bytes = settings.preferred_block_size_bytes,
+        .use_uncompressed_cache = info.use_uncompressed_cache,
+    };

    Pipes pipes;
-    for (auto & item : splitted_parts_and_ranges)
+    /// For parallel replicas the split will be performed on the initiator side.
+    if (is_parallel_reading_from_replicas)
    {
-        pipes.emplace_back(readInOrder(std::move(item), column_names, read_type,
-                                        info.use_uncompressed_cache, input_order_info->limit, pool));
+        pipes.emplace_back(readInOrder(std::move(parts_with_ranges), column_names, pool_settings, read_type, input_order_info->limit));
+    }
+    else
+    {
+        std::vector<RangesInDataParts> splitted_parts_and_ranges;
+        splitted_parts_and_ranges.reserve(num_streams);
+
+        for (size_t i = 0; i < num_streams && !parts_with_ranges.empty(); ++i)
+        {
+            size_t need_marks = min_marks_per_stream;
+            RangesInDataParts new_parts;
+
+            /// Loop over parts.
+            /// We will iteratively take part or some subrange of a part from the back
+            ///  and assign a stream to read from it.
+            while (need_marks > 0 && !parts_with_ranges.empty())
+            {
+                RangesInDataPart part = parts_with_ranges.back();
+                parts_with_ranges.pop_back();
+                size_t & marks_in_part = info.sum_marks_in_parts.back();
+
+                /// We will not take too few rows from a part.
+                if (marks_in_part >= info.min_marks_for_concurrent_read && need_marks < info.min_marks_for_concurrent_read)
+                    need_marks = info.min_marks_for_concurrent_read;
+
+                /// Do not leave too few rows in the part.
+                if (marks_in_part > need_marks && marks_in_part - need_marks < info.min_marks_for_concurrent_read)
+                    need_marks = marks_in_part;
+
+                MarkRanges ranges_to_get_from_part;
+
+                /// We take full part if it contains enough marks or
+                /// if we know limit and part contains less than 'limit' rows.
+                bool take_full_part = marks_in_part <= need_marks || (input_order_info->limit && input_order_info->limit < part.getRowsCount());
+
+                /// We take the whole part if it is small enough.
+                if (take_full_part)
+                {
+                    ranges_to_get_from_part = part.ranges;
+
+                    need_marks -= marks_in_part;
+                    info.sum_marks_in_parts.pop_back();
+                }
+                else
+                {
+                    /// Loop through ranges in part. Take enough ranges to cover "need_marks".
+                    while (need_marks > 0)
+                    {
+                        if (part.ranges.empty())
+                            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected end of ranges while spreading marks among streams");
+
+                        MarkRange & range = part.ranges.front();
+
+                        const size_t marks_in_range = range.end - range.begin;
+                        const size_t marks_to_get_from_range = std::min(marks_in_range, need_marks);
+
+                        ranges_to_get_from_part.emplace_back(range.begin, range.begin + marks_to_get_from_range);
+                        range.begin += marks_to_get_from_range;
+                        marks_in_part -= marks_to_get_from_range;
+                        need_marks -= marks_to_get_from_range;
+                        if (range.begin == range.end)
+                            part.ranges.pop_front();
+                    }
+                    parts_with_ranges.emplace_back(part);
+                }
+
+                ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);
+                new_parts.emplace_back(part.data_part, part.alter_conversions, part.part_index_in_query, std::move(ranges_to_get_from_part));
+            }
+
+            splitted_parts_and_ranges.emplace_back(std::move(new_parts));
+        }
+
+        for (auto && item : splitted_parts_and_ranges)
+            pipes.emplace_back(readInOrder(std::move(item), column_names, pool_settings, read_type, input_order_info->limit));
    }

    Block pipe_header;
@ -898,7 +948,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsWithOrder(
            if (pipe.numOutputPorts() > 1)
            {
                auto transform = std::make_shared<MergingSortedTransform>(
-                    pipe.getHeader(), pipe.numOutputPorts(), sort_description, max_block_size, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);
+                    pipe.getHeader(), pipe.numOutputPorts(), sort_description, block_size.max_block_size_rows, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);

                pipe.addTransform(std::move(transform));
            }
@ -931,7 +981,7 @@ static void addMergingFinal(
    const SortDescription & sort_description,
    MergeTreeData::MergingParams merging_params,
    Names partition_key_columns,
-    size_t max_block_size)
+    size_t max_block_size_rows)
 {
    const auto & header = pipe.getHeader();
    size_t num_outputs = pipe.numOutputPorts();
@ -944,31 +994,31 @@ static void addMergingFinal(
        {
            case MergeTreeData::MergingParams::Ordinary:
                return std::make_shared<MergingSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);
+                            sort_description, max_block_size_rows, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch);

            case MergeTreeData::MergingParams::Collapsing:
                return std::make_shared<CollapsingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.sign_column, true, max_block_size, /*max_block_size_bytes=*/0);
+                            sort_description, merging_params.sign_column, true, max_block_size_rows, /*max_block_size_bytes=*/0);

            case MergeTreeData::MergingParams::Summing:
                return std::make_shared<SummingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.columns_to_sum, partition_key_columns, max_block_size, /*max_block_size_bytes=*/0);
+                            sort_description, merging_params.columns_to_sum, partition_key_columns, max_block_size_rows, /*max_block_size_bytes=*/0);

            case MergeTreeData::MergingParams::Aggregating:
                return std::make_shared<AggregatingSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size, /*max_block_size_bytes=*/0);
+                            sort_description, max_block_size_rows, /*max_block_size_bytes=*/0);

            case MergeTreeData::MergingParams::Replacing:
                return std::make_shared<ReplacingSortedTransform>(header, num_outputs,
-                            sort_description, merging_params.is_deleted_column, merging_params.version_column, max_block_size, /*max_block_size_bytes=*/0, /*out_row_sources_buf_*/ nullptr, /*use_average_block_sizes*/ false, /*cleanup*/ !merging_params.is_deleted_column.empty());
+                            sort_description, merging_params.is_deleted_column, merging_params.version_column, max_block_size_rows, /*max_block_size_bytes=*/0, /*out_row_sources_buf_*/ nullptr, /*use_average_block_sizes*/ false, /*cleanup*/ !merging_params.is_deleted_column.empty());

            case MergeTreeData::MergingParams::VersionedCollapsing:
                return std::make_shared<VersionedCollapsingTransform>(header, num_outputs,
-                            sort_description, merging_params.sign_column, max_block_size, /*max_block_size_bytes=*/0);
+                            sort_description, merging_params.sign_column, max_block_size_rows, /*max_block_size_bytes=*/0);

            case MergeTreeData::MergingParams::Graphite:
                return std::make_shared<GraphiteRollupSortedTransform>(header, num_outputs,
-                            sort_description, max_block_size, /*max_block_size_bytes=*/0, merging_params.graphite_params, now);
+                            sort_description, max_block_size_rows, /*max_block_size_bytes=*/0, merging_params.graphite_params, now);
        }

        UNREACHABLE();
@ -1064,11 +1114,12 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
                    return this->read(
                        std::move(parts),
                        column_names,
-                        ReadFromMergeTree::ReadType::InOrder,
+                        ReadType::InOrder,
                        1 /* num_streams */,
                        0 /* min_marks_for_concurrent_read */,
                        info.use_uncompressed_cache);
                };
+
                pipes = buildPipesForReadingByPKRanges(
                    metadata_for_reading->getPrimaryKey(),
                    sorting_expr,
@ -1080,7 +1131,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
            else
            {
                pipes.emplace_back(read(
-                    std::move(new_parts), column_names, ReadFromMergeTree::ReadType::InOrder, num_streams, 0, info.use_uncompressed_cache));
+                    std::move(new_parts), column_names, ReadType::InOrder, num_streams, 0, info.use_uncompressed_cache));

                pipes.back().addSimpleTransform([sorting_expr](const Block & header)
                                                { return std::make_shared<ExpressionTransform>(header, sorting_expr); });
@ -1121,7 +1172,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
                sort_description,
                data.merging_params,
                partition_key_columns,
-                max_block_size);
+                block_size.max_block_size_rows);

        partition_pipes.emplace_back(Pipe::unitePipes(std::move(pipes)));
    }
@ -1141,7 +1192,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
        if (sum_marks_in_lonely_parts < num_streams_for_lonely_parts * min_marks_for_concurrent_read && lonely_parts.size() < num_streams_for_lonely_parts)
            num_streams_for_lonely_parts = std::max((sum_marks_in_lonely_parts + min_marks_for_concurrent_read - 1) / min_marks_for_concurrent_read, lonely_parts.size());

-        auto pipe = read(std::move(lonely_parts), column_names, ReadFromMergeTree::ReadType::Default,
+        auto pipe = read(std::move(lonely_parts), column_names, ReadType::Default,
                num_streams_for_lonely_parts, min_marks_for_concurrent_read, info.use_uncompressed_cache);

        /// Drop temporary columns, added by 'sorting_key_expr'
@ -1605,11 +1656,13 @@ void ReadFromMergeTree::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info
 {
    query_info.prewhere_info = prewhere_info_value;
    prewhere_info = prewhere_info_value;
-    output_stream = DataStream{.header = IMergeTreeSelectAlgorithm::transformHeader(
+
+    output_stream = DataStream{.header = MergeTreeSelectProcessor::transformHeader(
        storage_snapshot->getSampleBlockForColumns(real_column_names),
        prewhere_info_value,
        data.getPartitionValueType(),
        virt_column_names)};
+
    updateSortDescriptionForOutputStream(
        *output_stream,
        storage_snapshot->getMetadataForQuery()->getSortingKeyColumns(),
--- a/src/Processors/QueryPlan/ReadFromMergeTree.h
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.h
@ -87,25 +87,7 @@ public:
    };

    using IndexStats = std::vector<IndexStat>;
-
-    enum class ReadType
-    {
-        /// By default, read will use MergeTreeReadPool and return pipe with num_streams outputs.
-        /// If num_streams == 1, will read without pool, in order specified in parts.
-        Default,
-        /// Read in sorting key order.
-        /// Returned pipe will have the number of ports equals to parts.size().
-        /// Parameter num_streams_ is ignored in this case.
-        /// User should add MergingSorted itself if needed.
-        InOrder,
-        /// The same as InOrder, but in reverse order.
-        /// For every part, read ranges and granules from end to begin. Also add ReverseTransform.
-        InReverseOrder,
-        /// A special type of reading where every replica
-        /// talks to a remote coordinator (which is located on the initiator node)
-        /// and who spreads marks and parts across them.
-        ParallelReplicas,
-    };
+    using ReadType = MergeTreeReadType;

    struct AnalysisResult
    {
@ -113,7 +95,7 @@ public:
        MergeTreeDataSelectSamplingData sampling;
        IndexStats index_stats;
        Names column_names_to_read;
-        ReadFromMergeTree::ReadType read_type = ReadFromMergeTree::ReadType::Default;
+        ReadType read_type = ReadType::Default;
        UInt64 total_parts = 0;
        UInt64 parts_before_pk = 0;
        UInt64 selected_parts = 0;
@ -223,7 +205,7 @@ public:

    const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; }
    const MergeTreeData & getMergeTreeData() const { return data; }
-    size_t getMaxBlockSize() const { return max_block_size; }
+    size_t getMaxBlockSize() const { return block_size.max_block_size_rows; }
    size_t getNumStreams() const { return requested_num_streams; }
    bool isParallelReadingEnabled() const { return read_task_callback != std::nullopt; }

@ -271,12 +253,10 @@ private:
    StorageMetadataPtr metadata_for_reading;

    ContextPtr context;
+    const MergeTreeReadTask::BlockSizeParams block_size;

-    const size_t max_block_size;
    size_t requested_num_streams;
    size_t output_streams_limit = 0;
-    const size_t preferred_block_size_bytes;
-    const size_t preferred_max_column_in_block_size_bytes;
    const bool sample_factor_column_queried;

    /// Used for aggregation optimisation (see DB::QueryPlanOptimizations::tryAggregateEachPartitionIndependently).
@ -292,16 +272,14 @@ private:
    UInt64 selected_rows = 0;
    UInt64 selected_marks = 0;

+    using PoolSettings = MergeTreeReadPoolBase::PoolSettings;
+
    Pipe read(RangesInDataParts parts_with_range, Names required_columns, ReadType read_type, size_t max_streams, size_t min_marks_for_concurrent_read, bool use_uncompressed_cache);
-    Pipe readFromPool(RangesInDataParts parts_with_ranges, Names required_columns, size_t max_streams, size_t min_marks_for_concurrent_read, bool use_uncompressed_cache);
-    Pipe readFromPoolParallelReplicas(RangesInDataParts parts_with_ranges, Names required_columns, size_t max_streams, size_t min_marks_for_concurrent_read, bool use_uncompressed_cache);
-    Pipe readInOrder(RangesInDataParts parts_with_range, Names required_columns, ReadType read_type, bool use_uncompressed_cache, UInt64 limit, MergeTreeInOrderReadPoolParallelReplicasPtr pool);
+    Pipe readFromPool(RangesInDataParts parts_with_range, Names required_columns, PoolSettings pool_settings);
+    Pipe readFromPoolParallelReplicas(RangesInDataParts parts_with_range, Names required_columns, PoolSettings pool_settings);
+    Pipe readInOrder(RangesInDataParts parts_with_ranges, Names required_columns, PoolSettings pool_settings, ReadType read_type, UInt64 limit);

-    template<typename TSource>
-    ProcessorPtr createSource(const RangesInDataPart & part, const Names & required_columns, bool use_uncompressed_cache, bool has_limit_below_one_block, MergeTreeInOrderReadPoolParallelReplicasPtr pool);
-
-    Pipe spreadMarkRanges(
-        RangesInDataParts && parts_with_ranges, size_t num_streams, AnalysisResult & result, ActionsDAGPtr & result_projection);
+    Pipe spreadMarkRanges(RangesInDataParts && parts_with_ranges, size_t num_streams, AnalysisResult & result, ActionsDAGPtr & result_projection);

    Pipe groupStreamsByPartition(AnalysisResult & result, ActionsDAGPtr & result_projection);

--- a/src/Processors/Transforms/MergeJoinTransform.cpp
+++ b/src/Processors/Transforms/MergeJoinTransform.cpp
@ -52,12 +52,12 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column,

        if (left_nullable && right_nullable)
        {
-            int res = left_column.compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
+            int res = left_nullable->compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
            if (res)
                return res;

            /// NULL != NULL case
-            if (left_column.isNullAt(lhs_pos))
+            if (left_nullable->isNullAt(lhs_pos))
                return null_direction_hint;

            return 0;
@ -68,7 +68,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column,
    {
        if (const auto * left_nullable = checkAndGetColumn<ColumnNullable>(left_column))
        {
-            if (left_column.isNullAt(lhs_pos))
+            if (left_nullable->isNullAt(lhs_pos))
                return null_direction_hint;
            return left_nullable->getNestedColumn().compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
        }
@ -78,7 +78,7 @@ int nullableCompareAt(const IColumn & left_column, const IColumn & right_column,
    {
        if (const auto * right_nullable = checkAndGetColumn<ColumnNullable>(right_column))
        {
-            if (right_column.isNullAt(rhs_pos))
+            if (right_nullable->isNullAt(rhs_pos))
                return -null_direction_hint;
            return left_column.compareAt(lhs_pos, rhs_pos, right_nullable->getNestedColumn(), null_direction_hint);
        }
--- a/src/Storages/AlterCommands.cpp
+++ b/src/Storages/AlterCommands.cpp
@ -1350,9 +1350,14 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const
    validateColumnsDefaultsAndGetSampleBlock(default_expr_list, all_columns.getAll(), context);
 }

-bool AlterCommands::hasSettingsAlterCommand() const
+bool AlterCommands::hasNonReplicatedAlterCommand() const
 {
-    return std::any_of(begin(), end(), [](const AlterCommand & c) { return c.isSettingsAlter(); });
+    return std::any_of(begin(), end(), [](const AlterCommand & c) { return c.isSettingsAlter() || c.isCommentAlter(); });
+}
+
+bool AlterCommands::areNonReplicatedAlterCommands() const
+{
+    return std::all_of(begin(), end(), [](const AlterCommand & c) { return c.isSettingsAlter() || c.isCommentAlter(); });
 }

 bool AlterCommands::isSettingsAlter() const
--- a/src/Storages/AlterCommands.h
+++ b/src/Storages/AlterCommands.h
@ -196,8 +196,11 @@ public:
    /// Commands have to be prepared before apply.
    void apply(StorageInMemoryMetadata & metadata, ContextPtr context) const;

-    /// At least one command modify settings.
-    bool hasSettingsAlterCommand() const;
+    /// At least one command modify settings or comments.
+    bool hasNonReplicatedAlterCommand() const;
+
+    /// All commands modify settings or comments.
+    bool areNonReplicatedAlterCommands() const;

    /// All commands modify settings only.
    bool isSettingsAlter() const;
--- a/src/Storages/MergeTree/IMergeTreeReadPool.h
+++ b/src/Storages/MergeTree/IMergeTreeReadPool.h
@ -3,24 +3,27 @@
 #include <boost/noncopyable.hpp>
 #include <Core/Block.h>
 #include <IO/ReadBufferFromFileBase.h>
-#include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/MergeTreeReadTask.h>


 namespace DB
 {
-struct MergeTreeReadTask;
-using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
-

+/// The interface that determines how tasks for reading (MergeTreeReadTask)
+/// are distributed among data parts with ranges.
 class IMergeTreeReadPool : private boost::noncopyable
 {
 public:
    virtual ~IMergeTreeReadPool() = default;
-
+    virtual String getName() const = 0;
    virtual Block getHeader() const = 0;

-    virtual MergeTreeReadTaskPtr getTask(size_t thread) = 0;
+    /// Returns true if tasks are returned in the same order as the order of ranges passed to pool
+    virtual bool preservesOrderOfRanges() const = 0;

+    /// task_idx is an implementation defined identifier that helps
+    /// to get required task. E.g. it may be number of thread in case of Default reading type or an index of a part in case of InOrder/InReverseOrder reading type.
+    virtual MergeTreeReadTaskPtr getTask(size_t task_idx, MergeTreeReadTask * previous_task) = 0;
    virtual void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) = 0;
 };

--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -3,7 +3,6 @@
 #include <Core/NamesAndTypes.h>
 #include <Common/HashTable/HashMap.h>
 #include <Storages/MergeTree/MergeTreeReaderStream.h>
-#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/IMergeTreeDataPart.h>
 #include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>

--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@ -487,6 +487,7 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const

    size_t sum_input_rows_exact = global_ctx->merge_list_element_ptr->rows_read;
    size_t input_rows_filtered = *global_ctx->input_rows_filtered;
+    size_t cleanedup_rows_count = global_ctx->cleanedup_rows_count;
    global_ctx->merge_list_element_ptr->columns_written = global_ctx->merging_column_names.size();
    global_ctx->merge_list_element_ptr->progress.store(ctx->column_sizes->keyColumnsWeight(), std::memory_order_relaxed);

@ -499,12 +500,13 @@ bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const
    /// In special case, when there is only one source part, and no rows were skipped, we may have
    /// skipped writing rows_sources file. Otherwise rows_sources_count must be equal to the total
    /// number of input rows.
-    if ((rows_sources_count > 0 || global_ctx->future_part->parts.size() > 1) && sum_input_rows_exact != rows_sources_count + input_rows_filtered)
+    if ((rows_sources_count > 0 || global_ctx->future_part->parts.size() > 1)
+        && sum_input_rows_exact != rows_sources_count + input_rows_filtered + cleanedup_rows_count)
        throw Exception(
-                        ErrorCodes::LOGICAL_ERROR,
-                        "Number of rows in source parts ({}) excluding filtered rows ({}) differs from number "
-                        "of bytes written to rows_sources file ({}). It is a bug.",
-                        sum_input_rows_exact, input_rows_filtered, rows_sources_count);
+            ErrorCodes::LOGICAL_ERROR,
+            "Number of rows in source parts ({}) excluding filtered rows ({}) and cleaned up rows ({}) differs from number "
+            "of bytes written to rows_sources file ({}). It is a bug.",
+            sum_input_rows_exact, input_rows_filtered, cleanedup_rows_count, rows_sources_count);

    ctx->rows_sources_read_buf = std::make_unique<CompressedReadBufferFromFile>(ctx->tmp_disk->readFile(fileName(ctx->rows_sources_file->path())));

@ -975,7 +977,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
            merged_transform = std::make_shared<ReplacingSortedTransform>(
                header, pipes.size(), sort_description, ctx->merging_params.is_deleted_column, ctx->merging_params.version_column,
                merge_block_size_rows, merge_block_size_bytes, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size,
-                (data_settings->clean_deleted_rows != CleanDeletedRows::Never) || global_ctx->cleanup);
+                (data_settings->clean_deleted_rows != CleanDeletedRows::Never) || global_ctx->cleanup, &global_ctx->cleanedup_rows_count);
            break;

        case MergeTreeData::MergingParams::Graphite:
--- a/src/Storages/MergeTree/MergeTask.h
+++ b/src/Storages/MergeTree/MergeTask.h
@ -145,6 +145,7 @@ private:
        bool deduplicate{false};
        Names deduplicate_by_columns{};
        bool cleanup{false};
+        size_t cleanedup_rows_count{0};

        NamesAndTypesList gathering_columns{};
        NamesAndTypesList merging_columns{};
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
@ -1,696 +0,0 @@
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
-#include <Storages/MergeTree/MergeTreeRangeReader.h>
-#include <Storages/MergeTree/IMergeTreeDataPart.h>
-#include <Storages/MergeTree/IMergeTreeReader.h>
-#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
-#include <Storages/MergeTree/RequestResponse.h>
-#include <Columns/FilterDescription.h>
-#include <Common/ElapsedTimeProfileEventIncrement.h>
-#include <Common/logger_useful.h>
-#include <Common/typeid_cast.h>
-#include <DataTypes/DataTypeNothing.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypeUUID.h>
-#include <DataTypes/DataTypeArray.h>
-#include <Processors/Transforms/AggregatingTransform.h>
-#include <city.h>
-
-namespace ProfileEvents
-{
-    extern const Event WaitPrefetchTaskMicroseconds;
-};
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
-    extern const int LOGICAL_ERROR;
-    extern const int QUERY_WAS_CANCELLED;
-}
-
-static void injectNonConstVirtualColumns(
-    size_t rows,
-    Block & block,
-    const Names & virtual_columns);
-
-static void injectPartConstVirtualColumns(
-    size_t rows,
-    Block & block,
-    MergeTreeReadTask * task,
-    const DataTypePtr & partition_value_type,
-    const Names & virtual_columns);
-
-
-IMergeTreeSelectAlgorithm::IMergeTreeSelectAlgorithm(
-    Block header,
-    const MergeTreeData & storage_,
-    const StorageSnapshotPtr & storage_snapshot_,
-    const PrewhereInfoPtr & prewhere_info_,
-    const ExpressionActionsSettings & actions_settings_,
-    UInt64 max_block_size_rows_,
-    UInt64 preferred_block_size_bytes_,
-    UInt64 preferred_max_column_in_block_size_bytes_,
-    const MergeTreeReaderSettings & reader_settings_,
-    bool use_uncompressed_cache_,
-    const Names & virt_column_names_)
-    : storage(storage_)
-    , storage_snapshot(storage_snapshot_)
-    , prewhere_info(prewhere_info_)
-    , actions_settings(actions_settings_)
-    , prewhere_actions(getPrewhereActions(prewhere_info, actions_settings, reader_settings_.enable_multiple_prewhere_read_steps))
-    , max_block_size_rows(max_block_size_rows_)
-    , preferred_block_size_bytes(preferred_block_size_bytes_)
-    , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_)
-    , reader_settings(reader_settings_)
-    , use_uncompressed_cache(use_uncompressed_cache_)
-    , virt_column_names(virt_column_names_)
-    , partition_value_type(storage.getPartitionValueType())
-    , owned_uncompressed_cache(use_uncompressed_cache ? storage.getContext()->getUncompressedCache() : nullptr)
-    , owned_mark_cache(storage.getContext()->getMarkCache())
-{
-    header_without_const_virtual_columns = applyPrewhereActions(std::move(header), prewhere_info);
-    size_t non_const_columns_offset = header_without_const_virtual_columns.columns();
-    injectNonConstVirtualColumns(0, header_without_const_virtual_columns, virt_column_names);
-
-    for (size_t col_num = non_const_columns_offset; col_num < header_without_const_virtual_columns.columns(); ++col_num)
-        non_const_virtual_column_names.emplace_back(header_without_const_virtual_columns.getByPosition(col_num).name);
-
-    result_header = header_without_const_virtual_columns;
-    injectPartConstVirtualColumns(0, result_header, nullptr, partition_value_type, virt_column_names);
-
-    if (!prewhere_actions.steps.empty())
-        LOG_TRACE(log, "PREWHERE condition was split into {} steps: {}", prewhere_actions.steps.size(), prewhere_actions.dumpConditions());
-
-    if (prewhere_info)
-        LOG_TEST(log, "Original PREWHERE DAG:\n{}\nPREWHERE actions:\n{}",
-            (prewhere_info->prewhere_actions ? prewhere_info->prewhere_actions->dumpDAG(): std::string("<nullptr>")),
-            (!prewhere_actions.steps.empty() ? prewhere_actions.dump() : std::string("<nullptr>")));
-}
-
-bool tryBuildPrewhereSteps(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, PrewhereExprInfo & prewhere);
-
-PrewhereExprInfo IMergeTreeSelectAlgorithm::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps)
-{
-    PrewhereExprInfo prewhere_actions;
-    if (prewhere_info)
-    {
-        if (prewhere_info->row_level_filter)
-        {
-            PrewhereExprStep row_level_filter_step
-            {
-                .type = PrewhereExprStep::Filter,
-                .actions = std::make_shared<ExpressionActions>(prewhere_info->row_level_filter, actions_settings),
-                .filter_column_name = prewhere_info->row_level_column_name,
-                .remove_filter_column = true,
-                .need_filter = true,
-                .perform_alter_conversions = true,
-            };
-
-            prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(row_level_filter_step)));
-        }
-
-        if (!enable_multiple_prewhere_read_steps ||
-            !tryBuildPrewhereSteps(prewhere_info, actions_settings, prewhere_actions))
-        {
-            PrewhereExprStep prewhere_step
-            {
-                .type = PrewhereExprStep::Filter,
-                .actions = std::make_shared<ExpressionActions>(prewhere_info->prewhere_actions, actions_settings),
-                .filter_column_name = prewhere_info->prewhere_column_name,
-                .remove_filter_column = prewhere_info->remove_prewhere_column,
-                .need_filter = prewhere_info->need_filter,
-                .perform_alter_conversions = true,
-            };
-
-            prewhere_actions.steps.emplace_back(std::make_shared<PrewhereExprStep>(std::move(prewhere_step)));
-        }
-    }
-
-    return prewhere_actions;
-}
-
-
-bool IMergeTreeSelectAlgorithm::getNewTask()
-{
-    if (getNewTaskImpl())
-    {
-        finalizeNewTask();
-        return true;
-    }
-    return false;
-}
-
-
-ChunkAndProgress IMergeTreeSelectAlgorithm::read()
-{
-    while (!is_cancelled)
-    {
-        try
-        {
-            if ((!task || task->isFinished()) && !getNewTask())
-                break;
-        }
-        catch (const Exception & e)
-        {
-            /// See MergeTreeBaseSelectProcessor::getTaskFromBuffer()
-            if (e.code() == ErrorCodes::QUERY_WAS_CANCELLED)
-                break;
-            throw;
-        }
-
-        auto res = readFromPart();
-
-        if (res.row_count)
-        {
-            injectVirtualColumns(res.block, res.row_count, task.get(), partition_value_type, virt_column_names);
-
-            /// Reorder the columns according to result_header
-            Columns ordered_columns;
-            ordered_columns.reserve(result_header.columns());
-            for (size_t i = 0; i < result_header.columns(); ++i)
-            {
-                auto name = result_header.getByPosition(i).name;
-                ordered_columns.push_back(res.block.getByName(name).column);
-            }
-
-            return ChunkAndProgress{
-                .chunk = Chunk(ordered_columns, res.row_count),
-                .num_read_rows = res.num_read_rows,
-                .num_read_bytes = res.num_read_bytes,
-                .is_finished = false};
-        }
-        else
-        {
-            return {Chunk(), res.num_read_rows, res.num_read_bytes, false};
-        }
-    }
-
-    return {Chunk(), 0, 0, true};
-}
-
-void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForCurrentTask(
-    const IMergeTreeReader::ValueSizeMap & value_size_map,
-    const ReadBufferFromFileBase::ProfileCallback & profile_callback)
-{
-    if (!task)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no task");
-
-    if (task->reader.valid())
-    {
-        ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::WaitPrefetchTaskMicroseconds);
-        reader = task->reader.get();
-    }
-    else
-    {
-        reader = task->data_part->getReader(
-            task->task_columns.columns, storage_snapshot, task->mark_ranges,
-            owned_uncompressed_cache.get(), owned_mark_cache.get(),
-            task->alter_conversions, reader_settings, value_size_map, profile_callback);
-    }
-
-    if (!task->pre_reader_for_step.empty())
-    {
-        ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::WaitPrefetchTaskMicroseconds);
-        pre_reader_for_step.clear();
-        for (auto & pre_reader : task->pre_reader_for_step)
-            pre_reader_for_step.push_back(pre_reader.get());
-    }
-    else
-    {
-        initializeMergeTreePreReadersForPart(
-            task->data_part, task->alter_conversions,
-            task->task_columns, task->mark_ranges,
-            value_size_map, profile_callback);
-    }
-}
-
-void IMergeTreeSelectAlgorithm::initializeMergeTreeReadersForPart(
-    const MergeTreeData::DataPartPtr & data_part,
-    const AlterConversionsPtr & alter_conversions,
-    const MergeTreeReadTaskColumns & task_columns,
-    const MarkRanges & mark_ranges,
-    const IMergeTreeReader::ValueSizeMap & value_size_map,
-    const ReadBufferFromFileBase::ProfileCallback & profile_callback)
-{
-    reader = data_part->getReader(
-        task_columns.columns, storage_snapshot, mark_ranges,
-        owned_uncompressed_cache.get(), owned_mark_cache.get(),
-        alter_conversions, reader_settings, value_size_map, profile_callback);
-
-    initializeMergeTreePreReadersForPart(
-        data_part, alter_conversions, task_columns,
-        mark_ranges, value_size_map, profile_callback);
-}
-
-void IMergeTreeSelectAlgorithm::initializeMergeTreePreReadersForPart(
-    const MergeTreeData::DataPartPtr & data_part,
-    const AlterConversionsPtr & alter_conversions,
-    const MergeTreeReadTaskColumns & task_columns,
-    const MarkRanges & mark_ranges,
-    const IMergeTreeReader::ValueSizeMap & value_size_map,
-    const ReadBufferFromFileBase::ProfileCallback & profile_callback)
-{
-    pre_reader_for_step.clear();
-
-    /// Add lightweight delete filtering step
-    if (reader_settings.apply_deleted_mask && data_part->hasLightweightDelete())
-    {
-        pre_reader_for_step.push_back(
-            data_part->getReader(
-                {LightweightDeleteDescription::FILTER_COLUMN}, storage_snapshot,
-                mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(),
-                alter_conversions, reader_settings, value_size_map, profile_callback));
-    }
-
-    for (const auto & pre_columns_per_step : task_columns.pre_columns)
-    {
-        pre_reader_for_step.push_back(
-            data_part->getReader(
-                pre_columns_per_step, storage_snapshot, mark_ranges,
-                owned_uncompressed_cache.get(), owned_mark_cache.get(),
-                alter_conversions, reader_settings, value_size_map, profile_callback));
-    }
-}
-
-void IMergeTreeSelectAlgorithm::initializeRangeReaders(MergeTreeReadTask & current_task)
-{
-    return initializeRangeReadersImpl(
-        current_task.range_reader, current_task.pre_range_readers, prewhere_actions,
-        reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
-        pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
-}
-
-void IMergeTreeSelectAlgorithm::initializeRangeReadersImpl(
-    MergeTreeRangeReader & range_reader,
-    std::deque<MergeTreeRangeReader> & pre_range_readers,
-    const PrewhereExprInfo & prewhere_actions,
-    IMergeTreeReader * reader,
-    bool has_lightweight_delete,
-    const MergeTreeReaderSettings & reader_settings,
-    const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
-    const PrewhereExprStep & lightweight_delete_filter_step,
-    const Names & non_const_virtual_column_names)
-{
-    MergeTreeRangeReader * prev_reader = nullptr;
-    bool last_reader = false;
-    size_t pre_readers_shift = 0;
-
-    /// Add filtering step with lightweight delete mask
-    if (reader_settings.apply_deleted_mask && has_lightweight_delete)
-    {
-        MergeTreeRangeReader pre_range_reader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names);
-        pre_range_readers.push_back(std::move(pre_range_reader));
-        prev_reader = &pre_range_readers.back();
-        pre_readers_shift++;
-    }
-
-    if (prewhere_actions.steps.size() + pre_readers_shift != pre_reader_for_step.size())
-    {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "PREWHERE steps count mismatch, actions: {}, readers: {}",
-            prewhere_actions.steps.size(), pre_reader_for_step.size());
-    }
-
-    for (size_t i = 0; i < prewhere_actions.steps.size(); ++i)
-    {
-        last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions.steps.size());
-
-        MergeTreeRangeReader current_reader(
-            pre_reader_for_step[i + pre_readers_shift].get(),
-            prev_reader, prewhere_actions.steps[i].get(),
-            last_reader, non_const_virtual_column_names);
-
-        pre_range_readers.push_back(std::move(current_reader));
-        prev_reader = &pre_range_readers.back();
-    }
-
-    if (!last_reader)
-    {
-        range_reader = MergeTreeRangeReader(reader, prev_reader, nullptr, true, non_const_virtual_column_names);
-    }
-    else
-    {
-        /// If all columns are read by pre_range_readers than move last pre_range_reader into range_reader
-        range_reader = std::move(pre_range_readers.back());
-        pre_range_readers.pop_back();
-    }
-}
-
-static UInt64 estimateNumRows(const MergeTreeReadTask & current_task, UInt64 current_preferred_block_size_bytes,
-    UInt64 current_max_block_size_rows, UInt64 current_preferred_max_column_in_block_size_bytes, double min_filtration_ratio, size_t min_marks_to_read)
-{
-    const MergeTreeRangeReader & current_reader = current_task.range_reader;
-
-    if (!current_task.size_predictor)
-        return static_cast<size_t>(current_max_block_size_rows);
-
-    /// Calculates number of rows will be read using preferred_block_size_bytes.
-    /// Can't be less than avg_index_granularity.
-    size_t rows_to_read = current_task.size_predictor->estimateNumRows(current_preferred_block_size_bytes);
-    if (!rows_to_read)
-        return rows_to_read;
-    auto total_row_in_current_granule = current_reader.numRowsInCurrentGranule();
-    rows_to_read = std::max(total_row_in_current_granule, rows_to_read);
-
-    if (current_preferred_max_column_in_block_size_bytes)
-    {
-        /// Calculates number of rows will be read using preferred_max_column_in_block_size_bytes.
-        auto rows_to_read_for_max_size_column
-            = current_task.size_predictor->estimateNumRowsForMaxSizeColumn(current_preferred_max_column_in_block_size_bytes);
-        double filtration_ratio = std::max(min_filtration_ratio, 1.0 - current_task.size_predictor->filtered_rows_ratio);
-        auto rows_to_read_for_max_size_column_with_filtration
-            = static_cast<size_t>(rows_to_read_for_max_size_column / filtration_ratio);
-
-        /// If preferred_max_column_in_block_size_bytes is used, number of rows to read can be less than current_index_granularity.
-        rows_to_read = std::min(rows_to_read, rows_to_read_for_max_size_column_with_filtration);
-    }
-
-    auto unread_rows_in_current_granule = current_reader.numPendingRowsInCurrentGranule();
-    if (unread_rows_in_current_granule >= rows_to_read)
-        return rows_to_read;
-
-    const MergeTreeIndexGranularity & index_granularity = current_task.data_part->index_granularity;
-
-    return index_granularity.countMarksForRows(current_reader.currentMark(), rows_to_read, current_reader.numReadRowsInCurrentGranule(), min_marks_to_read);
-}
-
-
-IMergeTreeSelectAlgorithm::BlockAndProgress IMergeTreeSelectAlgorithm::readFromPartImpl()
-{
-    if (task->size_predictor)
-        task->size_predictor->startBlock();
-
-    const UInt64 current_max_block_size_rows = max_block_size_rows;
-    const UInt64 current_preferred_block_size_bytes = preferred_block_size_bytes;
-    const UInt64 current_preferred_max_column_in_block_size_bytes = preferred_max_column_in_block_size_bytes;
-    const double min_filtration_ratio = 0.00001;
-
-    UInt64 recommended_rows = estimateNumRows(*task, current_preferred_block_size_bytes,
-        current_max_block_size_rows, current_preferred_max_column_in_block_size_bytes, min_filtration_ratio, min_marks_to_read);
-    UInt64 rows_to_read = std::max(static_cast<UInt64>(1), std::min(current_max_block_size_rows, recommended_rows));
-
-    auto read_result = task->range_reader.read(rows_to_read, task->mark_ranges);
-
-    /// All rows were filtered. Repeat.
-    if (read_result.num_rows == 0)
-        read_result.columns.clear();
-
-    const auto & sample_block = task->range_reader.getSampleBlock();
-    if (read_result.num_rows != 0 && sample_block.columns() != read_result.columns.size())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Inconsistent number of columns got from MergeTreeRangeReader. "
-                        "Have {} in sample block and {} columns in list",
-                        toString(sample_block.columns()), toString(read_result.columns.size()));
-
-    /// TODO: check columns have the same types as in header.
-
-    UInt64 num_filtered_rows = read_result.numReadRows() - read_result.num_rows;
-
-    size_t num_read_rows = read_result.numReadRows();
-    size_t num_read_bytes = read_result.numBytesRead();
-
-    if (task->size_predictor)
-    {
-        task->size_predictor->updateFilteredRowsRation(read_result.numReadRows(), num_filtered_rows);
-
-        if (!read_result.columns.empty())
-            task->size_predictor->update(sample_block, read_result.columns, read_result.num_rows);
-    }
-
-    Block block;
-    if (read_result.num_rows != 0)
-        block = sample_block.cloneWithColumns(read_result.columns);
-
-    BlockAndProgress res = {
-        .block = std::move(block),
-        .row_count = read_result.num_rows,
-        .num_read_rows = num_read_rows,
-        .num_read_bytes = num_read_bytes };
-
-    return res;
-}
-
-
-IMergeTreeSelectAlgorithm::BlockAndProgress IMergeTreeSelectAlgorithm::readFromPart()
-{
-    if (!task->range_reader.isInitialized())
-        initializeRangeReaders(*task);
-
-    return readFromPartImpl();
-}
-
-
-namespace
-{
-    struct VirtualColumnsInserter
-    {
-        explicit VirtualColumnsInserter(Block & block_) : block(block_) {}
-
-        bool columnExists(const String & name) const { return block.has(name); }
-
-        void insertUInt8Column(const ColumnPtr & column, const String & name)
-        {
-            block.insert({column, std::make_shared<DataTypeUInt8>(), name});
-        }
-
-        void insertUInt64Column(const ColumnPtr & column, const String & name)
-        {
-            block.insert({column, std::make_shared<DataTypeUInt64>(), name});
-        }
-
-        void insertUUIDColumn(const ColumnPtr & column, const String & name)
-        {
-            block.insert({column, std::make_shared<DataTypeUUID>(), name});
-        }
-
-        void insertLowCardinalityColumn(const ColumnPtr & column, const String & name)
-        {
-            block.insert({column, std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()), name});
-        }
-
-        void insertPartitionValueColumn(
-            size_t rows, const Row & partition_value, const DataTypePtr & partition_value_type, const String & name)
-        {
-            ColumnPtr column;
-            if (rows)
-                column = partition_value_type->createColumnConst(rows, Tuple(partition_value.begin(), partition_value.end()))
-                             ->convertToFullColumnIfConst();
-            else
-                column = partition_value_type->createColumn();
-
-            block.insert({column, partition_value_type, name});
-        }
-
-        Block & block;
-    };
-}
-
-/// Adds virtual columns that are not const for all rows
-static void injectNonConstVirtualColumns(
-    size_t rows,
-    Block & block,
-    const Names & virtual_columns)
-{
-    VirtualColumnsInserter inserter(block);
-    for (const auto & virtual_column_name : virtual_columns)
-    {
-        if (virtual_column_name == "_part_offset")
-        {
-            if (!rows)
-            {
-                inserter.insertUInt64Column(DataTypeUInt64().createColumn(), virtual_column_name);
-            }
-            else
-            {
-                if (!inserter.columnExists(virtual_column_name))
-                    throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Column {} must have been filled part reader",
-                        virtual_column_name);
-            }
-        }
-
-        if (virtual_column_name == LightweightDeleteDescription::FILTER_COLUMN.name)
-        {
-                /// If _row_exists column isn't present in the part then fill it here with 1s
-                ColumnPtr column;
-                if (rows)
-                    column = LightweightDeleteDescription::FILTER_COLUMN.type->createColumnConst(rows, 1)->convertToFullColumnIfConst();
-                else
-                    column = LightweightDeleteDescription::FILTER_COLUMN.type->createColumn();
-
-                inserter.insertUInt8Column(column, virtual_column_name);
-        }
-    }
-}
-
-/// Adds virtual columns that are const for the whole part
-static void injectPartConstVirtualColumns(
-    size_t rows,
-    Block & block,
-    MergeTreeReadTask * task,
-    const DataTypePtr & partition_value_type,
-    const Names & virtual_columns)
-{
-    VirtualColumnsInserter inserter(block);
-    /// add virtual columns
-    /// Except _sample_factor, which is added from the outside.
-    if (!virtual_columns.empty())
-    {
-        if (unlikely(rows && !task))
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insert virtual columns to non-empty chunk without specified task.");
-
-        const IMergeTreeDataPart * part = nullptr;
-        if (rows)
-        {
-            part = task->data_part.get();
-            if (part->isProjectionPart())
-                part = part->getParentPart();
-        }
-        for (const auto & virtual_column_name : virtual_columns)
-        {
-            if (virtual_column_name == "_part")
-            {
-                ColumnPtr column;
-                if (rows)
-                    column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}
-                                 .createColumnConst(rows, part->name)
-                                 ->convertToFullColumnIfConst();
-                else
-                    column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn();
-
-                inserter.insertLowCardinalityColumn(column, virtual_column_name);
-            }
-            else if (virtual_column_name == "_part_index")
-            {
-                ColumnPtr column;
-                if (rows)
-                    column = DataTypeUInt64().createColumnConst(rows, task->part_index_in_query)->convertToFullColumnIfConst();
-                else
-                    column = DataTypeUInt64().createColumn();
-
-                inserter.insertUInt64Column(column, virtual_column_name);
-            }
-            else if (virtual_column_name == "_part_uuid")
-            {
-                ColumnPtr column;
-                if (rows)
-                    column = DataTypeUUID().createColumnConst(rows, part->uuid)->convertToFullColumnIfConst();
-                else
-                    column = DataTypeUUID().createColumn();
-
-                inserter.insertUUIDColumn(column, virtual_column_name);
-            }
-            else if (virtual_column_name == "_partition_id")
-            {
-                ColumnPtr column;
-                if (rows)
-                    column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}
-                                 .createColumnConst(rows, part->info.partition_id)
-                                 ->convertToFullColumnIfConst();
-                else
-                    column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumn();
-
-                inserter.insertLowCardinalityColumn(column, virtual_column_name);
-            }
-            else if (virtual_column_name == "_partition_value")
-            {
-                if (rows)
-                    inserter.insertPartitionValueColumn(rows, part->partition.value, partition_value_type, virtual_column_name);
-                else
-                    inserter.insertPartitionValueColumn(rows, {}, partition_value_type, virtual_column_name);
-            }
-        }
-    }
-}
-
-void IMergeTreeSelectAlgorithm::injectVirtualColumns(
-    Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns)
-{
-    /// First add non-const columns that are filled by the range reader and then const columns that we will fill ourselves.
-    /// Note that the order is important: virtual columns filled by the range reader must go first
-    injectNonConstVirtualColumns(row_count, block, virtual_columns);
-    injectPartConstVirtualColumns(row_count, block, task, partition_value_type, virtual_columns);
-}
-
-Block IMergeTreeSelectAlgorithm::applyPrewhereActions(Block block, const PrewhereInfoPtr & prewhere_info)
-{
-    if (prewhere_info)
-    {
-        if (prewhere_info->row_level_filter)
-        {
-            block = prewhere_info->row_level_filter->updateHeader(std::move(block));
-            auto & row_level_column = block.getByName(prewhere_info->row_level_column_name);
-            if (!row_level_column.type->canBeUsedInBooleanContext())
-            {
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER, "Invalid type for filter in PREWHERE: {}",
-                    row_level_column.type->getName());
-            }
-
-            block.erase(prewhere_info->row_level_column_name);
-        }
-
-        if (prewhere_info->prewhere_actions)
-        {
-            block = prewhere_info->prewhere_actions->updateHeader(std::move(block));
-
-            auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name);
-            if (!prewhere_column.type->canBeUsedInBooleanContext())
-            {
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER, "Invalid type for filter in PREWHERE: {}",
-                    prewhere_column.type->getName());
-            }
-
-            if (prewhere_info->remove_prewhere_column)
-            {
-                block.erase(prewhere_info->prewhere_column_name);
-            }
-            else if (prewhere_info->need_filter)
-            {
-                WhichDataType which(removeNullable(recursiveRemoveLowCardinality(prewhere_column.type)));
-
-                if (which.isNativeInt() || which.isNativeUInt())
-                    prewhere_column.column = prewhere_column.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst();
-                else if (which.isFloat())
-                    prewhere_column.column = prewhere_column.type->createColumnConst(block.rows(), 1.0f)->convertToFullColumnIfConst();
-                else
-                    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER,
-                        "Illegal type {} of column for filter",
-                        prewhere_column.type->getName());
-            }
-        }
-    }
-
-    return block;
-}
-
-Block IMergeTreeSelectAlgorithm::transformHeader(
-    Block block, const PrewhereInfoPtr & prewhere_info, const DataTypePtr & partition_value_type, const Names & virtual_columns)
-{
-    auto transformed = applyPrewhereActions(std::move(block), prewhere_info);
-    injectVirtualColumns(transformed, 0, nullptr, partition_value_type, virtual_columns);
-    return transformed;
-}
-
-std::unique_ptr<MergeTreeBlockSizePredictor> IMergeTreeSelectAlgorithm::getSizePredictor(
-    const MergeTreeData::DataPartPtr & data_part,
-    const MergeTreeReadTaskColumns & task_columns,
-    const Block & sample_block)
-{
-    const auto & required_column_names = task_columns.columns.getNames();
-    NameSet complete_column_names(required_column_names.begin(), required_column_names.end());
-    for (const auto & pre_columns_per_step : task_columns.pre_columns)
-    {
-        const auto & required_pre_column_names = pre_columns_per_step.getNames();
-        complete_column_names.insert(required_pre_column_names.begin(), required_pre_column_names.end());
-    }
-
-    return std::make_unique<MergeTreeBlockSizePredictor>(
-        data_part, Names(complete_column_names.begin(), complete_column_names.end()), sample_block);
-}
-
-
-IMergeTreeSelectAlgorithm::~IMergeTreeSelectAlgorithm() = default;
-
-}
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
@ -1,217 +0,0 @@
-#pragma once
-#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
-#include <Storages/MergeTree/MergeTreeData.h>
-#include <Storages/SelectQueryInfo.h>
-#include <Storages/MergeTree/IMergeTreeReader.h>
-#include <Storages/MergeTree/RequestResponse.h>
-#include <Processors/Chunk.h>
-
-
-namespace DB
-{
-
-class IMergeTreeReader;
-class UncompressedCache;
-class MarkCache;
-struct PrewhereExprInfo;
-
-struct ChunkAndProgress
-{
-    Chunk chunk;
-    size_t num_read_rows = 0;
-    size_t num_read_bytes = 0;
-    /// Explicitly indicate that we have read all data.
-    /// This is needed to occasionally return empty chunk to indicate the progress while the rows are filtered out in PREWHERE.
-    bool is_finished = false;
-};
-
-struct ParallelReadingExtension
-{
-    MergeTreeAllRangesCallback all_callback;
-    MergeTreeReadTaskCallback callback;
-    size_t count_participating_replicas{0};
-    size_t number_of_current_replica{0};
-    /// This is needed to estimate the number of bytes
-    /// between a pair of marks to perform one request
-    /// over the network for a 1Gb of data.
-    Names columns_to_read;
-};
-
-/// Base class for MergeTreeThreadSelectAlgorithm and MergeTreeSelectAlgorithm
-class IMergeTreeSelectAlgorithm
-{
-public:
-    IMergeTreeSelectAlgorithm(
-        Block header,
-        const MergeTreeData & storage_,
-        const StorageSnapshotPtr & storage_snapshot_,
-        const PrewhereInfoPtr & prewhere_info_,
-        const ExpressionActionsSettings & actions_settings,
-        UInt64 max_block_size_rows_,
-        UInt64 preferred_block_size_bytes_,
-        UInt64 preferred_max_column_in_block_size_bytes_,
-        const MergeTreeReaderSettings & reader_settings_,
-        bool use_uncompressed_cache_,
-        const Names & virt_column_names_ = {});
-
-    virtual ~IMergeTreeSelectAlgorithm();
-
-    static Block transformHeader(
-        Block block, const PrewhereInfoPtr & prewhere_info, const DataTypePtr & partition_value_type, const Names & virtual_columns);
-
-    static std::unique_ptr<MergeTreeBlockSizePredictor> getSizePredictor(
-        const MergeTreeData::DataPartPtr & data_part,
-        const MergeTreeReadTaskColumns & task_columns,
-        const Block & sample_block);
-
-    Block getHeader() const { return result_header; }
-
-    ChunkAndProgress read();
-
-    void cancel() { is_cancelled = true; }
-
-    const MergeTreeReaderSettings & getSettings() const { return reader_settings; }
-
-    virtual std::string getName() const = 0;
-
-    static PrewhereExprInfo getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings, bool enable_multiple_prewhere_read_steps);
-
-protected:
-    /// This struct allow to return block with no columns but with non-zero number of rows similar to Chunk
-    struct BlockAndProgress
-    {
-        Block block;
-        size_t row_count = 0;
-        size_t num_read_rows = 0;
-        size_t num_read_bytes = 0;
-    };
-
-    /// Creates new this->task and return a flag whether it was successful or not
-    virtual bool getNewTaskImpl() = 0;
-    /// Creates new readers for a task it is needed. These methods are separate, because
-    /// in case of parallel reading from replicas the whole task could be denied by a coodinator
-    /// or it could modified somehow.
-    virtual void finalizeNewTask() = 0;
-
-    size_t estimateMaxBatchSizeForHugeRanges();
-
-    /// Closes readers and unlock part locks
-    virtual void finish() = 0;
-
-    virtual BlockAndProgress readFromPart();
-
-    BlockAndProgress readFromPartImpl();
-
-    /// Used for filling header with no rows as well as block with data
-    static void
-    injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns);
-
-protected:
-    static void initializeRangeReadersImpl(
-         MergeTreeRangeReader & range_reader,
-         std::deque<MergeTreeRangeReader> & pre_range_readers,
-         const PrewhereExprInfo & prewhere_actions,
-         IMergeTreeReader * reader,
-         bool has_lightweight_delete,
-         const MergeTreeReaderSettings & reader_settings,
-         const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
-         const PrewhereExprStep & lightweight_delete_filter_step,
-         const Names & non_const_virtual_column_names);
-
-    /// Sets up data readers for each step of prewhere and where
-    void initializeMergeTreeReadersForCurrentTask(
-        const IMergeTreeReader::ValueSizeMap & value_size_map,
-        const ReadBufferFromFileBase::ProfileCallback & profile_callback);
-
-    void initializeMergeTreeReadersForPart(
-        const MergeTreeData::DataPartPtr & data_part,
-        const AlterConversionsPtr & alter_conversions,
-        const MergeTreeReadTaskColumns & task_columns,
-        const MarkRanges & mark_ranges,
-        const IMergeTreeReader::ValueSizeMap & value_size_map,
-        const ReadBufferFromFileBase::ProfileCallback & profile_callback);
-
-    /// Sets up range readers corresponding to data readers
-    void initializeRangeReaders(MergeTreeReadTask & task);
-
-    const MergeTreeData & storage;
-    StorageSnapshotPtr storage_snapshot;
-
-    /// This step is added when the part has lightweight delete mask
-    const PrewhereExprStep lightweight_delete_filter_step
-    {
-        .type = PrewhereExprStep::Filter,
-        .actions = nullptr,
-        .filter_column_name = LightweightDeleteDescription::FILTER_COLUMN.name,
-        .remove_filter_column = true,
-        .need_filter = true,
-        .perform_alter_conversions = true,
-    };
-
-    PrewhereInfoPtr prewhere_info;
-    ExpressionActionsSettings actions_settings;
-    PrewhereExprInfo prewhere_actions;
-
-    UInt64 max_block_size_rows;
-    UInt64 preferred_block_size_bytes;
-    UInt64 preferred_max_column_in_block_size_bytes;
-
-    MergeTreeReaderSettings reader_settings;
-
-    bool use_uncompressed_cache;
-
-    Names virt_column_names;
-
-    /// These columns will be filled by the merge tree range reader
-    Names non_const_virtual_column_names;
-
-    DataTypePtr partition_value_type;
-
-    /// This header is used for chunks from readFromPart().
-    Block header_without_const_virtual_columns;
-    /// A result of getHeader(). A chunk which this header is returned from read().
-    Block result_header;
-
-    UncompressedCachePtr owned_uncompressed_cache;
-    MarkCachePtr owned_mark_cache;
-
-    using MergeTreeReaderPtr = std::unique_ptr<IMergeTreeReader>;
-    MergeTreeReaderPtr reader;
-    std::vector<MergeTreeReaderPtr> pre_reader_for_step;
-
-    MergeTreeReadTaskPtr task;
-
-    /// This setting is used in base algorithm only to additionally limit the number of granules to read.
-    /// It is changed in ctor of MergeTreeThreadSelectAlgorithm.
-    ///
-    /// The reason why we have it here is because MergeTreeReadPool takes the full task
-    /// ignoring min_marks_to_read setting in case of remote disk (see MergeTreeReadPool::getTask).
-    /// In this case, we won't limit the number of rows to read based on adaptive granularity settings.
-    ///
-    /// Big reading tasks are better for remote disk and prefetches.
-    /// So, for now it's easier to limit max_rows_to_read.
-    /// Somebody need to refactor this later.
-    size_t min_marks_to_read = 0;
-
-private:
-    Poco::Logger * log = &Poco::Logger::get("MergeTreeBaseSelectProcessor");
-
-    std::atomic<bool> is_cancelled{false};
-
-    bool getNewTask();
-
-    /// Initialize pre readers.
-    void initializeMergeTreePreReadersForPart(
-        const MergeTreeData::DataPartPtr & data_part,
-        const AlterConversionsPtr & alter_conversions,
-        const MergeTreeReadTaskColumns & task_columns,
-        const MarkRanges & mark_ranges,
-        const IMergeTreeReader::ValueSizeMap & value_size_map,
-        const ReadBufferFromFileBase::ProfileCallback & profile_callback);
-
-    static Block applyPrewhereActions(Block block, const PrewhereInfoPtr & prewhere_info);
-};
-
-using MergeTreeSelectAlgorithmPtr = std::unique_ptr<IMergeTreeSelectAlgorithm>;
-
-}
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
@ -6,7 +6,7 @@
 #include <Core/NamesAndTypes.h>
 #include <Common/checkStackSize.h>
 #include <Common/typeid_cast.h>
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
+#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
 #include <Columns/ColumnConst.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
@ -16,6 +16,7 @@

 namespace DB
 {
+
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
@ -136,43 +137,6 @@ NameSet injectRequiredColumns(
    return injected_columns;
 }

-
-MergeTreeReadTask::MergeTreeReadTask(
-    const DataPartPtr & data_part_,
-    const AlterConversionsPtr & alter_conversions_,
-    const MarkRanges & mark_ranges_,
-    size_t part_index_in_query_,
-    const NameSet & column_name_set_,
-    const MergeTreeReadTaskColumns & task_columns_,
-    MergeTreeBlockSizePredictorPtr size_predictor_,
-    Priority priority_,
-    std::future<MergeTreeReaderPtr> reader_,
-    std::vector<std::future<MergeTreeReaderPtr>> && pre_reader_for_step_)
-    : data_part{data_part_}
-    , alter_conversions{alter_conversions_}
-    , mark_ranges{mark_ranges_}
-    , part_index_in_query{part_index_in_query_}
-    , column_name_set{column_name_set_}
-    , task_columns{task_columns_}
-    , size_predictor{size_predictor_}
-    , reader(std::move(reader_))
-    , pre_reader_for_step(std::move(pre_reader_for_step_))
-    , priority(priority_)
-{
-}
-
-MergeTreeReadTask::~MergeTreeReadTask()
-{
-    if (reader.valid())
-        reader.wait();
-
-    for (const auto & pre_reader : pre_reader_for_step)
-    {
-        if (pre_reader.valid())
-            pre_reader.wait();
-    }
-}
-
 MergeTreeBlockSizePredictor::MergeTreeBlockSizePredictor(
    const DataPartPtr & data_part_, const Names & columns, const Block & sample_block)
    : data_part(data_part_)
@ -195,9 +159,8 @@ void MergeTreeBlockSizePredictor::initialize(const Block & sample_block, const C
    for (size_t pos = 0; pos < num_columns; ++pos)
    {
        const auto & column_with_type_and_name = sample_block.getByPosition(pos);
-        const String & column_name = column_with_type_and_name.name;
-        const ColumnPtr & column_data = from_update ? columns[pos]
-                                                    : column_with_type_and_name.column;
+        const auto & column_name = column_with_type_and_name.name;
+        const auto & column_data = from_update ? columns[pos] : column_with_type_and_name.column;

        if (!from_update && !names_set.contains(column_name))
            continue;
@ -246,7 +209,6 @@ void MergeTreeBlockSizePredictor::startBlock()
        info.size_bytes = 0;
 }

-
 /// TODO: add last_read_row_in_part parameter to take into account gaps between adjacent ranges
 void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Columns & columns, size_t num_rows, double decay)
 {
@ -296,7 +258,7 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum
 }


-MergeTreeReadTaskColumns getReadTaskColumns(
+MergeTreeReadTask::Columns getReadTaskColumns(
    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
    const Names & required_columns,
@ -317,7 +279,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
    injectRequiredColumns(
        data_part_info_for_reader, storage_snapshot, with_subcolumns, column_to_read_after_prewhere);

-    MergeTreeReadTaskColumns result;
+    MergeTreeReadTask::Columns result;
    auto options = GetColumnsOptions(GetColumnsOptions::All)
        .withExtendedObjects()
        .withSystemColumns();
@ -365,7 +327,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(

    if (prewhere_info)
    {
-        auto prewhere_actions = IMergeTreeSelectAlgorithm::getPrewhereActions(
+        auto prewhere_actions = MergeTreeSelectProcessor::getPrewhereActions(
            prewhere_info,
            actions_settings,
            reader_settings.enable_multiple_prewhere_read_steps);
@ -387,16 +349,4 @@ MergeTreeReadTaskColumns getReadTaskColumns(
    return result;
 }

-
-std::string MergeTreeReadTaskColumns::dump() const
-{
-    WriteBufferFromOwnString s;
-    for (size_t i = 0; i < pre_columns.size(); ++i)
-    {
-        s << "STEP " << i << ": " << pre_columns[i].toString() << "\n";
-    }
-    s << "COLUMNS: " << columns.toString() << "\n";
-    return s.str();
-}
-
 }
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
@ -2,29 +2,15 @@

 #include <optional>
 #include <Core/NamesAndTypes.h>
-#include <Storages/StorageSnapshot.h>
-#include <Storages/MergeTree/RangesInDataPart.h>
-#include <Storages/MergeTree/MergeTreeRangeReader.h>
-#include <Storages/MergeTree/IMergeTreeReader.h>
-#include <Storages/MergeTree/AlterConversions.h>
+#include <Storages/MergeTree/MergeTreeReadTask.h>


 namespace DB
 {

-class MergeTreeData;
-struct MergeTreeReadTask;
 struct MergeTreeReaderSettings;
-struct MergeTreeBlockSizePredictor;
 class IMergeTreeDataPartInfoForReader;

-using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
-using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredictor>;
-
-class IMergeTreeDataPart;
-using DataPartPtr = std::shared_ptr<const IMergeTreeDataPart>;
-
-
 /** If some of the requested columns are not in the part,
  * then find out which columns may need to be read further,
  * so that you can calculate the DEFAULT expression for these columns.
@ -36,64 +22,7 @@ NameSet injectRequiredColumns(
    bool with_subcolumns,
    Names & columns);

-struct MergeTreeReadTaskColumns
-{
-    /// column names to read during WHERE
-    NamesAndTypesList columns;
-    /// column names to read during each PREWHERE step
-    std::vector<NamesAndTypesList> pre_columns;
-
-    std::string dump() const;
-};
-
-/// A batch of work for MergeTreeThreadSelectProcessor
-struct MergeTreeReadTask
-{
-    /// Data part which should be read while performing this task
-    DataPartPtr data_part;
-    /// Alter converversionss that should be applied on-fly for part.
-    AlterConversionsPtr alter_conversions;
-    /// Ranges to read from `data_part`.
-    MarkRanges mark_ranges;
-    /// for virtual `part_index` virtual column
-    size_t part_index_in_query;
-    /// used to determine whether column should be filtered during PREWHERE or WHERE
-    const NameSet & column_name_set;
-    /// column names to read during PREWHERE and WHERE
-    const MergeTreeReadTaskColumns & task_columns;
-    /// Used to satistfy preferred_block_size_bytes limitation
-    MergeTreeBlockSizePredictorPtr size_predictor;
-    /// Used to save current range processing status
-    MergeTreeRangeReader range_reader;
-    /// Range readers for multiple filtering steps: row level security, PREWHERE etc.
-    /// NOTE: we take references to elements and push_back new elements, that's why it is a deque but not a vector
-    std::deque<MergeTreeRangeReader> pre_range_readers;
-
-    using MergeTreeReaderPtr = std::unique_ptr<IMergeTreeReader>;
-    std::future<MergeTreeReaderPtr> reader;
-    std::vector<std::future<MergeTreeReaderPtr>> pre_reader_for_step;
-
-    Priority priority;
-
-    bool isFinished() const { return mark_ranges.empty() && range_reader.isCurrentRangeFinished(); }
-
-    MergeTreeReadTask(
-        const DataPartPtr & data_part_,
-        const AlterConversionsPtr & alter_conversions_,
-        const MarkRanges & mark_ranges_,
-        size_t part_index_in_query_,
-        const NameSet & column_name_set_,
-        const MergeTreeReadTaskColumns & task_columns_,
-        MergeTreeBlockSizePredictorPtr size_predictor_,
-        Priority priority_ = {},
-        std::future<MergeTreeReaderPtr> reader_ = {},
-        std::vector<std::future<MergeTreeReaderPtr>> && pre_reader_for_step_ = {});
-
-    ~MergeTreeReadTask();
-};
-
-
-MergeTreeReadTaskColumns getReadTaskColumns(
+MergeTreeReadTask::Columns getReadTaskColumns(
    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
    const Names & required_columns,
@ -119,7 +48,6 @@ struct MergeTreeBlockSizePredictor
        return block_size_bytes;
    }

-
    /// Predicts what number of rows should be read to exhaust byte quota per column
    inline size_t estimateNumRowsForMaxSizeColumn(size_t bytes_quota) const
    {
@ -153,7 +81,6 @@ struct MergeTreeBlockSizePredictor
    static double calculateDecay() { return 1. - std::pow(TARGET_WEIGHT, 1. / NUM_UPDATES_TO_TARGET_WEIGHT); }

 protected:
-
    DataPartPtr data_part;

    struct ColumnInfo
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -68,7 +68,7 @@
 #include <Storages/AlterCommands.h>
 #include <Storages/Freeze.h>
 #include <Storages/MergeTree/checkDataPart.h>
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
+#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeDataPartCompact.h>
 #include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
 #include <Storages/MergeTree/MergeTreeDataPartWide.h>
--- a/src/Storages/MergeTree/MergeTreeInOrderSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeInOrderSelectProcessor.cpp
@ -1,76 +0,0 @@
-#include <Storages/MergeTree/MergeTreeInOrderSelectProcessor.h>
-#include "Storages/MergeTree/RangesInDataPart.h"
-#include <Storages/MergeTree/IntersectionsIndexes.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int MEMORY_LIMIT_EXCEEDED;
-}
-
-bool MergeTreeInOrderSelectAlgorithm::getNewTaskImpl()
-try
-{
-    if (all_mark_ranges.empty())
-        return false;
-
-    if (!reader)
-        initializeReaders();
-
-    MarkRanges mark_ranges_for_task;
-
-    if (!pool)
-    {
-        /// If we need to read few rows, set one range per task to reduce number of read data.
-        if (has_limit_below_one_block)
-        {
-            mark_ranges_for_task = MarkRanges{};
-            mark_ranges_for_task.emplace_front(std::move(all_mark_ranges.front()));
-            all_mark_ranges.pop_front();
-        }
-        else
-        {
-            mark_ranges_for_task = std::move(all_mark_ranges);
-            all_mark_ranges.clear();
-        }
-    }
-    else
-    {
-        auto description = RangesInDataPartDescription{
-            .info = data_part->info,
-            /// We just ignore all the distribution done before
-            /// Everything will be done on coordinator side
-            .ranges = {},
-        };
-
-        mark_ranges_for_task = pool->getNewTask(description);
-
-        if (mark_ranges_for_task.empty())
-            return false;
-    }
-
-    auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr
-        : getSizePredictor(data_part, task_columns, sample_block);
-
-    task = std::make_unique<MergeTreeReadTask>(
-        data_part,
-        alter_conversions,
-        mark_ranges_for_task,
-        part_index_in_query,
-        column_name_set,
-        task_columns,
-        std::move(size_predictor));
-
-    return true;
-}
-catch (...)
-{
-    /// Suspicion of the broken part. A part is added to the queue for verification.
-    if (getCurrentExceptionCode() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
-        storage.reportBrokenPart(data_part);
-    throw;
-}
-
-}
--- a/src/Storages/MergeTree/MergeTreeInOrderSelectProcessor.h
+++ b/src/Storages/MergeTree/MergeTreeInOrderSelectProcessor.h
@ -1,33 +0,0 @@
-#pragma once
-#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
-#include <Common/logger_useful.h>
-
-namespace DB
-{
-
-
-/// Used to read data from single part with select query in order of primary key.
-/// Cares about PREWHERE, virtual columns, indexes etc.
-/// To read data from multiple parts, Storage (MergeTree) creates multiple such objects.
-class MergeTreeInOrderSelectAlgorithm final : public MergeTreeSelectAlgorithm
-{
-public:
-    template <typename... Args>
-    explicit MergeTreeInOrderSelectAlgorithm(Args &&... args)
-        : MergeTreeSelectAlgorithm{std::forward<Args>(args)...}
-    {
-        LOG_TRACE(log, "Reading {} ranges in order from part {}, approx. {} rows starting from {}",
-            all_mark_ranges.size(), data_part->name, total_rows,
-            data_part->index_granularity.getMarkStartingRow(all_mark_ranges.front().begin));
-    }
-
-    String getName() const override { return "MergeTreeInOrder"; }
-
-private:
-    bool getNewTaskImpl() override;
-    void finalizeNewTask() override {}
-
-    Poco::Logger * log = &Poco::Logger::get("MergeTreeInOrderSelectProcessor");
-};
-
-}
--- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp
@ -5,7 +5,6 @@
 #include <Storages/MergeTree/IMergeTreeReader.h>
 #include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Storages/MergeTree/MarkRange.h>
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/MergeTreePrefetchedReadPool.h>
 #include <Storages/MergeTree/MergeTreeRangeReader.h>
@ -14,10 +13,10 @@
 #include <Common/ElapsedTimeProfileEventIncrement.h>
 #include <Common/logger_useful.h>

-
 namespace ProfileEvents
 {
    extern const Event MergeTreePrefetchedReadPoolInit;
+    extern const Event WaitPrefetchTaskMicroseconds;
 }

 namespace DB
@ -29,145 +28,124 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
 }

+bool MergeTreePrefetchedReadPool::TaskHolder::operator<(const TaskHolder & other) const
+{
+    chassert(task->priority >= 0);
+    chassert(other.task->priority >= 0);
+    /// With default std::priority_queue, top() returns largest element.
+    /// So closest to 0 will be on top with this comparator.
+    return task->priority > other.task->priority; /// Less is better.
+}
+
+MergeTreePrefetchedReadPool::PrefetechedReaders::PrefetechedReaders(
+    MergeTreeReadTask::Readers readers_,
+    Priority priority_,
+    MergeTreePrefetchedReadPool & pool_)
+    : is_valid(true)
+    , readers(std::move(readers_))
+{
+    prefetch_futures.push_back(pool_.createPrefetchedFuture(readers.main.get(), priority_));
+
+    for (const auto & reader : readers.prewhere)
+        prefetch_futures.push_back(pool_.createPrefetchedFuture(reader.get(), priority_));
+}
+
+void MergeTreePrefetchedReadPool::PrefetechedReaders::wait()
+{
+    ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::WaitPrefetchTaskMicroseconds);
+    for (auto & prefetch_future : prefetch_futures)
+        prefetch_future.wait();
+}
+
+MergeTreeReadTask::Readers MergeTreePrefetchedReadPool::PrefetechedReaders::get()
+{
+    SCOPE_EXIT({ is_valid = false; });
+    ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::WaitPrefetchTaskMicroseconds);
+    for (auto & prefetch_future : prefetch_futures)
+        prefetch_future.get();
+
+    return std::move(readers);
+}
+
 MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool(
-    size_t threads,
-    size_t sum_marks_,
-    size_t min_marks_for_concurrent_read_,
    RangesInDataParts && parts_,
    const StorageSnapshotPtr & storage_snapshot_,
    const PrewhereInfoPtr & prewhere_info_,
    const ExpressionActionsSettings & actions_settings_,
+    const MergeTreeReaderSettings & reader_settings_,
    const Names & column_names_,
    const Names & virtual_column_names_,
-    size_t preferred_block_size_bytes_,
-    const MergeTreeReaderSettings & reader_settings_,
-    ContextPtr context_,
-    bool use_uncompressed_cache_,
-    bool is_remote_read_,
-    const MergeTreeSettings & storage_settings_)
-    : WithContext(context_)
-    , log(&Poco::Logger::get("MergeTreePrefetchedReadPool(" + (parts_.empty() ? "" : parts_.front().data_part->storage.getStorageID().getNameForLogs()) + ")"))
-    , header(storage_snapshot_->getSampleBlockForColumns(column_names_))
-    , mark_cache(context_->getGlobalContext()->getMarkCache().get())
-    , uncompressed_cache(use_uncompressed_cache_ ? context_->getGlobalContext()->getUncompressedCache().get() : nullptr)
-    , profile_callback([this](ReadBufferFromFileBase::ProfileInfo info_) { profileFeedback(info_); })
-    , index_granularity_bytes(storage_settings_.index_granularity_bytes)
-    , fixed_index_granularity(storage_settings_.index_granularity)
-    , storage_snapshot(storage_snapshot_)
-    , column_names(column_names_)
-    , virtual_column_names(virtual_column_names_)
-    , prewhere_info(prewhere_info_)
-    , actions_settings(actions_settings_)
-    , reader_settings(reader_settings_)
-    , is_remote_read(is_remote_read_)
+    const PoolSettings & settings_,
+    const ContextPtr & context_)
+    : MergeTreeReadPoolBase(
+        std::move(parts_),
+        storage_snapshot_,
+        prewhere_info_,
+        actions_settings_,
+        reader_settings_,
+        column_names_,
+        virtual_column_names_,
+        settings_,
+        context_)
+    , WithContext(context_)
    , prefetch_threadpool(getContext()->getPrefetchThreadpool())
+    , log(&Poco::Logger::get("MergeTreePrefetchedReadPool(" + (parts_.empty() ? "" : parts_.front().data_part->storage.getStorageID().getNameForLogs()) + ")"))
 {
    /// Tasks creation might also create a lost of readers - check they do not
    /// do any time consuming operations in ctor.
    ProfileEventTimeIncrement<Milliseconds> watch(ProfileEvents::MergeTreePrefetchedReadPoolInit);

-    parts_infos = getPartsInfos(parts_, preferred_block_size_bytes_);
-    threads_tasks = createThreadsTasks(threads, sum_marks_, min_marks_for_concurrent_read_);
+    fillPerPartStatistics();
+    fillPerThreadTasks(pool_settings.threads, pool_settings.sum_marks);
 }

-struct MergeTreePrefetchedReadPool::PartInfo
+std::future<void> MergeTreePrefetchedReadPool::createPrefetchedFuture(IMergeTreeReader * reader, Priority priority)
 {
-    MergeTreeData::DataPartPtr data_part;
-    AlterConversionsPtr alter_conversions;
-    size_t part_index_in_query;
-    size_t sum_marks = 0;
-    MarkRanges ranges;
-
-    NameSet column_name_set;
-    MergeTreeReadTaskColumns task_columns;
-    MergeTreeBlockSizePredictorPtr size_predictor;
-
-    size_t approx_size_of_mark = 0;
-    size_t prefetch_step_marks = 0;
-
-    size_t estimated_memory_usage_for_single_prefetch = 0;
-    size_t required_readers_num = 0;
-};
-
-std::future<MergeTreeReaderPtr> MergeTreePrefetchedReadPool::createPrefetchedReader(
-    const IMergeTreeDataPart & data_part,
-    const NamesAndTypesList & columns,
-    const AlterConversionsPtr & alter_conversions,
-    const MarkRanges & required_ranges,
-    Priority priority) const
-{
-    auto reader = data_part.getReader(
-        columns, storage_snapshot, required_ranges,
-        uncompressed_cache, mark_cache, alter_conversions, reader_settings,
-        IMergeTreeReader::ValueSizeMap{}, profile_callback);
-
    /// In order to make a prefetch we need to wait for marks to be loaded. But we just created
    /// a reader (which starts loading marks in its constructor), then if we do prefetch right
    /// after creating a reader, it will be very inefficient. We can do prefetch for all parts
    /// only inside this MergeTreePrefetchedReadPool, where read tasks are created and distributed,
    /// and we cannot block either, therefore make prefetch inside the pool and put the future
-    /// into the read task (MergeTreeReadTask). When a thread calls getTask(), it will wait for
-    /// it (if not yet ready) after getting the task.
-    auto task = [=, my_reader = std::move(reader), context = getContext()]() mutable -> MergeTreeReaderPtr &&
+    /// into the thread task. When a thread calls getTask(), it will wait for it is not ready yet.
+    auto task = [=, context = getContext()]() mutable
    {
        /// For async read metrics in system.query_log.
        PrefetchIncrement watch(context->getAsyncReadCounters());
-
-        my_reader->prefetchBeginOfRange(priority);
-        return std::move(my_reader);
+        reader->prefetchBeginOfRange(priority);
    };
-    return scheduleFromThreadPool<IMergeTreeDataPart::MergeTreeReaderPtr>(std::move(task), prefetch_threadpool, "ReadPrepare", priority);
+
+    return scheduleFromThreadPool<void>(std::move(task), prefetch_threadpool, "ReadPrepare", priority);
 }

-void MergeTreePrefetchedReadPool::createPrefetchedReaderForTask(MergeTreeReadTask & task) const
+void MergeTreePrefetchedReadPool::createPrefetchedReadersForTask(ThreadTask & task)
 {
-    if (task.reader.valid())
+    if (task.readers_future.valid())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Task already has a reader");

-    task.reader = createPrefetchedReader(*task.data_part, task.task_columns.columns, task.alter_conversions, task.mark_ranges, task.priority);
-
-    if (reader_settings.apply_deleted_mask && task.data_part->hasLightweightDelete())
-    {
-        auto pre_reader = createPrefetchedReader(*task.data_part, {LightweightDeleteDescription::FILTER_COLUMN}, task.alter_conversions, task.mark_ranges, task.priority);
-        task.pre_reader_for_step.push_back(std::move(pre_reader));
-    }
-
-    for (const auto & pre_columns_per_step : task.task_columns.pre_columns)
-    {
-        auto pre_reader = createPrefetchedReader(*task.data_part, pre_columns_per_step, task.alter_conversions, task.mark_ranges, task.priority);
-        task.pre_reader_for_step.push_back(std::move(pre_reader));
-    }
+    auto extras = getExtras();
+    auto readers = MergeTreeReadTask::createReaders(task.read_info, extras, task.ranges);
+    task.readers_future = PrefetechedReaders(std::move(readers), task.priority, *this);
 }

-bool MergeTreePrefetchedReadPool::TaskHolder::operator <(const TaskHolder & other) const
-{
-    chassert(task->priority >= 0);
-    chassert(other.task->priority >= 0);
-    return task->priority > other.task->priority; /// Less is better.
-    /// With default std::priority_queue, top() returns largest element.
-    /// So closest to 0 will be on top with this comparator.
-}
-
-void MergeTreePrefetchedReadPool::startPrefetches() const
+void MergeTreePrefetchedReadPool::startPrefetches()
 {
    if (prefetch_queue.empty())
        return;

-    [[maybe_unused]] TaskHolder prev(nullptr, 0);
+    [[maybe_unused]] TaskHolder prev;
    [[maybe_unused]] const Priority highest_priority{reader_settings.read_settings.priority.value + 1};
    assert(prefetch_queue.top().task->priority == highest_priority);
+
    while (!prefetch_queue.empty())
    {
        const auto & top = prefetch_queue.top();
-        createPrefetchedReaderForTask(*top.task);
+        createPrefetchedReadersForTask(*top.task);
 #ifndef NDEBUG
        if (prev.task)
        {
            assert(top.task->priority >= highest_priority);
            if (prev.thread_id == top.thread_id)
-            {
                assert(prev.task->priority < top.task->priority);
-            }
        }
        prev = top;
 #endif
@ -175,11 +153,11 @@ void MergeTreePrefetchedReadPool::startPrefetches() const
    }
 }

-MergeTreeReadTaskPtr MergeTreePrefetchedReadPool::getTask(size_t thread)
+MergeTreeReadTaskPtr MergeTreePrefetchedReadPool::getTask(size_t task_idx, MergeTreeReadTask * previous_task)
 {
    std::lock_guard lock(mutex);

-    if (threads_tasks.empty())
+    if (per_thread_tasks.empty())
        return nullptr;

    if (!started_prefetches)
@ -188,112 +166,129 @@ MergeTreeReadTaskPtr MergeTreePrefetchedReadPool::getTask(size_t thread)
        startPrefetches();
    }

-    auto it = threads_tasks.find(thread);
-    if (it == threads_tasks.end())
-    {
-        ThreadsTasks::iterator non_prefetched_tasks_to_steal = threads_tasks.end();
-        ThreadsTasks::iterator prefetched_tasks_to_steal = threads_tasks.end();
-        int64_t best_prefetched_task_priority = -1;
-
-        /// There is no point stealing in order (like in MergeTreeReadPool, where tasks can be stolen
-        /// only from the next thread). Even if we steal task from the next thread, which reads from
-        /// the same part as we just read, it might seem that we can reuse our own reader, do some
-        /// seek avoiding and it will have a good result as we avoided seek (new request). But it is
-        /// not so, because this next task will most likely have its own reader a prefetch already on
-        /// the fly. (Not to mention that in fact we cannot reuse our own reader if initially we did
-        /// not accounted this range into range request to object storage).
-        for (auto thread_tasks_it = threads_tasks.begin(); thread_tasks_it != threads_tasks.end(); ++thread_tasks_it)
-        {
-            /// Prefer to steal tasks which have an initialized reader (with prefetched data). Thus we avoid
-            /// losing a prefetch by creating our own reader (or resusing our own reader if the part
-            /// is the same as last read by this thread).
-            auto & thread_tasks = thread_tasks_it->second;
-            auto task_it = std::find_if(
-                thread_tasks.begin(), thread_tasks.end(),
-                [](const auto & task) { return task->reader.valid(); });
-
-            if (task_it == thread_tasks.end())
-            {
-                /// The follow back to non-prefetched task should lie on the thread which
-                /// has more tasks than others.
-                if (non_prefetched_tasks_to_steal == threads_tasks.end()
-                    || non_prefetched_tasks_to_steal->second.size() < thread_tasks.size())
-                    non_prefetched_tasks_to_steal = thread_tasks_it;
-            }
-            /// Try to steal task with the best (lowest) priority (because it will be executed faster).
-            else if (prefetched_tasks_to_steal == threads_tasks.end()
-                || (*task_it)->priority < best_prefetched_task_priority)
-            {
-                best_prefetched_task_priority = (*task_it)->priority;
-                chassert(best_prefetched_task_priority >= 0);
-                prefetched_tasks_to_steal = thread_tasks_it;
-            }
-        }
-
-        if (prefetched_tasks_to_steal != threads_tasks.end())
-        {
-            auto & thread_tasks = prefetched_tasks_to_steal->second;
-            assert(!thread_tasks.empty());
-
-            auto task_it = std::find_if(
-                thread_tasks.begin(), thread_tasks.end(),
-                [](const auto & task) { return task->reader.valid(); });
-            assert(task_it != thread_tasks.end());
-
-            auto task = std::move(*task_it);
-            thread_tasks.erase(task_it);
-
-            if (thread_tasks.empty())
-                threads_tasks.erase(prefetched_tasks_to_steal);
-
-            return task;
-        }
-
-        /// TODO: it also makes sense to first try to steal from the next thread if it has ranges
-        /// from the same part as current thread last read - to reuse the reader.
-
-        if (non_prefetched_tasks_to_steal != threads_tasks.end())
-        {
-            auto & thread_tasks = non_prefetched_tasks_to_steal->second;
-            assert(!thread_tasks.empty());
-
-            /// Get second half of the tasks.
-            const size_t total_tasks = thread_tasks.size();
-            const size_t half = total_tasks / 2;
-            auto half_it = thread_tasks.begin() + half;
-            assert(half_it != thread_tasks.end());
-
-            /// Give them to current thread, as current thread's tasks list is empty.
-            auto & current_thread_tasks = threads_tasks[thread];
-            current_thread_tasks.insert(
-                current_thread_tasks.end(), make_move_iterator(half_it), make_move_iterator(thread_tasks.end()));
-
-            /// Erase them from the thread from which we steal.
-            thread_tasks.resize(half);
-            if (thread_tasks.empty())
-                threads_tasks.erase(non_prefetched_tasks_to_steal);
-
-            auto task = std::move(current_thread_tasks.front());
-            current_thread_tasks.erase(current_thread_tasks.begin());
-            if (current_thread_tasks.empty())
-                threads_tasks.erase(thread);
-
-            return task;
-        }
-
-        return nullptr;
-    }
+    auto it = per_thread_tasks.find(task_idx);
+    if (it == per_thread_tasks.end())
+        return stealTask(task_idx, previous_task);

    auto & thread_tasks = it->second;
    assert(!thread_tasks.empty());

-    auto task = std::move(thread_tasks.front());
+    auto thread_task = std::move(thread_tasks.front());
    thread_tasks.pop_front();

    if (thread_tasks.empty())
-        threads_tasks.erase(it);
+        per_thread_tasks.erase(it);

-    return task;
+    return createTask(*thread_task, previous_task);
+}
+
+MergeTreeReadTaskPtr MergeTreePrefetchedReadPool::stealTask(size_t thread, MergeTreeReadTask * previous_task)
+{
+    auto non_prefetched_tasks_to_steal = per_thread_tasks.end();
+    auto prefetched_tasks_to_steal = per_thread_tasks.end();
+    int64_t best_prefetched_task_priority = -1;
+
+    /// There is no point stealing in order (like in MergeTreeReadPool, where tasks can be stolen
+    /// only from the next thread). Even if we steal task from the next thread, which reads from
+    /// the same part as we just read, it might seem that we can reuse our own reader, do some
+    /// seek avoiding and it will have a good result as we avoided seek (new request). But it is
+    /// not so, because this next task will most likely have its own reader a prefetch already on
+    /// the fly. (Not to mention that in fact we cannot reuse our own reader if initially we did
+    /// not accounted this range into range request to object storage).
+    for (auto thread_tasks_it = per_thread_tasks.begin(); thread_tasks_it != per_thread_tasks.end(); ++thread_tasks_it)
+    {
+        /// Prefer to steal tasks which have an initialized reader (with prefetched data). Thus we avoid
+        /// losing a prefetch by creating our own reader (or resusing our own reader if the part
+        /// is the same as last read by this thread).
+        auto & thread_tasks = thread_tasks_it->second;
+
+        auto task_it = std::find_if(
+            thread_tasks.begin(), thread_tasks.end(),
+            [](const auto & task) { return task->readers_future.valid(); });
+
+        if (task_it == thread_tasks.end())
+        {
+            /// The follow back to non-prefetched task should lie on the thread which
+            /// has more tasks than others.
+            if (non_prefetched_tasks_to_steal == per_thread_tasks.end()
+                || non_prefetched_tasks_to_steal->second.size() < thread_tasks.size())
+                non_prefetched_tasks_to_steal = thread_tasks_it;
+        }
+        /// Try to steal task with the best (lowest) priority (because it will be executed faster).
+        else if (prefetched_tasks_to_steal == per_thread_tasks.end()
+            || (*task_it)->priority < best_prefetched_task_priority)
+        {
+            best_prefetched_task_priority = (*task_it)->priority;
+            chassert(best_prefetched_task_priority >= 0);
+            prefetched_tasks_to_steal = thread_tasks_it;
+        }
+    }
+
+    if (prefetched_tasks_to_steal != per_thread_tasks.end())
+    {
+        auto & thread_tasks = prefetched_tasks_to_steal->second;
+        assert(!thread_tasks.empty());
+
+        auto task_it = std::find_if(
+            thread_tasks.begin(), thread_tasks.end(),
+            [](const auto & task) { return task->readers_future.valid(); });
+
+        assert(task_it != thread_tasks.end());
+        auto thread_task = std::move(*task_it);
+        thread_tasks.erase(task_it);
+
+        if (thread_tasks.empty())
+            per_thread_tasks.erase(prefetched_tasks_to_steal);
+
+        return createTask(*thread_task, previous_task);
+    }
+
+    /// TODO: it also makes sense to first try to steal from the next thread if it has ranges
+    /// from the same part as current thread last read - to reuse the reader.
+    if (non_prefetched_tasks_to_steal != per_thread_tasks.end())
+    {
+        auto & thread_tasks = non_prefetched_tasks_to_steal->second;
+        assert(!thread_tasks.empty());
+
+        /// Get second half of the tasks.
+        const size_t total_tasks = thread_tasks.size();
+        const size_t half = total_tasks / 2;
+        auto half_it = thread_tasks.begin() + half;
+        assert(half_it != thread_tasks.end());
+
+        /// Give them to current thread, as current thread's tasks list is empty.
+        auto & current_thread_tasks = per_thread_tasks[thread];
+        current_thread_tasks.insert(
+            current_thread_tasks.end(), make_move_iterator(half_it), make_move_iterator(thread_tasks.end()));
+
+        /// Erase them from the thread from which we steal.
+        thread_tasks.resize(half);
+        if (thread_tasks.empty())
+            per_thread_tasks.erase(non_prefetched_tasks_to_steal);
+
+        auto thread_task = std::move(current_thread_tasks.front());
+        current_thread_tasks.erase(current_thread_tasks.begin());
+        if (current_thread_tasks.empty())
+            per_thread_tasks.erase(thread);
+
+        return createTask(*thread_task, previous_task);
+    }
+
+    return nullptr;
+}
+
+MergeTreeReadTaskPtr MergeTreePrefetchedReadPool::createTask(ThreadTask & task, MergeTreeReadTask * previous_task)
+{
+    if (task.readers_future.valid())
+    {
+        auto size_predictor = task.read_info->shared_size_predictor
+            ? std::make_unique<MergeTreeBlockSizePredictor>(*task.read_info->shared_size_predictor)
+            : nullptr;
+
+        return std::make_unique<MergeTreeReadTask>(task.read_info, task.readers_future.get(), task.ranges, std::move(size_predictor));
+    }
+
+    return MergeTreeReadPoolBase::createTask(task.read_info, task.ranges, previous_task);
 }

 size_t getApproximateSizeOfGranule(const IMergeTreeDataPart & part, const Names & columns_to_read)
@ -304,154 +299,111 @@ size_t getApproximateSizeOfGranule(const IMergeTreeDataPart & part, const Names
    return columns_size.data_compressed / part.getMarksCount();
 }

-MergeTreePrefetchedReadPool::PartsInfos MergeTreePrefetchedReadPool::getPartsInfos(
-    const RangesInDataParts & parts, size_t preferred_block_size_bytes) const
+void MergeTreePrefetchedReadPool::fillPerPartStatistics()
 {
-    PartsInfos result;
-    Block sample_block = storage_snapshot->metadata->getSampleBlock();
+    per_part_statistics.clear();
+    per_part_statistics.reserve(parts_ranges.size());
    const auto & settings = getContext()->getSettingsRef();
-    const bool predict_block_size_bytes = preferred_block_size_bytes > 0;

-    for (const auto & part : parts)
+    for (size_t i = 0; i < parts_ranges.size(); ++i)
    {
-        auto part_info = std::make_unique<PartInfo>();
-
-        part_info->data_part = part.data_part;
-        part_info->alter_conversions = part.alter_conversions;
-        part_info->part_index_in_query = part.part_index_in_query;
-        part_info->ranges = part.ranges;
-        std::sort(part_info->ranges.begin(), part_info->ranges.end());
-
-        LoadedMergeTreeDataPartInfoForReader part_reader_info(part.data_part, part_info->alter_conversions);
+        auto & part_stat = per_part_statistics.emplace_back();
+        const auto & read_info = *per_part_infos[i];

        /// Sum up total size of all mark ranges in a data part.
-        for (const auto & range : part.ranges)
-            part_info->sum_marks += range.end - range.begin;
+        for (const auto & range : parts_ranges[i].ranges)
+            part_stat.sum_marks += range.end - range.begin;

        const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
            ? prewhere_info->prewhere_actions->getRequiredColumnsNames()
            : column_names;
-        part_info->approx_size_of_mark = getApproximateSizeOfGranule(*part_info->data_part, columns);

-        const auto task_columns = getReadTaskColumns(
-            part_reader_info,
-            storage_snapshot,
-            column_names,
-            virtual_column_names,
-            prewhere_info,
-            actions_settings,
-            reader_settings,
-            /* with_subcolumns */ true);
+        part_stat.approx_size_of_mark = getApproximateSizeOfGranule(*read_info.data_part, columns);

-        part_info->size_predictor = !predict_block_size_bytes
-            ? nullptr
-            : IMergeTreeSelectAlgorithm::getSizePredictor(part.data_part, task_columns, sample_block);
-
-        /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter.
-        const auto & required_column_names = task_columns.columns.getNames();
-        part_info->column_name_set = {required_column_names.begin(), required_column_names.end()};
-        part_info->task_columns = task_columns;
+        auto update_stat_for_column = [&](const auto & column_name)
+        {
+            size_t column_size = read_info.data_part->getColumnSize(column_name).data_compressed;
+            part_stat.estimated_memory_usage_for_single_prefetch += std::min<size_t>(column_size, settings.prefetch_buffer_size);
+            ++part_stat.required_readers_num;
+        };

        /// adjustBufferSize(), which is done in MergeTreeReaderStream and MergeTreeReaderCompact,
        /// lowers buffer size if file size (or required read range) is less. So we know that the
        /// settings.prefetch_buffer_size will be lowered there, therefore we account it here as well.
        /// But here we make a more approximate lowering (because we do not have loaded marks yet),
        /// while in adjustBufferSize it will be presize.
-        for (const auto & col : task_columns.columns)
-        {
-            const auto col_size = part.data_part->getColumnSize(col.name).data_compressed;
-            part_info->estimated_memory_usage_for_single_prefetch += std::min<size_t>(col_size, settings.prefetch_buffer_size);
-            ++part_info->required_readers_num;
-        }
-        if (reader_settings.apply_deleted_mask && part.data_part->hasLightweightDelete())
-        {
-            const auto col_size = part.data_part->getColumnSize(
-                LightweightDeleteDescription::FILTER_COLUMN.name).data_compressed;
-            part_info->estimated_memory_usage_for_single_prefetch += std::min<size_t>(col_size, settings.prefetch_buffer_size);
-            ++part_info->required_readers_num;
-        }
-        if (prewhere_info)
-        {
-            for (const auto & cols : task_columns.pre_columns)
-            {
-                for (const auto & col : cols)
-                {
-                    const size_t col_size = part.data_part->getColumnSize(col.name).data_compressed;
-                    part_info->estimated_memory_usage_for_single_prefetch += std::min<size_t>(col_size, settings.prefetch_buffer_size);
-                    ++part_info->required_readers_num;
-                }
-            }
-        }
+        for (const auto & column : read_info.task_columns.columns)
+            update_stat_for_column(column.name);

-        result.push_back(std::move(part_info));
+        if (reader_settings.apply_deleted_mask && read_info.data_part->hasLightweightDelete())
+            update_stat_for_column(LightweightDeleteDescription::FILTER_COLUMN.name);
+
+        for (const auto & pre_columns : read_info.task_columns.pre_columns)
+            for (const auto & column : pre_columns)
+                update_stat_for_column(column.name);
    }
-
-    return result;
 }

-MergeTreePrefetchedReadPool::ThreadsTasks MergeTreePrefetchedReadPool::createThreadsTasks(
-    size_t threads, size_t sum_marks, size_t /* min_marks_for_concurrent_read */) const
+void MergeTreePrefetchedReadPool::fillPerThreadTasks(size_t threads, size_t sum_marks)
 {
-    if (parts_infos.empty())
-        return {};
+    if (per_part_infos.empty())
+        return;

    const auto & context = getContext();
    const auto & settings = context->getSettingsRef();

    size_t total_size_approx = 0;
-    for (const auto & part : parts_infos)
-    {
-        total_size_approx += part->sum_marks * part->approx_size_of_mark;
-    }
+    for (const auto & part : per_part_statistics)
+        total_size_approx += part.sum_marks * part.approx_size_of_mark;

    size_t min_prefetch_step_marks = 0;
-
-    for (const auto & part : parts_infos)
+    for (size_t i = 0; i < per_part_infos.size(); ++i)
    {
+        auto & part_stat = per_part_statistics[i];
+
        if (settings.filesystem_prefetch_step_marks)
        {
-            part->prefetch_step_marks = settings.filesystem_prefetch_step_marks;
+            part_stat.prefetch_step_marks = settings.filesystem_prefetch_step_marks;
        }
-        else if (settings.filesystem_prefetch_step_bytes && part->approx_size_of_mark)
+        else if (settings.filesystem_prefetch_step_bytes && part_stat.approx_size_of_mark)
        {
-            part->prefetch_step_marks = std::max<size_t>(
-                1, static_cast<size_t>(std::round(static_cast<double>(settings.filesystem_prefetch_step_bytes) / part->approx_size_of_mark)));
+            part_stat.prefetch_step_marks = std::max<size_t>(
+                1, static_cast<size_t>(std::round(static_cast<double>(settings.filesystem_prefetch_step_bytes) / part_stat.approx_size_of_mark)));
        }

        /// This limit is important to avoid spikes of slow aws getObject requests when parallelizing within one file.
        /// (The default is taken from here https://docs.aws.amazon.com/whitepapers/latest/s3-optimizing-performance-best-practices/use-byte-range-fetches.html).
-        if (part->approx_size_of_mark
+        if (part_stat.approx_size_of_mark
            && settings.filesystem_prefetch_min_bytes_for_single_read_task
-            && part->approx_size_of_mark < settings.filesystem_prefetch_min_bytes_for_single_read_task)
+            && part_stat.approx_size_of_mark < settings.filesystem_prefetch_min_bytes_for_single_read_task)
        {
            const size_t min_prefetch_step_marks_by_total_cols = static_cast<size_t>(
-                std::ceil(static_cast<double>(settings.filesystem_prefetch_min_bytes_for_single_read_task) / part->approx_size_of_mark));
+                std::ceil(static_cast<double>(settings.filesystem_prefetch_min_bytes_for_single_read_task) / part_stat.approx_size_of_mark));
+
            /// At least one task to start working on it right now and another one to prefetch in the meantime.
            const size_t new_min_prefetch_step_marks = std::min<size_t>(min_prefetch_step_marks_by_total_cols, sum_marks / threads / 2);
            if (min_prefetch_step_marks < new_min_prefetch_step_marks)
            {
                LOG_DEBUG(log, "Increasing min prefetch step from {} to {}", min_prefetch_step_marks, new_min_prefetch_step_marks);
-
                min_prefetch_step_marks = new_min_prefetch_step_marks;
            }
        }

-        if (part->prefetch_step_marks < min_prefetch_step_marks)
+        if (part_stat.prefetch_step_marks < min_prefetch_step_marks)
        {
-            LOG_DEBUG(log, "Increasing prefetch step from {} to {}", part->prefetch_step_marks, min_prefetch_step_marks);
-
-            part->prefetch_step_marks = min_prefetch_step_marks;
+            LOG_DEBUG(log, "Increasing prefetch step from {} to {}", part_stat.prefetch_step_marks, min_prefetch_step_marks);
+            part_stat.prefetch_step_marks = min_prefetch_step_marks;
        }

        LOG_DEBUG(
            log,
            "Part: {}, sum_marks: {}, approx mark size: {}, prefetch_step_bytes: {}, prefetch_step_marks: {}, (ranges: {})",
-            part->data_part->name,
-            part->sum_marks,
-            part->approx_size_of_mark,
+            parts_ranges[i].data_part->name,
+            part_stat.sum_marks,
+            part_stat.approx_size_of_mark,
            settings.filesystem_prefetch_step_bytes,
-            part->prefetch_step_marks,
-            toString(part->ranges));
+            part_stat.prefetch_step_marks,
+            toString(parts_ranges[i].ranges));
    }

    const size_t min_marks_per_thread = (sum_marks - 1) / threads + 1;
@ -469,13 +421,24 @@ MergeTreePrefetchedReadPool::ThreadsTasks MergeTreePrefetchedReadPool::createThr
    size_t allowed_memory_usage = settings.filesystem_prefetch_max_memory_usage;
    if (!allowed_memory_usage)
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `filesystem_prefetch_max_memory_usage` must be non-zero");
+
    std::optional<size_t> allowed_prefetches_num = settings.filesystem_prefetches_limit
        ? std::optional<size_t>(settings.filesystem_prefetches_limit)
        : std::nullopt;

-    ThreadsTasks result_threads_tasks;
+    per_thread_tasks.clear();
    size_t total_tasks = 0;
-    for (size_t i = 0, part_idx = 0; i < threads && part_idx < parts_infos.size(); ++i)
+
+    /// Make a copy to modify ranges.
+    std::vector<MarkRanges> per_part_ranges;
+    per_part_ranges.reserve(parts_ranges.size());
+    for (const auto & part_with_ranges : parts_ranges)
+    {
+        auto & part_ranges = per_part_ranges.emplace_back(part_with_ranges.ranges);
+        std::sort(part_ranges.begin(), part_ranges.end());
+    }
+
+    for (size_t i = 0, part_idx = 0; i < threads && part_idx < per_part_infos.size(); ++i)
    {
        int64_t need_marks = min_marks_per_thread;

@ -486,119 +449,102 @@ MergeTreePrefetchedReadPool::ThreadsTasks MergeTreePrefetchedReadPool::createThr
        /// reads from pool which are from reader.
        Priority priority{reader_settings.read_settings.priority.value + 1};

-        while (need_marks > 0 && part_idx < parts_infos.size())
+        while (need_marks > 0 && part_idx < per_part_infos.size())
        {
-            auto & part = *parts_infos[part_idx];
-            size_t & marks_in_part = part.sum_marks;
+            auto & part_stat = per_part_statistics[part_idx];
+            auto & part_ranges = per_part_ranges[part_idx];

-            if (marks_in_part == 0)
+            if (part_stat.sum_marks == 0)
            {
                ++part_idx;
                continue;
            }

            MarkRanges ranges_to_get_from_part;
-            size_t marks_to_get_from_part = std::min<size_t>(need_marks, marks_in_part);
+            size_t marks_to_get_from_part = std::min<size_t>(need_marks, part_stat.sum_marks);

            /// Split by prefetch step even if !allow_prefetch below. Because it will allow
            /// to make a better distribution of tasks which did not fill into memory limit
            /// or prefetches limit through tasks stealing.
-            if (part.prefetch_step_marks)
+            if (part_stat.prefetch_step_marks)
            {
-                marks_to_get_from_part = std::min<size_t>(marks_to_get_from_part, part.prefetch_step_marks);
+                marks_to_get_from_part = std::min<size_t>(marks_to_get_from_part, part_stat.prefetch_step_marks);
            }

-            if (marks_in_part == marks_to_get_from_part)
+            if (part_stat.sum_marks == marks_to_get_from_part)
            {
-                ranges_to_get_from_part = part.ranges;
+                ranges_to_get_from_part = part_ranges;
            }
            else
            {
-                if (part.sum_marks < marks_to_get_from_part)
+                if (part_stat.sum_marks < marks_to_get_from_part)
                {
                    throw Exception(
                        ErrorCodes::LOGICAL_ERROR,
                        "Requested {} marks from part {}, but part has only {} marks",
-                        marks_to_get_from_part, part.data_part->name, part.sum_marks);
+                        marks_to_get_from_part, per_part_infos[part_idx]->data_part->name, part_stat.sum_marks);
                }

-                size_t get_marks_num = marks_to_get_from_part;
-                while (get_marks_num > 0)
+                size_t num_marks_to_get = marks_to_get_from_part;
+                while (num_marks_to_get > 0)
                {
-                    MarkRange & range = part.ranges.front();
+                    MarkRange & range = part_ranges.front();
                    const size_t marks_in_range = range.end - range.begin;
-                    const size_t marks_to_get_from_range = std::min(marks_in_range, get_marks_num);
-                    get_marks_num -= marks_to_get_from_range;
+                    const size_t marks_to_get_from_range = std::min(marks_in_range, num_marks_to_get);
+                    num_marks_to_get -= marks_to_get_from_range;

                    ranges_to_get_from_part.emplace_back(range.begin, range.begin + marks_to_get_from_range);
                    range.begin += marks_to_get_from_range;

                    if (range.begin == range.end)
                    {
-                        part.ranges.pop_front();
+                        part_ranges.pop_front();
                    }
-                    else if (!get_marks_num && part.prefetch_step_marks && range.end - range.begin < part.prefetch_step_marks)
+                    else if (!num_marks_to_get && part_stat.prefetch_step_marks && range.end - range.begin < part_stat.prefetch_step_marks)
                    {
-                        /// We already have `get_marks_num` marks, but current mark range has
+                        /// We already have `num_marks_to_get` marks, but current mark range has
                        /// less than `prefetch_step_marks` marks, then add them too.
                        ranges_to_get_from_part.emplace_back(range.begin, range.end);
                        marks_to_get_from_part += range.end - range.begin;
-                        part.ranges.pop_front();
+                        part_ranges.pop_front();
                    }
                }
            }

            need_marks -= marks_to_get_from_part;
            sum_marks -= marks_to_get_from_part;
-            marks_in_part -= marks_to_get_from_part;
-
-            auto curr_task_size_predictor = !part.size_predictor ? nullptr
-                : std::make_unique<MergeTreeBlockSizePredictor>(*part.size_predictor); /// make a copy
-
-            auto read_task = std::make_unique<MergeTreeReadTask>(
-                part.data_part,
-                part.alter_conversions,
-                ranges_to_get_from_part,
-                part.part_index_in_query,
-                part.column_name_set,
-                part.task_columns,
-                std::move(curr_task_size_predictor));
-
-            read_task->priority = priority;
+            part_stat.sum_marks -= marks_to_get_from_part;

            bool allow_prefetch = false;
            if (allowed_memory_usage
-                && (allowed_prefetches_num.has_value() == false || allowed_prefetches_num.value() > 0))
+                && (!allowed_prefetches_num.has_value() || allowed_prefetches_num.value() > 0))
            {
-                allow_prefetch = part.estimated_memory_usage_for_single_prefetch <= allowed_memory_usage
-                    && (allowed_prefetches_num.has_value() == false
-                        || part.required_readers_num <= allowed_prefetches_num.value());
+                allow_prefetch = part_stat.estimated_memory_usage_for_single_prefetch <= allowed_memory_usage
+                    && (!allowed_prefetches_num.has_value() || part_stat.required_readers_num <= allowed_prefetches_num.value());

                if (allow_prefetch)
                {
-                    allowed_memory_usage -= part.estimated_memory_usage_for_single_prefetch;
+                    allowed_memory_usage -= part_stat.estimated_memory_usage_for_single_prefetch;
                    if (allowed_prefetches_num.has_value())
-                        *allowed_prefetches_num -= part.required_readers_num;
+                        *allowed_prefetches_num -= part_stat.required_readers_num;
                }
            }

+            auto thread_task = std::make_unique<ThreadTask>(per_part_infos[part_idx], ranges_to_get_from_part, priority);
            if (allow_prefetch)
-            {
-                prefetch_queue.emplace(TaskHolder(read_task.get(), i));
-            }
-            ++priority.value;
+                prefetch_queue.emplace(TaskHolder{thread_task.get(), i});

-            result_threads_tasks[i].push_back(std::move(read_task));
+            per_thread_tasks[i].push_back(std::move(thread_task));
+
+            ++priority.value;
            ++total_tasks;
        }
    }

-    LOG_TEST(log, "Result tasks {} for {} threads: {}", total_tasks, threads, dumpTasks(result_threads_tasks));
-
-    return result_threads_tasks;
+    LOG_TEST(log, "Result tasks {} for {} threads: {}", total_tasks, threads, dumpTasks(per_thread_tasks));
 }

-std::string MergeTreePrefetchedReadPool::dumpTasks(const ThreadsTasks & tasks)
+std::string MergeTreePrefetchedReadPool::dumpTasks(const TasksPerThread & tasks)
 {
    WriteBufferFromOwnString result;
    for (const auto & [thread_id, thread_tasks] : tasks)
@ -611,9 +557,9 @@ std::string MergeTreePrefetchedReadPool::dumpTasks(const ThreadsTasks & tasks)
            {
                result << '\t';
                result << ++no << ": ";
-                result << "reader: " << task->reader.valid() << ", ";
-                result << "part: " << task->data_part->name << ", ";
-                result << "ranges: " << toString(task->mark_ranges);
+                result << "reader future: " << task->readers_future.valid() << ", ";
+                result << "part: " << task->read_info->data_part->name << ", ";
+                result << "ranges: " << toString(task->ranges);
            }
        }
    }
--- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h
+++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h
@ -1,9 +1,6 @@
 #pragma once
-
+#include <Storages/MergeTree/MergeTreeReadPoolBase.h>
 #include <Common/ThreadPool_fwd.h>
-#include <Interpreters/ExpressionActionsSettings.h>
-#include <Storages/MergeTree/MergeTreeReadPool.h>
-#include <Storages/MergeTree/MergeTreeIOSettings.h>
 #include <IO/AsyncReadCounters.h>
 #include <boost/heap/priority_queue.hpp>
 #include <queue>
@ -16,99 +13,110 @@ using MergeTreeReaderPtr = std::unique_ptr<IMergeTreeReader>;
 /// A class which is responsible for creating read tasks
 /// which are later taken by readers via getTask method.
 /// Does prefetching for the read tasks it creates.
-class MergeTreePrefetchedReadPool : public IMergeTreeReadPool, private WithContext
+class MergeTreePrefetchedReadPool : public MergeTreeReadPoolBase, private WithContext
 {
 public:
    MergeTreePrefetchedReadPool(
-        size_t threads,
-        size_t sum_marks_,
-        size_t min_marks_for_concurrent_read_,
        RangesInDataParts && parts_,
        const StorageSnapshotPtr & storage_snapshot_,
        const PrewhereInfoPtr & prewhere_info_,
        const ExpressionActionsSettings & actions_settings_,
+        const MergeTreeReaderSettings & reader_settings_,
        const Names & column_names_,
        const Names & virtual_column_names_,
-        size_t preferred_block_size_bytes_,
-        const MergeTreeReaderSettings & reader_settings_,
-        ContextPtr context_,
-        bool use_uncompressed_cache_,
-        bool is_remote_read_,
-        const MergeTreeSettings & storage_settings_);
+        const PoolSettings & settings_,
+        const ContextPtr & context_);

-    MergeTreeReadTaskPtr getTask(size_t thread) override;
+    String getName() const override { return "PrefetchedReadPool"; }
+    bool preservesOrderOfRanges() const override { return false; }
+    MergeTreeReadTaskPtr getTask(size_t task_idx, MergeTreeReadTask * previous_task) override;

    void profileFeedback(ReadBufferFromFileBase::ProfileInfo) override {}

-    Block getHeader() const override { return header; }
-
    static bool checkReadMethodAllowed(LocalFSReadMethod method);
    static bool checkReadMethodAllowed(RemoteFSReadMethod method);

 private:
-    struct PartInfo;
-    using PartInfoPtr = std::shared_ptr<PartInfo>;
-    using PartsInfos = std::vector<PartInfoPtr>;
-    using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
-    using ThreadTasks = std::deque<MergeTreeReadTaskPtr>;
-    using ThreadsTasks = std::map<size_t, ThreadTasks>;
+    struct PartStatistic
+    {
+        size_t sum_marks = 0;

-    std::future<MergeTreeReaderPtr> createPrefetchedReader(
-        const IMergeTreeDataPart & data_part,
-        const NamesAndTypesList & columns,
-        const AlterConversionsPtr & alter_conversions,
-        const MarkRanges & required_ranges,
-        Priority priority) const;
+        size_t approx_size_of_mark = 0;
+        size_t prefetch_step_marks = 0;

-    void createPrefetchedReaderForTask(MergeTreeReadTask & task) const;
+        size_t estimated_memory_usage_for_single_prefetch = 0;
+        size_t required_readers_num = 0;
+    };

-    size_t getApproxSizeOfGranule(const IMergeTreeDataPart & part) const;
+    class PrefetechedReaders
+    {
+    public:
+        PrefetechedReaders() = default;
+        PrefetechedReaders(MergeTreeReadTask::Readers readers_, Priority priority_, MergeTreePrefetchedReadPool & pool_);

-    PartsInfos getPartsInfos(const RangesInDataParts & parts, size_t preferred_block_size_bytes) const;
+        void wait();
+        MergeTreeReadTask::Readers get();
+        bool valid() const { return is_valid; }

-    ThreadsTasks createThreadsTasks(
-        size_t threads,
-        size_t sum_marks,
-        size_t min_marks_for_concurrent_read) const;
+    private:
+        bool is_valid = false;
+        MergeTreeReadTask::Readers readers;
+        std::vector<std::future<void>> prefetch_futures;
+    };

-    void startPrefetches() const;
+    struct ThreadTask
+    {
+        using InfoPtr = MergeTreeReadTask::InfoPtr;

-    static std::string dumpTasks(const ThreadsTasks & tasks);
+        ThreadTask(InfoPtr read_info_, MarkRanges ranges_, Priority priority_)
+            : read_info(std::move(read_info_)), ranges(std::move(ranges_)), priority(priority_)
+        {
+        }

-    Poco::Logger * log;
+        ~ThreadTask()
+        {
+            if (readers_future.valid())
+                readers_future.wait();
+        }

-    Block header;
-    MarkCache * mark_cache;
-    UncompressedCache * uncompressed_cache;
-    ReadBufferFromFileBase::ProfileCallback profile_callback;
-    size_t index_granularity_bytes;
-    size_t fixed_index_granularity;
-
-    StorageSnapshotPtr storage_snapshot;
-    const Names column_names;
-    const Names virtual_column_names;
-    PrewhereInfoPtr prewhere_info;
-    const ExpressionActionsSettings actions_settings;
-    const MergeTreeReaderSettings reader_settings;
-    RangesInDataParts parts_ranges;
-
-    [[ maybe_unused ]] const bool is_remote_read;
-    ThreadPool & prefetch_threadpool;
-
-    PartsInfos parts_infos;
-
-    ThreadsTasks threads_tasks;
-    std::mutex mutex;
+        InfoPtr read_info;
+        MarkRanges ranges;
+        Priority priority;
+        PrefetechedReaders readers_future;
+    };

    struct TaskHolder
    {
-        explicit TaskHolder(MergeTreeReadTask * task_, size_t thread_id_) : task(task_), thread_id(thread_id_) {}
-        MergeTreeReadTask * task;
-        size_t thread_id;
-        bool operator <(const TaskHolder & other) const;
+        ThreadTask * task = nullptr;
+        size_t thread_id = 0;
+        bool operator<(const TaskHolder & other) const;
    };
-    mutable std::priority_queue<TaskHolder> prefetch_queue; /// the smallest on top
+
+    using ThreadTaskPtr = std::unique_ptr<ThreadTask>;
+    using ThreadTasks = std::deque<ThreadTaskPtr>;
+    using TasksPerThread = std::map<size_t, ThreadTasks>;
+    using PartStatistics = std::vector<PartStatistic>;
+
+    void fillPerPartStatistics();
+    void fillPerThreadTasks(size_t threads, size_t sum_marks);
+
+    void startPrefetches();
+    void createPrefetchedReadersForTask(ThreadTask & task);
+    std::future<void> createPrefetchedFuture(IMergeTreeReader * reader, Priority priority);
+
+    MergeTreeReadTaskPtr stealTask(size_t thread, MergeTreeReadTask * previous_task);
+    MergeTreeReadTaskPtr createTask(ThreadTask & thread_task, MergeTreeReadTask * previous_task);
+
+    static std::string dumpTasks(const TasksPerThread & tasks);
+
+    mutable std::mutex mutex;
+    ThreadPool & prefetch_threadpool;
+
+    PartStatistics per_part_statistics;
+    TasksPerThread per_thread_tasks;
+    std::priority_queue<TaskHolder> prefetch_queue; /// the smallest on top
    bool started_prefetches = false;
+    Poco::Logger * log;

    /// A struct which allows to track max number of tasks which were in the
    /// threadpool simultaneously (similar to CurrentMetrics, but the result
--- a/src/Storages/MergeTree/MergeTreeReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp
@ -1,5 +1,6 @@
+#include "Storages/MergeTree/MergeTreeBlockReadUtils.h"
+#include "Storages/MergeTree/MergeTreeReadTask.h"
 #include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeReadPool.h>
 #include <base/range.h>
 #include <Interpreters/Context_fwd.h>
@ -32,9 +33,6 @@ size_t getApproxSizeOfPart(const IMergeTreeDataPart & part, const Names & column
 }

 MergeTreeReadPool::MergeTreeReadPool(
-    size_t threads_,
-    size_t sum_marks_,
-    size_t min_marks_for_concurrent_read_,
    RangesInDataParts && parts_,
    const StorageSnapshotPtr & storage_snapshot_,
    const PrewhereInfoPtr & prewhere_info_,
@ -42,29 +40,22 @@ MergeTreeReadPool::MergeTreeReadPool(
    const MergeTreeReaderSettings & reader_settings_,
    const Names & column_names_,
    const Names & virtual_column_names_,
-    ContextPtr context_,
-    bool do_not_steal_tasks_)
-    : storage_snapshot(storage_snapshot_)
-    , column_names(column_names_)
-    , virtual_column_names(virtual_column_names_)
-    , min_marks_for_concurrent_read(min_marks_for_concurrent_read_)
-    , prewhere_info(prewhere_info_)
-    , actions_settings(actions_settings_)
-    , reader_settings(reader_settings_)
-    , parts_ranges(std::move(parts_))
-    , predict_block_size_bytes(context_->getSettingsRef().preferred_block_size_bytes > 0)
-    , do_not_steal_tasks(do_not_steal_tasks_)
-    , merge_tree_use_const_size_tasks_for_remote_reading(context_->getSettingsRef().merge_tree_use_const_size_tasks_for_remote_reading)
+    const PoolSettings & settings_,
+    const ContextPtr & context_)
+    : MergeTreeReadPoolBase(
+        std::move(parts_),
+        storage_snapshot_,
+        prewhere_info_,
+        actions_settings_,
+        reader_settings_,
+        column_names_,
+        virtual_column_names_,
+        settings_,
+        context_)
+    , min_marks_for_concurrent_read(pool_settings.min_marks_for_concurrent_read)
    , backoff_settings{context_->getSettingsRef()}
-    , backoff_state{threads_}
+    , backoff_state{pool_settings.threads}
 {
-    /// parts don't contain duplicate MergeTreeDataPart's.
-    const auto per_part_sum_marks = fillPerPartInfo(
-        parts_ranges, storage_snapshot, is_part_on_remote_disk,
-        predict_block_size_bytes,
-        column_names, virtual_column_names, prewhere_info,
-        actions_settings, reader_settings, per_part_params);
-
    if (std::ranges::count(is_part_on_remote_disk, true))
    {
        const auto & settings = context_->getSettingsRef();
@ -76,6 +67,7 @@ MergeTreeReadPool::MergeTreeReadPool(
            const auto & columns = settings.merge_tree_determine_task_size_by_prewhere_columns && prewhere_info
                ? prewhere_info->prewhere_actions->getRequiredColumnsNames()
                : column_names_;
+
            total_compressed_bytes += getApproxSizeOfPart(*part.data_part, columns);
            total_marks += part.getMarksCount();
        }
@ -85,118 +77,60 @@ MergeTreeReadPool::MergeTreeReadPool(
            const auto min_bytes_per_task = settings.merge_tree_min_bytes_per_task_for_remote_reading;
            const auto avg_mark_bytes = std::max<size_t>(total_compressed_bytes / total_marks, 1);
            /// We're taking min here because number of tasks shouldn't be too low - it will make task stealing impossible.
-            const auto heuristic_min_marks = std::min<size_t>(total_marks / threads_, min_bytes_per_task / avg_mark_bytes);
+            const auto heuristic_min_marks = std::min<size_t>(total_marks / pool_settings.threads, min_bytes_per_task / avg_mark_bytes);
+
            if (heuristic_min_marks > min_marks_for_concurrent_read)
-            {
                min_marks_for_concurrent_read = heuristic_min_marks;
-            }
        }
    }

-    fillPerThreadInfo(threads_, sum_marks_, per_part_sum_marks, parts_ranges);
+    fillPerThreadInfo(pool_settings.threads, pool_settings.sum_marks);
 }

-std::vector<size_t> MergeTreeReadPool::fillPerPartInfo(
-    const RangesInDataParts & parts,
-    const StorageSnapshotPtr & storage_snapshot,
-    std::vector<bool> & is_part_on_remote_disk,
-    bool & predict_block_size_bytes,
-    const Names & column_names,
-    const Names & virtual_column_names,
-    const PrewhereInfoPtr & prewhere_info,
-    const ExpressionActionsSettings & actions_settings,
-    const MergeTreeReaderSettings & reader_settings,
-    std::vector<MergeTreeReadPool::PerPartParams> & per_part_params)
-{
-    std::vector<size_t> per_part_sum_marks;
-    Block sample_block = storage_snapshot->metadata->getSampleBlock();
-    is_part_on_remote_disk.resize(parts.size());
-
-    for (const auto i : collections::range(0, parts.size()))
-    {
-        const auto & part = parts[i];
-#ifndef NDEBUG
-        assertSortedAndNonIntersecting(part.ranges);
-#endif
-
-        bool part_on_remote_disk = part.data_part->isStoredOnRemoteDisk();
-        is_part_on_remote_disk[i] = part_on_remote_disk;
-
-        /// Read marks for every data part.
-        size_t sum_marks = 0;
-        for (const auto & range : part.ranges)
-            sum_marks += range.end - range.begin;
-
-        per_part_sum_marks.push_back(sum_marks);
-
-        auto & per_part = per_part_params.emplace_back();
-        per_part.data_part = part;
-
-        LoadedMergeTreeDataPartInfoForReader part_info(part.data_part, part.alter_conversions);
-        auto task_columns = getReadTaskColumns(
-            part_info, storage_snapshot, column_names, virtual_column_names,
-            prewhere_info, actions_settings,
-            reader_settings, /*with_subcolumns=*/ true);
-
-        auto size_predictor = !predict_block_size_bytes ? nullptr
-            : IMergeTreeSelectAlgorithm::getSizePredictor(part.data_part, task_columns, sample_block);
-
-        per_part.size_predictor = std::move(size_predictor);
-
-        /// will be used to distinguish between PREWHERE and WHERE columns when applying filter
-        const auto & required_column_names = task_columns.columns.getNames();
-        per_part.column_name_set = {required_column_names.begin(), required_column_names.end()};
-        per_part.task_columns = std::move(task_columns);
-    }
-
-    return per_part_sum_marks;
-}
-
-MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t thread)
+MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t task_idx, MergeTreeReadTask * previous_task)
 {
    const std::lock_guard lock{mutex};

    /// If number of threads was lowered due to backoff, then will assign work only for maximum 'backoff_state.current_threads' threads.
-    if (thread >= backoff_state.current_threads)
+    if (task_idx >= backoff_state.current_threads)
        return nullptr;

    if (remaining_thread_tasks.empty())
        return nullptr;

-    const auto tasks_remaining_for_this_thread = !threads_tasks[thread].sum_marks_in_parts.empty();
-    if (!tasks_remaining_for_this_thread && do_not_steal_tasks)
+    const auto tasks_remaining_for_this_thread = !threads_tasks[task_idx].sum_marks_in_parts.empty();
+    if (!tasks_remaining_for_this_thread && pool_settings.do_not_steal_tasks)
        return nullptr;

    /// Steal task if nothing to do and it's not prohibited
-    auto thread_idx = thread;
+    auto thread_idx = task_idx;
    if (!tasks_remaining_for_this_thread)
    {
        auto it = remaining_thread_tasks.lower_bound(backoff_state.current_threads);
        // Grab the entire tasks of a thread which is killed by backoff
        if (it != remaining_thread_tasks.end())
        {
-            threads_tasks[thread] = std::move(threads_tasks[*it]);
+            threads_tasks[task_idx] = std::move(threads_tasks[*it]);
            remaining_thread_tasks.erase(it);
-            remaining_thread_tasks.insert(thread);
+            remaining_thread_tasks.insert(task_idx);
        }
        else // Try steal tasks from the next thread
        {
-            it = remaining_thread_tasks.upper_bound(thread);
+            it = remaining_thread_tasks.upper_bound(task_idx);
            if (it == remaining_thread_tasks.end())
                it = remaining_thread_tasks.begin();
            thread_idx = *it;
        }
    }
+
    auto & thread_tasks = threads_tasks[thread_idx];
-
    auto & thread_task = thread_tasks.parts_and_ranges.back();
-    const auto part_idx = thread_task.part_idx;

-    auto & part = per_part_params[part_idx].data_part;
+    const auto part_idx = thread_task.part_idx;
    auto & marks_in_part = thread_tasks.sum_marks_in_parts.back();

    size_t need_marks;
-    if (is_part_on_remote_disk[part_idx] && !merge_tree_use_const_size_tasks_for_remote_reading)
+    if (is_part_on_remote_disk[part_idx] && !pool_settings.use_const_size_tasks_for_remote_reading)
        need_marks = marks_in_part;
    else /// Get whole part to read if it is small enough.
        need_marks = std::min(marks_in_part, min_marks_for_concurrent_read);
@ -239,28 +173,12 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t thread)
        }
    }

-    const auto & per_part = per_part_params[part_idx];
-    auto curr_task_size_predictor = !per_part.size_predictor ? nullptr
-        : std::make_unique<MergeTreeBlockSizePredictor>(*per_part.size_predictor); /// make a copy
-
-    return std::make_unique<MergeTreeReadTask>(
-        part.data_part,
-        part.alter_conversions,
-        ranges_to_get_from_part,
-        part.part_index_in_query,
-        per_part.column_name_set,
-        per_part.task_columns,
-        std::move(curr_task_size_predictor));
-}
-
-Block MergeTreeReadPool::getHeader() const
-{
-    return storage_snapshot->getSampleBlockForColumns(column_names);
+    return createTask(per_part_infos[part_idx], std::move(ranges_to_get_from_part), previous_task);
 }

 void MergeTreeReadPool::profileFeedback(ReadBufferFromFileBase::ProfileInfo info)
 {
-    if (backoff_settings.min_read_latency_ms == 0 || do_not_steal_tasks)
+    if (backoff_settings.min_read_latency_ms == 0 || pool_settings.do_not_steal_tasks)
        return;

    if (info.nanoseconds < backoff_settings.min_read_latency_ms * 1000000)
@ -297,13 +215,10 @@ void MergeTreeReadPool::profileFeedback(ReadBufferFromFileBase::ProfileInfo info
    LOG_DEBUG(log, "Will lower number of threads to {}", backoff_state.current_threads);
 }

-
-void MergeTreeReadPool::fillPerThreadInfo(
-    size_t threads, size_t sum_marks, std::vector<size_t> per_part_sum_marks,
-    const RangesInDataParts & parts)
+void MergeTreeReadPool::fillPerThreadInfo(size_t threads, size_t sum_marks)
 {
    threads_tasks.resize(threads);
-    if (parts.empty())
+    if (parts_ranges.empty())
        return;

    struct PartInfo
@ -316,17 +231,19 @@ void MergeTreeReadPool::fillPerThreadInfo(
    using PartsInfo = std::vector<PartInfo>;
    std::queue<PartsInfo> parts_queue;

+    auto per_part_sum_marks = getPerPartSumMarks();
+
    {
        /// Group parts by disk name.
        /// We try minimize the number of threads concurrently read from the same disk.
        /// It improves the performance for JBOD architecture.
        std::map<String, std::vector<PartInfo>> parts_per_disk;

-        for (size_t i = 0; i < parts.size(); ++i)
+        for (size_t i = 0; i < parts_ranges.size(); ++i)
        {
-            PartInfo part_info{parts[i], per_part_sum_marks[i], i};
-            if (parts[i].data_part->isStoredOnDisk())
-                parts_per_disk[parts[i].data_part->getDataPartStorage().getDiskName()].push_back(std::move(part_info));
+            PartInfo part_info{parts_ranges[i], per_part_sum_marks[i], i};
+            if (parts_ranges[i].data_part->isStoredOnDisk())
+                parts_per_disk[parts_ranges[i].data_part->getDataPartStorage().getDiskName()].push_back(std::move(part_info));
            else
                parts_per_disk[""].push_back(std::move(part_info));
        }
@ -346,7 +263,7 @@ void MergeTreeReadPool::fillPerThreadInfo(
        while (need_marks > 0 && !parts_queue.empty())
        {
            auto & current_parts = parts_queue.front();
-            RangesInDataPart & part = current_parts.back().part;
+            auto & part_with_ranges = current_parts.back().part;
            size_t & marks_in_part = current_parts.back().sum_marks;
            const auto part_idx = current_parts.back().part_idx;

@ -366,7 +283,7 @@ void MergeTreeReadPool::fillPerThreadInfo(
            /// Get whole part to read if it is small enough.
            if (marks_in_part <= need_marks)
            {
-                ranges_to_get_from_part = part.ranges;
+                ranges_to_get_from_part = part_with_ranges.ranges;
                marks_in_ranges = marks_in_part;

                need_marks -= marks_in_part;
@ -379,10 +296,10 @@ void MergeTreeReadPool::fillPerThreadInfo(
                /// Loop through part ranges.
                while (need_marks > 0)
                {
-                    if (part.ranges.empty())
+                    if (part_with_ranges.ranges.empty())
                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected end of ranges while spreading marks among threads");

-                    MarkRange & range = part.ranges.front();
+                    MarkRange & range = part_with_ranges.ranges.front();

                    const size_t marks_in_range = range.end - range.begin;
                    const size_t marks_to_get_from_range = std::min(marks_in_range, need_marks);
@ -392,11 +309,11 @@ void MergeTreeReadPool::fillPerThreadInfo(
                    marks_in_part -= marks_to_get_from_range;
                    need_marks -= marks_to_get_from_range;
                    if (range.begin == range.end)
-                        part.ranges.pop_front();
+                        part_with_ranges.ranges.pop_front();
                }
            }

-            threads_tasks[i].parts_and_ranges.push_back({ part_idx, ranges_to_get_from_part });
+            threads_tasks[i].parts_and_ranges.push_back({part_idx, ranges_to_get_from_part});
            threads_tasks[i].sum_marks_in_parts.push_back(marks_in_ranges);
            if (marks_in_ranges != 0)
                remaining_thread_tasks.insert(i);
@ -415,152 +332,4 @@ void MergeTreeReadPool::fillPerThreadInfo(
    }
 }

-
-MergeTreeReadPoolParallelReplicas::~MergeTreeReadPoolParallelReplicas() = default;
-
-
-Block MergeTreeReadPoolParallelReplicas::getHeader() const
-{
-    return storage_snapshot->getSampleBlockForColumns(extension.columns_to_read);
-}
-
-MergeTreeReadTaskPtr MergeTreeReadPoolParallelReplicas::getTask(size_t thread)
-{
-    /// This parameter is needed only to satisfy the interface
-    UNUSED(thread);
-
-    std::lock_guard lock(mutex);
-
-    if (no_more_tasks_available)
-        return nullptr;
-
-    if (buffered_ranges.empty())
-    {
-        auto result = extension.callback(ParallelReadRequest(
-            CoordinationMode::Default,
-            extension.number_of_current_replica,
-            min_marks_for_concurrent_read * threads,
-            /// For Default coordination mode we don't need to pass part names.
-            RangesInDataPartsDescription{}));
-
-        if (!result || result->finish)
-        {
-            no_more_tasks_available = true;
-            return nullptr;
-        }
-
-        buffered_ranges = std::move(result->description);
-    }
-
-    if (buffered_ranges.empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No tasks to read. This is a bug");
-
-    auto & current_task = buffered_ranges.front();
-
-    RangesInDataPart part;
-    size_t part_idx = 0;
-    for (size_t index = 0; index < per_part_params.size(); ++index)
-    {
-        auto & other_part = per_part_params[index];
-        if (other_part.data_part.data_part->info == current_task.info)
-        {
-            part = other_part.data_part;
-            part_idx = index;
-            break;
-        }
-    }
-
-    MarkRanges ranges_to_read;
-    size_t current_sum_marks = 0;
-    while (current_sum_marks < min_marks_for_concurrent_read && !current_task.ranges.empty())
-    {
-        auto diff = min_marks_for_concurrent_read - current_sum_marks;
-        auto range = current_task.ranges.front();
-        if (range.getNumberOfMarks() > diff)
-        {
-            auto new_range = range;
-            new_range.end = range.begin + diff;
-            range.begin += diff;
-
-            current_task.ranges.front() = range;
-            ranges_to_read.push_back(new_range);
-            current_sum_marks += new_range.getNumberOfMarks();
-            continue;
-        }
-
-        ranges_to_read.push_back(range);
-        current_sum_marks += range.getNumberOfMarks();
-        current_task.ranges.pop_front();
-    }
-
-    if (current_task.ranges.empty())
-        buffered_ranges.pop_front();
-
-    const auto & per_part = per_part_params[part_idx];
-
-    auto curr_task_size_predictor
-        = !per_part.size_predictor ? nullptr : std::make_unique<MergeTreeBlockSizePredictor>(*per_part.size_predictor); /// make a copy
-
-    return std::make_unique<MergeTreeReadTask>(
-        part.data_part,
-        part.alter_conversions,
-        ranges_to_read,
-        part.part_index_in_query,
-        per_part.column_name_set,
-        per_part.task_columns,
-        std::move(curr_task_size_predictor));
-}
-
-
-MarkRanges MergeTreeInOrderReadPoolParallelReplicas::getNewTask(RangesInDataPartDescription description)
-{
-    std::lock_guard lock(mutex);
-
-    auto get_from_buffer = [&]() -> std::optional<MarkRanges>
-    {
-        for (auto & desc : buffered_tasks)
-        {
-            if (desc.info == description.info && !desc.ranges.empty())
-            {
-                auto result = std::move(desc.ranges);
-                desc.ranges = MarkRanges{};
-                return result;
-            }
-        }
-        return std::nullopt;
-    };
-
-    if (auto result = get_from_buffer(); result)
-        return result.value();
-
-    if (no_more_tasks)
-        return {};
-
-    auto response = extension.callback(ParallelReadRequest(
-        mode,
-        extension.number_of_current_replica,
-        min_marks_for_concurrent_read * request.size(),
-        request
-    ));
-
-    if (!response || response->description.empty() || response->finish)
-    {
-        no_more_tasks = true;
-        return {};
-    }
-
-    /// Fill the buffer
-    for (size_t i = 0; i < request.size(); ++i)
-    {
-        auto & new_ranges = response->description[i].ranges;
-        auto & old_ranges = buffered_tasks[i].ranges;
-        std::move(new_ranges.begin(), new_ranges.end(), std::back_inserter(old_ranges));
-    }
-
-    if (auto result = get_from_buffer(); result)
-        return result.value();
-
-    return {};
-}
-
 }
--- a/src/Storages/MergeTree/MergeTreeReadPool.h
+++ b/src/Storages/MergeTree/MergeTreeReadPool.h
@ -1,38 +1,30 @@
 #pragma once
-
+#include <Storages/MergeTree/MergeTreeReadPoolBase.h>
 #include <Core/NamesAndTypes.h>
-#include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
-#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/RangesInDataPart.h>
 #include <Storages/MergeTree/RequestResponse.h>
-#include <Storages/MergeTree/IMergeTreeReadPool.h>
 #include <Storages/SelectQueryInfo.h>
 #include <Storages/MergeTree/AlterConversions.h>
-#include <Interpreters/Context_fwd.h>
-
 #include <mutex>


 namespace DB
 {

-/** Provides read tasks for MergeTreeThreadSelectProcessor`s in fine-grained batches, allowing for more
+/** Provides read tasks for MergeTreeThreadSelectAlgorithm in fine-grained batches, allowing for more
 *  uniform distribution of work amongst multiple threads. All parts and their ranges are divided into `threads`
 *  workloads with at most `sum_marks / threads` marks. Then, threads are performing reads from these workloads
 *  in "sequential" manner, requesting work in small batches. As soon as some thread has exhausted
 *  it's workload, it either is signaled that no more work is available (`do_not_steal_tasks == false`) or
 *  continues taking small batches from other threads' workloads (`do_not_steal_tasks == true`).
 */
-class MergeTreeReadPool : public IMergeTreeReadPool
+class MergeTreeReadPool : public MergeTreeReadPoolBase
 {
 public:
    struct BackoffSettings;

    MergeTreeReadPool(
-        size_t threads_,
-        size_t sum_marks_,
-        size_t min_marks_for_concurrent_read_,
        RangesInDataParts && parts_,
        const StorageSnapshotPtr & storage_snapshot_,
        const PrewhereInfoPtr & prewhere_info_,
@ -40,12 +32,14 @@ public:
        const MergeTreeReaderSettings & reader_settings_,
        const Names & column_names_,
        const Names & virtual_column_names_,
-        ContextPtr context_,
-        bool do_not_steal_tasks_ = false);
+        const PoolSettings & settings_,
+        const ContextPtr & context_);

    ~MergeTreeReadPool() override = default;

-    MergeTreeReadTaskPtr getTask(size_t thread) override;
+    String getName() const override { return "ReadPool"; }
+    bool preservesOrderOfRanges() const override { return false; }
+    MergeTreeReadTaskPtr getTask(size_t task_idx, MergeTreeReadTask * previous_task) override;

    /** Each worker could call this method and pass information about read performance.
      * If read performance is too low, pool could decide to lower number of threads: do not assign more tasks to several threads.
@ -53,8 +47,6 @@ public:
      */
    void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) override;

-    Block getHeader() const override;
-
    /** Pull could dynamically lower (backoff) number of threads, if read operation are too slow.
      * Settings for that backoff.
      */
@ -82,50 +74,12 @@ public:
        BackoffSettings() : min_read_latency_ms(0) {}
    };

-    struct PerPartParams
-    {
-        MergeTreeReadTaskColumns task_columns;
-        NameSet column_name_set;
-        MergeTreeBlockSizePredictorPtr size_predictor;
-        RangesInDataPart data_part;
-    };
-
-    static std::vector<size_t> fillPerPartInfo(
-        const RangesInDataParts & parts,
-        const StorageSnapshotPtr & storage_snapshot,
-        std::vector<bool> & is_part_on_remote_disk,
-        bool & predict_block_size_bytes,
-        const Names & column_names,
-        const Names & virtual_column_names,
-        const PrewhereInfoPtr & prewhere_info,
-        const ExpressionActionsSettings & actions_settings_,
-        const MergeTreeReaderSettings & reader_settings_,
-        std::vector<MergeTreeReadPool::PerPartParams> & per_part_params);
-
 private:
-    void fillPerThreadInfo(
-        size_t threads, size_t sum_marks, std::vector<size_t> per_part_sum_marks,
-        const RangesInDataParts & parts);
-
-    /// Initialized in constructor
-    StorageSnapshotPtr storage_snapshot;
-    const Names column_names;
-    const Names virtual_column_names;
-    size_t min_marks_for_concurrent_read{0};
-    PrewhereInfoPtr prewhere_info;
-    ExpressionActionsSettings actions_settings;
-    MergeTreeReaderSettings reader_settings;
-    RangesInDataParts parts_ranges;
-    bool predict_block_size_bytes;
-    bool do_not_steal_tasks;
-    bool merge_tree_use_const_size_tasks_for_remote_reading = false;
-
-    std::vector<PerPartParams> per_part_params;
-    std::vector<bool> is_part_on_remote_disk;
-
-    BackoffSettings backoff_settings;
+    void fillPerThreadInfo(size_t threads, size_t sum_marks);

    mutable std::mutex mutex;
+    size_t min_marks_for_concurrent_read = 0;
+
    /// State to track numbers of slow reads.
    struct BackoffState
    {
@ -135,16 +89,10 @@ private:

        explicit BackoffState(size_t threads) : current_threads(threads) {}
    };
+
+    const BackoffSettings backoff_settings;
    BackoffState backoff_state;

-    struct Part
-    {
-        MergeTreeData::DataPartPtr data_part;
-        size_t part_index_in_query;
-    };
-
-    std::vector<Part> parts_with_idx;
-
    struct ThreadTask
    {
        struct PartIndexAndRange
@ -159,123 +107,8 @@ private:

    std::vector<ThreadTask> threads_tasks;
    std::set<size_t> remaining_thread_tasks;
+
    Poco::Logger * log = &Poco::Logger::get("MergeTreeReadPool");
-
 };

-class MergeTreeReadPoolParallelReplicas : public IMergeTreeReadPool
-{
-public:
-    MergeTreeReadPoolParallelReplicas(
-        StorageSnapshotPtr storage_snapshot_,
-        size_t threads_,
-        ParallelReadingExtension extension_,
-        const RangesInDataParts & parts_,
-        const PrewhereInfoPtr & prewhere_info_,
-        const ExpressionActionsSettings & actions_settings_,
-        const MergeTreeReaderSettings & reader_settings_,
-        const Names & column_names_,
-        const Names & virtual_column_names_,
-        size_t min_marks_for_concurrent_read_)
-        : extension(extension_)
-        , threads(threads_)
-        , prewhere_info(prewhere_info_)
-        , actions_settings(actions_settings_)
-        , reader_settings(reader_settings_)
-        , storage_snapshot(storage_snapshot_)
-        , min_marks_for_concurrent_read(min_marks_for_concurrent_read_)
-        , column_names(column_names_)
-        , virtual_column_names(virtual_column_names_)
-        , parts_ranges(std::move(parts_))
-    {
-        MergeTreeReadPool::fillPerPartInfo(
-            parts_ranges, storage_snapshot, is_part_on_remote_disk,
-            predict_block_size_bytes, column_names, virtual_column_names, prewhere_info,
-            actions_settings, reader_settings, per_part_params);
-
-        extension.all_callback(InitialAllRangesAnnouncement(
-            CoordinationMode::Default,
-            parts_ranges.getDescriptions(),
-            extension.number_of_current_replica
-        ));
-    }
-
-    ~MergeTreeReadPoolParallelReplicas() override;
-
-    Block getHeader() const override;
-
-    MergeTreeReadTaskPtr getTask(size_t thread) override;
-
-    void profileFeedback(ReadBufferFromFileBase::ProfileInfo) override {}
-
-private:
-    ParallelReadingExtension extension;
-
-    RangesInDataPartsDescription buffered_ranges;
-    size_t threads;
-    bool no_more_tasks_available{false};
-    Poco::Logger * log = &Poco::Logger::get("MergeTreeReadPoolParallelReplicas");
-
-    std::mutex mutex;
-
-    PrewhereInfoPtr prewhere_info;
-    ExpressionActionsSettings actions_settings;
-    MergeTreeReaderSettings reader_settings;
-    StorageSnapshotPtr storage_snapshot;
-    size_t min_marks_for_concurrent_read;
-    const Names column_names;
-    const Names virtual_column_names;
-    RangesInDataParts parts_ranges;
-
-    bool predict_block_size_bytes = false;
-    std::vector<bool> is_part_on_remote_disk;
-    std::vector<MergeTreeReadPool::PerPartParams> per_part_params;
-};
-
-using MergeTreeReadPoolParallelReplicasPtr = std::shared_ptr<MergeTreeReadPoolParallelReplicas>;
-
-
-class MergeTreeInOrderReadPoolParallelReplicas : private boost::noncopyable
-{
-public:
-    MergeTreeInOrderReadPoolParallelReplicas(
-        RangesInDataParts parts_,
-        ParallelReadingExtension extension_,
-        CoordinationMode mode_,
-        size_t min_marks_for_concurrent_read_)
-        : parts_ranges(parts_)
-        , extension(extension_)
-        , mode(mode_)
-        , min_marks_for_concurrent_read(min_marks_for_concurrent_read_)
-    {
-        for (const auto & part : parts_ranges)
-            request.push_back({part.data_part->info, MarkRanges{}});
-
-        for (const auto & part : parts_ranges)
-            buffered_tasks.push_back({part.data_part->info, MarkRanges{}});
-
-        extension.all_callback(InitialAllRangesAnnouncement(
-            mode,
-            parts_ranges.getDescriptions(),
-            extension.number_of_current_replica
-        ));
-    }
-
-    MarkRanges getNewTask(RangesInDataPartDescription description);
-
-
-    RangesInDataParts parts_ranges;
-    ParallelReadingExtension extension;
-    CoordinationMode mode;
-    size_t min_marks_for_concurrent_read{0};
-
-    bool no_more_tasks{false};
-    RangesInDataPartsDescription request;
-    RangesInDataPartsDescription buffered_tasks;
-
-    std::mutex mutex;
-};
-
-using MergeTreeInOrderReadPoolParallelReplicasPtr = std::shared_ptr<MergeTreeInOrderReadPoolParallelReplicas>;
-
 }
--- a/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPoolBase.cpp
@ -0,0 +1,149 @@
+#include <Storages/MergeTree/MergeTreeReadPoolBase.h>
+#include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
+
+namespace DB
+{
+
+MergeTreeReadPoolBase::MergeTreeReadPoolBase(
+    RangesInDataParts && parts_,
+    const StorageSnapshotPtr & storage_snapshot_,
+    const PrewhereInfoPtr & prewhere_info_,
+    const ExpressionActionsSettings & actions_settings_,
+    const MergeTreeReaderSettings & reader_settings_,
+    const Names & column_names_,
+    const Names & virtual_column_names_,
+    const PoolSettings & pool_settings_,
+    const ContextPtr & context_)
+    : parts_ranges(std::move(parts_))
+    , storage_snapshot(storage_snapshot_)
+    , prewhere_info(prewhere_info_)
+    , actions_settings(actions_settings_)
+    , reader_settings(reader_settings_)
+    , column_names(column_names_)
+    , virtual_column_names(virtual_column_names_)
+    , pool_settings(pool_settings_)
+    , owned_mark_cache(context_->getGlobalContext()->getMarkCache())
+    , owned_uncompressed_cache(pool_settings_.use_uncompressed_cache ? context_->getGlobalContext()->getUncompressedCache() : nullptr)
+    , header(storage_snapshot->getSampleBlockForColumns(column_names))
+    , profile_callback([this](ReadBufferFromFileBase::ProfileInfo info_) { profileFeedback(info_); })
+{
+    fillPerPartInfos();
+}
+
+void MergeTreeReadPoolBase::fillPerPartInfos()
+{
+    per_part_infos.reserve(parts_ranges.size());
+    is_part_on_remote_disk.reserve(parts_ranges.size());
+
+    auto sample_block = storage_snapshot->metadata->getSampleBlock();
+
+    for (const auto & part_with_ranges : parts_ranges)
+    {
+#ifndef NDEBUG
+        assertSortedAndNonIntersecting(part_with_ranges.ranges);
+#endif
+
+        MergeTreeReadTask::Info read_task_info;
+
+        read_task_info.data_part = part_with_ranges.data_part;
+        read_task_info.part_index_in_query = part_with_ranges.part_index_in_query;
+        read_task_info.alter_conversions = part_with_ranges.alter_conversions;
+
+        LoadedMergeTreeDataPartInfoForReader part_info(part_with_ranges.data_part, part_with_ranges.alter_conversions);
+
+        read_task_info.task_columns = getReadTaskColumns(
+            part_info, storage_snapshot, column_names, virtual_column_names,
+            prewhere_info, actions_settings,
+            reader_settings, /*with_subcolumns=*/ true);
+
+        if (pool_settings.preferred_block_size_bytes > 0)
+        {
+            const auto & result_column_names = read_task_info.task_columns.columns.getNames();
+            NameSet all_column_names(result_column_names.begin(), result_column_names.end());
+
+            for (const auto & pre_columns_per_step : read_task_info.task_columns.pre_columns)
+            {
+                const auto & pre_column_names = pre_columns_per_step.getNames();
+                all_column_names.insert(pre_column_names.begin(), pre_column_names.end());
+            }
+
+            read_task_info.shared_size_predictor = std::make_unique<MergeTreeBlockSizePredictor>(
+                read_task_info.data_part,
+                Names(all_column_names.begin(), all_column_names.end()),
+                sample_block);
+        }
+
+        is_part_on_remote_disk.push_back(part_with_ranges.data_part->isStoredOnRemoteDisk());
+        per_part_infos.push_back(std::make_shared<MergeTreeReadTask::Info>(std::move(read_task_info)));
+    }
+}
+
+std::vector<size_t> MergeTreeReadPoolBase::getPerPartSumMarks() const
+{
+    std::vector<size_t> per_part_sum_marks;
+    per_part_sum_marks.reserve(parts_ranges.size());
+
+    for (const auto & part_with_ranges : parts_ranges)
+    {
+        size_t sum_marks = 0;
+        for (const auto & range : part_with_ranges.ranges)
+            sum_marks += range.end - range.begin;
+
+        per_part_sum_marks.push_back(sum_marks);
+    }
+
+    return per_part_sum_marks;
+}
+
+MergeTreeReadTaskPtr MergeTreeReadPoolBase::createTask(
+    MergeTreeReadTask::InfoPtr read_info,
+    MarkRanges ranges,
+    MergeTreeReadTask * previous_task) const
+{
+    auto task_size_predictor = read_info->shared_size_predictor
+        ? std::make_unique<MergeTreeBlockSizePredictor>(*read_info->shared_size_predictor)
+        : nullptr; /// make a copy
+
+    auto get_part_name = [](const auto & task_info) -> const String &
+    {
+        return task_info.data_part->isProjectionPart() ? task_info.data_part->getParentPart()->name : task_info.data_part->name;
+    };
+
+    auto extras = getExtras();
+    MergeTreeReadTask::Readers task_readers;
+
+    if (!previous_task)
+    {
+        task_readers = MergeTreeReadTask::createReaders(read_info, extras, ranges);
+    }
+    else if (get_part_name(previous_task->getInfo()) != get_part_name(*read_info))
+    {
+        extras.value_size_map = previous_task->getMainReader().getAvgValueSizeHints();
+        task_readers = MergeTreeReadTask::createReaders(read_info, extras, ranges);
+    }
+    else
+    {
+        task_readers = previous_task->releaseReaders();
+    }
+
+    return std::make_unique<MergeTreeReadTask>(
+        read_info,
+        std::move(task_readers),
+        std::move(ranges),
+        std::move(task_size_predictor));
+}
+
+MergeTreeReadTask::Extras MergeTreeReadPoolBase::getExtras() const
+{
+    return
+    {
+        .uncompressed_cache = owned_uncompressed_cache.get(),
+        .mark_cache = owned_mark_cache.get(),
+        .reader_settings = reader_settings,
+        .storage_snapshot = storage_snapshot,
+        .profile_callback = profile_callback,
+    };
+}
+
+}
--- a/src/Storages/MergeTree/MergeTreeReadPoolBase.h
+++ b/src/Storages/MergeTree/MergeTreeReadPoolBase.h
@ -0,0 +1,67 @@
+#pragma once
+#include <Storages/MergeTree/MergeTreeReadTask.h>
+#include <Storages/MergeTree/RangesInDataPart.h>
+#include <Storages/MergeTree/IMergeTreeReadPool.h>
+
+namespace DB
+{
+
+class MergeTreeReadPoolBase : public IMergeTreeReadPool
+{
+public:
+    struct PoolSettings
+    {
+        size_t threads = 0;
+        size_t sum_marks = 0;
+        size_t min_marks_for_concurrent_read = 0;
+        size_t preferred_block_size_bytes = 0;
+
+        bool use_uncompressed_cache = false;
+        bool do_not_steal_tasks = false;
+        bool use_const_size_tasks_for_remote_reading = false;
+    };
+
+    MergeTreeReadPoolBase(
+        RangesInDataParts && parts_,
+        const StorageSnapshotPtr & storage_snapshot_,
+        const PrewhereInfoPtr & prewhere_info_,
+        const ExpressionActionsSettings & actions_settings_,
+        const MergeTreeReaderSettings & reader_settings_,
+        const Names & column_names_,
+        const Names & virtual_column_names_,
+        const PoolSettings & settings_,
+        const ContextPtr & context_);
+
+    Block getHeader() const override { return header; }
+
+protected:
+    /// Initialized in constructor
+    const RangesInDataParts parts_ranges;
+    const StorageSnapshotPtr storage_snapshot;
+    const PrewhereInfoPtr prewhere_info;
+    const ExpressionActionsSettings actions_settings;
+    const MergeTreeReaderSettings reader_settings;
+    const Names column_names;
+    const Names virtual_column_names;
+    const PoolSettings pool_settings;
+    const MarkCachePtr owned_mark_cache;
+    const UncompressedCachePtr owned_uncompressed_cache;
+    const Block header;
+
+    void fillPerPartInfos();
+    std::vector<size_t> getPerPartSumMarks() const;
+
+    MergeTreeReadTaskPtr createTask(
+        MergeTreeReadTask::InfoPtr read_info,
+        MarkRanges ranges,
+        MergeTreeReadTask * previous_task) const;
+
+    MergeTreeReadTask::Extras getExtras() const;
+
+    std::vector<MergeTreeReadTask::InfoPtr> per_part_infos;
+    std::vector<bool> is_part_on_remote_disk;
+
+    ReadBufferFromFileBase::ProfileCallback profile_callback;
+};
+
+}
--- a/src/Storages/MergeTree/MergeTreeReadPoolInOrder.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPoolInOrder.cpp
@ -0,0 +1,73 @@
+#include <Storages/MergeTree/MergeTreeReadPoolInOrder.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+MergeTreeReadPoolInOrder::MergeTreeReadPoolInOrder(
+    bool has_limit_below_one_block_,
+    MergeTreeReadType read_type_,
+    RangesInDataParts parts_,
+    const StorageSnapshotPtr & storage_snapshot_,
+    const PrewhereInfoPtr & prewhere_info_,
+    const ExpressionActionsSettings & actions_settings_,
+    const MergeTreeReaderSettings & reader_settings_,
+    const Names & column_names_,
+    const Names & virtual_column_names_,
+    const PoolSettings & settings_,
+    const ContextPtr & context_)
+    : MergeTreeReadPoolBase(
+        std::move(parts_),
+        storage_snapshot_,
+        prewhere_info_,
+        actions_settings_,
+        reader_settings_,
+        column_names_,
+        virtual_column_names_,
+        settings_,
+        context_)
+    , has_limit_below_one_block(has_limit_below_one_block_)
+    , read_type(read_type_)
+{
+    per_part_mark_ranges.reserve(parts_ranges.size());
+    for (const auto & part_with_ranges : parts_ranges)
+        per_part_mark_ranges.push_back(part_with_ranges.ranges);
+}
+
+MergeTreeReadTaskPtr MergeTreeReadPoolInOrder::getTask(size_t task_idx, MergeTreeReadTask * previous_task)
+{
+    if (task_idx >= per_part_infos.size())
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "Requested task with idx {}, but there are only {} parts",
+            task_idx, per_part_infos.size());
+
+    auto & all_mark_ranges = per_part_mark_ranges[task_idx];
+    if (all_mark_ranges.empty())
+        return nullptr;
+
+    MarkRanges mark_ranges_for_task;
+    if (read_type == MergeTreeReadType::InReverseOrder)
+    {
+        /// Read ranges from right to left.
+        mark_ranges_for_task.emplace_back(std::move(all_mark_ranges.back()));
+        all_mark_ranges.pop_back();
+    }
+    else if (has_limit_below_one_block)
+    {
+        /// If we need to read few rows, set one range per task to reduce number of read data.
+        mark_ranges_for_task.emplace_back(std::move(all_mark_ranges.front()));
+        all_mark_ranges.pop_front();
+    }
+    else
+    {
+        mark_ranges_for_task = std::move(all_mark_ranges);
+    }
+
+    return createTask(per_part_infos[task_idx], std::move(mark_ranges_for_task), previous_task);
+}
+
+}
--- a/src/Storages/MergeTree/MergeTreeReadPoolInOrder.h
+++ b/src/Storages/MergeTree/MergeTreeReadPoolInOrder.h
@ -0,0 +1,35 @@
+#pragma once
+#include <Storages/MergeTree/MergeTreeReadPoolBase.h>
+
+namespace DB
+{
+
+class MergeTreeReadPoolInOrder : public MergeTreeReadPoolBase
+{
+public:
+    MergeTreeReadPoolInOrder(
+        bool has_limit_below_one_block_,
+        MergeTreeReadType read_type_,
+        RangesInDataParts parts_,
+        const StorageSnapshotPtr & storage_snapshot_,
+        const PrewhereInfoPtr & prewhere_info_,
+        const ExpressionActionsSettings & actions_settings_,
+        const MergeTreeReaderSettings & reader_settings_,
+        const Names & column_names_,
+        const Names & virtual_column_names_,
+        const PoolSettings & settings_,
+        const ContextPtr & context_);
+
+    String getName() const override { return "ReadPoolInOrder"; }
+    bool preservesOrderOfRanges() const override { return true; }
+    MergeTreeReadTaskPtr getTask(size_t task_idx, MergeTreeReadTask * previous_task) override;
+    void profileFeedback(ReadBufferFromFileBase::ProfileInfo) override {}
+
+private:
+    const bool has_limit_below_one_block;
+    const MergeTreeReadType read_type;
+
+    std::vector<MarkRanges> per_part_mark_ranges;
+};
+
+}
--- a/src/Storages/MergeTree/MergeTreeReadPoolParallelReplicas.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPoolParallelReplicas.cpp
@ -0,0 +1,110 @@
+#include <Storages/MergeTree/MergeTreeReadPoolParallelReplicas.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+MergeTreeReadPoolParallelReplicas::MergeTreeReadPoolParallelReplicas(
+    ParallelReadingExtension extension_,
+    RangesInDataParts && parts_,
+    const StorageSnapshotPtr & storage_snapshot_,
+    const PrewhereInfoPtr & prewhere_info_,
+    const ExpressionActionsSettings & actions_settings_,
+    const MergeTreeReaderSettings & reader_settings_,
+    const Names & column_names_,
+    const Names & virtual_column_names_,
+    const PoolSettings & settings_,
+    const ContextPtr & context_)
+    : MergeTreeReadPoolBase(
+        std::move(parts_),
+        storage_snapshot_,
+        prewhere_info_,
+        actions_settings_,
+        reader_settings_,
+        column_names_,
+        virtual_column_names_,
+        settings_,
+        context_)
+    , extension(std::move(extension_))
+{
+    extension.all_callback(InitialAllRangesAnnouncement(
+        CoordinationMode::Default,
+        parts_ranges.getDescriptions(),
+        extension.number_of_current_replica
+    ));
+}
+
+MergeTreeReadTaskPtr MergeTreeReadPoolParallelReplicas::getTask(size_t /*task_idx*/, MergeTreeReadTask * previous_task)
+{
+    std::lock_guard lock(mutex);
+
+    if (no_more_tasks_available)
+        return nullptr;
+
+    if (buffered_ranges.empty())
+    {
+        auto result = extension.callback(ParallelReadRequest(
+            CoordinationMode::Default,
+            extension.number_of_current_replica,
+            pool_settings.min_marks_for_concurrent_read * pool_settings.threads,
+            /// For Default coordination mode we don't need to pass part names.
+            RangesInDataPartsDescription{}));
+
+        if (!result || result->finish)
+        {
+            no_more_tasks_available = true;
+            return nullptr;
+        }
+
+        buffered_ranges = std::move(result->description);
+    }
+
+    if (buffered_ranges.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "No tasks to read. This is a bug");
+
+    auto & current_task = buffered_ranges.front();
+
+    size_t part_idx = 0;
+    for (size_t index = 0; index < per_part_infos.size(); ++index)
+    {
+        if (per_part_infos[index]->data_part->info == current_task.info)
+        {
+            part_idx = index;
+            break;
+        }
+    }
+
+    MarkRanges ranges_to_read;
+    size_t current_sum_marks = 0;
+    while (current_sum_marks < pool_settings.min_marks_for_concurrent_read && !current_task.ranges.empty())
+    {
+        auto diff = pool_settings.min_marks_for_concurrent_read - current_sum_marks;
+        auto range = current_task.ranges.front();
+        if (range.getNumberOfMarks() > diff)
+        {
+            auto new_range = range;
+            new_range.end = range.begin + diff;
+            range.begin += diff;
+
+            current_task.ranges.front() = range;
+            ranges_to_read.push_back(new_range);
+            current_sum_marks += new_range.getNumberOfMarks();
+            continue;
+        }
+
+        ranges_to_read.push_back(range);
+        current_sum_marks += range.getNumberOfMarks();
+        current_task.ranges.pop_front();
+    }
+
+    if (current_task.ranges.empty())
+        buffered_ranges.pop_front();
+
+    return createTask(per_part_infos[part_idx], std::move(ranges_to_read), previous_task);
+}
+
+}
--- a/src/Storages/MergeTree/MergeTreeReadPoolParallelReplicas.h
+++ b/src/Storages/MergeTree/MergeTreeReadPoolParallelReplicas.h
@ -0,0 +1,39 @@
+#pragma once
+#include <Storages/MergeTree/MergeTreeReadPoolBase.h>
+#include <Storages/MergeTree/MergeTreeSelectProcessor.h>
+
+namespace DB
+{
+
+class MergeTreeReadPoolParallelReplicas : public MergeTreeReadPoolBase
+{
+public:
+    MergeTreeReadPoolParallelReplicas(
+        ParallelReadingExtension extension_,
+        RangesInDataParts && parts_,
+        const StorageSnapshotPtr & storage_snapshot_,
+        const PrewhereInfoPtr & prewhere_info_,
+        const ExpressionActionsSettings & actions_settings_,
+        const MergeTreeReaderSettings & reader_settings_,
+        const Names & column_names_,
+        const Names & virtual_column_names_,
+        const PoolSettings & settings_,
+        const ContextPtr & context_);
+
+    ~MergeTreeReadPoolParallelReplicas() override = default;
+
+    String getName() const override { return "ReadPoolParallelReplicas"; }
+    bool preservesOrderOfRanges() const override { return false; }
+    void profileFeedback(ReadBufferFromFileBase::ProfileInfo) override {}
+    MergeTreeReadTaskPtr getTask(size_t task_idx, MergeTreeReadTask * previous_task) override;
+
+private:
+    mutable std::mutex mutex;
+
+    const ParallelReadingExtension extension;
+    RangesInDataPartsDescription buffered_ranges;
+    bool no_more_tasks_available{false};
+    Poco::Logger * log = &Poco::Logger::get("MergeTreeReadPoolParallelReplicas");
+};
+
+}
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit e77bd70bdd860c52c561568cffb251d88bba064c`
				`@ -1 +0,0 @@`
				`Subproject commit 8628e258090f9eb76d90ac3c91e1ab4690e9aa11`