Merge branch 'master' into keeper-with-disks

2024-11-22 23:52:03 +00:00 · 2023-05-26 10:48:41 +00:00 · 2023-05-26 10:48:41 +00:00 · 8a2a63a7bd
commit 8a2a63a7bd
parent 6a8a21e09a 09d7512f45
131 changed files with 1940 additions and 584 deletions
--- a/README.md
+++ b/README.md
@ -23,7 +23,6 @@ curl https://clickhouse.com/ | sh
 ## Upcoming Events

 * [**v23.5 Release Webinar**](https://clickhouse.com/company/events/v23-5-release-webinar?utm_source=github&utm_medium=social&utm_campaign=release-webinar-2023-05) - May 31 - 23.5 is rapidly approaching. Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release.
-* [**ClickHouse Meetup in Berlin**](https://www.meetup.com/clickhouse-berlin-user-group/events/292892466) - May 16 
 * [**ClickHouse Meetup in Barcelona**](https://www.meetup.com/clickhouse-barcelona-user-group/events/292892669) - May 25 
 * [**ClickHouse Meetup in London**](https://www.meetup.com/clickhouse-london-user-group/events/292892824) - May 25 
 * [**ClickHouse Meetup in San Francisco**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/293426725/) - Jun 7 
--- a/contrib/aws
+++ b/contrib/aws
@ -1 +1 @@
-Subproject commit ecccfc026a42b30023289410a67024d561f4bf3e
+Subproject commit ca02358dcc7ce3ab733dd4cbcc32734eecfa4ee3
--- a/contrib/aws-c-auth
+++ b/contrib/aws-c-auth
@ -1 +1 @@
-Subproject commit 30df6c407e2df43bd244e2c34c9b4a4b87372bfb
+Subproject commit 97133a2b5dbca1ccdf88cd6f44f39d0531d27d12
--- a/contrib/aws-c-common
+++ b/contrib/aws-c-common
@ -1 +1 @@
-Subproject commit 324fd1d973ccb25c813aa747bf1759cfde5121c5
+Subproject commit 45dcb2849c891dba2100b270b4676765c92949ff
--- a/contrib/aws-c-event-stream
+++ b/contrib/aws-c-event-stream
@ -1 +1 @@
-Subproject commit 39bfa94a14b7126bf0c1330286ef8db452d87e66
+Subproject commit 2f9b60c42f90840ec11822acda3d8cdfa97a773d
--- a/contrib/aws-c-http
+++ b/contrib/aws-c-http
@ -1 +1 @@
-Subproject commit 2c5a2a7d5556600b9782ffa6c9d7e09964df1abc
+Subproject commit dd34461987947672444d0bc872c5a733dfdb9711
--- a/contrib/aws-c-io
+++ b/contrib/aws-c-io
@ -1 +1 @@
-Subproject commit 5d32c453560d0823df521a686bf7fbacde7f9be3
+Subproject commit d58ed4f272b1cb4f89ac9196526ceebe5f2b0d89
--- a/contrib/aws-c-mqtt
+++ b/contrib/aws-c-mqtt
@ -1 +1 @@
-Subproject commit 882c689561a3db1466330ccfe3b63637e0a575d3
+Subproject commit 33c3455cec82b16feb940e12006cefd7b3ef4194
--- a/contrib/aws-c-s3
+++ b/contrib/aws-c-s3
@ -1 +1 @@
-Subproject commit a41255ece72a7c887bba7f9d998ca3e14f4c8a1b
+Subproject commit d7bfe602d6925948f1fff95784e3613cca6a3900
--- a/contrib/aws-c-sdkutils
+++ b/contrib/aws-c-sdkutils
@ -1 +1 @@
-Subproject commit 25bf5cf225f977c3accc6a05a0a7a181ef2a4a30
+Subproject commit 208a701fa01e99c7c8cc3dcebc8317da71362972
--- a/contrib/aws-checksums
+++ b/contrib/aws-checksums
@ -1 +1 @@
-Subproject commit 48e7c0e01479232f225c8044d76c84e74192889d
+Subproject commit ad53be196a25bbefa3700a01187fdce573a7d2d0
--- a/contrib/aws-cmake/CMakeLists.txt
+++ b/contrib/aws-cmake/CMakeLists.txt
@ -52,8 +52,8 @@ endif()

 # Directories.
 SET(AWS_SDK_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws")
-SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-core")
-SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-s3")
+SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/src/aws-cpp-sdk-core")
+SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/generated/src/aws-cpp-sdk-s3")

 SET(AWS_AUTH_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-auth")
 SET(AWS_CAL_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-cal")
@ -118,7 +118,7 @@ configure_file("${AWS_SDK_CORE_DIR}/include/aws/core/SDKConfig.h.in"
 list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MAJOR=1")
 list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MINOR=10")
 list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_PATCH=36")
-    
+
 list(APPEND AWS_SOURCES ${AWS_SDK_CORE_SRC} ${AWS_SDK_CORE_NET_SRC} ${AWS_SDK_CORE_PLATFORM_SRC})

 list(APPEND AWS_PUBLIC_INCLUDES
--- a/contrib/aws-crt-cpp
+++ b/contrib/aws-crt-cpp
@ -1 +1 @@
-Subproject commit ec0bea288f451d884c0d80d534bc5c66241c39a4
+Subproject commit 8a301b7e842f1daed478090c869207300972379f
--- a/contrib/aws-s2n-tls
+++ b/contrib/aws-s2n-tls
@ -1 +1 @@
-Subproject commit 0f1ba9e5c4a67cb3898de0c0b4f911d4194dc8de
+Subproject commit 71f4794b7580cf780eb4aca77d69eded5d3c7bb4
--- a/contrib/sparse-checkout/update-aws.sh
+++ b/contrib/sparse-checkout/update-aws.sh
@ -5,8 +5,8 @@ echo "Using sparse checkout for aws"
 FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout
 echo '/*' > $FILES_TO_CHECKOUT
 echo '!/*/*' >> $FILES_TO_CHECKOUT
-echo '/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
-echo '/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT
+echo '/src/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
+echo '/generated/src/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT

 git config core.sparsecheckout true
 git checkout $1
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -132,6 +132,9 @@ function run_tests()

    ADDITIONAL_OPTIONS+=('--report-logs-stats')

+    clickhouse-test "00001_select_1" > /dev/null ||:
+    clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" ||:
+
    set +e
    clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \
        --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -65,6 +65,9 @@ sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
  > /etc/clickhouse-server/config.d/storage_conf.xml.tmp
 sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml

+# it contains some new settings, but we can safely remove it
+rm /etc/clickhouse-server/config.d/merge_tree.xml
+
 start
 stop
 mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.initial.log
@ -94,6 +97,9 @@ sudo cat /etc/clickhouse-server/config.d/storage_conf.xml \
  > /etc/clickhouse-server/config.d/storage_conf.xml.tmp
 sudo mv /etc/clickhouse-server/config.d/storage_conf.xml.tmp /etc/clickhouse-server/config.d/storage_conf.xml

+# it contains some new settings, but we can safely remove it
+rm /etc/clickhouse-server/config.d/merge_tree.xml
+
 start

 clickhouse-client --query="SELECT 'Server version: ', version()"
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -177,11 +177,11 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 - `--user, -u` – The username. Default value: default.
 - `--password` – The password. Default value: empty string.
 - `--ask-password` - Prompt the user to enter a password.
- `--query, -q` – The query to process when using non-interactive mode. You must specify either `query` or `queries-file` option.
- `--queries-file` – file path with queries to execute. You must specify either `query` or `queries-file` option.
- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default).
- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter).
+- `--query, -q` – The query to process when using non-interactive mode. Cannot be used simultaneously with `--queries-file`.
+- `--queries-file` – file path with queries to execute. Cannot be used simultaneously with `--query`.
 - `--multiquery, -n` – If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
+- `--multiline, -m` – If specified, allow multiline queries (do not send the query on Enter).
+- `--database, -d` – Select the current default database. Default value: the current database from the server settings (‘default’ by default).
 - `--format, -f` – Use the specified default format to output the result.
 - `--vertical, -E` – If specified, use the [Vertical format](../interfaces/formats.md#vertical) by default to output the result. This is the same as `–format=Vertical`. In this format, each value is printed on a separate line, which is helpful when displaying wide tables.
 - `--time, -t` – If specified, print the query execution time to ‘stderr’ in non-interactive mode.
--- a/docs/en/operations/system-tables/build_options.md
+++ b/docs/en/operations/system-tables/build_options.md
@ -0,0 +1,27 @@
+---
+slug: /en/operations/system-tables/build_options
+---
+# build_options
+
+Contains information about the ClickHouse server's build options.
+
+Columns:
+
+- `name` (String) — Name of the build option, e.g. `USE_ODBC`
+- `value` (String) — Value of the build option, e.g. `1`
+
+**Example**
+
+``` sql
+SELECT * FROM system.build_options LIMIT 5
+```
+
+``` text
+┌─name─────────────┬─value─┐
+│ USE_BROTLI       │ 1     │
+│ USE_BZIP2        │ 1     │
+│ USE_CAPNP        │ 1     │
+│ USE_CASSANDRA    │ 1     │
+│ USE_DATASKETCHES │ 1     │
+└──────────────────┴───────┘
+```
--- a/docs/en/operations/system-tables/processors_profile_log.md
+++ b/docs/en/operations/system-tables/processors_profile_log.md
@ -5,16 +5,18 @@ This table contains profiling on processors level (that you can find in [`EXPLAI
 Columns:

 - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened.
- `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the event happened.
+- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the event happened.
+- `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time with microseconds precision when the event happened.
 - `id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of processor
 - `parent_ids` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Parent processors IDs
+- `plan_step` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the query plan step which created this processor. The value is zero if the processor was not added from any step.
+- `plan_group` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Group of the processor if it was created by query plan step. A group is a logical partitioning of processors added from the same query plan step. Group is used only for beautifying the result of EXPLAIN PIPELINE result.
+- `initial_query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the initial query (for distributed query execution).
 - `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query
 - `name` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Name of the processor.
 - `elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was executed.
 - `input_wait_elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was waiting for data (from other processor).
 - `output_wait_elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was waiting because output port was full.
- `plan_step` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the query plan step which created this processor. The value is zero if the processor was not added from any step.
- `plan_group` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Group of the processor if it was created by query plan step. A group is a logical partitioning of processors added from the same query plan step. Group is used only for beautifying the result of EXPLAIN PIPELINE result.
 - `input_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of rows consumed by processor.
 - `input_bytes` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of bytes consumed by processor.
 - `output_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The number of rows generated by processor.
--- a/docs/en/operations/system-tables/query_log.md
+++ b/docs/en/operations/system-tables/query_log.md
@ -59,9 +59,10 @@ Columns:
 - `query_kind` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Type of the query.
 - `databases` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the databases present in the query.
 - `tables` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the tables present in the query.
- `views` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the (materialized or live) views present in the query.
 - `columns` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the columns present in the query.
+- `partitions` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the partitions present in the query.
 - `projections` ([String](../../sql-reference/data-types/string.md)) — Names of the projections used during the query execution.
+- `views` ([Array](../../sql-reference/data-types/array.md)([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md))) — Names of the (materialized or live) views present in the query.
 - `exception_code` ([Int32](../../sql-reference/data-types/int-uint.md)) — Code of an exception.
 - `exception` ([String](../../sql-reference/data-types/string.md)) — Exception message.
 - `stack_trace` ([String](../../sql-reference/data-types/string.md)) — [Stack trace](https://en.wikipedia.org/wiki/Stack_trace). An empty string, if the query was completed successfully.
--- a/docs/en/operations/utilities/clickhouse-local.md
+++ b/docs/en/operations/utilities/clickhouse-local.md
@ -183,12 +183,12 @@ Arguments:
 - `-S`, `--structure` — table structure for input data.
 - `--input-format` — input format, `TSV` by default.
 - `-f`, `--file` — path to data, `stdin` by default.
- `-q`, `--query` — queries to execute with `;` as delimiter. You must specify either `query` or `queries-file` option.
- `--queries-file` - file path with queries to execute. You must specify either `query` or `queries-file` option.
+- `-q`, `--query` — queries to execute with `;` as delimiter. Cannot be used simultaneously with `--queries-file`.
+- `--queries-file` - file path with queries to execute. Cannot be used simultaneously with `--query`.
+- `--multiquery, -n` – If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
 - `-N`, `--table` — table name where to put output data, `table` by default.
 - `--format`, `--output-format` — output format, `TSV` by default.
 - `-d`, `--database` — default database, `_local` by default.
- `--multiquery, -n` – If specified, multiple queries separated by semicolons can be listed after the `--query` option. For convenience, it is also possible to omit `--query` and pass the queries directly after `--multiquery`.
 - `--stacktrace` — whether to dump debug output in case of exception.
 - `--echo` — print query before execution.
 - `--verbose` — more details on query execution.
--- a/docs/en/sql-reference/statements/select/order-by.md
+++ b/docs/en/sql-reference/statements/select/order-by.md
@ -544,6 +544,54 @@ Result:
 └─────┴──────────┴───────┘
 ```

+##Filling grouped by sorting prefix
+
+It can be useful to fill rows which have the same values in particular columns independently, - a good example is filling missing values in time series.
+Assume there is the following time series table
+``` sql
+CREATE TABLE timeseries
+(
+    `sensor_id` UInt64,
+    `timestamp` DateTime64(3, 'UTC'),
+    `value` Float64
+)
+ENGINE = Memory;
+
+SELECT * FROM timeseries;
+
+┌─sensor_id─┬───────────────timestamp─┬─value─┐
+│       234 │ 2021-12-01 00:00:03.000 │     3 │
+│       432 │ 2021-12-01 00:00:01.000 │     1 │
+│       234 │ 2021-12-01 00:00:07.000 │     7 │
+│       432 │ 2021-12-01 00:00:05.000 │     5 │
+└───────────┴─────────────────────────┴───────┘
+```
+And we'd like to fill missing values for each sensor independently with 1 second interval.
+The way to achieve it is to use `sensor_id` column as sorting prefix for filling column `timestamp`
+```
+SELECT *
+FROM timeseries
+ORDER BY
+    sensor_id,
+    timestamp WITH FILL
+INTERPOLATE ( value AS 9999 )
+
+┌─sensor_id─┬───────────────timestamp─┬─value─┐
+│       234 │ 2021-12-01 00:00:03.000 │     3 │
+│       234 │ 2021-12-01 00:00:04.000 │  9999 │
+│       234 │ 2021-12-01 00:00:05.000 │  9999 │
+│       234 │ 2021-12-01 00:00:06.000 │  9999 │
+│       234 │ 2021-12-01 00:00:07.000 │     7 │
+│       432 │ 2021-12-01 00:00:01.000 │     1 │
+│       432 │ 2021-12-01 00:00:02.000 │  9999 │
+│       432 │ 2021-12-01 00:00:03.000 │  9999 │
+│       432 │ 2021-12-01 00:00:04.000 │  9999 │
+│       432 │ 2021-12-01 00:00:05.000 │     5 │
+└───────────┴─────────────────────────┴───────┘
+```
+Here, the `value` column was interpolated with `9999` just to make filled rows more noticeable
+This behavior is controlled by setting `use_with_fill_by_sorting_prefix` (enabled by default)
+
 ## Related content

 - Blog: [Working with time series data in ClickHouse](https://clickhouse.com/blog/working-with-time-series-data-and-functions-ClickHouse)
--- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp
+++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp
@ -1,6 +1,7 @@
 #include <Backups/BackupEntryFromAppendOnlyFile.h>
 #include <Disks/IDisk.h>
 #include <IO/LimitSeekableReadBuffer.h>
+#include <IO/ReadBufferFromFileBase.h>


 namespace DB
--- a/src/Backups/BackupEntryFromImmutableFile.cpp
+++ b/src/Backups/BackupEntryFromImmutableFile.cpp
@ -1,5 +1,7 @@
 #include <Backups/BackupEntryFromImmutableFile.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <Disks/IDisk.h>
+#include <city.h>


 namespace DB
--- a/src/Backups/BackupIO_Default.cpp
+++ b/src/Backups/BackupIO_Default.cpp
@ -3,7 +3,7 @@
 #include <Disks/IDisk.h>
 #include <IO/copyData.h>
 #include <IO/WriteBufferFromFileBase.h>
-#include <IO/SeekableReadBuffer.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <Interpreters/Context.h>
 #include <Common/logger_useful.h>

--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -15,7 +15,7 @@
 #include <IO/Archives/createArchiveWriter.h>
 #include <IO/ConcatSeekableReadBuffer.h>
 #include <IO/ReadHelpers.h>
-#include <IO/SeekableReadBuffer.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromFileBase.h>
 #include <IO/WriteHelpers.h>
 #include <IO/Operators.h>
--- a/src/Client/HedgedConnections.cpp
+++ b/src/Client/HedgedConnections.cpp
@ -174,7 +174,7 @@ void HedgedConnections::sendQuery(
            modified_settings.group_by_two_level_threshold_bytes = 0;
        }

-        const bool enable_sample_offset_parallel_processing = settings.max_parallel_replicas > 1 && !settings.allow_experimental_parallel_reading_from_replicas;
+        const bool enable_sample_offset_parallel_processing = settings.max_parallel_replicas > 1 && settings.allow_experimental_parallel_reading_from_replicas == 0;

        if (offset_states.size() > 1 && enable_sample_offset_parallel_processing)
        {
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -142,7 +142,7 @@ void MultiplexedConnections::sendQuery(
        }
    }

-    const bool enable_sample_offset_parallel_processing = settings.max_parallel_replicas > 1 && !settings.allow_experimental_parallel_reading_from_replicas;
+    const bool enable_sample_offset_parallel_processing = settings.max_parallel_replicas > 1 && settings.allow_experimental_parallel_reading_from_replicas == 0;

    size_t num_replicas = replica_states.size();
    if (num_replicas > 1)
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@ -3,6 +3,7 @@
 #include <base/types.h>
 #include <Common/Exception.h>
 #include <Coordination/KeeperConstants.h>
+#include <Poco/Net/SocketAddress.h>

 #include <vector>
 #include <memory>
@ -466,7 +467,7 @@ public:
    /// Useful to check owner of ephemeral node.
    virtual int64_t getSessionID() const = 0;

-    virtual String getConnectedAddress() const = 0;
+    virtual Poco::Net::SocketAddress getConnectedAddress() const = 0;

    /// If the method will throw an exception, callbacks won't be called.
    ///
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@ -39,7 +39,7 @@ public:

    bool isExpired() const override { return expired; }
    int64_t getSessionID() const override { return 0; }
-    String getConnectedAddress() const override { return connected_zk_address; }
+    Poco::Net::SocketAddress getConnectedAddress() const override { return connected_zk_address; }


    void create(
@ -127,7 +127,7 @@ private:

    zkutil::ZooKeeperArgs args;

-    String connected_zk_address;
+    Poco::Net::SocketAddress connected_zk_address;

    std::mutex push_request_mutex;
    std::atomic<bool> expired{false};
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -112,11 +112,10 @@ void ZooKeeper::init(ZooKeeperArgs args_)
        else
            LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(args.hosts, ","), args.chroot);

-        String address = impl->getConnectedAddress();
+        Poco::Net::SocketAddress address = impl->getConnectedAddress();

-        size_t colon_pos = address.find(':');
-        connected_zk_host = address.substr(0, colon_pos);
-        connected_zk_port = address.substr(colon_pos + 1);
+        connected_zk_host = address.host().toString();
+        connected_zk_port = address.port();

        connected_zk_index = 0;

@ -124,7 +123,7 @@ void ZooKeeper::init(ZooKeeperArgs args_)
        {
            for (size_t i = 0; i < args.hosts.size(); i++)
            {
-                if (args.hosts[i] == address)
+                if (args.hosts[i] == address.toString())
                {
                    connected_zk_index = i;
                    break;
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -524,7 +524,7 @@ public:
    void setServerCompletelyStarted();

    String getConnectedZooKeeperHost() const { return connected_zk_host; }
-    String getConnectedZooKeeperPort() const { return connected_zk_port; }
+    UInt16 getConnectedZooKeeperPort() const { return connected_zk_port; }
    size_t getConnectedZooKeeperIndex() const { return connected_zk_index; }

 private:
@ -591,7 +591,7 @@ private:
    ZooKeeperArgs args;

    String connected_zk_host;
-    String connected_zk_port;
+    UInt16 connected_zk_port;
    size_t connected_zk_index;

    std::mutex mutex;
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -433,7 +433,7 @@ void ZooKeeper::connect(
                }

                connected = true;
-                connected_zk_address = node.address.toString();
+                connected_zk_address = node.address;

                break;
            }
@ -450,7 +450,7 @@ void ZooKeeper::connect(
    if (!connected)
    {
        WriteBufferFromOwnString message;
-        connected_zk_address = "";
+        connected_zk_address = Poco::Net::SocketAddress();

        message << "All connection tries failed while connecting to ZooKeeper. nodes: ";
        bool first = true;
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@ -125,7 +125,7 @@ public:
    /// Useful to check owner of ephemeral node.
    int64_t getSessionID() const override { return session_id; }

-    String getConnectedAddress() const override { return connected_zk_address; }
+    Poco::Net::SocketAddress getConnectedAddress() const override { return connected_zk_address; }

    void executeGenericRequest(
        const ZooKeeperRequestPtr & request,
@ -203,7 +203,7 @@ public:

 private:
    ACLs default_acls;
-    String connected_zk_address;
+    Poco::Net::SocketAddress connected_zk_address;

    zkutil::ZooKeeperArgs args;

--- a/src/Coordination/KeeperStateManager.cpp
+++ b/src/Coordination/KeeperStateManager.cpp
@ -6,6 +6,7 @@
 #include <Common/Exception.h>
 #include <Common/isLocalAddress.h>
 #include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromFile.h>
 #include <Common/getMultipleKeysFromConfig.h>
 #include <Disks/DiskLocal.h>
 #include <Common/logger_useful.h>
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -154,7 +154,7 @@ class IColumn;
    M(ParallelReplicasCustomKeyFilterType, parallel_replicas_custom_key_filter_type, ParallelReplicasCustomKeyFilterType::DEFAULT, "Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.", 0) \
    \
    M(String, cluster_for_parallel_replicas, "default", "Cluster for a shard in which current server is located", 0) \
-    M(Bool, allow_experimental_parallel_reading_from_replicas, false, "If true, ClickHouse will send a SELECT query to all replicas of a table. It will work for any kind on MergeTree table.", 0) \
+    M(UInt64, allow_experimental_parallel_reading_from_replicas, 0, "Use all the replicas from a shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure", 0) \
    M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, "A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.", 0) \
    M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, "If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables", 0) \
    \
@ -729,6 +729,7 @@ class IColumn;
    M(UInt64, http_max_request_param_data_size, 10_MiB, "Limit on size of request data used as a query parameter in predefined HTTP requests.", 0) \
    M(Bool, function_json_value_return_type_allow_nullable, false, "Allow function JSON_VALUE to return nullable type.", 0) \
    M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \
+    M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \
    \
    /** Experimental functions */ \
    M(Bool, allow_experimental_funnel_functions, false, "Enable experimental functions for funnel analysis.", 0) \
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -82,6 +82,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
 {
    {"23.5", {{"input_format_parquet_preserve_order", true, false, "Allow Parquet reader to reorder rows for better parallelism."},
              {"parallelize_output_from_storages", false, true, "Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows."},
+              {"use_with_fill_by_sorting_prefix", false, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently"},
              {"output_format_parquet_compliant_nested_types", false, true, "Change an internal field name in output Parquet file schema."}}},
    {"23.4", {{"allow_suspicious_indices", true, false, "If true, index can defined with identical expressions"},
              {"connect_timeout_with_failover_ms", 50, 1000, "Increase default connect timeout because of async connect"},
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@ -2,6 +2,7 @@

 #include <Interpreters/Context_fwd.h>
 #include <Core/Defines.h>
+#include <Core/Names.h>
 #include <base/types.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/Exception.h>
@ -20,6 +21,7 @@
 #include <boost/noncopyable.hpp>
 #include <Poco/Timestamp.h>
 #include <filesystem>
+#include <sys/stat.h>


 namespace fs = std::filesystem;
--- a/src/Disks/IDiskTransaction.h
+++ b/src/Disks/IDiskTransaction.h
@ -4,6 +4,7 @@
 #include <vector>
 #include <boost/noncopyable.hpp>
 #include <Disks/IDisk.h>
+#include <sys/types.h>

 namespace DB
 {
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -1,4 +1,4 @@
-#include "AsynchronousReadIndirectBufferFromRemoteFS.h"
+#include "AsynchronousBoundedReadBuffer.h"

 #include <Common/Stopwatch.h>
 #include <Common/logger_useful.h>
@ -43,105 +43,77 @@ namespace ErrorCodes
 }


-AsynchronousReadIndirectBufferFromRemoteFS::AsynchronousReadIndirectBufferFromRemoteFS(
+AsynchronousBoundedReadBuffer::AsynchronousBoundedReadBuffer(
+        ImplPtr impl_,
        IAsynchronousReader & reader_,
        const ReadSettings & settings_,
-        std::shared_ptr<ReadBufferFromRemoteFSGather> impl_,
-        std::shared_ptr<AsyncReadCounters> async_read_counters_,
-        std::shared_ptr<FilesystemReadPrefetchesLog> prefetches_log_)
+        AsyncReadCountersPtr async_read_counters_,
+        FilesystemReadPrefetchesLogPtr prefetches_log_)
    : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0)
+    , impl(std::move(impl_))
    , read_settings(settings_)
    , reader(reader_)
-    , base_priority(settings_.priority)
-    , impl(impl_)
    , prefetch_buffer(settings_.prefetch_buffer_size)
-    , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr
-               ? CurrentThread::getQueryId() : "")
+    , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr ? CurrentThread::getQueryId() : "")
    , current_reader_id(getRandomASCIIString(8))
-#ifndef NDEBUG
-    , log(&Poco::Logger::get("AsynchronousBufferFromRemoteFS"))
-#else
-    , log(&Poco::Logger::get("AsyncBuffer(" + impl->getFileName() + ")"))
-#endif
+    , log(&Poco::Logger::get("AsynchronousBoundedReadBuffer"))
    , async_read_counters(async_read_counters_)
    , prefetches_log(prefetches_log_)
 {
    ProfileEvents::increment(ProfileEvents::RemoteFSBuffers);
 }

-String AsynchronousReadIndirectBufferFromRemoteFS::getFileName() const
+bool AsynchronousBoundedReadBuffer::hasPendingDataToRead()
 {
-    return impl->getFileName();
-}
-
-
-String AsynchronousReadIndirectBufferFromRemoteFS::getInfoForLog()
-{
-    return impl->getInfoForLog();
-}
-
-size_t AsynchronousReadIndirectBufferFromRemoteFS::getFileSize()
-{
-    return impl->getFileSize();
-}
-
-bool AsynchronousReadIndirectBufferFromRemoteFS::hasPendingDataToRead()
-{
-    /**
-     * Note: read_until_position here can be std::nullopt only for non-MergeTree tables.
-     * For mergeTree tables it must be guaranteed that setReadUntilPosition() or
-     * setReadUntilEnd() is called before any read or prefetch.
-     * setReadUntilEnd() always sets read_until_position to file size.
-     * setReadUntilPosition(pos) always has pos > 0, because if
-     * right_offset_in_compressed_file is 0, then setReadUntilEnd() is used.
-     */
    if (read_until_position)
    {
-        /// Everything is already read.
-        if (file_offset_of_buffer_end == *read_until_position)
+        if (file_offset_of_buffer_end == *read_until_position) /// Everything is already read.
            return false;

        if (file_offset_of_buffer_end > *read_until_position)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Read beyond last offset ({} > {}, info: {})",
-                            file_offset_of_buffer_end, *read_until_position, impl->getInfoForLog());
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Read beyond last offset ({} > {}, info: {})",
+                file_offset_of_buffer_end, *read_until_position, impl->getInfoForLog());
+        }
    }

    return true;
 }

-
-std::future<IAsynchronousReader::Result> AsynchronousReadIndirectBufferFromRemoteFS::asyncReadInto(char * data, size_t size, int64_t priority)
+std::future<IAsynchronousReader::Result>
+AsynchronousBoundedReadBuffer::asyncReadInto(char * data, size_t size, int64_t priority)
 {
    IAsynchronousReader::Request request;
    request.descriptor = std::make_shared<RemoteFSFileDescriptor>(*impl, async_read_counters);
    request.buf = data;
    request.size = size;
    request.offset = file_offset_of_buffer_end;
-    request.priority = base_priority + priority;
+    request.priority = read_settings.priority + priority;
    request.ignore = bytes_to_ignore;
    return reader.submit(request);
 }

-
-void AsynchronousReadIndirectBufferFromRemoteFS::prefetch(int64_t priority)
+void AsynchronousBoundedReadBuffer::prefetch(int64_t priority)
 {
    if (prefetch_future.valid())
        return;

-    /// Check boundary, which was set in readUntilPosition().
    if (!hasPendingDataToRead())
        return;

-    last_prefetch_info.submit_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+    last_prefetch_info.submit_time = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::system_clock::now().time_since_epoch()).count();
    last_prefetch_info.priority = priority;

-    /// Prefetch even in case hasPendingData() == true.
-    chassert(prefetch_buffer.size() == read_settings.prefetch_buffer_size || prefetch_buffer.size() == read_settings.remote_fs_buffer_size);
+    chassert(prefetch_buffer.size() == read_settings.prefetch_buffer_size
+             || prefetch_buffer.size() == read_settings.remote_fs_buffer_size);
    prefetch_future = asyncReadInto(prefetch_buffer.data(), prefetch_buffer.size(), priority);
    ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches);
 }

-void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t position)
+void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position)
 {
    if (!read_until_position || position != *read_until_position)
    {
@ -157,21 +129,16 @@ void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t pos
    }
 }

-
-void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilEnd()
+void AsynchronousBoundedReadBuffer::appendToPrefetchLog(
+    FilesystemPrefetchState state,
+    int64_t size,
+    const std::unique_ptr<Stopwatch> & execution_watch)
 {
-    setReadUntilPosition(impl->getFileSize());
-}
-
-
-void AsynchronousReadIndirectBufferFromRemoteFS::appendToPrefetchLog(FilesystemPrefetchState state, int64_t size, const std::unique_ptr<Stopwatch> & execution_watch)
-{
-    const auto & object = impl->getCurrentObject();
    FilesystemReadPrefetchesLogElement elem
    {
        .event_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()),
        .query_id = query_id,
-        .path = object.local_path,
+        .path = impl->getFileName(),
        .offset = file_offset_of_buffer_end,
        .size = size,
        .prefetch_submit_time = last_prefetch_info.submit_time,
@ -187,7 +154,7 @@ void AsynchronousReadIndirectBufferFromRemoteFS::appendToPrefetchLog(FilesystemP
 }


-bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
+bool AsynchronousBoundedReadBuffer::nextImpl()
 {
    if (!hasPendingDataToRead())
        return false;
@ -245,14 +212,14 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    /// In case of multiple files for the same file in clickhouse (i.e. log family)
    /// file_offset_of_buffer_end will not match getImplementationBufferOffset()
    /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
-    chassert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
+    chassert(file_offset_of_buffer_end >= impl->getFileOffsetOfBufferEnd());
    chassert(file_offset_of_buffer_end <= impl->getFileSize());

    return bytes_read;
 }


-off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
+off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
 {
    ProfileEvents::increment(ProfileEvents::RemoteFSSeeks);

@ -268,7 +235,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    }
    else
    {
-        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "ReadBufferFromFileDescriptor::seek expects SEEK_SET or SEEK_CUR as whence");
+        throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Expected SEEK_SET or SEEK_CUR as whence");
    }

    /// Position is unchanged.
@ -322,9 +289,8 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    if (read_until_position && new_pos > *read_until_position)
    {
        ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
-        impl->reset();
-
        file_offset_of_buffer_end = new_pos = *read_until_position; /// read_until_position is a non-included boundary.
+        impl->seek(file_offset_of_buffer_end, SEEK_SET);
        return new_pos;
    }

@ -332,8 +298,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
    * Note: we read in range [file_offset_of_buffer_end, read_until_position).
    */
-    if (impl->initialized()
-        && read_until_position && new_pos < *read_until_position
+    if (read_until_position && new_pos < *read_until_position
        && new_pos > file_offset_of_buffer_end
        && new_pos < file_offset_of_buffer_end + read_settings.remote_read_min_bytes_for_seek)
    {
@ -342,31 +307,21 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    }
    else
    {
-        if (impl->initialized())
-        {
-            ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
-            impl->reset();
-        }
+        ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
        file_offset_of_buffer_end = new_pos;
+        impl->seek(file_offset_of_buffer_end, SEEK_SET);
    }

    return new_pos;
 }


-off_t AsynchronousReadIndirectBufferFromRemoteFS::getPosition()
-{
-    return file_offset_of_buffer_end - available() + bytes_to_ignore;
-}
-
-
-void AsynchronousReadIndirectBufferFromRemoteFS::finalize()
+void AsynchronousBoundedReadBuffer::finalize()
 {
    resetPrefetch(FilesystemPrefetchState::UNNEEDED);
 }

-
-AsynchronousReadIndirectBufferFromRemoteFS::~AsynchronousReadIndirectBufferFromRemoteFS()
+AsynchronousBoundedReadBuffer::~AsynchronousBoundedReadBuffer()
 {
    try
    {
@ -378,7 +333,7 @@ AsynchronousReadIndirectBufferFromRemoteFS::~AsynchronousReadIndirectBufferFromR
    }
 }

-void AsynchronousReadIndirectBufferFromRemoteFS::resetPrefetch(FilesystemPrefetchState state)
+void AsynchronousBoundedReadBuffer::resetPrefetch(FilesystemPrefetchState state)
 {
    if (!prefetch_future.valid())
        return;
--- a/src/Disks/IO/AsynchronousBoundedReadBuffer.h
+++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.h
@ -0,0 +1,96 @@
+#pragma once
+
+#include "config.h"
+#include <IO/ReadBufferFromFile.h>
+#include <IO/AsynchronousReader.h>
+#include <IO/ReadSettings.h>
+#include <Interpreters/FilesystemReadPrefetchesLog.h>
+#include <utility>
+
+namespace Poco { class Logger; }
+
+namespace DB
+{
+
+struct AsyncReadCounters;
+using AsyncReadCountersPtr = std::shared_ptr<AsyncReadCounters>;
+class ReadBufferFromRemoteFSGather;
+
+class AsynchronousBoundedReadBuffer : public ReadBufferFromFileBase
+{
+public:
+    using Impl = ReadBufferFromFileBase;
+    using ImplPtr = std::unique_ptr<Impl>;
+
+    explicit AsynchronousBoundedReadBuffer(
+        ImplPtr impl_,
+        IAsynchronousReader & reader_,
+        const ReadSettings & settings_,
+        AsyncReadCountersPtr async_read_counters_ = nullptr,
+        FilesystemReadPrefetchesLogPtr prefetches_log_ = nullptr);
+
+    ~AsynchronousBoundedReadBuffer() override;
+
+    String getFileName() const override { return impl->getFileName(); }
+
+    size_t getFileSize() override { return impl->getFileSize(); }
+
+    String getInfoForLog() override { return impl->getInfoForLog(); }
+
+    off_t seek(off_t offset_, int whence) override;
+
+    void prefetch(int64_t priority) override;
+
+    void setReadUntilPosition(size_t position) override; /// [..., position).
+
+    void setReadUntilEnd() override { return setReadUntilPosition(getFileSize()); }
+
+    off_t getPosition() override { return file_offset_of_buffer_end - available() + bytes_to_ignore; }
+
+private:
+    const ImplPtr impl;
+    const ReadSettings read_settings;
+    IAsynchronousReader & reader;
+
+    size_t file_offset_of_buffer_end = 0;
+    std::optional<size_t> read_until_position;
+    /// If nonzero then working_buffer is empty.
+    /// If a prefetch is in flight, the prefetch task has been instructed to ignore this many bytes.
+    size_t bytes_to_ignore = 0;
+
+    Memory<> prefetch_buffer;
+    std::future<IAsynchronousReader::Result> prefetch_future;
+
+    const std::string query_id;
+    const std::string current_reader_id;
+
+    Poco::Logger * log;
+
+    AsyncReadCountersPtr async_read_counters;
+    FilesystemReadPrefetchesLogPtr prefetches_log;
+
+    struct LastPrefetchInfo
+    {
+        UInt64 submit_time = 0;
+        size_t priority = 0;
+    };
+    LastPrefetchInfo last_prefetch_info;
+
+    bool nextImpl() override;
+
+    void finalize();
+
+    bool hasPendingDataToRead();
+
+    void appendToPrefetchLog(
+        FilesystemPrefetchState state,
+        int64_t size,
+        const std::unique_ptr<Stopwatch> & execution_watch);
+
+    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
+
+    void resetPrefetch(FilesystemPrefetchState state);
+
+};
+
+}
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
@ -1,111 +0,0 @@
-#pragma once
-
-#include "config.h"
-#include <IO/ReadBufferFromFile.h>
-#include <IO/AsynchronousReader.h>
-#include <IO/ReadSettings.h>
-#include <Interpreters/FilesystemReadPrefetchesLog.h>
-#include <utility>
-
-namespace Poco { class Logger; }
-
-namespace DB
-{
-
-struct AsyncReadCounters;
-class ReadBufferFromRemoteFSGather;
-
-/**
- * Reads data from S3/HDFS/Web using stored paths in metadata.
-* This class is an asynchronous version of ReadIndirectBufferFromRemoteFS.
-*
-* Buffers chain for diskS3:
-* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
-* -> ReadBufferFromS3 -> ReadBufferFromIStream.
-*
-* Buffers chain for diskWeb:
-* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
-* -> ReadIndirectBufferFromWebServer -> ReadBufferFromHTTP -> ReadBufferFromIStream.
-*
-* We pass either `memory` or `prefetch_buffer` through all this chain and return it back.
-*/
-class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
-{
-public:
-    explicit AsynchronousReadIndirectBufferFromRemoteFS(
-        IAsynchronousReader & reader_, const ReadSettings & settings_,
-        std::shared_ptr<ReadBufferFromRemoteFSGather> impl_,
-        std::shared_ptr<AsyncReadCounters> async_read_counters_,
-        std::shared_ptr<FilesystemReadPrefetchesLog> prefetches_log_);
-
-    ~AsynchronousReadIndirectBufferFromRemoteFS() override;
-
-    off_t seek(off_t offset_, int whence) override;
-
-    off_t getPosition() override;
-
-    String getFileName() const override;
-
-    void prefetch(int64_t priority) override;
-
-    void setReadUntilPosition(size_t position) override; /// [..., position).
-
-    void setReadUntilEnd() override;
-
-    String getInfoForLog() override;
-
-    size_t getFileSize() override;
-
-    bool isIntegratedWithFilesystemCache() const override { return true; }
-
-private:
-    bool nextImpl() override;
-
-    void finalize();
-
-    bool hasPendingDataToRead();
-
-    void appendToPrefetchLog(FilesystemPrefetchState state, int64_t size, const std::unique_ptr<Stopwatch> & execution_watch);
-
-    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
-
-    void resetPrefetch(FilesystemPrefetchState state);
-
-    ReadSettings read_settings;
-
-    IAsynchronousReader & reader;
-
-    int64_t base_priority;
-
-    std::shared_ptr<ReadBufferFromRemoteFSGather> impl;
-
-    std::future<IAsynchronousReader::Result> prefetch_future;
-
-    size_t file_offset_of_buffer_end = 0;
-
-    Memory<> prefetch_buffer;
-
-    std::string query_id;
-
-    std::string current_reader_id;
-
-    /// If nonzero then working_buffer is empty.
-    /// If a prefetch is in flight, the prefetch task has been instructed to ignore this many bytes.
-    size_t bytes_to_ignore = 0;
-
-    std::optional<size_t> read_until_position;
-
-    Poco::Logger * log;
-
-    std::shared_ptr<AsyncReadCounters> async_read_counters;
-    std::shared_ptr<FilesystemReadPrefetchesLog> prefetches_log;
-
-    struct LastPrefetchInfo
-    {
-        UInt64 submit_time = 0;
-        size_t priority = 0;
-    };
-    LastPrefetchInfo last_prefetch_info;
-};
-
-}
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -5,6 +5,7 @@
 #include <Disks/IO/CachedOnDiskReadBufferFromFile.h>
 #include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
 #include <Common/logger_useful.h>
+#include <IO/SwapHelper.h>
 #include <iostream>
 #include <base/hex.h>
 #include <Interpreters/FilesystemCacheLog.h>
@ -12,22 +13,24 @@

 namespace DB
 {
+namespace ErrorCodes
+{
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+}

 ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(
    ReadBufferCreator && read_buffer_creator_,
    const StoredObjects & blobs_to_read_,
    const ReadSettings & settings_,
    std::shared_ptr<FilesystemCacheLog> cache_log_)
-    : ReadBuffer(nullptr, 0)
-    , read_buffer_creator(std::move(read_buffer_creator_))
-    , blobs_to_read(blobs_to_read_)
+    : ReadBufferFromFileBase(0, nullptr, 0)
    , settings(settings_)
+    , blobs_to_read(blobs_to_read_)
+    , read_buffer_creator(std::move(read_buffer_creator_))
+    , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr)
    , query_id(CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() != nullptr ? CurrentThread::getQueryId() : "")
    , log(&Poco::Logger::get("ReadBufferFromRemoteFSGather"))
 {
-    if (cache_log_ && settings.enable_filesystem_cache_log)
-        cache_log = cache_log_;
-
    if (!blobs_to_read.empty())
        current_object = blobs_to_read.front();

@ -38,13 +41,12 @@ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(

 SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object)
 {
-    if (current_buf != nullptr && !with_cache)
+    if (current_buf && !with_cache)
    {
-        appendFilesystemCacheLog();
+        appendUncachedReadInfo();
    }

    current_object = object;
-    total_bytes_read_from_current_file = 0;
    const auto & object_path = object.remote_path;

    size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size;
@ -72,7 +74,7 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
    return current_read_buffer_creator();
 }

-void ReadBufferFromRemoteFSGather::appendFilesystemCacheLog()
+void ReadBufferFromRemoteFSGather::appendUncachedReadInfo()
 {
    if (!cache_log || current_object.remote_path.empty())
        return;
@ -84,7 +86,7 @@ void ReadBufferFromRemoteFSGather::appendFilesystemCacheLog()
        .source_file_path = current_object.remote_path,
        .file_segment_range = { 0, current_object.bytes_size },
        .cache_type = FilesystemCacheLogElement::CacheType::READ_FROM_FS_BYPASSING_CACHE,
-        .file_segment_size = total_bytes_read_from_current_file,
+        .file_segment_size = current_object.bytes_size,
        .read_from_cache_attempted = false,
    };
    cache_log->add(elem);
@ -176,7 +178,7 @@ bool ReadBufferFromRemoteFSGather::moveToNextBuffer()

 bool ReadBufferFromRemoteFSGather::readImpl()
 {
-    swap(*current_buf);
+    SwapHelper swap(*this, *current_buf);

    bool result = false;

@ -187,7 +189,6 @@ bool ReadBufferFromRemoteFSGather::readImpl()
     */
    if (bytes_to_ignore)
    {
-        total_bytes_read_from_current_file += bytes_to_ignore;
        current_buf->ignore(bytes_to_ignore);
        result = current_buf->hasPendingData();
        file_offset_of_buffer_end += bytes_to_ignore;
@ -207,57 +208,41 @@ bool ReadBufferFromRemoteFSGather::readImpl()
        file_offset_of_buffer_end += current_buf->available();
    }

-    swap(*current_buf);
-
    /// Required for non-async reads.
    if (result)
    {
-        assert(available());
-        nextimpl_working_buffer_offset = offset();
-        total_bytes_read_from_current_file += available();
+        assert(current_buf->available());
+        nextimpl_working_buffer_offset = current_buf->offset();
    }

    return result;
 }

-size_t ReadBufferFromRemoteFSGather::getFileOffsetOfBufferEnd() const
-{
-    return file_offset_of_buffer_end;
-}
-
 void ReadBufferFromRemoteFSGather::setReadUntilPosition(size_t position)
 {
-    if (position != read_until_position)
-    {
-        read_until_position = position;
-        reset();
-    }
+    if (position == read_until_position)
+        return;
+
+    reset();
+    read_until_position = position;
 }

 void ReadBufferFromRemoteFSGather::reset()
 {
+    current_object = {};
+    current_buf_idx = {};
    current_buf.reset();
+    bytes_to_ignore = 0;
 }

-String ReadBufferFromRemoteFSGather::getFileName() const
+off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence)
 {
-    return current_object.remote_path;
-}
+    if (whence != SEEK_SET)
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only seeking with SEEK_SET is allowed");

-size_t ReadBufferFromRemoteFSGather::getFileSize() const
-{
-    size_t size = 0;
-    for (const auto & object : blobs_to_read)
-        size += object.bytes_size;
-    return size;
-}
-
-String ReadBufferFromRemoteFSGather::getInfoForLog()
-{
-    if (!current_buf)
-        return "";
-
-    return current_buf->getInfoForLog();
+    reset();
+    file_offset_of_buffer_end = offset;
+    return file_offset_of_buffer_end;
 }

 size_t ReadBufferFromRemoteFSGather::getImplementationBufferOffset() const
@ -271,7 +256,7 @@ size_t ReadBufferFromRemoteFSGather::getImplementationBufferOffset() const
 ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather()
 {
    if (!with_cache)
-        appendFilesystemCacheLog();
+        appendUncachedReadInfo();
 }

 }
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
@ -10,12 +10,13 @@ namespace Poco { class Logger; }

 namespace DB
 {
+class FilesystemCacheLog;

 /**
 * Remote disk might need to split one clickhouse file into multiple files in remote fs.
 * This class works like a proxy to allow transition from one file into multiple.
 */
-class ReadBufferFromRemoteFSGather final : public ReadBuffer
+class ReadBufferFromRemoteFSGather final : public ReadBufferFromFileBase
 {
 friend class ReadIndirectBufferFromRemoteFS;

@ -30,25 +31,25 @@ public:

    ~ReadBufferFromRemoteFSGather() override;

-    String getFileName() const;
+    String getFileName() const override { return current_object.remote_path; }

-    void reset();
+    String getInfoForLog() override { return current_buf ? current_buf->getInfoForLog() : ""; }

    void setReadUntilPosition(size_t position) override;

    IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore) override;

-    size_t getFileSize() const;
+    size_t getFileSize() override { return getTotalSize(blobs_to_read); }

-    size_t getFileOffsetOfBufferEnd() const;
+    size_t getFileOffsetOfBufferEnd() const override { return file_offset_of_buffer_end; }

    bool initialized() const { return current_buf != nullptr; }

-    String getInfoForLog();
-
    size_t getImplementationBufferOffset() const;

-    const StoredObject & getCurrentObject() const { return current_object; }
+    off_t seek(off_t offset, int whence) override;
+
+    off_t getPosition() override { return file_offset_of_buffer_end - available() + bytes_to_ignore; }

 private:
    SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object);
@ -61,40 +62,26 @@ private:

    bool moveToNextBuffer();

-    void appendFilesystemCacheLog();
+    void appendUncachedReadInfo();

-    ReadBufferCreator read_buffer_creator;
-
-    StoredObjects blobs_to_read;
-
-    ReadSettings settings;
-
-    size_t read_until_position = 0;
-
-    StoredObject current_object;
+    void reset();

+    const ReadSettings settings;
+    const StoredObjects blobs_to_read;
+    const ReadBufferCreator read_buffer_creator;
+    const std::shared_ptr<FilesystemCacheLog> cache_log;
+    const String query_id;
    bool with_cache;

-    String query_id;
-
-    Poco::Logger * log;
-
-    SeekableReadBufferPtr current_buf;
-
-    size_t current_buf_idx = 0;
-
+    size_t read_until_position = 0;
    size_t file_offset_of_buffer_end = 0;
-
-    /**
-     * File:                        |___________________|
-     * Buffer:                            |~~~~~~~|
-     * file_offset_of_buffer_end:                 ^
-     */
    size_t bytes_to_ignore = 0;

-    size_t total_bytes_read_from_current_file = 0;
+    StoredObject current_object;
+    size_t current_buf_idx = 0;
+    SeekableReadBufferPtr current_buf;

-    std::shared_ptr<FilesystemCacheLog> cache_log;
+    Poco::Logger * log;
 };

 }
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
@ -82,7 +82,7 @@ off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
    else
        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET or SEEK_CUR modes are allowed.");

-    impl->reset();
+    impl->seek(impl->file_offset_of_buffer_end, SEEK_SET);
    resetWorkingBuffer();

    file_offset_of_buffer_end = impl->file_offset_of_buffer_end;
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
@ -31,8 +31,6 @@ public:

    void setReadUntilEnd() override;

-    bool isIntegratedWithFilesystemCache() const override { return true; }
-
    size_t getFileSize() override;

 private:
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
@ -8,6 +8,7 @@
 #include <Disks/IO/WriteBufferFromAzureBlobStorage.h>
 #include <IO/SeekAvoidingReadBuffer.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <Disks/IO/AsynchronousBoundedReadBuffer.h>

 #include <Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.h>
 #include <Interpreters/Context.h>
@ -112,8 +113,8 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
    if (disk_read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
    {
        auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
-        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(
-            reader, disk_read_settings, std::move(reader_impl),
+        return std::make_unique<AsynchronousBoundedReadBuffer>(
+            std::move(reader_impl), reader, disk_read_settings,
            global_context->getAsyncReadCounters(),
            global_context->getFilesystemReadPrefetchesLog());
    }
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h
@ -5,7 +5,6 @@

 #include <Disks/ObjectStorages/DiskObjectStorageCommon.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
-#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
 #include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
 #include <Disks/ObjectStorages/IObjectStorage.h>
 #include <Common/MultiVersion.h>
--- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.h
@ -2,6 +2,7 @@

 #include <Disks/ObjectStorages/IObjectStorage.h>
 #include <base/getFQDNOrHostName.h>
+#include <future>

 namespace DB
 {
--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@ -7,7 +7,6 @@
 #include <Storages/HDFS/HDFSCommon.h>

 #include <Storages/HDFS/ReadBufferFromHDFS.h>
-#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
 #include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
 #include <Common/getRandomASCIIString.h>
--- a/src/Disks/ObjectStorages/IObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/IObjectStorage.cpp
@ -3,6 +3,7 @@
 #include <Common/getRandomASCIIString.h>
 #include <IO/WriteBufferFromFileBase.h>
 #include <IO/copyData.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <Interpreters/Context.h>


--- a/src/Disks/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/ObjectStorages/IObjectStorage.h
@ -12,12 +12,14 @@
 #include <Common/Exception.h>
 #include <IO/ReadSettings.h>
 #include <IO/WriteSettings.h>
+#include <IO/copyData.h>

-#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
 #include <Disks/ObjectStorages/StoredObject.h>
 #include <Disks/DiskType.h>
 #include <Common/ThreadPool_fwd.h>
 #include <Disks/WriteMode.h>
+#include <Interpreters/Context_fwd.h>
+#include <Core/Types.h>


 namespace DB
--- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp
@ -7,6 +7,7 @@
 #include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
 #include <Disks/IO/createReadBufferFromFileBase.h>
+#include <Disks/IO/AsynchronousBoundedReadBuffer.h>
 #include <IO/SeekAvoidingReadBuffer.h>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/copyData.h>
@ -63,12 +64,12 @@ std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOL
        global_context->getFilesystemCacheLog());

    /// We use `remove_fs_method` (not `local_fs_method`) because we are about to use
-    /// AsynchronousReadIndirectBufferFromRemoteFS which works by the remote_fs_* settings.
+    /// AsynchronousBoundedReadBuffer which works by the remote_fs_* settings.
    if (modified_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
    {
        auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
-        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(
-            reader, modified_settings, std::move(impl),
+        return std::make_unique<AsynchronousBoundedReadBuffer>(
+            std::move(impl), reader, modified_settings,
            global_context->getAsyncReadCounters(),
            global_context->getFilesystemReadPrefetchesLog());
    }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -6,7 +6,7 @@

 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
 #include <Disks/ObjectStorages/DiskObjectStorageCommon.h>
-#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/AsynchronousBoundedReadBuffer.h>
 #include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
 #include <Disks/IO/ThreadPoolRemoteFSReader.h>
 #include <IO/WriteBufferFromS3.h>
@ -127,8 +127,8 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
    if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
    {
        auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
-        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(
-            reader, disk_read_settings, std::move(s3_impl),
+        return std::make_unique<AsynchronousBoundedReadBuffer>(
+            std::move(s3_impl), reader, disk_read_settings,
            global_context->getAsyncReadCounters(),
            global_context->getFilesystemReadPrefetchesLog());
    }
--- a/src/Disks/ObjectStorages/StoredObject.cpp
+++ b/src/Disks/ObjectStorages/StoredObject.cpp
@ -0,0 +1,14 @@
+#include <Disks/ObjectStorages/StoredObject.h>
+
+namespace DB
+{
+
+size_t getTotalSize(const StoredObjects & objects)
+{
+    size_t size = 0;
+    for (const auto & object : objects)
+        size += object.bytes_size;
+    return size;
+}
+
+}
--- a/src/Disks/ObjectStorages/StoredObject.h
+++ b/src/Disks/ObjectStorages/StoredObject.h
@ -29,4 +29,6 @@ struct StoredObject

 using StoredObjects = std::vector<StoredObject>;

+size_t getTotalSize(const StoredObjects & objects);
+
 }
--- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
@ -9,6 +9,7 @@
 #include <IO/WriteHelpers.h>

 #include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/AsynchronousBoundedReadBuffer.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
 #include <Disks/IO/ReadBufferFromWebServer.h>
 #include <Disks/IO/ThreadPoolRemoteFSReader.h>
@ -189,8 +190,8 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
    if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
    {
        auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
-        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(
-            reader, read_settings, std::move(web_impl),
+        return std::make_unique<AsynchronousBoundedReadBuffer>(
+            std::move(web_impl), reader, read_settings,
            global_context->getAsyncReadCounters(),
            global_context->getFilesystemReadPrefetchesLog());
    }
--- a/src/IO/AsyncReadCounters.h
+++ b/src/IO/AsyncReadCounters.h
@ -27,5 +27,6 @@ struct AsyncReadCounters

    void dumpToMapColumn(IColumn * column) const;
 };
+using AsyncReadCountersPtr = std::shared_ptr<AsyncReadCounters>;

 }
--- a/src/IO/ReadBufferFromFileDecorator.h
+++ b/src/IO/ReadBufferFromFileDecorator.h
@ -27,8 +27,6 @@ public:

    ReadBuffer & getWrappedReadBuffer() { return *impl; }

-    bool isIntegratedWithFilesystemCache() const override { return impl->isIntegratedWithFilesystemCache(); }
-
    size_t getFileSize() override;

 protected:
--- a/src/IO/SeekableReadBuffer.h
+++ b/src/IO/SeekableReadBuffer.h
@ -49,8 +49,6 @@ public:
    /// If true, setReadUntilPosition() guarantees that eof will be reported at the given position.
    virtual bool supportsRightBoundedReads() const { return false; }

-    virtual bool isIntegratedWithFilesystemCache() const { return false; }
-
    /// Returns true if seek() actually works, false if seek() will always throw (or make subsequent
    /// nextImpl() calls throw).
    ///
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -1386,6 +1386,20 @@ void Context::addQueryAccessInfo(
        query_access_info.views.emplace(view_name);
 }

+void Context::addQueryAccessInfo(const Names & partition_names)
+{
+    if (isGlobalContext())
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have query access info");
+    }
+
+    std::lock_guard<std::mutex> lock(query_access_info.mutex);
+    for (const auto & partition_name : partition_names)
+    {
+        query_access_info.partitions.emplace(partition_name);
+    }
+}
+
 void Context::addQueryFactoriesInfo(QueryLogFactories factory_type, const String & created_object) const
 {
    if (isGlobalContext())
@ -2796,11 +2810,7 @@ zkutil::ZooKeeperPtr Context::getAuxiliaryZooKeeper(const String & name) const
 std::map<String, zkutil::ZooKeeperPtr> Context::getAuxiliaryZooKeepers() const
 {
    std::lock_guard lock(shared->auxiliary_zookeepers_mutex);
-
-    if (!shared->auxiliary_zookeepers.empty())
-        return shared->auxiliary_zookeepers;
-    else
-        return std::map<String, zkutil::ZooKeeperPtr>();
+    return shared->auxiliary_zookeepers;
 }

 #if USE_ROCKSDB
@ -4314,7 +4324,7 @@ Context::ParallelReplicasMode Context::getParallelReplicasMode() const
    if (!settings_.parallel_replicas_custom_key.value.empty())
        return CUSTOM_KEY;

-    if (settings_.allow_experimental_parallel_reading_from_replicas
+    if (settings_.allow_experimental_parallel_reading_from_replicas > 0
        && !settings_.use_hedged_requests)
        return READ_TASKS;

--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -297,6 +297,7 @@ private:
            databases = rhs.databases;
            tables = rhs.tables;
            columns = rhs.columns;
+            partitions = rhs.partitions;
            projections = rhs.projections;
            views = rhs.views;
        }
@ -314,6 +315,7 @@ private:
            std::swap(databases, rhs.databases);
            std::swap(tables, rhs.tables);
            std::swap(columns, rhs.columns);
+            std::swap(partitions, rhs.partitions);
            std::swap(projections, rhs.projections);
            std::swap(views, rhs.views);
        }
@ -323,6 +325,7 @@ private:
        std::set<std::string> databases{};
        std::set<std::string> tables{};
        std::set<std::string> columns{};
+        std::set<std::string> partitions{};
        std::set<std::string> projections{};
        std::set<std::string> views{};
    };
@ -631,6 +634,7 @@ public:
        const Names & column_names,
        const String & projection_name = {},
        const String & view_name = {});
+    void addQueryAccessInfo(const Names & partition_names);


    /// Supported factories for records in query_log
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -969,6 +969,15 @@ const ASTSelectQuery * ExpressionAnalyzer::getSelectQuery() const
    return select_query;
 }

+bool ExpressionAnalyzer::isRemoteStorage() const
+{
+    const Settings & csettings = getContext()->getSettingsRef();
+    // Consider any storage used in parallel replicas as remote, so the query is executed in multiple servers
+    const bool enable_parallel_processing_of_joins
+        = csettings.max_parallel_replicas > 1 && csettings.allow_experimental_parallel_reading_from_replicas > 0;
+    return syntax->is_remote_storage || enable_parallel_processing_of_joins;
+}
+
 const ASTSelectQuery * SelectQueryExpressionAnalyzer::getAggregatingQuery() const
 {
    if (!has_aggregation)
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@ -201,7 +201,7 @@ protected:

    const ASTSelectQuery * getSelectQuery() const;

-    bool isRemoteStorage() const { return syntax->is_remote_storage; }
+    bool isRemoteStorage() const;

    NamesAndTypesList getColumnsAfterArrayJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns);
    NamesAndTypesList analyzeJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns);
--- a/src/Interpreters/FilesystemReadPrefetchesLog.h
+++ b/src/Interpreters/FilesystemReadPrefetchesLog.h
@ -45,4 +45,6 @@ public:
    using SystemLog<FilesystemReadPrefetchesLogElement>::SystemLog;
 };

+using FilesystemReadPrefetchesLogPtr = std::shared_ptr<FilesystemReadPrefetchesLog>;
+
 }
--- a/src/Interpreters/FillingRow.cpp
+++ b/src/Interpreters/FillingRow.cpp
@ -50,7 +50,16 @@ bool FillingRow::operator>=(const FillingRow & other) const
    return !(*this < other);
 }

-bool FillingRow::next(const FillingRow & to_row)
+bool FillingRow::isNull() const
+{
+    for (const auto & field : row)
+        if (!field.isNull())
+            return false;
+
+    return true;
+}
+
+std::pair<bool, bool> FillingRow::next(const FillingRow & to_row)
 {
    const size_t row_size = size();
    size_t pos = 0;
@ -61,22 +70,24 @@ bool FillingRow::next(const FillingRow & to_row)
            break;

    if (pos == row_size || less(to_row.row[pos], row[pos], getDirection(pos)))
-        return false;
+        return {false, false};

    /// If we have any 'fill_to' value at position greater than 'pos',
    ///  we need to generate rows up to 'fill_to' value.
    for (size_t i = row_size - 1; i > pos; --i)
    {
-        if (getFillDescription(i).fill_to.isNull() || row[i].isNull())
+        auto & fill_column_desc = getFillDescription(i);
+
+        if (fill_column_desc.fill_to.isNull() || row[i].isNull())
            continue;

-        auto next_value = row[i];
-        getFillDescription(i).step_func(next_value);
-        if (less(next_value, getFillDescription(i).fill_to, getDirection(i)))
+        Field next_value = row[i];
+        fill_column_desc.step_func(next_value);
+        if (less(next_value, fill_column_desc.fill_to, getDirection(i)))
        {
            row[i] = next_value;
            initFromDefaults(i + 1);
-            return true;
+            return {true, true};
        }
    }

@ -84,14 +95,13 @@ bool FillingRow::next(const FillingRow & to_row)
    getFillDescription(pos).step_func(next_value);

    if (less(to_row.row[pos], next_value, getDirection(pos)) || equals(next_value, getFillDescription(pos).fill_to))
-        return false;
+        return {false, false};

    row[pos] = next_value;
    if (equals(row[pos], to_row.row[pos]))
    {
        bool is_less = false;
-        size_t i = pos + 1;
-        for (; i < row_size; ++i)
+        for (size_t i = pos + 1; i < row_size; ++i)
        {
            const auto & fill_from = getFillDescription(i).fill_from;
            if (!fill_from.isNull())
@ -101,11 +111,11 @@ bool FillingRow::next(const FillingRow & to_row)
            is_less |= less(row[i], to_row.row[i], getDirection(i));
        }

-        return is_less;
+        return {is_less, true};
    }

    initFromDefaults(pos + 1);
-    return true;
+    return {true, true};
 }

 void FillingRow::initFromDefaults(size_t from_pos)
--- a/src/Interpreters/FillingRow.h
+++ b/src/Interpreters/FillingRow.h
@ -19,7 +19,10 @@ public:
    explicit FillingRow(const SortDescription & sort_description);

    /// Generates next row according to fill 'from', 'to' and 'step' values.
-    bool next(const FillingRow & to_row);
+    /// Return pair of boolean
+    /// apply - true if filling values should be inserted into result set
+    /// value_changed - true if filling row value was changed
+    std::pair<bool, bool> next(const FillingRow & to_row);

    void initFromDefaults(size_t from_pos = 0);

@ -29,9 +32,11 @@ public:
    bool operator<(const FillingRow & other) const;
    bool operator==(const FillingRow & other) const;
    bool operator>=(const FillingRow & other) const;
+    bool isNull() const;

    int getDirection(size_t index) const { return sort_description[index].direction; }
    FillColumnDescription & getFillDescription(size_t index) { return sort_description[index].fill_description; }
+    const FillColumnDescription & getFillDescription(size_t index) const { return sort_description[index].fill_description; }

    String dump() const;

--- a/src/Interpreters/GlobalSubqueriesVisitor.h
+++ b/src/Interpreters/GlobalSubqueriesVisitor.h
@ -205,10 +205,19 @@ public:
    }

 private:
+    static bool shouldBeExecutedGlobally(const Data & data)
+    {
+        const Settings & settings = data.getContext()->getSettingsRef();
+        /// For parallel replicas we reinterpret JOIN as GLOBAL JOIN as a way to broadcast data
+        const bool enable_parallel_processing_of_joins = data.getContext()->canUseParallelReplicasOnInitiator();
+        return settings.prefer_global_in_and_join || enable_parallel_processing_of_joins;
+    }
+
+
    /// GLOBAL IN
    static void visit(ASTFunction & func, ASTPtr &, Data & data)
    {
-        if ((data.getContext()->getSettingsRef().prefer_global_in_and_join
+        if ((shouldBeExecutedGlobally(data)
             && (func.name == "in" || func.name == "notIn" || func.name == "nullIn" || func.name == "notNullIn"))
            || func.name == "globalIn" || func.name == "globalNotIn" || func.name == "globalNullIn" || func.name == "globalNotNullIn")
        {
@ -238,8 +247,7 @@ private:
    static void visit(ASTTablesInSelectQueryElement & table_elem, ASTPtr &, Data & data)
    {
        if (table_elem.table_join
-            && (table_elem.table_join->as<ASTTableJoin &>().locality == JoinLocality::Global
-                || data.getContext()->getSettingsRef().prefer_global_in_and_join))
+            && (table_elem.table_join->as<ASTTableJoin &>().locality == JoinLocality::Global || shouldBeExecutedGlobally(data)))
        {
            data.addExternalStorage(table_elem.table_expression, true);
            data.has_global_subqueries = true;
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -116,6 +116,7 @@ namespace ErrorCodes
    extern const int ACCESS_DENIED;
    extern const int UNKNOWN_IDENTIFIER;
    extern const int BAD_ARGUMENTS;
+    extern const int SUPPORT_IS_DISABLED;
 }

 /// Assumes `storage` is set and the table filter (row-level security) is not empty.
@ -385,6 +386,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(

    query_info.ignore_projections = options.ignore_projections;
    query_info.is_projection_query = options.is_projection_query;
+    query_info.is_internal = options.is_internal;

    initSettings();
    const Settings & settings = context->getSettingsRef();
@ -408,6 +410,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
        ApplyWithSubqueryVisitor().visit(query_ptr);
    }

+    query_info.query = query_ptr->clone();
    query_info.original_query = query_ptr->clone();

    if (settings.count_distinct_optimization)
@ -455,25 +458,42 @@ InterpreterSelectQuery::InterpreterSelectQuery(
        }
    }

-    if (joined_tables.tablesCount() > 1 && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas))
+    /// Check support for JOIN for parallel replicas with custom key
+    if (joined_tables.tablesCount() > 1 && !settings.parallel_replicas_custom_key.value.empty())
    {
-        LOG_WARNING(log, "Joins are not supported with parallel replicas. Query will be executed without using them.");
-        context->setSetting("allow_experimental_parallel_reading_from_replicas", false);
+        LOG_WARNING(log, "JOINs are not supported with parallel_replicas_custom_key. Query will be executed without using them.");
        context->setSetting("parallel_replicas_custom_key", String{""});
    }

-    /// Try to execute query without parallel replicas if we find that there is a FINAL modifier there.
-    bool is_query_with_final = false;
-    if (query_info.table_expression_modifiers)
-        is_query_with_final = query_info.table_expression_modifiers->hasFinal();
-    else if (query_info.query)
-        is_query_with_final = query_info.query->as<ASTSelectQuery &>().final();
-
-    if (is_query_with_final && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas))
+    /// Check support for FINAL for parallel replicas
+    bool is_query_with_final = isQueryWithFinal(query_info);
+    if (is_query_with_final && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas > 0))
    {
-        LOG_WARNING(log, "FINAL modifier is supported with parallel replicas. Will try to execute the query without using them.");
-        context->setSetting("allow_experimental_parallel_reading_from_replicas", false);
-        context->setSetting("parallel_replicas_custom_key", String{""});
+        if (settings.allow_experimental_parallel_reading_from_replicas == 1)
+        {
+            LOG_WARNING(log, "FINAL modifier is not supported with parallel replicas. Query will be executed without using them.");
+            context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
+            context->setSetting("parallel_replicas_custom_key", String{""});
+        }
+        else if (settings.allow_experimental_parallel_reading_from_replicas == 2)
+        {
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "FINAL modifier is not supported with parallel replicas");
+        }
+    }
+
+    /// Check support for parallel replicas for non-replicated storage (plain MergeTree)
+    bool is_plain_merge_tree = storage && storage->isMergeTree() && !storage->supportsReplication();
+    if (is_plain_merge_tree && settings.allow_experimental_parallel_reading_from_replicas > 0 && !settings.parallel_replicas_for_non_replicated_merge_tree)
+    {
+        if (settings.allow_experimental_parallel_reading_from_replicas == 1)
+        {
+            LOG_WARNING(log, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`. For now query will be executed without using them.");
+            context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
+        }
+        else if (settings.allow_experimental_parallel_reading_from_replicas == 2)
+        {
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "To use parallel replicas with plain MergeTree tables please enable setting `parallel_replicas_for_non_replicated_merge_tree`");
+        }
    }

    /// Rewrite JOINs
@ -2994,20 +3014,27 @@ void InterpreterSelectQuery::executeWithFill(QueryPlan & query_plan)
    auto & query = getSelectQuery();
    if (query.orderBy())
    {
-        SortDescription order_descr = getSortDescription(query, context);
-        SortDescription fill_descr;
-        for (auto & desc : order_descr)
+        SortDescription sort_description = getSortDescription(query, context);
+        SortDescription fill_description;
+        for (auto & desc : sort_description)
        {
            if (desc.with_fill)
-                fill_descr.push_back(desc);
+                fill_description.push_back(desc);
        }

-        if (fill_descr.empty())
+        if (fill_description.empty())
            return;

        InterpolateDescriptionPtr interpolate_descr =
            getInterpolateDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context);
-        auto filling_step = std::make_unique<FillingStep>(query_plan.getCurrentDataStream(), std::move(fill_descr), interpolate_descr);
+
+        const Settings & settings = context->getSettingsRef();
+        auto filling_step = std::make_unique<FillingStep>(
+            query_plan.getCurrentDataStream(),
+            std::move(sort_description),
+            std::move(fill_description),
+            interpolate_descr,
+            settings.use_with_fill_by_sorting_prefix);
        query_plan.addStep(std::move(filling_step));
    }
 }
@ -3126,4 +3153,14 @@ void InterpreterSelectQuery::initSettings()
    }
 }

+bool InterpreterSelectQuery::isQueryWithFinal(const SelectQueryInfo & info)
+{
+    bool result = info.query->as<ASTSelectQuery &>().final();
+    if (info.table_expression_modifiers)
+        result |= info.table_expression_modifiers->hasFinal();
+
+    return result;
+}
+
+
 }
--- a/src/Interpreters/InterpreterSelectQuery.h
+++ b/src/Interpreters/InterpreterSelectQuery.h
@ -131,6 +131,8 @@ public:
    static SortDescription getSortDescription(const ASTSelectQuery & query, const ContextPtr & context);
    static UInt64 getLimitForSorting(const ASTSelectQuery & query, const ContextPtr & context);

+    static bool isQueryWithFinal(const SelectQueryInfo & info);
+
 private:
    InterpreterSelectQuery(
        const ASTPtr & query_ptr_,
--- a/src/Interpreters/OptimizeDateFilterVisitor.cpp
+++ b/src/Interpreters/OptimizeDateFilterVisitor.cpp
@ -0,0 +1,121 @@
+#include <Interpreters/OptimizeDateFilterVisitor.h>
+
+#include <Common/DateLUT.h>
+#include <Common/DateLUTImpl.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTFunction.h>
+
+
+namespace DB
+{
+
+ASTPtr generateOptimizedDateFilterAST(const String & comparator, const String & converter, const String & column, UInt64 year)
+{
+    const DateLUTImpl & date_lut = DateLUT::instance();
+
+    if (converter != "toYear") return {};
+
+    String start_date = date_lut.dateToString(date_lut.makeDayNum(year, 1, 1));
+    String end_date = date_lut.dateToString(date_lut.makeDayNum(year, 12, 31));
+
+    if (comparator == "equals")
+    {
+        return makeASTFunction("and",
+                                makeASTFunction("greaterOrEquals",
+                                            std::make_shared<ASTIdentifier>(column),
+                                            std::make_shared<ASTLiteral>(start_date)
+                                            ),
+                                makeASTFunction("lessOrEquals",
+                                            std::make_shared<ASTIdentifier>(column),
+                                            std::make_shared<ASTLiteral>(end_date)
+                                            )
+                                );
+    }
+    else if (comparator == "notEquals")
+    {
+        return makeASTFunction("or",
+                                makeASTFunction("less",
+                                            std::make_shared<ASTIdentifier>(column),
+                                            std::make_shared<ASTLiteral>(start_date)
+                                            ),
+                                makeASTFunction("greater",
+                                            std::make_shared<ASTIdentifier>(column),
+                                            std::make_shared<ASTLiteral>(end_date)
+                                            )
+                                );
+    }
+    else if (comparator == "less" || comparator == "greaterOrEquals")
+    {
+        return makeASTFunction(comparator,
+                    std::make_shared<ASTIdentifier>(column),
+                    std::make_shared<ASTLiteral>(start_date)
+                    );
+    }
+    else
+    {
+        return makeASTFunction(comparator,
+                    std::make_shared<ASTIdentifier>(column),
+                    std::make_shared<ASTLiteral>(end_date)
+                    );
+    }
+}
+
+bool rewritePredicateInPlace(ASTFunction & function, ASTPtr & ast)
+{
+    const static std::unordered_map<String, String> swap_relations = {
+        {"equals", "equals"},
+        {"notEquals", "notEquals"},
+        {"less", "greater"},
+        {"greater", "less"},
+        {"lessOrEquals", "greaterOrEquals"},
+        {"greaterOrEquals", "lessOrEquals"},
+    };
+
+    if (!swap_relations.contains(function.name)) return false;
+
+    if (!function.arguments || function.arguments->children.size() != 2) return false;
+
+    size_t func_id = function.arguments->children.size();
+
+    for (size_t i = 0; i < function.arguments->children.size(); i++)
+    {
+        if (const auto * func = function.arguments->children[i]->as<ASTFunction>(); func)
+        {
+            if (func->name == "toYear")
+            {
+                func_id = i;
+            }
+        }
+    }
+
+    if (func_id == function.arguments->children.size()) return false;
+
+    size_t literal_id = 1 - func_id;
+    const auto * literal = function.arguments->children[literal_id]->as<ASTLiteral>();
+
+    if (!literal || literal->value.getType() != Field::Types::UInt64) return false;
+
+    UInt64 compare_to = literal->value.get<UInt64>();
+    String comparator = literal_id > func_id ? function.name : swap_relations.at(function.name);
+
+    const auto * func = function.arguments->children[func_id]->as<ASTFunction>();
+    const auto * column_id = func->arguments->children.at(0)->as<ASTIdentifier>();
+
+    if (!column_id) return false;
+
+    String column = column_id->name();
+
+    const auto new_ast = generateOptimizedDateFilterAST(comparator, func->name, column, compare_to);
+
+    if (!new_ast) return false;
+
+    ast = new_ast;
+    return true;
+}
+
+void OptimizeDateFilterInPlaceData::visit(ASTFunction & function, ASTPtr & ast) const
+{
+    rewritePredicateInPlace(function, ast);
+}
+}
--- a/src/Interpreters/OptimizeDateFilterVisitor.h
+++ b/src/Interpreters/OptimizeDateFilterVisitor.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include <Interpreters/InDepthNodeVisitor.h>
+
+namespace DB
+{
+
+class ASTFunction;
+
+/// Rewrite the predicates in place
+class OptimizeDateFilterInPlaceData
+{
+public:
+    using TypeToVisit = ASTFunction;
+    void visit(ASTFunction & function, ASTPtr & ast) const;
+};
+
+using OptimizeDateFilterInPlaceMatcher = OneTypeMatcher<OptimizeDateFilterInPlaceData>;
+using OptimizeDateFilterInPlaceVisitor = InDepthNodeVisitor<OptimizeDateFilterInPlaceMatcher, true>;
+}
--- a/src/Interpreters/ProcessorsProfileLog.cpp
+++ b/src/Interpreters/ProcessorsProfileLog.cpp
@ -29,6 +29,7 @@ NamesAndTypesList ProcessorProfileLogElement::getNamesAndTypes()
        {"plan_step", std::make_shared<DataTypeUInt64>()},
        {"plan_group", std::make_shared<DataTypeUInt64>()},

+        {"initial_query_id", std::make_shared<DataTypeString>()},
        {"query_id", std::make_shared<DataTypeString>()},
        {"name", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
        {"elapsed_us", std::make_shared<DataTypeUInt64>()},
@ -60,6 +61,7 @@ void ProcessorProfileLogElement::appendToBlock(MutableColumns & columns) const

    columns[i++]->insert(plan_step);
    columns[i++]->insert(plan_group);
+    columns[i++]->insertData(initial_query_id.data(), initial_query_id.size());
    columns[i++]->insertData(query_id.data(), query_id.size());
    columns[i++]->insertData(processor_name.data(), processor_name.size());
    columns[i++]->insert(elapsed_us);
--- a/src/Interpreters/ProcessorsProfileLog.h
+++ b/src/Interpreters/ProcessorsProfileLog.h
@ -19,6 +19,7 @@ struct ProcessorProfileLogElement
    UInt64 plan_step{};
    UInt64 plan_group{};

+    String initial_query_id;
    String query_id;
    String processor_name;

--- a/src/Interpreters/QueryLog.cpp
+++ b/src/Interpreters/QueryLog.cpp
@ -70,6 +70,7 @@ NamesAndTypesList QueryLogElement::getNamesAndTypes()
        {"databases", array_low_cardinality_string},
        {"tables", array_low_cardinality_string},
        {"columns", array_low_cardinality_string},
+        {"partitions", array_low_cardinality_string},
        {"projections", array_low_cardinality_string},
        {"views", array_low_cardinality_string},
        {"exception_code", std::make_shared<DataTypeInt32>()},
@ -176,6 +177,7 @@ void QueryLogElement::appendToBlock(MutableColumns & columns) const
        auto & column_databases = typeid_cast<ColumnArray &>(*columns[i++]);
        auto & column_tables = typeid_cast<ColumnArray &>(*columns[i++]);
        auto & column_columns = typeid_cast<ColumnArray &>(*columns[i++]);
+        auto & column_partitions = typeid_cast<ColumnArray &>(*columns[i++]);
        auto & column_projections = typeid_cast<ColumnArray &>(*columns[i++]);
        auto & column_views = typeid_cast<ColumnArray &>(*columns[i++]);

@ -194,6 +196,7 @@ void QueryLogElement::appendToBlock(MutableColumns & columns) const
        fill_column(query_databases, column_databases);
        fill_column(query_tables, column_tables);
        fill_column(query_columns, column_columns);
+        fill_column(query_partitions, column_partitions);
        fill_column(query_projections, column_projections);
        fill_column(query_views, column_views);
    }
--- a/src/Interpreters/QueryLog.h
+++ b/src/Interpreters/QueryLog.h
@ -65,6 +65,7 @@ struct QueryLogElement
    std::set<String> query_databases;
    std::set<String> query_tables;
    std::set<String> query_columns;
+    std::set<String> query_partitions;
    std::set<String> query_projections;
    std::set<String> query_views;

--- a/src/Interpreters/TreeOptimizer.cpp
+++ b/src/Interpreters/TreeOptimizer.cpp
@ -25,6 +25,7 @@
 #include <Interpreters/GatherFunctionQuantileVisitor.h>
 #include <Interpreters/RewriteSumIfFunctionVisitor.h>
 #include <Interpreters/RewriteArrayExistsFunctionVisitor.h>
+#include <Interpreters/OptimizeDateFilterVisitor.h>

 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
@ -677,6 +678,21 @@ void optimizeInjectiveFunctionsInsideUniq(ASTPtr & query, ContextPtr context)
    RemoveInjectiveFunctionsVisitor(data).visit(query);
 }

+void optimizeDateFilters(ASTSelectQuery * select_query)
+{
+    /// Predicates in HAVING clause has been moved to WHERE clause.
+    if (select_query->where())
+    {
+        OptimizeDateFilterInPlaceVisitor::Data data;
+        OptimizeDateFilterInPlaceVisitor(data).visit(select_query->refWhere());
+    }
+    if (select_query->prewhere())
+    {
+        OptimizeDateFilterInPlaceVisitor::Data data;
+        OptimizeDateFilterInPlaceVisitor(data).visit(select_query->refPrewhere());
+    }
+}
+
 void transformIfStringsIntoEnum(ASTPtr & query)
 {
    std::unordered_set<String> function_names = {"if", "transform"};
@ -780,6 +796,9 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result,
                tables_with_columns, result.storage_snapshot->metadata, result.storage);
    }

+    /// Rewrite date filters to avoid the calls of converters such as toYear, toYYYYMM, toISOWeek, etc.
+    optimizeDateFilters(select_query);
+
    /// GROUP BY injective function elimination.
    optimizeGroupBy(select_query, context);

--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -837,6 +837,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                    elem.query_databases = info.databases;
                    elem.query_tables = info.tables;
                    elem.query_columns = info.columns;
+                    elem.query_partitions = info.partitions;
                    elem.query_projections = info.projections;
                    elem.query_views = info.views;
                }
@ -901,6 +902,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                element.query_databases.insert(access_info.databases.begin(), access_info.databases.end());
                element.query_tables.insert(access_info.tables.begin(), access_info.tables.end());
                element.query_columns.insert(access_info.columns.begin(), access_info.columns.end());
+                element.query_partitions.insert(access_info.partitions.begin(), access_info.partitions.end());
                element.query_projections.insert(access_info.projections.begin(), access_info.projections.end());
                element.query_views.insert(access_info.views.begin(), access_info.views.end());

@ -1003,6 +1005,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                            ProcessorProfileLogElement processor_elem;
                            processor_elem.event_time = elem.event_time;
                            processor_elem.event_time_microseconds = elem.event_time_microseconds;
+                            processor_elem.initial_query_id = elem.client_info.initial_query_id;
                            processor_elem.query_id = elem.client_info.current_query_id;

                            auto get_proc_id = [](const IProcessor & proc) -> UInt64
--- a/src/Interpreters/interpretSubquery.cpp
+++ b/src/Interpreters/interpretSubquery.cpp
@ -112,8 +112,6 @@ std::shared_ptr<InterpreterSelectWithUnionQuery> interpretSubquery(
        subquery_options.removeDuplicates();
    }

-    /// We don't want to execute reading for subqueries in parallel
-    subquery_context->setSetting("allow_experimental_parallel_reading_from_replicas", false);
    return std::make_shared<InterpreterSelectWithUnionQuery>(query, subquery_context, subquery_options, required_source_columns);
 }

--- a/src/Planner/Planner.cpp
+++ b/src/Planner/Planner.cpp
@ -83,6 +83,7 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
    extern const int TOO_DEEP_SUBQUERIES;
    extern const int NOT_IMPLEMENTED;
+    extern const int SUPPORT_IS_DISABLED;
 }

 /** ClickHouse query planner.
@ -622,7 +623,14 @@ void addWithFillStepIfNeeded(QueryPlan & query_plan,
        interpolate_description = std::make_shared<InterpolateDescription>(std::move(interpolate_actions_dag), empty_aliases);
    }

-    auto filling_step = std::make_unique<FillingStep>(query_plan.getCurrentDataStream(), std::move(fill_description), interpolate_description);
+    const auto & query_context = planner_context->getQueryContext();
+    const Settings & settings = query_context->getSettingsRef();
+    auto filling_step = std::make_unique<FillingStep>(
+        query_plan.getCurrentDataStream(),
+        sort_description,
+        std::move(fill_description),
+        interpolate_description,
+        settings.use_with_fill_by_sorting_prefix);
    query_plan.addStep(std::move(filling_step));
 }

@ -1185,16 +1193,25 @@ void Planner::buildPlanForQueryNode()
    const auto & settings = query_context->getSettingsRef();

    if (planner_context->getTableExpressionNodeToData().size() > 1
-        && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas))
+        && (!settings.parallel_replicas_custom_key.value.empty() || settings.allow_experimental_parallel_reading_from_replicas > 0))
    {
-        LOG_WARNING(
-            &Poco::Logger::get("Planner"), "Joins are not supported with parallel replicas. Query will be executed without using them.");
+        if (settings.allow_experimental_parallel_reading_from_replicas == 1)
+        {
+                    LOG_WARNING(
+            &Poco::Logger::get("Planner"), "JOINs are not supported with parallel replicas. Query will be executed without using them.");

-        auto & mutable_context = planner_context->getMutableQueryContext();
-        mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", false);
-        mutable_context->setSetting("parallel_replicas_custom_key", String{""});
+            auto & mutable_context = planner_context->getMutableQueryContext();
+            mutable_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
+            mutable_context->setSetting("parallel_replicas_custom_key", String{""});
+        }
+        else if (settings.allow_experimental_parallel_reading_from_replicas == 2)
+        {
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "JOINs are not supported with parallel replicas");
+        }
    }

+    /// TODO: Also disable parallel replicas in case of FINAL
+
    auto top_level_identifiers = collectTopLevelColumnIdentifiers(query_tree, planner_context);
    auto join_tree_query_plan = buildJoinTreeQueryPlan(query_tree,
        select_query_info,
@ -1432,7 +1449,8 @@ void Planner::buildPlanForQueryNode()
            addLimitByStep(query_plan, limit_by_analysis_result, query_node);
        }

-        addWithFillStepIfNeeded(query_plan, query_analysis_result, planner_context, query_node);
+        if (query_node.hasOrderBy())
+            addWithFillStepIfNeeded(query_plan, query_analysis_result, planner_context, query_node);

        bool apply_offset = query_processing_info.getToStage() != QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit;

--- a/src/Processors/QueryPlan/FillingStep.cpp
+++ b/src/Processors/QueryPlan/FillingStep.cpp
@ -27,9 +27,17 @@ static ITransformingStep::Traits getTraits()
    };
 }

-FillingStep::FillingStep(const DataStream & input_stream_, SortDescription sort_description_, InterpolateDescriptionPtr interpolate_description_)
+FillingStep::FillingStep(
+    const DataStream & input_stream_,
+    SortDescription sort_description_,
+    SortDescription fill_description_,
+    InterpolateDescriptionPtr interpolate_description_,
+    bool use_with_fill_by_sorting_prefix_)
    : ITransformingStep(input_stream_, FillingTransform::transformHeader(input_stream_.header, sort_description_), getTraits())
-    , sort_description(std::move(sort_description_)), interpolate_description(interpolate_description_)
+    , sort_description(std::move(sort_description_))
+    , fill_description(std::move(fill_description_))
+    , interpolate_description(interpolate_description_)
+    , use_with_fill_by_sorting_prefix(use_with_fill_by_sorting_prefix_)
 {
    if (!input_stream_.has_single_port)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "FillingStep expects single input");
@ -40,9 +48,10 @@ void FillingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build
    pipeline.addSimpleTransform([&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
    {
        if (stream_type == QueryPipelineBuilder::StreamType::Totals)
-            return std::make_shared<FillingNoopTransform>(header, sort_description);
+            return std::make_shared<FillingNoopTransform>(header, fill_description);

-        return std::make_shared<FillingTransform>(header, sort_description, std::move(interpolate_description));
+        return std::make_shared<FillingTransform>(
+            header, sort_description, fill_description, std::move(interpolate_description), use_with_fill_by_sorting_prefix);
    });
 }

--- a/src/Processors/QueryPlan/FillingStep.h
+++ b/src/Processors/QueryPlan/FillingStep.h
@ -10,7 +10,12 @@ namespace DB
 class FillingStep : public ITransformingStep
 {
 public:
-    FillingStep(const DataStream & input_stream_, SortDescription sort_description_, InterpolateDescriptionPtr interpolate_description_);
+    FillingStep(
+        const DataStream & input_stream_,
+        SortDescription sort_description_,
+        SortDescription fill_description_,
+        InterpolateDescriptionPtr interpolate_description_,
+        bool use_with_fill_by_sorting_prefix);

    String getName() const override { return "Filling"; }

@ -25,7 +30,9 @@ private:
    void updateOutputStream() override;

    SortDescription sort_description;
+    SortDescription fill_description;
    InterpolateDescriptionPtr interpolate_description;
+    const bool use_with_fill_by_sorting_prefix;
 };

 }
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -3,6 +3,7 @@
 #include <IO/Operators.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExpressionAnalyzer.h>
+#include <Interpreters/InterpreterSelectQuery.h>
 #include <Interpreters/TreeRewriter.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
@ -99,7 +100,6 @@ namespace ErrorCodes
    extern const int INDEX_NOT_USED;
    extern const int LOGICAL_ERROR;
    extern const int TOO_MANY_ROWS;
-    extern const int SUPPORT_IS_DISABLED;
 }

 static MergeTreeReaderSettings getMergeTreeReaderSettings(
@ -1314,7 +1314,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl(
        auto reader_settings = getMergeTreeReaderSettings(context, query_info);

        bool use_skip_indexes = settings.use_skip_indexes;
-        bool final = isFinal(query_info);
+        bool final = InterpreterSelectQuery::isQueryWithFinal(query_info);

        if (final && !settings.use_skip_indexes_if_final)
            use_skip_indexes = false;
@ -1377,7 +1377,7 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction,

    /// Disable read-in-order optimization for reverse order with final.
    /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order).
-    if (direction != 1 && isFinal(query_info))
+    if (direction != 1 && isQueryWithFinal())
        return false;

    auto order_info = std::make_shared<InputOrderInfo>(SortDescription{}, prefix_size, direction, limit);
@ -1500,11 +1500,7 @@ ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const

 bool ReadFromMergeTree::isQueryWithFinal() const
 {
-    const auto & select = query_info.query->as<ASTSelectQuery &>();
-    if (query_info.table_expression_modifiers)
-        return query_info.table_expression_modifiers->hasFinal();
-    else
-        return select.final();
+    return InterpreterSelectQuery::isQueryWithFinal(query_info);
 }

 bool ReadFromMergeTree::isQueryWithSampling() const
@ -1522,7 +1518,7 @@ bool ReadFromMergeTree::isQueryWithSampling() const
 Pipe ReadFromMergeTree::spreadMarkRanges(
    RangesInDataParts && parts_with_ranges, size_t num_streams, AnalysisResult & result, ActionsDAGPtr & result_projection)
 {
-    bool final = isQueryWithFinal();
+    const bool final = isQueryWithFinal();
    const auto & input_order_info = query_info.getInputOrderInfo();

    Names column_names_to_read = result.column_names_to_read;
@ -1539,8 +1535,7 @@ Pipe ReadFromMergeTree::spreadMarkRanges(

    if (final)
    {
-        if (is_parallel_reading_from_replicas)
-            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "FINAL modifier is not supported with parallel replicas");
+        chassert(!is_parallel_reading_from_replicas);

        if (output_each_partition_through_separate_port)
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Optimisation isn't supposed to be used for queries with final");
@ -1618,6 +1613,18 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons
        result.selected_marks,
        result.selected_ranges);

+    // Adding partition info to QueryAccessInfo.
+    if (context->hasQueryContext() && !query_info.is_internal)
+    {
+        Names partition_names;
+        for (const auto & part : result.parts_with_ranges)
+        {
+            partition_names.emplace_back(
+                fmt::format("{}.{}", data.getStorageID().getFullNameNotQuoted(), part.data_part->info.partition_id));
+        }
+        context->getQueryContext()->addQueryAccessInfo(partition_names);
+    }
+
    ProfileEvents::increment(ProfileEvents::SelectedParts, result.selected_parts);
    ProfileEvents::increment(ProfileEvents::SelectedRanges, result.selected_ranges);
    ProfileEvents::increment(ProfileEvents::SelectedMarks, result.selected_marks);
@ -1948,15 +1955,6 @@ void ReadFromMergeTree::describeIndexes(JSONBuilder::JSONMap & map) const
    }
 }

-bool ReadFromMergeTree::isFinal(const SelectQueryInfo & query_info)
-{
-    if (query_info.table_expression_modifiers)
-        return query_info.table_expression_modifiers->hasFinal();
-
-    const auto & select = query_info.query->as<ASTSelectQuery &>();
-    return select.final();
-}
-
 bool MergeTreeDataSelectAnalysisResult::error() const
 {
    return std::holds_alternative<std::exception_ptr>(result);
--- a/src/Processors/QueryPlan/ReadFromMergeTree.h
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.h
@ -159,7 +159,6 @@ public:

    void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value);

-    static bool isFinal(const SelectQueryInfo & query_info);
    bool isQueryWithFinal() const;
    bool isQueryWithSampling() const;

--- a/src/Processors/Transforms/FillingTransform.cpp
+++ b/src/Processors/Transforms/FillingTransform.cpp
@ -187,25 +187,31 @@ static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr &
 }

 FillingTransform::FillingTransform(
-        const Block & header_, const SortDescription & sort_description_, InterpolateDescriptionPtr interpolate_description_)
-        : ISimpleTransform(header_, transformHeader(header_, sort_description_), true)
-        , sort_description(sort_description_)
-        , interpolate_description(interpolate_description_)
-        , filling_row(sort_description_)
-        , next_row(sort_description_)
+    const Block & header_,
+    const SortDescription & sort_description_,
+    const SortDescription & fill_description_,
+    InterpolateDescriptionPtr interpolate_description_,
+    const bool use_with_fill_by_sorting_prefix_)
+    : ISimpleTransform(header_, transformHeader(header_, fill_description_), true)
+    , sort_description(sort_description_)
+    , fill_description(fill_description_)
+    , interpolate_description(interpolate_description_)
+    , filling_row(fill_description_)
+    , next_row(fill_description_)
+    , use_with_fill_by_sorting_prefix(use_with_fill_by_sorting_prefix_)
 {
    if (interpolate_description)
        interpolate_actions = std::make_shared<ExpressionActions>(interpolate_description->actions);

    std::vector<bool> is_fill_column(header_.columns());
-    for (size_t i = 0, size = sort_description.size(); i < size; ++i)
+    for (size_t i = 0, size = fill_description.size(); i < size; ++i)
    {
-        if (interpolate_description && interpolate_description->result_columns_set.contains(sort_description[i].column_name))
+        if (interpolate_description && interpolate_description->result_columns_set.contains(fill_description[i].column_name))
            throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION,
                "Column '{}' is participating in ORDER BY ... WITH FILL expression and can't be INTERPOLATE output",
-                sort_description[i].column_name);
+                fill_description[i].column_name);

-        size_t block_position = header_.getPositionByName(sort_description[i].column_name);
+        size_t block_position = header_.getPositionByName(fill_description[i].column_name);
        is_fill_column[block_position] = true;
        fill_column_positions.push_back(block_position);

@ -226,21 +232,40 @@ FillingTransform::FillingTransform(
                "WITH FILL bound values cannot be negative for unsigned type {}", type->getName());
        }
    }
+    logDebug("fill description", dumpSortDescription(fill_description));

    std::set<size_t> unique_positions;
    for (auto pos : fill_column_positions)
        if (!unique_positions.insert(pos).second)
            throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION, "Multiple WITH FILL for identical expressions is not supported in ORDER BY");

+    if (use_with_fill_by_sorting_prefix)
+    {
+        /// build sorting prefix for first fill column
+        for (const auto & desc : sort_description)
+        {
+            if (desc.column_name == fill_description[0].column_name)
+                break;
+
+            size_t pos = header_.getPositionByName(desc.column_name);
+            sort_prefix_positions.push_back(pos);
+
+            sort_prefix.push_back(desc);
+        }
+        logDebug("sort prefix", dumpSortDescription(sort_prefix));
+        last_range_sort_prefix.reserve(sort_prefix.size());
+    }
+
    size_t idx = 0;
    for (const ColumnWithTypeAndName & column : header_.getColumnsWithTypeAndName())
    {
        if (interpolate_description)
            if (const auto & p = interpolate_description->required_columns_map.find(column.name);
                p != interpolate_description->required_columns_map.end())
-                    input_positions.emplace_back(idx, p->second);
+                input_positions.emplace_back(idx, p->second);

-        if (!is_fill_column[idx] && !(interpolate_description && interpolate_description->result_columns_set.contains(column.name)))
+        if (!is_fill_column[idx] && !(interpolate_description && interpolate_description->result_columns_set.contains(column.name))
+            && sort_prefix_positions.end() == std::find(sort_prefix_positions.begin(), sort_prefix_positions.end(), idx))
            other_column_positions.push_back(idx);

        ++idx;
@ -249,6 +274,20 @@ FillingTransform::FillingTransform(
    if (interpolate_description)
        for (const auto & name : interpolate_description->result_columns_order)
            interpolate_column_positions.push_back(header_.getPositionByName(name));
+
+    /// check conflict in positions between interpolate and sorting prefix columns
+    if (!sort_prefix_positions.empty() && !interpolate_column_positions.empty())
+    {
+        std::unordered_set<size_t> interpolate_positions(interpolate_column_positions.begin(), interpolate_column_positions.end());
+        for (auto sort_prefix_pos : sort_prefix_positions)
+        {
+            if (interpolate_positions.contains(sort_prefix_pos))
+                throw Exception(
+                    ErrorCodes::INVALID_WITH_FILL_EXPRESSION,
+                    "The same column in ORDER BY before WITH FILL (sorting prefix) and INTERPOLATE is not allowed. Column: {}",
+                    (header_.begin() + sort_prefix_pos)->name);
+        }
+    }
 }

 /// prepare() is overrididen to call transform() after all chunks are processed
@ -313,9 +352,14 @@ void FillingTransform::interpolate(const MutableColumns & result_columns, Block

 using MutableColumnRawPtrs = std::vector<IColumn*>;

-static void insertFromFillingRow(const MutableColumnRawPtrs & filling_columns, const MutableColumnRawPtrs & interpolate_columns, const MutableColumnRawPtrs & other_columns,
-    const FillingRow & filling_row, const Block & interpolate_block)
+void FillingTransform::insertFromFillingRow(
+    const MutableColumnRawPtrs & filling_columns,
+    const MutableColumnRawPtrs & interpolate_columns,
+    const MutableColumnRawPtrs & other_columns,
+    const Block & interpolate_block)
 {
+    logDebug("insertFromFillingRow", filling_row);
+
    for (size_t i = 0, size = filling_columns.size(); i < size; ++i)
    {
        if (filling_row[i].isNull())
@ -338,10 +382,14 @@ static void insertFromFillingRow(const MutableColumnRawPtrs & filling_columns, c

    for (auto * other_column : other_columns)
        other_column->insertDefault();
+
+    filling_row_inserted = true;
 }

 static void copyRowFromColumns(const MutableColumnRawPtrs & dest, const Columns & source, size_t row_num)
 {
+    chassert(dest.size() == source.size());
+
    for (size_t i = 0, size = source.size(); i < size; ++i)
        dest[i]->insertFrom(*source[i], row_num);
 }
@ -353,7 +401,7 @@ static void initColumnsByPositions(
    MutableColumnRawPtrs & output_columns_by_position,
    const std::vector<size_t> & positions)
 {
-    for (size_t pos : positions)
+    for (const size_t pos : positions)
    {
        input_columns_by_positions.push_back(input_columns[pos]);
        output_columns_by_position.push_back(output_columns[pos].get());
@ -364,10 +412,12 @@ void FillingTransform::initColumns(
    const Columns & input_columns,
    Columns & input_fill_columns,
    Columns & input_interpolate_columns,
+    Columns & input_sort_prefix_columns,
    Columns & input_other_columns,
    MutableColumns & output_columns,
    MutableColumnRawPtrs & output_fill_columns,
    MutableColumnRawPtrs & output_interpolate_columns,
+    MutableColumnRawPtrs & output_sort_prefix_columns,
    MutableColumnRawPtrs & output_other_columns)
 {
    Columns non_const_columns;
@ -382,65 +432,236 @@ void FillingTransform::initColumns(
    initColumnsByPositions(non_const_columns, input_fill_columns, output_columns, output_fill_columns, fill_column_positions);
    initColumnsByPositions(
        non_const_columns, input_interpolate_columns, output_columns, output_interpolate_columns, interpolate_column_positions);
+    initColumnsByPositions(non_const_columns, input_sort_prefix_columns, output_columns, output_sort_prefix_columns, sort_prefix_positions);
    initColumnsByPositions(non_const_columns, input_other_columns, output_columns, output_other_columns, other_column_positions);
 }

 bool FillingTransform::generateSuffixIfNeeded(const Columns & input_columns, MutableColumns & result_columns)
 {
-    logDebug("generateSuffixIfNeeded() filling_row", filling_row);
-    logDebug("generateSuffixIfNeeded() next_row", next_row);
-    logDebug("generateSuffixIfNeeded() first", first);
-
-    /// Determines should we insert filling row before start generating next rows.
-    bool should_insert_first = next_row < filling_row || first;
-
-    for (size_t i = 0, size = filling_row.size(); i < size; ++i)
-        next_row[i] = filling_row.getFillDescription(i).fill_to;
-
-    logDebug("generateSuffixIfNeeded() next_row updated", next_row);
-
-    if (!first && filling_row >= next_row)
-    {
-        logDebug("generateSuffixIfNeeded()", "no need to generate suffix");
-        return false;
-    }
-
    Columns input_fill_columns;
    Columns input_interpolate_columns;
+    Columns input_sort_prefix_columns;
    Columns input_other_columns;
    MutableColumnRawPtrs res_fill_columns;
    MutableColumnRawPtrs res_interpolate_columns;
+    MutableColumnRawPtrs res_sort_prefix_columns;
    MutableColumnRawPtrs res_other_columns;

    initColumns(
        input_columns,
        input_fill_columns,
        input_interpolate_columns,
+        input_sort_prefix_columns,
        input_other_columns,
        result_columns,
        res_fill_columns,
        res_interpolate_columns,
+        res_sort_prefix_columns,
        res_other_columns);

-    if (first)
-        filling_row.initFromDefaults();
+    return generateSuffixIfNeeded(result_columns, res_fill_columns, res_interpolate_columns, res_sort_prefix_columns, res_other_columns);
+}
+
+bool FillingTransform::generateSuffixIfNeeded(
+    const MutableColumns & result_columns,
+    MutableColumnRawPtrs res_fill_columns,
+    MutableColumnRawPtrs res_interpolate_columns,
+    MutableColumnRawPtrs res_sort_prefix_columns,
+    MutableColumnRawPtrs res_other_columns)
+{
+    logDebug("generateSuffixIfNeeded() filling_row", filling_row);
+    logDebug("generateSuffixIfNeeded() next_row", next_row);
+
+    /// Determines if we should insert filling row before start generating next rows
+    bool should_insert_first = (next_row < filling_row && !filling_row_inserted) || next_row.isNull();
+    logDebug("should_insert_first", should_insert_first);
+
+    for (size_t i = 0, size = filling_row.size(); i < size; ++i)
+        next_row[i] = filling_row.getFillDescription(i).fill_to;
+
+    logDebug("generateSuffixIfNeeded() next_row updated", next_row);
+
+    if (filling_row >= next_row)
+    {
+        logDebug("generateSuffixIfNeeded()", "no need to generate suffix");
+        return false;
+    }

    Block interpolate_block;
    if (should_insert_first && filling_row < next_row)
    {
        interpolate(result_columns, interpolate_block);
-        insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, filling_row, interpolate_block);
+        insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, interpolate_block);
+        /// fulfill sort prefix columns with last row values or defaults
+        if (!last_range_sort_prefix.empty())
+            copyRowFromColumns(res_sort_prefix_columns, last_range_sort_prefix, 0);
+        else
+            for (auto * sort_prefix_column : res_sort_prefix_columns)
+                sort_prefix_column->insertDefault();
    }

-    while (filling_row.next(next_row))
+    bool filling_row_changed = false;
+    while (true)
    {
+        const auto [apply, changed] = filling_row.next(next_row);
+        filling_row_changed = changed;
+        if (!apply)
+            break;
+
        interpolate(result_columns, interpolate_block);
-        insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, filling_row, interpolate_block);
+        insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, interpolate_block);
+        /// fulfill sort prefix columns with last row values or defaults
+        if (!last_range_sort_prefix.empty())
+            copyRowFromColumns(res_sort_prefix_columns, last_range_sort_prefix, 0);
+        else
+            for (auto * sort_prefix_column : res_sort_prefix_columns)
+                sort_prefix_column->insertDefault();
    }
+    /// new valid filling row was generated but not inserted
+    if (filling_row_changed)
+        filling_row_inserted = false;

    return true;
 }

+template <typename Predicate>
+size_t getRangeEnd(size_t begin, size_t end, Predicate pred)
+{
+    chassert(begin < end);
+
+    const size_t linear_probe_threadhold = 16;
+    size_t linear_probe_end = begin + linear_probe_threadhold;
+    if (linear_probe_end > end)
+        linear_probe_end = end;
+
+    for (size_t pos = begin; pos < linear_probe_end; ++pos)
+    {
+        if (!pred(begin, pos))
+            return pos;
+    }
+
+    size_t low = linear_probe_end;
+    size_t high = end - 1;
+    while (low <= high)
+    {
+        size_t mid = low + (high - low) / 2;
+        if (pred(begin, mid))
+            low = mid + 1;
+        else
+        {
+            high = mid - 1;
+            end = mid;
+        }
+    }
+    return end;
+}
+
+void FillingTransform::transformRange(
+    const Columns & input_fill_columns,
+    const Columns & input_interpolate_columns,
+    const Columns & input_sort_prefix_columns,
+    const Columns & input_other_columns,
+    const MutableColumns & result_columns,
+    const MutableColumnRawPtrs & res_fill_columns,
+    const MutableColumnRawPtrs & res_interpolate_columns,
+    const MutableColumnRawPtrs & res_sort_prefix_columns,
+    const MutableColumnRawPtrs & res_other_columns,
+    std::pair<size_t, size_t> range,
+    const bool new_sorting_prefix)
+{
+    const size_t range_begin = range.first;
+    const size_t range_end = range.second;
+
+    Block interpolate_block;
+    if (new_sorting_prefix)
+    {
+        logDebug("--- new range ---", range_end);
+        for (size_t i = 0, size = filling_row.size(); i < size; ++i)
+        {
+            const auto current_value = (*input_fill_columns[i])[range_begin];
+            const auto & fill_from = filling_row.getFillDescription(i).fill_from;
+
+            if (!fill_from.isNull() && !equals(current_value, fill_from))
+            {
+                filling_row.initFromDefaults(i);
+                filling_row_inserted = false;
+                if (less(fill_from, current_value, filling_row.getDirection(i)))
+                {
+                    interpolate(result_columns, interpolate_block);
+                    insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, interpolate_block);
+                    copyRowFromColumns(res_sort_prefix_columns, input_sort_prefix_columns, range_begin);
+                }
+                break;
+            }
+            filling_row[i] = current_value;
+        }
+    }
+
+    for (size_t row_ind = range_begin; row_ind < range_end; ++row_ind)
+    {
+        logDebug("row", row_ind);
+        logDebug("filling_row", filling_row);
+        logDebug("next_row", next_row);
+
+        bool should_insert_first = next_row < filling_row;
+        logDebug("should_insert_first", should_insert_first);
+
+        for (size_t i = 0, size = filling_row.size(); i < size; ++i)
+        {
+            const auto current_value = (*input_fill_columns[i])[row_ind];
+            const auto & fill_to = filling_row.getFillDescription(i).fill_to;
+
+            if (fill_to.isNull() || less(current_value, fill_to, filling_row.getDirection(i)))
+                next_row[i] = current_value;
+            else
+                next_row[i] = fill_to;
+        }
+        logDebug("next_row updated", next_row);
+
+        /// The condition is true when filling row is initialized by value(s) in FILL FROM,
+        /// and there are row(s) in current range with value(s) < then in the filling row.
+        /// It can happen only once for a range.
+        if (should_insert_first && filling_row < next_row)
+        {
+            interpolate(result_columns, interpolate_block);
+            insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, interpolate_block);
+            copyRowFromColumns(res_sort_prefix_columns, input_sort_prefix_columns, row_ind);
+        }
+
+        bool filling_row_changed = false;
+        while (true)
+        {
+            const auto [apply, changed] = filling_row.next(next_row);
+            filling_row_changed = changed;
+            if (!apply)
+                break;
+
+            interpolate(result_columns, interpolate_block);
+            insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, interpolate_block);
+            copyRowFromColumns(res_sort_prefix_columns, input_sort_prefix_columns, row_ind);
+        }
+        /// new valid filling row was generated but not inserted, will use it during suffix generation
+        if (filling_row_changed)
+            filling_row_inserted = false;
+
+        logDebug("filling_row after", filling_row);
+
+        copyRowFromColumns(res_fill_columns, input_fill_columns, row_ind);
+        copyRowFromColumns(res_interpolate_columns, input_interpolate_columns, row_ind);
+        copyRowFromColumns(res_sort_prefix_columns, input_sort_prefix_columns, row_ind);
+        copyRowFromColumns(res_other_columns, input_other_columns, row_ind);
+    }
+
+    /// save sort prefix of last row in the range, it's used to generate suffix
+    last_range_sort_prefix.clear();
+    for (const auto & sort_prefix_column : input_sort_prefix_columns)
+    {
+        auto column = sort_prefix_column->cloneEmpty();
+        column->insertFrom(*sort_prefix_column, range_end - 1);
+        last_range_sort_prefix.push_back(std::move(column));
+    }
+}
+
 void FillingTransform::transform(Chunk & chunk)
 {
    logDebug("new chunk rows", chunk.getNumRows());
@ -453,9 +674,11 @@ void FillingTransform::transform(Chunk & chunk)

    Columns input_fill_columns;
    Columns input_interpolate_columns;
+    Columns input_sort_prefix_columns;
    Columns input_other_columns;
    MutableColumnRawPtrs res_fill_columns;
    MutableColumnRawPtrs res_interpolate_columns;
+    MutableColumnRawPtrs res_sort_prefix_columns;
    MutableColumnRawPtrs res_other_columns;
    MutableColumns result_columns;

@ -468,6 +691,14 @@ void FillingTransform::transform(Chunk & chunk)
        /// if all chunks are processed, then we may need to generate suffix for the following cases:
        /// (1) when all data are processed and WITH FILL .. TO is provided
        /// (2) for empty result set when WITH FILL FROM .. TO is provided (see PR #30888)
+
+        /// if no data was processed, then need to initialize filling_row
+        if (last_row.empty())
+        {
+            filling_row.initFromDefaults();
+            filling_row_inserted = false;
+        }
+
        if (generateSuffixIfNeeded(input.getHeader().getColumns(), result_columns))
        {
            size_t num_output_rows = result_columns[0]->size();
@ -485,72 +716,95 @@ void FillingTransform::transform(Chunk & chunk)
        input_columns,
        input_fill_columns,
        input_interpolate_columns,
+        input_sort_prefix_columns,
        input_other_columns,
        result_columns,
        res_fill_columns,
        res_interpolate_columns,
+        res_sort_prefix_columns,
        res_other_columns);

-    if (first)
+    if (sort_prefix.empty() || !use_with_fill_by_sorting_prefix)
    {
-        for (size_t i = 0, size = filling_row.size(); i < size; ++i)
-        {
-            auto current_value = (*input_fill_columns[i])[0];
-            const auto & fill_from = filling_row.getFillDescription(i).fill_from;
+        transformRange(
+            input_fill_columns,
+            input_interpolate_columns,
+            input_sort_prefix_columns,
+            input_other_columns,
+            result_columns,
+            res_fill_columns,
+            res_interpolate_columns,
+            res_sort_prefix_columns,
+            res_other_columns,
+            {0, num_rows},
+            last_row.empty());

-            if (!fill_from.isNull() && !equals(current_value, fill_from))
-            {
-                filling_row.initFromDefaults(i);
-                if (less(fill_from, current_value, filling_row.getDirection(i)))
-                {
-                    interpolate(result_columns, interpolate_block);
-                    insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, filling_row, interpolate_block);
-                }
-                break;
-            }
-            filling_row[i] = current_value;
-        }
-        first = false;
+        saveLastRow(result_columns);
+        size_t num_output_rows = result_columns[0]->size();
+        chunk.setColumns(std::move(result_columns), num_output_rows);
+        return;
    }

-    for (size_t row_ind = 0; row_ind < num_rows; ++row_ind)
+    /// check if last row in prev chunk had the same sorting prefix as the first in new one
+    /// if not, we need to reinitialize filling row
+    bool new_sort_prefix = last_row.empty();
+    if (!last_row.empty())
    {
-        logDebug("row", row_ind);
-        logDebug("filling_row", filling_row);
-        logDebug("next_row", next_row);
+        ColumnRawPtrs last_sort_prefix_columns;
+        last_sort_prefix_columns.reserve(sort_prefix.size());
+        for (size_t pos : sort_prefix_positions)
+            last_sort_prefix_columns.push_back(last_row[pos].get());

-        bool should_insert_first = next_row < filling_row;
-        logDebug("should_insert_first", should_insert_first);
-
-        for (size_t i = 0, size = filling_row.size(); i < size; ++i)
+        new_sort_prefix = false;
+        for (size_t i = 0; i < input_sort_prefix_columns.size(); ++i)
        {
-            auto current_value = (*input_fill_columns[i])[row_ind];
-            const auto & fill_to = filling_row.getFillDescription(i).fill_to;
-
-            if (fill_to.isNull() || less(current_value, fill_to, filling_row.getDirection(i)))
-                next_row[i] = current_value;
-            else
-                next_row[i] = fill_to;
+            const int res = input_sort_prefix_columns[i]->compareAt(0, 0, *last_sort_prefix_columns[i], sort_prefix[i].nulls_direction);
+            if (res != 0)
+            {
+                new_sort_prefix = true;
+                break;
+            }
        }
-        logDebug("next_row updated", next_row);
+    }

-        /// A case, when at previous step row was initialized from defaults 'fill_from' values
-        ///  and probably we need to insert it to block.
-        if (should_insert_first && filling_row < next_row)
-        {
-            interpolate(result_columns, interpolate_block);
-            insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, filling_row, interpolate_block);
-        }
+    for (size_t row_ind = 0; row_ind < num_rows;)
+    {
+        /// find next range
+        auto current_sort_prefix_end_pos = getRangeEnd(
+            row_ind,
+            num_rows,
+            [&](size_t pos_with_current_sort_prefix, size_t row_pos)
+            {
+                for (size_t i = 0; i < input_sort_prefix_columns.size(); ++i)
+                {
+                    const int res = input_sort_prefix_columns[i]->compareAt(
+                        pos_with_current_sort_prefix, row_pos, *input_sort_prefix_columns[i], sort_prefix[i].nulls_direction);
+                    if (res != 0)
+                        return false;
+                }
+                return true;
+            });

-        while (filling_row.next(next_row))
-        {
-            interpolate(result_columns, interpolate_block);
-            insertFromFillingRow(res_fill_columns, res_interpolate_columns, res_other_columns, filling_row, interpolate_block);
-        }
+        /// generate suffix for the previous range
+        if (!last_range_sort_prefix.empty() && new_sort_prefix)
+            generateSuffixIfNeeded(result_columns, res_fill_columns, res_interpolate_columns, res_sort_prefix_columns, res_other_columns);

-        copyRowFromColumns(res_fill_columns, input_fill_columns, row_ind);
-        copyRowFromColumns(res_interpolate_columns, input_interpolate_columns, row_ind);
-        copyRowFromColumns(res_other_columns, input_other_columns, row_ind);
+        transformRange(
+            input_fill_columns,
+            input_interpolate_columns,
+            input_sort_prefix_columns,
+            input_other_columns,
+            result_columns,
+            res_fill_columns,
+            res_interpolate_columns,
+            res_sort_prefix_columns,
+            res_other_columns,
+            {row_ind, current_sort_prefix_end_pos},
+            new_sort_prefix);
+
+        logDebug("range end", current_sort_prefix_end_pos);
+        row_ind = current_sort_prefix_end_pos;
+        new_sort_prefix = true;
    }

    saveLastRow(result_columns);
--- a/src/Processors/Transforms/FillingTransform.h
+++ b/src/Processors/Transforms/FillingTransform.h
@ -16,7 +16,12 @@ namespace DB
 class FillingTransform : public ISimpleTransform
 {
 public:
-    FillingTransform(const Block & header_, const SortDescription & sort_description_, InterpolateDescriptionPtr interpolate_description_);
+    FillingTransform(
+        const Block & header_,
+        const SortDescription & sort_description_,
+        const SortDescription & fill_description_,
+        InterpolateDescriptionPtr interpolate_description_,
+        bool use_with_fill_by_sorting_prefix_);

    String getName() const override { return "FillingTransform"; }

@ -25,42 +30,72 @@ public:
    static Block transformHeader(Block header, const SortDescription & sort_description);

 protected:
-    void transform(Chunk & Chunk) override;
+    void transform(Chunk & chunk) override;

 private:
+    using MutableColumnRawPtrs = std::vector<IColumn *>;
+    void transformRange(
+        const Columns & input_fill_columns,
+        const Columns & input_interpolate_columns,
+        const Columns & input_sort_prefix_columns,
+        const Columns & input_other_columns,
+        const MutableColumns & result_columns,
+        const MutableColumnRawPtrs & res_fill_columns,
+        const MutableColumnRawPtrs & res_interpolate_columns,
+        const MutableColumnRawPtrs & res_sort_prefix_columns,
+        const MutableColumnRawPtrs & res_other_columns,
+        std::pair<size_t, size_t> range,
+        bool new_sorting_prefix);
+
    void saveLastRow(const MutableColumns & result_columns);
    void interpolate(const MutableColumns & result_columns, Block & interpolate_block);

-    using MutableColumnRawPtrs = std::vector<IColumn *>;
    void initColumns(
        const Columns & input_columns,
        Columns & input_fill_columns,
        Columns & input_interpolate_columns,
+        Columns & input_sort_prefix_columns,
        Columns & input_other_columns,
        MutableColumns & output_columns,
        MutableColumnRawPtrs & output_fill_columns,
        MutableColumnRawPtrs & output_interpolate_columns,
+        MutableColumnRawPtrs & output_sort_prefix_columns,
        MutableColumnRawPtrs & output_other_columns);

    bool generateSuffixIfNeeded(
-        const Columns & input_columns,
-        MutableColumns & result_columns);
+        const MutableColumns & result_columns,
+        MutableColumnRawPtrs res_fill_columns,
+        MutableColumnRawPtrs res_interpolate_columns,
+        MutableColumnRawPtrs res_sort_prefix_columns,
+        MutableColumnRawPtrs res_other_columns);
+    bool generateSuffixIfNeeded(const Columns & input_columns, MutableColumns & result_columns);

-    const SortDescription sort_description; /// Contains only columns with WITH FILL.
+    void insertFromFillingRow(
+        const MutableColumnRawPtrs & filling_columns,
+        const MutableColumnRawPtrs & interpolate_columns,
+        const MutableColumnRawPtrs & other_columns,
+        const Block & interpolate_block);
+
+    const SortDescription sort_description;
+    const SortDescription fill_description; /// Contains only columns with WITH FILL.
+    SortDescription sort_prefix;
    const InterpolateDescriptionPtr interpolate_description; /// Contains INTERPOLATE columns

    FillingRow filling_row; /// Current row, which is used to fill gaps.
    FillingRow next_row; /// Row to which we need to generate filling rows.
+    bool filling_row_inserted = false;

    using Positions = std::vector<size_t>;
    Positions fill_column_positions;
    Positions interpolate_column_positions;
    Positions other_column_positions;
+    Positions sort_prefix_positions;
    std::vector<std::pair<size_t, NameAndTypePair>> input_positions; /// positions in result columns required for actions
    ExpressionActionsPtr interpolate_actions;
    Columns last_row;
-    bool first = true;              /// flag to determine if transform is/will be called for the first time
+    Columns last_range_sort_prefix;
    bool all_chunks_processed = false;    /// flag to determine if we have already processed all chunks
+    const bool use_with_fill_by_sorting_prefix;
 };

 class FillingNoopTransform : public ISimpleTransform
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@ -369,6 +369,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
    const StorageMetadataPtr & metadata_snapshot,
    ContextPtr context,
    const String & part_name,
+    const String & zookeeper_name,
    const String & replica_path,
    const String & host,
    int port,
@ -401,13 +402,18 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
    /// Validation of the input that may come from malicious replica.
    auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version);

+    String endpoint_id = getEndpointId(
+        data_settings->enable_the_endpoint_id_with_zookeeper_name_prefix ?
+        zookeeper_name + ":" + replica_path :
+        replica_path);
+
    Poco::URI uri;
    uri.setScheme(interserver_scheme);
    uri.setHost(host);
    uri.setPort(port);
    uri.setQueryParameters(
    {
-        {"endpoint",                getEndpointId(replica_path)},
+        {"endpoint",                endpoint_id},
        {"part",                    part_name},
        {"client_protocol_version", toString(REPLICATION_PROTOCOL_VERSION_WITH_METADATA_VERSION)},
        {"compress",                "false"}
@ -630,7 +636,15 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
            temporary_directory_lock = {};

            /// Try again but without zero-copy
-            return fetchSelectedPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
+            return fetchSelectedPart(
+                metadata_snapshot,
+                context,
+                part_name,
+                zookeeper_name,
+                replica_path,
+                host,
+                port,
+                timeouts,
                user, password, interserver_scheme, throttler, to_detached, tmp_prefix, nullptr, false, disk);
        }
    }
--- a/src/Storages/MergeTree/DataPartsExchange.h
+++ b/src/Storages/MergeTree/DataPartsExchange.h
@ -70,6 +70,7 @@ public:
        const StorageMetadataPtr & metadata_snapshot,
        ContextPtr context,
        const String & part_name,
+        const String & zookeeper_name,
        const String & replica_path,
        const String & host,
        int port,
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -7198,9 +7198,17 @@ QueryProcessingStage::Enum MergeTreeData::getQueryProcessingStage(
    if (query_context->getClientInfo().collaborate_with_initiator)
        return QueryProcessingStage::Enum::FetchColumns;

-    if (query_context->canUseParallelReplicasOnInitiator()
-        && to_stage >= QueryProcessingStage::WithMergeableState)
-        return QueryProcessingStage::Enum::WithMergeableState;
+    /// Parallel replicas
+    if (query_context->canUseParallelReplicasOnInitiator() && to_stage >= QueryProcessingStage::WithMergeableState)
+    {
+        /// ReplicatedMergeTree
+        if (supportsReplication())
+            return QueryProcessingStage::Enum::WithMergeableState;
+
+        /// For non-replicated MergeTree we allow them only if parallel_replicas_for_non_replicated_merge_tree is enabled
+        if (query_context->getSettingsRef().parallel_replicas_for_non_replicated_merge_tree)
+            return QueryProcessingStage::Enum::WithMergeableState;
+    }

    if (to_stage >= QueryProcessingStage::Enum::WithMergeableState)
    {
--- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp
+++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp
@ -5,6 +5,7 @@
 #include <boost/algorithm/string/trim.hpp>
 #include <IO/WriteHelpers.h>
 #include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromFileBase.h>
 #include <Disks/WriteMode.h>
 #include <Disks/IDisk.h>
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@ -159,6 +159,7 @@ struct Settings;
    M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \
    M(Bool, check_sample_column_is_correct, true, "Check columns or columns by hash for sampling are unsigned integer.", 0) \
    M(Bool, allow_vertical_merges_from_compact_to_wide_parts, false, "Allows vertical merges from compact to wide parts. This settings must have the same value on all replicas", 0) \
+    M(Bool, enable_the_endpoint_id_with_zookeeper_name_prefix, false, "Enable the endpoint id with zookeeper name prefix for the replicated merge tree table", 0) \
    \
    /** Experimental/work in progress feature. Unsafe for production. */ \
    M(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \
--- a/src/Storages/SelectQueryInfo.h
+++ b/src/Storages/SelectQueryInfo.h
@ -251,6 +251,7 @@ struct SelectQueryInfo
    bool is_projection_query = false;
    bool merge_tree_empty_result = false;
    bool settings_limit_offset_done = false;
+    bool is_internal = false;
    Block minmax_count_projection_block;
    MergeTreeDataSelectAnalysisResultPtr merge_tree_select_result_ptr;

--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@ -145,7 +145,6 @@ namespace ErrorCodes
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
    extern const int INCORRECT_NUMBER_OF_COLUMNS;
    extern const int INFINITE_LOOP;
-    extern const int ILLEGAL_FINAL;
    extern const int TYPE_MISMATCH;
    extern const int TOO_MANY_ROWS;
    extern const int UNABLE_TO_SKIP_UNUSED_SHARDS;
@ -1045,10 +1044,6 @@ void StorageDistributed::read(
    const size_t /*max_block_size*/,
    const size_t /*num_streams*/)
 {
-    const auto * select_query = query_info.query->as<ASTSelectQuery>();
-    if (select_query->final() && local_context->getSettingsRef().allow_experimental_parallel_reading_from_replicas)
-        throw Exception(ErrorCodes::ILLEGAL_FINAL, "Final modifier is not allowed together with parallel reading from replicas feature");
-
    Block header;
    ASTPtr query_ast;

--- a/src/Storages/StorageMerge.cpp
+++ b/src/Storages/StorageMerge.cpp
@ -633,10 +633,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources(
    auto & modified_select = modified_query_info.query->as<ASTSelectQuery &>();

    QueryPipelineBuilderPtr builder;
-
-    bool final = isFinal(modified_query_info);
-
-    if (!final && storage->needRewriteQueryWithFinal(real_column_names))
+    if (!InterpreterSelectQuery::isQueryWithFinal(modified_query_info) && storage->needRewriteQueryWithFinal(real_column_names))
    {
        /// NOTE: It may not work correctly in some cases, because query was analyzed without final.
        /// However, it's needed for MaterializedMySQL and it's unlikely that someone will use it with Merge tables.
@ -1010,21 +1007,13 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_)
 {
    /// Disable read-in-order optimization for reverse order with final.
    /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order).
-    if (order_info_->direction != 1 && isFinal(query_info))
+    if (order_info_->direction != 1 && InterpreterSelectQuery::isQueryWithFinal(query_info))
        return false;

    order_info = order_info_;
    return true;
 }

-bool ReadFromMerge::isFinal(const SelectQueryInfo & query_info)
-{
-    if (query_info.table_expression_modifiers)
-        return query_info.table_expression_modifiers->hasFinal();
-    const auto & select_query = query_info.query->as<ASTSelectQuery &>();
-    return select_query.final();
-}
-
 IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const
 {
    ColumnSizeByName column_sizes;
--- a/src/Storages/StorageMerge.h
+++ b/src/Storages/StorageMerge.h
@ -145,7 +145,6 @@ public:

    /// Returns `false` if requested reading cannot be performed.
    bool requestReadingInOrder(InputOrderInfoPtr order_info_);
-    static bool isFinal(const SelectQueryInfo & query_info);

 private:
    const size_t required_max_block_size;
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -240,6 +240,15 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeperAndAssertNotReadonl
    return res;
 }

+String StorageReplicatedMergeTree::getEndpointName() const
+{
+    const MergeTreeSettings & settings = getContext()->getReplicatedMergeTreeSettings();
+    if (settings.enable_the_endpoint_id_with_zookeeper_name_prefix)
+        return zookeeper_name + ":" + replica_path;
+
+    return replica_path;
+}
+
 static ConnectionTimeouts getHTTPTimeouts(ContextPtr context)
 {
    return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), {context->getConfigRef().getUInt("keep_alive_timeout", DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT), 0});
@ -1841,6 +1850,7 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che
            String source_replica_path = fs::path(zookeeper_path) / "replicas" / replica;
            if (!fetchPart(part_name,
                metadata_snapshot,
+                zookeeper_name,
                source_replica_path,
                /* to_detached= */ false,
                entry.quorum,
@ -2341,7 +2351,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
                                interserver_scheme, address.scheme, address.host);

            part_desc->res_part = fetcher.fetchSelectedPart(
-                metadata_snapshot, getContext(), part_desc->found_new_part_name, source_replica_path,
+                metadata_snapshot, getContext(), part_desc->found_new_part_name, zookeeper_name, source_replica_path,
                address.host, address.replication_port, timeouts, credentials->getUser(), credentials->getPassword(),
                interserver_scheme, replicated_fetches_throttler, false, TMP_PREFIX + "fetch_");

@ -2458,7 +2468,7 @@ void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entr
                                interserver_scheme, address.scheme, address.host);

            return fetcher.fetchSelectedPart(
-                metadata_snapshot, getContext(), entry.new_part_name, source_replica_path,
+                metadata_snapshot, getContext(), entry.new_part_name, zookeeper_name, source_replica_path,
                address.host, address.replication_port,
                timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme,
                replicated_fetches_throttler, true);
@ -4042,6 +4052,7 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo &
 bool StorageReplicatedMergeTree::fetchPart(
    const String & part_name,
    const StorageMetadataPtr & metadata_snapshot,
+    const String & source_zookeeper_name,
    const String & source_replica_path,
    bool to_detached,
    size_t quorum,
@ -4077,7 +4088,7 @@ bool StorageReplicatedMergeTree::fetchPart(
        currently_fetching_parts.erase(part_name);
    });

-    LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
+    LOG_DEBUG(log, "Fetching part {} from {}:{}", part_name, source_zookeeper_name, source_replica_path);

    auto settings_ptr = getSettings();
    TableLockHolder table_lock_holder;
@ -4134,7 +4145,8 @@ bool StorageReplicatedMergeTree::fetchPart(
                }
                else
                {
-                    LOG_INFO(log, "Not checking checksums of part {} with replica {} because part was removed from ZooKeeper", part_name, source_replica_path);
+                    LOG_INFO(log, "Not checking checksums of part {} with replica {}:{} because part was removed from ZooKeeper",
+                        part_name, source_zookeeper_name, source_replica_path);
                }
            }

@ -4187,6 +4199,7 @@ bool StorageReplicatedMergeTree::fetchPart(
                metadata_snapshot,
                getContext(),
                part_name,
+                source_zookeeper_name,
                source_replica_path,
                address.host,
                address.replication_port,
@ -4279,7 +4292,7 @@ bool StorageReplicatedMergeTree::fetchPart(
    if (part_to_clone)
        LOG_DEBUG(log, "Cloned part {} from {}{}", part_name, part_to_clone->name, to_detached ? " (to 'detached' directory)" : "");
    else
-        LOG_DEBUG(log, "Fetched part {} from {}{}", part_name, source_replica_path, to_detached ? " (to 'detached' directory)" : "");
+        LOG_DEBUG(log, "Fetched part {} from {}:{}{}", part_name, source_zookeeper_name, source_replica_path, to_detached ? " (to 'detached' directory)" : "");

    return true;
 }
@ -4318,7 +4331,7 @@ MutableDataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
        currently_fetching_parts.erase(part_name);
    });

-    LOG_DEBUG(log, "Fetching already known part {} from {}", part_name, source_replica_path);
+    LOG_DEBUG(log, "Fetching already known part {} from {}:{}", part_name, zookeeper_name, source_replica_path);

    TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);

@ -4350,7 +4363,7 @@ MutableDataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
                "'{}' != '{}', can't fetch part from {}", interserver_scheme, address.scheme, address.host);

        return fetcher.fetchSelectedPart(
-            metadata_snapshot, getContext(), part_name, source_replica_path,
+            metadata_snapshot, getContext(), part_name, zookeeper_name, source_replica_path,
            address.host, address.replication_port,
            timeouts, credentials->getUser(), credentials->getPassword(),
            interserver_scheme, replicated_fetches_throttler, false, "", nullptr, true,
@ -4387,7 +4400,7 @@ MutableDataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(

    ProfileEvents::increment(ProfileEvents::ReplicatedPartFetches);

-    LOG_DEBUG(log, "Fetched part {} from {}", part_name, source_replica_path);
+    LOG_DEBUG(log, "Fetched part {} from {}:{}", part_name, zookeeper_name, source_replica_path);
    return part->getDataPartStoragePtr();
 }

@ -4430,7 +4443,16 @@ void StorageReplicatedMergeTree::startupImpl(bool from_attach_thread)
        InterserverIOEndpointPtr data_parts_exchange_ptr = std::make_shared<DataPartsExchange::Service>(*this);
        [[maybe_unused]] auto prev_ptr = std::atomic_exchange(&data_parts_exchange_endpoint, data_parts_exchange_ptr);
        assert(prev_ptr == nullptr);
-        getContext()->getInterserverIOHandler().addEndpoint(data_parts_exchange_ptr->getId(replica_path), data_parts_exchange_ptr);
+
+        /// The endpoint id:
+        ///     old format: DataPartsExchange:/clickhouse/tables/default/t1/{shard}/{replica}
+        ///     new format: DataPartsExchange:{zookeeper_name}:/clickhouse/tables/default/t1/{shard}/{replica}
+        /// Notice:
+        ///     They are incompatible and the default is the old format.
+        ///     If you want to use the new format, please ensure that 'enable_the_endpoint_id_with_zookeeper_name_prefix' of all nodes is true .
+        ///
+        getContext()->getInterserverIOHandler().addEndpoint(
+            data_parts_exchange_ptr->getId(getEndpointName()), data_parts_exchange_ptr);

        startBeingLeader();

@ -4555,7 +4577,7 @@ void StorageReplicatedMergeTree::shutdown()
    auto data_parts_exchange_ptr = std::atomic_exchange(&data_parts_exchange_endpoint, InterserverIOEndpointPtr{});
    if (data_parts_exchange_ptr)
    {
-        getContext()->getInterserverIOHandler().removeEndpointIfExists(data_parts_exchange_ptr->getId(replica_path));
+        getContext()->getInterserverIOHandler().removeEndpointIfExists(data_parts_exchange_ptr->getId(getEndpointName()));
        /// Ask all parts exchange handlers to finish asap. New ones will fail to start
        data_parts_exchange_ptr->blocker.cancelForever();
        /// Wait for all of them
@ -6237,14 +6259,14 @@ void StorageReplicatedMergeTree::fetchPartition(
    info.table_id = getStorageID();
    info.table_id.uuid = UUIDHelpers::Nil;
    auto expand_from = query_context->getMacros()->expand(from_, info);
-    String auxiliary_zookeeper_name = zkutil::extractZooKeeperName(expand_from);
+    String from_zookeeper_name = zkutil::extractZooKeeperName(expand_from);
    String from = zkutil::extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true);
    if (from.empty())
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "ZooKeeper path should not be empty");

    zkutil::ZooKeeperPtr zookeeper;
-    if (auxiliary_zookeeper_name != default_zookeeper_name)
-        zookeeper = getContext()->getAuxiliaryZooKeeper(auxiliary_zookeeper_name);
+    if (from_zookeeper_name != default_zookeeper_name)
+        zookeeper = getContext()->getAuxiliaryZooKeeper(from_zookeeper_name);
    else
        zookeeper = getZooKeeper();

@ -6263,12 +6285,12 @@ void StorageReplicatedMergeTree::fetchPartition(
          */
        if (checkIfDetachedPartExists(part_name))
            throw Exception(ErrorCodes::DUPLICATE_DATA_PART, "Detached part {} already exists.", part_name);
-        LOG_INFO(log, "Will fetch part {} from shard {} (zookeeper '{}')", part_name, from_, auxiliary_zookeeper_name);
+        LOG_INFO(log, "Will fetch part {} from shard {}", part_name, from_);

        try
        {
            /// part name , metadata, part_path , true, 0, zookeeper
-            if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper, /* try_fetch_shared = */ false))
+            if (!fetchPart(part_name, metadata_snapshot, from_zookeeper_name, part_path, true, 0, zookeeper, /* try_fetch_shared = */ false))
                throw Exception(ErrorCodes::UNFINISHED, "Failed to fetch part {} from {}", part_name, from_);
        }
        catch (const DB::Exception & e)
@ -6283,7 +6305,7 @@ void StorageReplicatedMergeTree::fetchPartition(
    }

    String partition_id = getPartitionIDFromQuery(partition, query_context);
-    LOG_INFO(log, "Will fetch partition {} from shard {} (zookeeper '{}')", partition_id, from_, auxiliary_zookeeper_name);
+    LOG_INFO(log, "Will fetch partition {} from shard {}", partition_id, from_);

    /** Let's check that there is no such partition in the `detached` directory (where we will write the downloaded parts).
      * Unreliable (there is a race condition) - such a partition may appear a little later.
@ -6307,7 +6329,7 @@ void StorageReplicatedMergeTree::fetchPartition(
                active_replicas.push_back(replica);

        if (active_replicas.empty())
-            throw Exception(ErrorCodes::NO_ACTIVE_REPLICAS, "No active replicas for shard {}", from);
+            throw Exception(ErrorCodes::NO_ACTIVE_REPLICAS, "No active replicas for shard {}", from_);

        /** You must select the best (most relevant) replica.
        * This is a replica with the maximum `log_pointer`, then with the minimum `queue` size.
@ -6361,7 +6383,8 @@ void StorageReplicatedMergeTree::fetchPartition(
            LOG_INFO(log, "Some of parts ({}) are missing. Will try to fetch covering parts.", missing_parts.size());

        if (try_no >= query_context->getSettings().max_fetch_partition_retries_count)
-            throw Exception(ErrorCodes::TOO_MANY_RETRIES_TO_FETCH_PARTS, "Too many retries to fetch parts from {}", best_replica_path);
+            throw Exception(ErrorCodes::TOO_MANY_RETRIES_TO_FETCH_PARTS,
+                "Too many retries to fetch parts from {}:{}", from_zookeeper_name, best_replica_path);

        Strings parts = zookeeper->getChildren(fs::path(best_replica_path) / "parts");
        ActiveDataPartSet active_parts_set(format_version, parts);
@ -6382,7 +6405,8 @@ void StorageReplicatedMergeTree::fetchPartition(
            parts_to_fetch = std::move(parts_to_fetch_partition);

            if (parts_to_fetch.empty())
-                throw Exception(ErrorCodes::PARTITION_DOESNT_EXIST, "Partition {} on {} doesn't exist", partition_id, best_replica_path);
+                throw Exception(ErrorCodes::PARTITION_DOESNT_EXIST,
+                    "Partition {} on {}:{} doesn't exist", partition_id, from_zookeeper_name, best_replica_path);
        }
        else
        {
@ -6392,7 +6416,7 @@ void StorageReplicatedMergeTree::fetchPartition(
                if (!containing_part.empty())
                    parts_to_fetch.push_back(containing_part);
                else
-                    LOG_WARNING(log, "Part {} on replica {} has been vanished.", missing_part, best_replica_path);
+                    LOG_WARNING(log, "Part {} on replica {}:{} has been vanished.", missing_part, from_zookeeper_name, best_replica_path);
            }
        }

@ -6405,7 +6429,7 @@ void StorageReplicatedMergeTree::fetchPartition(

            try
            {
-                fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper, /* try_fetch_shared = */ false);
+                fetched = fetchPart(part, metadata_snapshot, from_zookeeper_name, best_replica_path, true, 0, zookeeper, /* try_fetch_shared = */ false);
            }
            catch (const DB::Exception & e)
            {
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@ -382,6 +382,7 @@ private:
    zkutil::ZooKeeperPtr getZooKeeperIfTableShutDown() const;
    zkutil::ZooKeeperPtr getZooKeeperAndAssertNotReadonly() const;
    void setZooKeeper();
+    String getEndpointName() const;

    /// If true, the table is offline and can not be written to it.
    /// This flag is managed by RestartingThread.
@ -699,6 +700,7 @@ private:
    bool fetchPart(
        const String & part_name,
        const StorageMetadataPtr & metadata_snapshot,
+        const String & source_zookeeper_name,
        const String & source_replica_path,
        bool to_detached,
        size_t quorum,
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@ -31,7 +31,7 @@
 #include <Storages/NamedCollectionsHelpers.h>
 #include <Storages/ReadFromStorageProgress.h>

-#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/AsynchronousBoundedReadBuffer.h>
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
 #include <Disks/ObjectStorages/StoredObject.h>

@ -676,8 +676,8 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createAsyncS3ReadBuffer(
    modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size;

    auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
-    auto async_reader = std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(
-        pool_reader, modified_settings, std::move(s3_impl),
+    auto async_reader = std::make_unique<AsynchronousBoundedReadBuffer>(
+        std::move(s3_impl), pool_reader, modified_settings,
        context->getAsyncReadCounters(), context->getFilesystemReadPrefetchesLog());

    async_reader->setReadUntilEnd();
--- a/src/Storages/StorageSet.cpp
+++ b/src/Storages/StorageSet.cpp
@ -10,6 +10,7 @@
 #include <Common/formatReadable.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <Interpreters/Context.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <Common/logger_useful.h>
 #include <Interpreters/Set.h>
 #include <Processors/Sinks/SinkToStorage.h>
--- a/Show More
+++ b/Show More