Merge branch 'ClickHouse:master' into group_sorted_array_function

2024-09-20 08:40:50 +00:00 · 2023-08-21 13:06:37 +03:00 · 2023-08-21 13:06:37 +03:00 · c6af1fd6c8
commit c6af1fd6c8
parent c5226ff2f5 5a4b001f8e
92 changed files with 2135 additions and 474 deletions
--- a/contrib/base64
+++ b/contrib/base64
@ -1 +1 @@
-Subproject commit 9499e0c4945589973b9ea1bc927377cfbc84aa46
+Subproject commit 8628e258090f9eb76d90ac3c91e1ab4690e9aa11
--- a/contrib/llvm-project
+++ b/contrib/llvm-project
@ -1 +1 @@
-Subproject commit d857c707fccd50423bea1c4710dc469cf89607a9
+Subproject commit 4ef26de16c229429141e424375142c9b03234b66
--- a/contrib/llvm-project-cmake/CMakeLists.txt
+++ b/contrib/llvm-project-cmake/CMakeLists.txt
@ -1,18 +1,16 @@
-if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
+if (APPLE OR SANITIZE STREQUAL "undefined")
   set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF)
 else()
   set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON)
 endif()

-option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ${ENABLE_EMBEDDED_COMPILER_DEFAULT})
+option (ENABLE_EMBEDDED_COMPILER "Enable support for JIT compilation during query execution" ${ENABLE_EMBEDDED_COMPILER_DEFAULT})

 if (NOT ENABLE_EMBEDDED_COMPILER)
    message(STATUS "Not using LLVM")
    return()
 endif()

-# TODO: Enable compilation on AArch64
-
 set (LLVM_VERSION "15.0.0bundled")
 set (LLVM_INCLUDE_DIRS
    "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm/include"
@ -58,18 +56,30 @@ set (REQUIRED_LLVM_LIBRARIES
    LLVMDemangle
 )

-# if (ARCH_AMD64)
+if (ARCH_AMD64)
+    set (LLVM_TARGETS_TO_BUILD "X86" CACHE INTERNAL "")
    list(APPEND REQUIRED_LLVM_LIBRARIES LLVMX86Info LLVMX86Desc LLVMX86CodeGen)
-# elseif (ARCH_AARCH64)
-#     list(APPEND REQUIRED_LLVM_LIBRARIES LLVMAArch64Info LLVMAArch64Desc LLVMAArch64CodeGen)
-# endif ()
+elseif (ARCH_AARCH64)
+    set (LLVM_TARGETS_TO_BUILD "AArch64" CACHE INTERNAL "")
+    list(APPEND REQUIRED_LLVM_LIBRARIES LLVMAArch64Info LLVMAArch64Desc LLVMAArch64CodeGen)
+elseif (ARCH_PPC64LE)
+    set (LLVM_TARGETS_TO_BUILD "PowerPC" CACHE INTERNAL "")
+    list(APPEND REQUIRED_LLVM_LIBRARIES LLVMPowerPCInfo LLVMPowerPCDesc LLVMPowerPCCodeGen)
+elseif (ARCH_S390X)
+    set (LLVM_TARGETS_TO_BUILD "SystemZ" CACHE INTERNAL "")
+    list(APPEND REQUIRED_LLVM_LIBRARIES LLVMSystemZInfo LLVMSystemZDesc LLVMSystemZCodeGen)
+elseif (ARCH_RISCV64)
+    set (LLVM_TARGETS_TO_BUILD "RISCV" CACHE INTERNAL "")
+    list(APPEND REQUIRED_LLVM_LIBRARIES LLVMRISCVInfo LLVMRISCVDesc LLVMRISCVCodeGen)
+endif ()
+
+message (STATUS "LLVM TARGETS TO BUILD ${LLVM_TARGETS_TO_BUILD}")

 set (CMAKE_INSTALL_RPATH "ON") # Do not adjust RPATH in llvm, since then it will not be able to find libcxx/libcxxabi/libunwind
 set (LLVM_COMPILER_CHECKED 1 CACHE INTERNAL "") # Skip internal compiler selection
 set (LLVM_ENABLE_EH 1 CACHE INTERNAL "") # With exception handling
 set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "")
 set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "")
-set (LLVM_TARGETS_TO_BUILD "X86" CACHE STRING "") # for x86 + ARM: "X86;AArch64"

 # Omit unnecessary stuff (just the options which are ON by default)
 set(LLVM_ENABLE_BACKTRACES 0 CACHE INTERNAL "")
@ -99,15 +109,12 @@ set(LLVM_ENABLE_BINDINGS 0 CACHE INTERNAL "")
 set (LLVM_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm-project/llvm")
 set (LLVM_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/llvm-project/llvm")

-# Since we always use toolchain files to generate hermatic builds, cmake will
-# think it's a cross compilation, and LLVM will try to configure NATIVE LLVM
-# targets with all tests enabled, which will slow down cmake configuration and
-# compilation (You'll see Building native llvm-tblgen...). Let's disable the
-# cross compiling indicator for now.
-#
-# TODO We should let cmake know whether it's indeed a cross compilation in the
-# first place.
-set (CMAKE_CROSSCOMPILING 0)
+message (STATUS "LLVM CMAKE CROSS COMPILING ${CMAKE_CROSSCOMPILING}")
+if (CMAKE_CROSSCOMPILING)
+    set (LLVM_HOST_TRIPLE "${CMAKE_C_COMPILER_TARGET}" CACHE INTERNAL "")
+    message (STATUS "CROSS COMPILING SET LLVM HOST TRIPLE ${LLVM_HOST_TRIPLE}")
+endif()
+
 add_subdirectory ("${LLVM_SOURCE_DIR}" "${LLVM_BINARY_DIR}")

 set_directory_properties (PROPERTIES
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -19,8 +19,9 @@
            <max_threads>12</max_threads>

            <!-- disable JIT for perf tests -->
-            <compile_expressions>0</compile_expressions>
-            <compile_aggregate_expressions>0</compile_aggregate_expressions>
+            <compile_expressions>1</compile_expressions>
+            <compile_aggregate_expressions>1</compile_aggregate_expressions>
+            <compile_sort_description>1</compile_sort_description>

            <!-- Don't fail some prewarm queries too early -->
            <timeout_before_checking_execution_speed>60</timeout_before_checking_execution_speed>
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -211,6 +211,11 @@ mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/cli
 sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
 sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml

+sudo cat /etc/clickhouse-server/config.d/logger_trace.xml \
+   | sed "s|<level>trace</level>|<level>test</level>|" \
+   > /etc/clickhouse-server/config.d/logger_trace.xml.tmp
+mv /etc/clickhouse-server/config.d/logger_trace.xml.tmp /etc/clickhouse-server/config.d/logger_trace.xml
+
 start

 stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" --global-time-limit 1200 \
--- a/docs/changelogs/v23.3.9.55-lts.md
+++ b/docs/changelogs/v23.3.9.55-lts.md
@ -0,0 +1,45 @@
+---
+sidebar_position: 1
+sidebar_label: 2023
+---
+
+# 2023 Changelog
+
+### ClickHouse release v23.3.9.55-lts (b9c5c8622d3) FIXME as compared to v23.3.8.21-lts (1675f2264f3)
+
+#### Performance Improvement
+* Backported in [#52213](https://github.com/ClickHouse/ClickHouse/issues/52213): Do not store blocks in `ANY` hash join if nothing is inserted. [#48633](https://github.com/ClickHouse/ClickHouse/pull/48633) ([vdimir](https://github.com/vdimir)).
+* Backported in [#52826](https://github.com/ClickHouse/ClickHouse/issues/52826): Fix incorrect projection analysis which invalidates primary keys. This issue only exists when `query_plan_optimize_primary_key = 1, query_plan_optimize_projection = 1` . This fixes [#48823](https://github.com/ClickHouse/ClickHouse/issues/48823) . This fixes [#51173](https://github.com/ClickHouse/ClickHouse/issues/51173) . [#52308](https://github.com/ClickHouse/ClickHouse/pull/52308) ([Amos Bird](https://github.com/amosbird)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#53019](https://github.com/ClickHouse/ClickHouse/issues/53019): Packing inline cache into docker images sometimes causes strange special effects. Since we don't use it at all, it's good to go. [#53008](https://github.com/ClickHouse/ClickHouse/pull/53008) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#53288](https://github.com/ClickHouse/ClickHouse/issues/53288): The compiler's profile data (`-ftime-trace`) is uploaded to ClickHouse Cloud., the second attempt after [#53100](https://github.com/ClickHouse/ClickHouse/issues/53100). [#53213](https://github.com/ClickHouse/ClickHouse/pull/53213) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Backported in [#53461](https://github.com/ClickHouse/ClickHouse/issues/53461): Preserve environment parameters in `clickhouse start` command. Fixes [#51962](https://github.com/ClickHouse/ClickHouse/issues/51962). [#53418](https://github.com/ClickHouse/ClickHouse/pull/53418) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Fix optimization to move functions before sorting. [#51481](https://github.com/ClickHouse/ClickHouse/pull/51481) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix Block structure mismatch in Pipe::unitePipes for FINAL [#51492](https://github.com/ClickHouse/ClickHouse/pull/51492) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix binary arithmetic for Nullable(IPv4) [#51642](https://github.com/ClickHouse/ClickHouse/pull/51642) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Support IPv4 and IPv6 as dictionary attributes [#51756](https://github.com/ClickHouse/ClickHouse/pull/51756) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Fix ORDER BY tuple of WINDOW functions [#52145](https://github.com/ClickHouse/ClickHouse/pull/52145) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Disable expression templates for time intervals [#52335](https://github.com/ClickHouse/ClickHouse/pull/52335) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix `countSubstrings()` hang with empty needle and a column haystack [#52409](https://github.com/ClickHouse/ClickHouse/pull/52409) ([Sergei Trifonov](https://github.com/serxa)).
+* Fixed inserting into Buffer engine [#52440](https://github.com/ClickHouse/ClickHouse/pull/52440) ([Vasily Nemkov](https://github.com/Enmk)).
+* The implementation of AnyHash was non-conformant. [#52448](https://github.com/ClickHouse/ClickHouse/pull/52448) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* init and destroy ares channel on demand.. [#52634](https://github.com/ClickHouse/ClickHouse/pull/52634) ([Arthur Passos](https://github.com/arthurpassos)).
+* Fix crash in function `tuple` with one sparse column argument [#52659](https://github.com/ClickHouse/ClickHouse/pull/52659) ([Anton Popov](https://github.com/CurtizJ)).
+* clickhouse-keeper: fix implementation of server with poll() [#52833](https://github.com/ClickHouse/ClickHouse/pull/52833) ([Andy Fiddaman](https://github.com/citrus-it)).
+* Fix password leak in show create mysql table [#52962](https://github.com/ClickHouse/ClickHouse/pull/52962) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix incorrect normal projection AST format [#53347](https://github.com/ClickHouse/ClickHouse/pull/53347) ([Amos Bird](https://github.com/amosbird)).
+* Fix loading lazy database during system.table select query [#53372](https://github.com/ClickHouse/ClickHouse/pull/53372) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Fix wrong columns order for queries with parallel FINAL. [#53489](https://github.com/ClickHouse/ClickHouse/pull/53489) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix: interpolate expression takes source column instead of same name aliased from select expression. [#53572](https://github.com/ClickHouse/ClickHouse/pull/53572) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix crash in comparison functions due to incorrect query analysis [#52172](https://github.com/ClickHouse/ClickHouse/pull/52172) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix deadlocks in StorageTableFunctionProxy [#52626](https://github.com/ClickHouse/ClickHouse/pull/52626) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Disable test_reverse_dns_query/test.py [#53195](https://github.com/ClickHouse/ClickHouse/pull/53195) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Disable test_host_regexp_multiple_ptr_records/test.py [#53211](https://github.com/ClickHouse/ClickHouse/pull/53211) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
--- a/docs/en/engines/table-engines/integrations/kafka.md
+++ b/docs/en/engines/table-engines/integrations/kafka.md
@ -173,6 +173,7 @@ Similar to GraphiteMergeTree, the Kafka engine supports extended configuration u
    <!-- Global configuration options for all tables of Kafka engine type -->
    <debug>cgrp</debug>
    <auto_offset_reset>smallest</auto_offset_reset>
+	<statistics_interval_ms>600</statistics_interval_ms>

    <!-- Configuration specific to topics "logs" and "stats" -->

@ -260,3 +261,4 @@ The number of rows in one Kafka message depends on whether the format is row-bas

 - [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns)
 - [background_message_broker_schedule_pool_size](../../../operations/server-configuration-parameters/settings.md#background_message_broker_schedule_pool_size)
+- [system.kafka_consumers](../../../operations/system-tables/kafka_consumers.md)
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -2136,6 +2136,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](/docs/en/engines/t
 - [input_format_parquet_case_insensitive_column_matching](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_case_insensitive_column_matching) - ignore case when matching Parquet columns with ClickHouse columns. Default value - `false`.
 - [input_format_parquet_allow_missing_columns](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_allow_missing_columns) - allow missing columns while reading Parquet data. Default value - `false`.
 - [input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference) - allow skipping columns with unsupported types while schema inference for Parquet format. Default value - `false`.
+- [input_format_parquet_local_file_min_bytes_for_seek](/docs/en/operations/settings/settings-formats.md/#input_format_parquet_local_file_min_bytes_for_seek) - min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format. Default value - `8192`.
 - [output_format_parquet_fixed_string_as_fixed_byte_array](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_fixed_string_as_fixed_byte_array) - use Parquet FIXED_LENGTH_BYTE_ARRAY type instead of Binary/String for FixedString columns. Default value - `true`.
 - [output_format_parquet_version](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_version) - The version of Parquet format used in output format. Default value - `2.latest`.
 - [output_format_parquet_compression_method](/docs/en/operations/settings/settings-formats.md/#output_format_parquet_compression_method) - compression method used in output Parquet format. Default value - `snappy`.
--- a/docs/en/interfaces/overview.md
+++ b/docs/en/interfaces/overview.md
@ -21,6 +21,11 @@ In most cases it is recommended to use an appropriate tool or library instead of
 - [ODBC driver](../interfaces/odbc.md)
 - [C++ client library](../interfaces/cpp.md)

+ClickHouse server provides embedded visual interfaces for power users:
+
+- Play UI: open `/play` in the browser;
+- Advanced Dashboard: open `/dashboard` in the browser;
+
 There are also a wide range of third-party libraries for working with ClickHouse:

 - [Client libraries](../interfaces/third-party/client-libraries.md)
--- a/docs/en/operations/settings/settings-formats.md
+++ b/docs/en/operations/settings/settings-formats.md
@ -1223,6 +1223,12 @@ Allow skipping columns with unsupported types while schema inference for format

 Disabled by default.

+### input_format_parquet_local_file_min_bytes_for_seek {#input_format_parquet_local_file_min_bytes_for_seek}
+
+min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format.
+
+Default value - `8192`.
+
 ### output_format_parquet_string_as_string {#output_format_parquet_string_as_string}

 Use Parquet String type instead of Binary for String columns.
--- a/docs/en/operations/system-tables/kafka_consumers.md
+++ b/docs/en/operations/system-tables/kafka_consumers.md
@ -0,0 +1,58 @@
+---
+slug: /en/operations/system-tables/kafka_consumers
+---
+# kafka_consumers
+
+Contains information about Kafka consumers.
+Applicable for [Kafka table engine](../../engines/table-engines/integrations/kafka) (native ClickHouse integration)
+
+Columns:
+
+- `database` (String) - database of the table with Kafka Engine.
+- `table` (String) - name of the table with Kafka Engine.
+- `consumer_id` (String) - Kafka consumer identifier. Note, that a table can have many consumers. Specified by `kafka_num_consumers` parameter.
+- `assignments.topic` (Array(String)) - Kafka topic.
+- `assignments.partition_id` (Array(Int32)) - Kafka partition id. Note, that only one consumer can be assigned to a partition.
+- `assignments.current_offset` (Array(Int64)) - current offset.
+- `exceptions.time`, (Array(DateTime)) - timestamp when the 10 most recent exceptions were generated.
+- `exceptions.text`, (Array(String)) - text of 10 most recent exceptions.
+- `last_poll_time`, (DateTime) - timestamp of the most recent poll.
+- `num_messages_read`, (UInt64) - number of messages read by the consumer.
+- `last_commit_time`, (DateTime) - timestamp of the most recent poll.
+- `num_commits`, (UInt64) - total number of commits for the consumer.
+- `last_rebalance_time`, (DateTime) - timestamp of the most recent Kafka rebalance
+- `num_rebalance_revocations`, (UInt64) - number of times the consumer was revoked its partitions
+- `num_rebalance_assignments`, (UInt64) - number of times the consumer was assigned to Kafka cluster
+- `is_currently_used`, (UInt8) - consumer is in use
+- `rdkafka_stat` (String) - library internal statistic. See https://github.com/ClickHouse/librdkafka/blob/master/STATISTICS.md . Set `statistics_interval_ms` to 0 disable, default is 3000 (once in three seconds).
+
+Example:
+
+``` sql
+SELECT *
+FROM system.kafka_consumers
+FORMAT Vertical
+```
+
+``` text
+Row 1:
+──────
+database:                   test
+table:                      kafka
+consumer_id:                ClickHouse-instance-test-kafka-1caddc7f-f917-4bb1-ac55-e28bd103a4a0
+assignments.topic:          ['system_kafka_cons']
+assignments.partition_id:   [0]
+assignments.current_offset: [18446744073709550615]
+exceptions.time:            []
+exceptions.text:            []
+last_poll_time:             2006-11-09 18:47:47
+num_messages_read:          4
+last_commit_time:           2006-11-10 04:39:40
+num_commits:                1
+last_rebalance_time:        1970-01-01 00:00:00
+num_rebalance_revocations:  0
+num_rebalance_assignments:  1
+is_currently_used:          1
+rdkafka_stat:               {...}
+
+```
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1572,6 +1572,11 @@ try
    global_context->setFormatSchemaPath(format_schema_path);
    fs::create_directories(format_schema_path);

+    /// Set path for filesystem caches
+    fs::path filesystem_caches_path(config().getString("filesystem_caches_path", ""));
+    if (!filesystem_caches_path.empty())
+        global_context->setFilesystemCachesPath(filesystem_caches_path);
+
    /// Check sanity of MergeTreeSettings on server startup
    {
        size_t background_pool_tasks = global_context->getMergeMutateExecutor()->getMaxTasksCount();
--- a/programs/server/dashboard.html
+++ b/programs/server/dashboard.html
@ -11,6 +11,7 @@
            --background: linear-gradient(to bottom, #00CCFF, #00D0D0);
            --chart-background: white;
            --shadow-color: rgba(0, 0, 0, 0.25);
+            --moving-shadow-color: rgba(0, 0, 0, 0.5);
            --input-shadow-color: rgba(0, 255, 0, 1);
            --error-color: red;
            --auth-error-color: white;
@ -34,6 +35,7 @@
            --background: #151C2C;
            --chart-background: #1b2834;
            --shadow-color: rgba(0, 0, 0, 0);
+            --moving-shadow-color: rgba(255, 255, 255, 0.25);
            --input-shadow-color: rgba(255, 128, 0, 0.25);
            --error-color: #F66;
            --legend-background: rgba(255, 255, 255, 0.25);
@ -91,6 +93,21 @@
            position: relative;
        }

+        .chart-maximized {
+            flex: 1 100%;
+            height: 75vh
+        }
+
+        .chart-moving {
+            z-index: 11;
+            box-shadow: 0 0 2rem var(--moving-shadow-color);
+        }
+
+        .chart-displaced {
+            opacity: 75%;
+            filter: blur(1px);
+        }
+
        .chart div { position: absolute; }

        .inputs {
@ -230,8 +247,8 @@
            filter: contrast(125%);
        }

-        #add, #reload {
-            padding: .25rem 0.5rem;
+        #add, #reload, #edit {
+            padding: 0.25rem 0.5rem;
            text-align: center;
            font-weight: bold;
            user-select: none;
@ -244,13 +261,10 @@
            margin-right: 1rem !important;
            margin-left: 0rem;
            margin-bottom: 1rem;
+            height: 3ex;
        }

-        /* .unconnected #reload {
-            margin-left: 3px;
-        } */
-
-        #add:hover, #reload:hover {
+        #add:hover, #reload:hover, #edit:hover {
            background: var(--button-background-color);
        }

@ -306,6 +320,7 @@
        }
        .chart-buttons a {
            margin-right: 0.25rem;
+            user-select: none;
        }
        .chart-buttons a:hover {
            color: var(--chart-button-hover-color);
@ -333,18 +348,21 @@
            padding: 2rem;
        }

-        .query-editor textarea {
-            grid-row: 1;
-            grid-column: 1 / span 2;
-            z-index: 11;
+        textarea {
            padding: 0.5rem;
            outline: none;
            border: none;
            font-size: 12pt;
-            border-bottom: 1px solid var(--edit-title-border);
            background: var(--chart-background);
            color: var(--text-color);
            resize: none;
+        }
+
+        .query-editor textarea {
+            grid-row: 1;
+            grid-column: 1 / span 2;
+            z-index: 11;
+            border-bottom: 1px solid var(--edit-title-border);
            margin: 0;
        }

@ -367,10 +385,41 @@
            filter: contrast(125%);
        }

+        .edit-cancel {
+            cursor: pointer;
+            background: var(--new-chart-background-color);
+        }
+        .edit-cancel:hover {
+            filter: contrast(125%);
+        }
+
        .nowrap {
            white-space: pre;
        }

+        #mass-editor {
+            display: none;
+            grid-template-columns: auto fit-content(10%) fit-content(10%);
+            grid-template-rows: auto fit-content(10%);
+            row-gap: 1rem;
+            column-gap: 1rem;
+        }
+
+        #mass-editor-textarea {
+            width: 100%;
+            height: 100%;
+            grid-row: 1;
+            grid-column: 1 / span 3;
+        }
+
+        #mass-editor input {
+            padding: 0.5rem;
+        }
+
+        #mass-editor-message {
+            color: var(--auth-error-color);
+        }
+
        /* Source: https://cdn.jsdelivr.net/npm/uplot@1.6.21/dist/uPlot.min.css
         * It is copy-pasted to lower the number of requests.
         */
@ -389,6 +438,7 @@
        </div>
        <div id="button-options">
            <span class="nowrap themes"><span id="toggle-dark">🌚</span><span id="toggle-light">🌞</span></span>
+            <input id="edit" type="button" value="✎" style="display: none;">
            <input id="add" type="button" value="Add chart" style="display: none;">
            <input id="reload" type="button" value="Reload">
            <div id="chart-params"></div>
@ -397,6 +447,12 @@
    <div id="auth-error"></div>
 </div>
 <div id="charts"></div>
+<div id="mass-editor">
+    <textarea id="mass-editor-textarea" spellcheck="false" data-gramm="false"></textarea>
+    <span id="mass-editor-message">&nbsp;</span>
+    <input type="submit" id="mass-editor-cancel" class="edit-cancel" value="Cancel">
+    <input type="submit" id="mass-editor-confirm" class="edit-confirm" value="Apply">
+</div>
 <script>

 /** Implementation note: it might be more natural to use some reactive framework.
@ -405,9 +461,7 @@
  *
  * TODO:
  * - zoom on the graphs should work on touch devices;
-  * - add mass edit capability (edit the JSON config as a whole);
  * - compress the state for URL's #hash;
-  * - save/load JSON configuration;
  * - footer with "about" or a link to source code;
  * - allow to configure a table on a server to save the dashboards;
  * - multiple lines on chart;
@ -418,11 +472,13 @@
 let host = 'https://play.clickhouse.com/';
 let user = 'explorer';
 let password = '';
+let add_http_cors_header = true;

 /// If it is hosted on server, assume that it is the address of ClickHouse.
 if (location.protocol != 'file:') {
    host = location.origin;
    user = 'default';
+    add_http_cors_header = false;
 }

 const errorCodeMessageMap = {
@ -702,6 +758,7 @@ function insertChart(i) {
    query_editor_textarea.spellcheck = false;
    query_editor_textarea.value = q.query;
    query_editor_textarea.placeholder = 'Query';
+    query_editor_textarea.setAttribute('data-gramm', false);
    query_editor.appendChild(query_editor_textarea);

    let query_editor_title = document.createElement('input');
@ -756,6 +813,92 @@ function insertChart(i) {
    let edit_buttons = document.createElement('div');
    edit_buttons.className = 'chart-buttons';

+    let move = document.createElement('a');
+    let move_text = document.createTextNode('✥');
+    move.appendChild(move_text);
+
+    let is_dragging = false;
+    move.addEventListener('mousedown', e => {
+        const idx = getCurrentIndex();
+        is_dragging = true;
+        chart.className = 'chart chart-moving';
+
+        let offset_x = e.clientX;
+        let offset_y = e.clientY;
+
+        let displace_idx = null;
+        let displace_chart = null;
+
+        function mouseup(e) {
+            is_dragging = false;
+            chart.className = 'chart';
+            chart.style.left = null;
+            chart.style.top = null;
+
+            if (displace_idx !== null) {
+                const elem = queries[idx];
+                queries.splice(idx, 1);
+                queries.splice(displace_idx, 0, elem);
+
+                displace_chart.className = 'chart';
+                drawAll();
+            }
+        }
+
+        function mousemove(e) {
+            if (!is_dragging) {
+                document.body.removeEventListener('mousemove', mousemove);
+                document.body.removeEventListener('mouseup', mouseup);
+                return;
+            }
+
+            let x = e.clientX - offset_x;
+            let y = e.clientY - offset_y;
+
+            chart.style.left = `${x}px`;
+            chart.style.top = `${y}px`;
+
+            displace_idx = null;
+            displace_chart = null;
+            let current_idx = -1;
+            for (const elem of charts.querySelectorAll('.chart')) {
+                ++current_idx;
+                if (current_idx == idx) {
+                    continue;
+                }
+
+                const this_rect = chart.getBoundingClientRect();
+                const this_center_x = this_rect.left + this_rect.width / 2;
+                const this_center_y = this_rect.top + this_rect.height / 2;
+
+                const elem_rect = elem.getBoundingClientRect();
+
+                if (this_center_x >= elem_rect.left && this_center_x <= elem_rect.right
+                    && this_center_y >= elem_rect.top && this_center_y <= elem_rect.bottom) {
+
+                    elem.className = 'chart chart-displaced';
+                    displace_idx = current_idx;
+                    displace_chart = elem;
+                } else {
+                    elem.className = 'chart';
+                }
+            }
+        }
+
+        document.body.addEventListener('mouseup', mouseup);
+        document.body.addEventListener('mousemove', mousemove);
+    });
+
+    let maximize = document.createElement('a');
+    let maximize_text = document.createTextNode('🗖');
+    maximize.appendChild(maximize_text);
+
+    maximize.addEventListener('click', e => {
+        const idx = getCurrentIndex();
+        chart.className = (chart.className == 'chart' ? 'chart chart-maximized' : 'chart');
+        resize();
+    });
+
    let edit = document.createElement('a');
    let edit_text = document.createTextNode('✎');
    edit.appendChild(edit_text);
@ -788,6 +931,8 @@ function insertChart(i) {
        saveState();
    });

+    edit_buttons.appendChild(move);
+    edit_buttons.appendChild(maximize);
    edit_buttons.appendChild(edit);
    edit_buttons.appendChild(trash);

@ -815,6 +960,66 @@ document.getElementById('reload').addEventListener('click', e => {
    reloadAll();
 });

+
+let mass_editor_active = false;
+
+function showMassEditor() {
+    document.getElementById('charts').style.display = 'none';
+
+    let editor_div = document.getElementById('mass-editor');
+    editor_div.style.display = 'grid';
+
+    let editor = document.getElementById('mass-editor-textarea');
+    editor.value = JSON.stringify({params: params, queries: queries}, null, 2);
+
+    mass_editor_active = true;
+}
+
+function hideMassEditor() {
+    document.getElementById('mass-editor').style.display = 'none';
+    document.getElementById('charts').style.display = 'flex';
+
+    mass_editor_active = false;
+}
+
+function massEditorApplyChanges() {
+    let editor = document.getElementById('mass-editor-textarea');
+    ({params, queries} = JSON.parse(editor.value));
+    hideMassEditor();
+    regenerate();
+    drawAll();
+    saveState();
+}
+
+document.getElementById('edit').addEventListener('click', e => {
+    if (mass_editor_active) {
+        massEditorApplyChanges();
+        hideMassEditor();
+    } else {
+        showMassEditor();
+    }
+});
+
+document.getElementById('mass-editor-confirm').addEventListener('click', e => {
+    massEditorApplyChanges();
+    hideMassEditor();
+});
+
+document.getElementById('mass-editor-cancel').addEventListener('click', e => {
+    hideMassEditor();
+});
+
+document.getElementById('mass-editor-textarea').addEventListener('input', e => {
+    let message = document.getElementById('mass-editor-message').firstChild;
+    message.data = '';
+    if (e.target.value != '') {
+        try { JSON.parse(e.target.value) } catch (e) {
+            message.data = e.toString();
+        }
+    }
+});
+
+
 function legendAsTooltipPlugin({ className, style = { background: "var(--legend-background)" } } = {}) {
    let legendEl;

@ -865,8 +1070,6 @@ function legendAsTooltipPlugin({ className, style = { background: "var(--legend-
    };
 }

-let add_http_cors_header = false;
-
 async function draw(idx, chart, url_params, query) {
    if (plots[idx]) {
        plots[idx].destroy();
@ -877,7 +1080,7 @@ async function draw(idx, chart, url_params, query) {
    user = document.getElementById('user').value;
    password = document.getElementById('password').value;

-    let url = `${host}?default_format=JSONCompactColumns`
+    let url = `${host}?default_format=JSONCompactColumns&enable_http_compression=1`

    if (add_http_cors_header) {
        // For debug purposes, you may set add_http_cors_header from a browser console
@ -906,12 +1109,14 @@ async function draw(idx, chart, url_params, query) {
    }

    if (error) {
-        const errorMatch = errorMessages.find(({ regex }) => error.match(regex))
-        const match = error.match(errorMatch.regex)
-        const message = errorMatch.messageFunc(match)
+        const errorMatch = errorMessages.find(({ regex }) => error.match(regex));
+        if (!errorMatch) {
+            throw new Error(error);
+        }
+        const match = error.match(errorMatch.regex);
+        const message = errorMatch.messageFunc(match);
        if (message) {
-            const authError = new Error(message)
-            throw authError
+            throw new Error(message);
        }
    }

@ -978,23 +1183,23 @@ async function draw(idx, chart, url_params, query) {
 }

 function showAuthError(message) {
-    const charts = document.querySelector('#charts');
+    const charts = document.getElementById('charts');
    charts.style.height = '0px';
    charts.style.opacity = '0';
-    const add = document.querySelector('#add');
-    add.style.display = 'none';
+    document.getElementById('add').style.display = 'none';
+    document.getElementById('edit').style.display = 'none';

-    const authError = document.querySelector('#auth-error');
+    const authError = document.getElementById('auth-error');
    authError.textContent = message;
    authError.style.display = 'flex';
 }

 function hideAuthError() {
-    const charts = document.querySelector('#charts');
+    const charts = document.getElementById('charts');
    charts.style.height = 'auto';
    charts.style.opacity = '1';

-    const authError = document.querySelector('#auth-error');
+    const authError = document.getElementById('auth-error');
    authError.textContent = '';
    authError.style.display = 'none';
 }
@ -1025,11 +1230,11 @@ async function drawAll() {
        if (results.includes(true)) {
            const element = document.querySelector('.inputs');
            element.classList.remove('unconnected');
-            const add = document.querySelector('#add');
-            add.style.display = 'block';
+            document.getElementById('add').style.display = 'inline-block';
+            document.getElementById('edit').style.display = 'inline-block';
        } 
        else {
-            const charts = document.querySelector('#charts')
+            const charts = document.getElementById('charts')
            charts.style.height = '0px';
        }
    })
@ -1048,14 +1253,14 @@ new ResizeObserver(resize).observe(document.body);

 function disableReloadButton() {
    const reloadButton = document.getElementById('reload')
-    reloadButton.value = 'Reloading...'
+    reloadButton.value = 'Reloading…'
    reloadButton.disabled = true
    reloadButton.classList.add('disabled')
 }

 function disableRunButton() {
    const runButton = document.getElementById('run')
-    runButton.value = 'Reloading...'
+    runButton.value = 'Reloading…'
    runButton.disabled = true
    runButton.classList.add('disabled')
 }
--- a/programs/server/play.html
+++ b/programs/server/play.html
@ -465,7 +465,7 @@
            <input class="monospace shadow" id="url" type="text" value="http://localhost:8123/" placeholder="url" /><input class="monospace shadow" id="user" type="text" value="default" placeholder="user" /><input class="monospace shadow" id="password" type="password" placeholder="password" />
        </div>
        <div id="query_div">
-            <textarea autofocus spellcheck="false" class="monospace shadow" id="query"></textarea>
+            <textarea autofocus spellcheck="false" data-gramm="false" class="monospace shadow" id="query"></textarea>
        </div>
        <div id="run_div">
            <button class="shadow" id="run">Run</button>
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -384,7 +384,7 @@ target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::abseil_swiss_tabl
 dbms_target_link_libraries(PUBLIC ch_contrib::roaring)

 if (TARGET ch_contrib::rdkafka)
-    dbms_target_link_libraries(PRIVATE ch_contrib::rdkafka ch_contrib::cppkafka)
+    dbms_target_link_libraries(PUBLIC ch_contrib::rdkafka ch_contrib::cppkafka)
 endif()

 if (TARGET ch_contrib::nats_io)
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -194,6 +194,7 @@
    M(FilesystemCacheSizeLimit, "Filesystem cache size limit in bytes") \
    M(FilesystemCacheElements, "Filesystem cache elements (file segments)") \
    M(FilesystemCacheDownloadQueueElements, "Filesystem cache elements in download queue") \
+    M(FilesystemCacheDelayedCleanupElements, "Filesystem cache elements in background cleanup queue") \
    M(AsyncInsertCacheSize, "Number of async insert hash id in cache") \
    M(S3Requests, "S3 requests") \
    M(KeeperAliveConnections, "Number of alive connections") \
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -261,7 +261,7 @@ class IColumn;
    \
    M(Float, memory_tracker_fault_probability, 0., "For testing of `exception safety` - throw an exception every time you allocate memory with the specified probability.", 0) \
    \
-    M(Bool, enable_http_compression, false, "Compress the result if the client over HTTP said that it understands data compressed by gzip or deflate.", 0) \
+    M(Bool, enable_http_compression, false, "Compress the result if the client over HTTP said that it understands data compressed by gzip, deflate, zstd, br, lz4, bz2, xz.", 0) \
    M(Int64, http_zlib_compression_level, 3, "Compression level - used if the client on HTTP said that it understands data compressed by gzip or deflate.", 0) \
    \
    M(Bool, http_native_compression_disable_checksumming_on_decompress, false, "If you uncompress the POST data from the client compressed by the native format, do not check the checksum.", 0) \
@ -270,7 +270,7 @@ class IColumn;
    \
    M(Bool, add_http_cors_header, false, "Write add http CORS header.", 0) \
    \
-    M(UInt64, max_http_get_redirects, 0, "Max number of http GET redirects hops allowed. Make sure additional security measures are in place to prevent a malicious server to redirect your requests to unexpected services.", 0) \
+    M(UInt64, max_http_get_redirects, 0, "Max number of http GET redirects hops allowed. Ensures additional security measures are in place to prevent a malicious server to redirect your requests to unexpected services.\n\nIt is the case when an external server redirects to another address, but that address appears to be internal to the company's infrastructure, and by sending an HTTP request to an internal server, you could request an internal API from the internal network, bypassing the auth, or even query other services, such as Redis or Memcached. When you don't have an internal infrastructure (including something running on your localhost), or you trust the server, it is safe to allow redirects. Although keep in mind, that if the URL uses HTTP instead of HTTPS, and you will have to trust not only the remote server but also your ISP and every network in the middle.", 0) \
    \
    M(Bool, use_client_time_zone, false, "Use client timezone for interpreting DateTime string values, instead of adopting server timezone.", 0) \
    \
@ -644,7 +644,7 @@ class IColumn;
    M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \
    M(Bool, database_replicated_allow_only_replicated_engine, false, "Allow to create only Replicated tables in database with engine Replicated", 0) \
    M(Bool, database_replicated_allow_replicated_engine_arguments, true, "Allow to create only Replicated tables in database with engine Replicated with explicit arguments", 0) \
-    M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \
+    M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result, one of: 'none', 'throw', 'null_status_on_timeout', 'never_throw'", 0) \
    M(UInt64, distributed_ddl_entry_format_version, 5, "Compatibility version of distributed DDL (ON CLUSTER) queries", 0) \
    \
    M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializedMySQL. If equal to 0, this setting is disabled", 0) \
@ -879,6 +879,7 @@ class IColumn;
    M(Bool, input_format_allow_seeks, true, "Allow seeks while reading in ORC/Parquet/Arrow input formats", 0) \
    M(Bool, input_format_orc_allow_missing_columns, false, "Allow missing columns while reading ORC input formats", 0) \
    M(Bool, input_format_parquet_allow_missing_columns, false, "Allow missing columns while reading Parquet input formats", 0) \
+    M(UInt64, input_format_parquet_local_file_min_bytes_for_seek, 8192, "Min bytes required for local read (file) to do seek, instead of read with ignore in Parquet input format", 0) \
    M(Bool, input_format_arrow_allow_missing_columns, false, "Allow missing columns while reading Arrow input formats", 0) \
    M(Char, input_format_hive_text_fields_delimiter, '\x01', "Delimiter between fields in Hive Text File", 0) \
    M(Char, input_format_hive_text_collection_items_delimiter, '\x02', "Delimiter between collection(array or map) items in Hive Text File", 0) \
--- a/src/DataTypes/Serializations/SerializationDecimalBase.cpp
+++ b/src/DataTypes/Serializations/SerializationDecimalBase.cpp
@ -15,14 +15,14 @@ template <typename T>
 void SerializationDecimalBase<T>::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const
 {
    FieldType x = field.get<DecimalField<T>>();
-    writeBinary(x, ostr);
+    writeBinaryLittleEndian(x, ostr);
 }

 template <typename T>
 void SerializationDecimalBase<T>::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const
 {
    const FieldType & x = assert_cast<const ColumnType &>(column).getElement(row_num);
-    writeBinary(x, ostr);
+    writeBinaryLittleEndian(x, ostr);
 }

 template <typename T>
@ -34,15 +34,26 @@ void SerializationDecimalBase<T>::serializeBinaryBulk(const IColumn & column, Wr

    if (limit == 0 || offset + limit > size)
        limit = size - offset;
-
-    ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(FieldType) * limit);
+    if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2)
+    {
+        for (size_t i = 0; i < limit; i++)
+        {
+            auto tmp(x[offset+i]);
+            char *start = reinterpret_cast<char*>(&tmp);
+            char *end = start + sizeof(FieldType);
+            std::reverse(start, end);
+            ostr.write(reinterpret_cast<const char *>(&tmp), sizeof(FieldType));
+        }
+    }
+    else
+        ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(FieldType) * limit);
 }

 template <typename T>
 void SerializationDecimalBase<T>::deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const
 {
    typename FieldType::NativeType x;
-    readBinary(x, istr);
+    readBinaryLittleEndian(x, istr);
    field = DecimalField(T(x), this->scale);
 }

@ -50,7 +61,7 @@ template <typename T>
 void SerializationDecimalBase<T>::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
 {
    typename FieldType::NativeType x;
-    readBinary(x, istr);
+    readBinaryLittleEndian(x, istr);
    assert_cast<ColumnType &>(column).getData().push_back(FieldType(x));
 }

@ -61,6 +72,16 @@ void SerializationDecimalBase<T>::deserializeBinaryBulk(IColumn & column, ReadBu
    size_t initial_size = x.size();
    x.resize(initial_size + limit);
    size_t size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(FieldType) * limit);
+    if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2)
+    {
+        for (size_t i = 0; i < limit; i++)
+        {
+            char *start = reinterpret_cast<char*>(&x[initial_size + i]);
+            char *end = start + sizeof(FieldType);
+            std::reverse(start, end);
+        }
+    }
+
    x.resize(initial_size + size / sizeof(FieldType));
 }

--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@ -132,13 +132,13 @@ struct IndexesSerializationType
            val |= NeedGlobalDictionaryBit;
        if (need_update_dictionary)
            val |= NeedUpdateDictionary;
-        writeIntBinary(val, buffer);
+        writeBinaryLittleEndian(val, buffer);
    }

    void deserialize(ReadBuffer & buffer, const ISerialization::DeserializeBinaryBulkSettings & settings)
    {
        SerializationType val;
-        readIntBinary(val, buffer);
+        readBinaryLittleEndian(val, buffer);

        checkType(val);
        has_additional_keys = (val & HasAdditionalKeysBit) != 0;
@ -235,7 +235,7 @@ void SerializationLowCardinality::serializeBinaryBulkStatePrefix(
    /// Write version and create SerializeBinaryBulkState.
    UInt64 key_version = KeysSerializationVersion::SharedDictionariesWithAdditionalKeys;

-    writeIntBinary(key_version, *stream);
+    writeBinaryLittleEndian(key_version, *stream);

    state = std::make_shared<SerializeStateLowCardinality>(key_version);
 }
@ -259,7 +259,7 @@ void SerializationLowCardinality::serializeBinaryBulkStateSuffix(
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream in SerializationLowCardinality::serializeBinaryBulkStateSuffix");

        UInt64 num_keys = nested_column->size();
-        writeIntBinary(num_keys, *stream);
+        writeBinaryLittleEndian(num_keys, *stream);
        dict_inner_serialization->serializeBinaryBulk(*nested_column, *stream, 0, num_keys);
        low_cardinality_state->shared_dictionary = nullptr;
    }
@ -277,7 +277,7 @@ void SerializationLowCardinality::deserializeBinaryBulkStatePrefix(
        return;

    UInt64 keys_version;
-    readIntBinary(keys_version, *stream);
+    readBinaryLittleEndian(keys_version, *stream);

    state = std::make_shared<DeserializeStateLowCardinality>(keys_version);
 }
@ -492,7 +492,7 @@ void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams(
    {
        const auto & nested_column = global_dictionary->getNestedNotNullableColumn();
        UInt64 num_keys = nested_column->size();
-        writeIntBinary(num_keys, *keys_stream);
+        writeBinaryLittleEndian(num_keys, *keys_stream);
        dict_inner_serialization->serializeBinaryBulk(*nested_column, *keys_stream, 0, num_keys);
        low_cardinality_state->shared_dictionary = nullptr;
    }
@ -500,12 +500,12 @@ void SerializationLowCardinality::serializeBinaryBulkWithMultipleStreams(
    if (need_additional_keys)
    {
        UInt64 num_keys = keys->size();
-        writeIntBinary(num_keys, *indexes_stream);
+        writeBinaryLittleEndian(num_keys, *indexes_stream);
        dict_inner_serialization->serializeBinaryBulk(*keys, *indexes_stream, 0, num_keys);
    }

    UInt64 num_rows = positions->size();
-    writeIntBinary(num_rows, *indexes_stream);
+    writeBinaryLittleEndian(num_rows, *indexes_stream);
    auto index_serialization = index_version.getDataType()->getDefaultSerialization();
    index_serialization->serializeBinaryBulk(*positions, *indexes_stream, 0, num_rows);
 }
@ -541,7 +541,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams(
    auto read_dictionary = [this, low_cardinality_state, keys_stream]()
    {
        UInt64 num_keys;
-        readIntBinary(num_keys, *keys_stream);
+        readBinaryLittleEndian(num_keys, *keys_stream);

        auto keys_type = removeNullable(dictionary_type);
        auto global_dict_keys = keys_type->createColumn();
@ -554,7 +554,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams(
    auto read_additional_keys = [this, low_cardinality_state, indexes_stream]()
    {
        UInt64 num_keys;
-        readIntBinary(num_keys, *indexes_stream);
+        readBinaryLittleEndian(num_keys, *indexes_stream);

        auto keys_type = removeNullable(dictionary_type);
        auto additional_keys = keys_type->createColumn();
@ -660,7 +660,7 @@ void SerializationLowCardinality::deserializeBinaryBulkWithMultipleStreams(
            else
                low_cardinality_state->additional_keys = nullptr;

-            readIntBinary(low_cardinality_state->num_pending_rows, *indexes_stream);
+            readBinaryLittleEndian(low_cardinality_state->num_pending_rows, *indexes_stream);
        }

        size_t num_rows_to_read = std::min<UInt64>(limit, low_cardinality_state->num_pending_rows);
--- a/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp
+++ b/src/Disks/ObjectStorages/Cached/registerDiskCache.cpp
@ -3,6 +3,7 @@
 #include <Interpreters/Cache/FileCache.h>
 #include <Common/logger_useful.h>
 #include <Common/assert_cast.h>
+#include <Common/filesystemHelpers.h>
 #include <Disks/DiskFactory.h>
 #include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
 #include <Disks/ObjectStorages/DiskObjectStorage.h>
@ -40,10 +41,24 @@ void registerDiskCache(DiskFactory & factory, bool /* global_skip_access_check *
        FileCacheSettings file_cache_settings;
        file_cache_settings.loadFromConfig(config, config_prefix);

-        if (file_cache_settings.base_path.empty())
-            file_cache_settings.base_path = fs::path(context->getPath()) / "disks" / name / "cache/";
-        else if (fs::path(file_cache_settings.base_path).is_relative())
-            file_cache_settings.base_path = fs::path(context->getPath()) / "caches" / file_cache_settings.base_path;
+        auto config_fs_caches_dir = context->getFilesystemCachesPath();
+        if (config_fs_caches_dir.empty())
+        {
+            if (fs::path(file_cache_settings.base_path).is_relative())
+                file_cache_settings.base_path = fs::path(context->getPath()) / "caches" / file_cache_settings.base_path;
+        }
+        else
+        {
+            if (fs::path(file_cache_settings.base_path).is_relative())
+                file_cache_settings.base_path = fs::path(config_fs_caches_dir) / file_cache_settings.base_path;
+
+            if (!pathStartsWith(file_cache_settings.base_path, config_fs_caches_dir))
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                                "Filesystem cache path {} must lie inside default filesystem cache path `{}`",
+                                file_cache_settings.base_path, config_fs_caches_dir);
+            }
+        }

        auto cache = FileCacheFactory::instance().getOrCreate(name, file_cache_settings);
        auto disk = disk_it->second;
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -133,6 +133,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.parquet.parallel_encoding = settings.output_format_parquet_parallel_encoding;
    format_settings.parquet.data_page_size = settings.output_format_parquet_data_page_size;
    format_settings.parquet.write_batch_size = settings.output_format_parquet_batch_size;
+    format_settings.parquet.local_read_min_bytes_for_seek = settings.input_format_parquet_local_file_min_bytes_for_seek;
    format_settings.pretty.charset = settings.output_format_pretty_grid_charset.toString() == "ASCII" ? FormatSettings::Pretty::Charset::ASCII : FormatSettings::Pretty::Charset::UTF8;
    format_settings.pretty.color = settings.output_format_pretty_color;
    format_settings.pretty.max_column_pad_width = settings.output_format_pretty_max_column_pad_width;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -241,6 +241,7 @@ struct FormatSettings
        bool output_compliant_nested_types = true;
        size_t data_page_size = 1024 * 1024;
        size_t write_batch_size = 1024;
+        size_t local_read_min_bytes_for_seek = 8192;
    } parquet;

    struct Pretty
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@ -1354,7 +1354,7 @@ Exception readException(ReadBuffer & buf, const String & additional_message, boo
    String stack_trace;
    bool has_nested = false;    /// Obsolete

-    readBinary(code, buf);
+    readBinaryLittleEndian(code, buf);
    readBinary(name, buf);
    readBinary(message, buf);
    readBinary(stack_trace, buf);
--- a/src/IO/ReadWriteBufferFromHTTP.cpp
+++ b/src/IO/ReadWriteBufferFromHTTP.cpp
@ -40,7 +40,12 @@ void UpdatableSession<TSessionFactory>::updateSession(const Poco::URI & uri)
    if (redirects <= max_redirects)
        session = session_factory->buildNewSession(uri);
    else
-        throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects while trying to access {}", initial_uri.toString());
+        throw Exception(ErrorCodes::TOO_MANY_REDIRECTS,
+            "Too many redirects while trying to access {}."
+            " You can {} redirects by changing the setting 'max_http_get_redirects'."
+            " Example: `SET max_http_get_redirects = 10`."
+            " Redirects are restricted to prevent possible attack when a malicious server redirects to an internal resource, bypassing the authentication or firewall.",
+            initial_uri.toString(), max_redirects ? "increase the allowed maximum number of" : "allow");
 }

 template <typename TSessionFactory>
--- a/src/IO/S3/URI.cpp
+++ b/src/IO/S3/URI.cpp
@ -52,9 +52,9 @@ URI::URI(const std::string & uri_)
            has_version_id = true;
        }

-    /// Poco::URI will ignore '?' when parsing the path, but if there is a vestionId in the http parameter,
+    /// Poco::URI will ignore '?' when parsing the path, but if there is a versionId in the http parameter,
    /// '?' can not be used as a wildcard, otherwise it will be ambiguous.
-    /// If no "vertionId" in the http parameter, '?' can be used as a wildcard.
+    /// If no "versionId" in the http parameter, '?' can be used as a wildcard.
    /// It is necessary to encode '?' to avoid deletion during parsing path.
    if (!has_version_id && uri_.find('?') != String::npos)
    {
--- a/src/IO/WriteHelpers.cpp
+++ b/src/IO/WriteHelpers.cpp
@ -77,7 +77,7 @@ void writeIPv6Text(const IPv6 & ip, WriteBuffer & buf)

 void writeException(const Exception & e, WriteBuffer & buf, bool with_stack_trace)
 {
-    writeBinary(e.code(), buf);
+    writeBinaryLittleEndian(e.code(), buf);
    writeBinary(String(e.name()), buf);
    writeBinary(e.displayText() + getExtraExceptionInfo(e), buf);

--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@ -51,13 +51,12 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-FileCache::FileCache(const FileCacheSettings & settings)
+FileCache::FileCache(const std::string & cache_name, const FileCacheSettings & settings)
    : max_file_segment_size(settings.max_file_segment_size)
    , bypass_cache_threshold(settings.enable_bypass_cache_with_threashold ? settings.bypass_cache_threashold : 0)
-    , delayed_cleanup_interval_ms(settings.delayed_cleanup_interval_ms)
    , boundary_alignment(settings.boundary_alignment)
    , background_download_threads(settings.background_download_threads)
-    , log(&Poco::Logger::get("FileCache"))
+    , log(&Poco::Logger::get("FileCache(" + cache_name + ")"))
    , metadata(settings.base_path)
 {
    main_priority = std::make_unique<LRUFileCachePriority>(settings.max_size, settings.max_elements);
@ -134,9 +133,7 @@ void FileCache::initialize()
    for (size_t i = 0; i < background_download_threads; ++i)
         download_threads.emplace_back([this] { metadata.downloadThreadFunc(); });

-    cleanup_task = Context::getGlobalContextInstance()->getSchedulePool().createTask("FileCacheCleanup", [this]{ cleanupThreadFunc(); });
-    cleanup_task->activate();
-    cleanup_task->scheduleAfter(delayed_cleanup_interval_ms);
+    cleanup_thread = std::make_unique<ThreadFromGlobalPool>(std::function{ [this]{ metadata.cleanupThreadFunc(); }});
 }

 CacheGuard::Lock FileCache::lockCache() const
@ -823,23 +820,13 @@ bool FileCache::tryReserve(FileSegment & file_segment, const size_t size, FileCa
 void FileCache::removeKey(const Key & key)
 {
    assertInitialized();
-    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::THROW);
-    locked_key->removeAll();
+    metadata.removeKey(key, /* if_exists */false, /* if_releasable */true);
 }

 void FileCache::removeKeyIfExists(const Key & key)
 {
    assertInitialized();
-
-    auto locked_key = metadata.lockKeyMetadata(key, CacheMetadata::KeyNotFoundPolicy::RETURN_NULL);
-    if (!locked_key)
-        return;
-
-    /// In ordinary case we remove data from cache when it's not used by anyone.
-    /// But if we have multiple replicated zero-copy tables on the same server
-    /// it became possible to start removing something from cache when it is used
-    /// by other "zero-copy" tables. That is why it's not an error.
-    locked_key->removeAll(/* if_releasable */true);
+    metadata.removeKey(key, /* if_exists */true, /* if_releasable */true);
 }

 void FileCache::removeFileSegment(const Key & key, size_t offset)
@ -857,8 +844,7 @@ void FileCache::removePathIfExists(const String & path)
 void FileCache::removeAllReleasable()
 {
    assertInitialized();
-
-    metadata.iterate([](LockedKey & locked_key) { locked_key.removeAll(/* if_releasable */true); });
+    metadata.removeAllKeys(/* if_releasable */true);

    if (stash)
    {
@ -990,6 +976,7 @@ void FileCache::loadMetadata()
                        fs::remove(offset_it->path());
                        continue;
                    }
+                    LOG_TEST(log, "Added file segment {}:{} (size: {}) with path: {}", key, offset, size, offset_it->path().string());

                    const auto & file_segment_metadata = file_segment_metadata_it->second;
                    chassert(file_segment_metadata->file_segment->assertCorrectness());
@ -1039,33 +1026,15 @@ FileCache::~FileCache()

 void FileCache::deactivateBackgroundOperations()
 {
-    if (cleanup_task)
-        cleanup_task->deactivate();
-
    metadata.cancelDownload();
+    metadata.cancelCleanup();
+
    for (auto & thread : download_threads)
        if (thread.joinable())
            thread.join();
-}

-void FileCache::cleanup()
-{
-    metadata.doCleanup();
-}
-
-void FileCache::cleanupThreadFunc()
-{
-    try
-    {
-        cleanup();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-        chassert(false);
-    }
-
-    cleanup_task->scheduleAfter(delayed_cleanup_interval_ms);
+    if (cleanup_thread && cleanup_thread->joinable())
+        cleanup_thread->join();
 }

 FileSegmentsHolderPtr FileCache::getSnapshot()
--- a/src/Interpreters/Cache/FileCache.h
+++ b/src/Interpreters/Cache/FileCache.h
@ -12,7 +12,7 @@

 #include <IO/ReadSettings.h>

-#include <Core/BackgroundSchedulePool.h>
+#include <Common/ThreadPool.h>
 #include <Interpreters/Cache/LRUFileCachePriority.h>
 #include <Interpreters/Cache/FileCache_fwd.h>
 #include <Interpreters/Cache/FileSegment.h>
@ -58,7 +58,7 @@ public:
    using PriorityIterator = IFileCachePriority::Iterator;
    using PriorityIterationResult = IFileCachePriority::IterationResult;

-    explicit FileCache(const FileCacheSettings & settings);
+    FileCache(const std::string & cache_name, const FileCacheSettings & settings);

    ~FileCache();

@ -130,8 +130,6 @@ public:

    FileSegmentsHolderPtr dumpQueue();

-    void cleanup();
-
    void deactivateBackgroundOperations();

    /// For per query cache limit.
@ -157,7 +155,6 @@ private:

    const size_t max_file_segment_size;
    const size_t bypass_cache_threshold = 0;
-    const size_t delayed_cleanup_interval_ms;
    const size_t boundary_alignment;
    const size_t background_download_threads;

@ -202,9 +199,8 @@ private:
     * A background cleanup task.
     * Clears removed cache entries from metadata.
     */
-    BackgroundSchedulePool::TaskHolder cleanup_task;
-
    std::vector<ThreadFromGlobalPool> download_threads;
+    std::unique_ptr<ThreadFromGlobalPool> cleanup_thread;

    void assertInitialized() const;

@ -235,8 +231,6 @@ private:
        FileSegment::State state,
        const CreateFileSegmentSettings & create_settings,
        const CacheGuard::Lock *);
-
-    void cleanupThreadFunc();
 };

 }
--- a/src/Interpreters/Cache/FileCacheFactory.cpp
+++ b/src/Interpreters/Cache/FileCacheFactory.cpp
@ -29,7 +29,7 @@ FileCachePtr FileCacheFactory::getOrCreate(
    auto it = caches_by_name.find(cache_name);
    if (it == caches_by_name.end())
    {
-        auto cache = std::make_shared<FileCache>(file_cache_settings);
+        auto cache = std::make_shared<FileCache>(cache_name, file_cache_settings);
        it = caches_by_name.emplace(
            cache_name, std::make_unique<FileCacheData>(cache, file_cache_settings)).first;
    }
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@ -49,8 +49,6 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &

    if (config.has(config_prefix + ".background_download_threads"))
        background_download_threads = config.getUInt(config_prefix + ".background_download_threads");
-
-    delayed_cleanup_interval_ms = config.getUInt64(config_prefix + ".delayed_cleanup_interval_ms", FILECACHE_DELAYED_CLEANUP_INTERVAL_MS);
 }

 }
--- a/src/Interpreters/Cache/FileCacheSettings.h
+++ b/src/Interpreters/Cache/FileCacheSettings.h
@ -24,7 +24,6 @@ struct FileCacheSettings

    bool enable_bypass_cache_with_threashold = false;
    size_t bypass_cache_threashold = FILECACHE_BYPASS_THRESHOLD;
-    size_t delayed_cleanup_interval_ms = FILECACHE_DELAYED_CLEANUP_INTERVAL_MS;

    size_t boundary_alignment = FILECACHE_DEFAULT_FILE_SEGMENT_ALIGNMENT;
    size_t background_download_threads = FILECACHE_DEFAULT_BACKGROUND_DOWNLOAD_THREADS;
--- a/src/Interpreters/Cache/FileCache_fwd.h
+++ b/src/Interpreters/Cache/FileCache_fwd.h
@ -10,7 +10,6 @@ static constexpr int FILECACHE_DEFAULT_BACKGROUND_DOWNLOAD_THREADS = 2;
 static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000;
 static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0;
 static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;
-static constexpr size_t FILECACHE_DELAYED_CLEANUP_INTERVAL_MS = 1000 * 60; /// 1 min

 class FileCache;
 using FileCachePtr = std::shared_ptr<FileCache>;
--- a/src/Interpreters/Cache/Metadata.cpp
+++ b/src/Interpreters/Cache/Metadata.cpp
@ -1,7 +1,6 @@
 #include <Interpreters/Cache/Metadata.h>
 #include <Interpreters/Cache/FileCache.h>
 #include <Interpreters/Cache/FileSegment.h>
-#include "Common/Exception.h"
 #include <Common/logger_useful.h>
 #include <Common/ElapsedTimeProfileEventIncrement.h>
 #include <filesystem>
@ -11,6 +10,7 @@ namespace fs = std::filesystem;
 namespace CurrentMetrics
 {
    extern const Metric FilesystemCacheDownloadQueueElements;
+    extern const Metric FilesystemCacheDelayedCleanupElements;
 }

 namespace ProfileEvents
@ -59,8 +59,8 @@ size_t FileSegmentMetadata::size() const
 KeyMetadata::KeyMetadata(
    const Key & key_,
    const std::string & key_path_,
-    CleanupQueue & cleanup_queue_,
-    DownloadQueue & download_queue_,
+    CleanupQueuePtr cleanup_queue_,
+    DownloadQueuePtr download_queue_,
    Poco::Logger * log_,
    std::shared_mutex & key_prefix_directory_mutex_,
    bool created_base_directory_)
@ -89,15 +89,19 @@ LockedKeyPtr KeyMetadata::lock()

 LockedKeyPtr KeyMetadata::tryLock()
 {
-    ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheLockKeyMicroseconds);
-
-    auto locked = std::make_unique<LockedKey>(shared_from_this());
+    auto locked = lockNoStateCheck();
    if (key_state == KeyMetadata::KeyState::ACTIVE)
        return locked;

    return nullptr;
 }

+LockedKeyPtr KeyMetadata::lockNoStateCheck()
+{
+    ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheLockKeyMicroseconds);
+    return std::make_unique<LockedKey>(shared_from_this());
+}
+
 bool KeyMetadata::createBaseDirectory()
 {
    if (!created_base_directory.exchange(true))
@ -131,26 +135,10 @@ std::string KeyMetadata::getFileSegmentPath(const FileSegment & file_segment)
 }


-class CleanupQueue
-{
-    friend struct CacheMetadata;
-public:
-    void add(const FileCacheKey & key);
-    void remove(const FileCacheKey & key);
-    size_t getSize() const;
-
-private:
-    bool tryPop(FileCacheKey & key);
-
-    std::unordered_set<FileCacheKey> keys;
-    mutable std::mutex mutex;
-};
-
-
 CacheMetadata::CacheMetadata(const std::string & path_)
    : path(path_)
-    , cleanup_queue(std::make_unique<CleanupQueue>())
-    , download_queue(std::make_unique<DownloadQueue>())
+    , cleanup_queue(std::make_shared<CleanupQueue>())
+    , download_queue(std::make_shared<DownloadQueue>())
    , log(&Poco::Logger::get("CacheMetadata"))
 {
 }
@ -207,20 +195,16 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata(

            it = emplace(
                key, std::make_shared<KeyMetadata>(
-                    key, getPathForKey(key), *cleanup_queue, *download_queue, log, key_prefix_directory_mutex, is_initial_load)).first;
+                    key, getPathForKey(key), cleanup_queue, download_queue, log, key_prefix_directory_mutex, is_initial_load)).first;
        }

        key_metadata = it->second;
    }

    {
-        LockedKeyPtr locked_metadata;
-        {
-            ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheLockKeyMicroseconds);
-            locked_metadata = std::make_unique<LockedKey>(key_metadata);
-        }
-
+        auto locked_metadata = key_metadata->lockNoStateCheck();
        const auto key_state = locked_metadata->getKeyState();
+
        if (key_state == KeyMetadata::KeyState::ACTIVE)
            return locked_metadata;

@ -249,17 +233,12 @@ LockedKeyPtr CacheMetadata::lockKeyMetadata(
    return lockKeyMetadata(key, key_not_found_policy);
 }

-void CacheMetadata::iterate(IterateCacheMetadataFunc && func)
+void CacheMetadata::iterate(IterateFunc && func)
 {
    auto lock = lockMetadata();
-    for (const auto & [key, key_metadata] : *this)
+    for (auto & [key, key_metadata] : *this)
    {
-        LockedKeyPtr locked_key;
-        {
-            ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheLockKeyMicroseconds);
-            locked_key = std::make_unique<LockedKey>(key_metadata);
-        }
-
+        auto locked_key = key_metadata->lockNoStateCheck();
        const auto key_state = locked_key->getKeyState();

        if (key_state == KeyMetadata::KeyState::ACTIVE)
@ -267,8 +246,7 @@ void CacheMetadata::iterate(IterateCacheMetadataFunc && func)
            func(*locked_key);
            continue;
        }
-
-        if (key_state == KeyMetadata::KeyState::REMOVING)
+        else if (key_state == KeyMetadata::KeyState::REMOVING)
            continue;

        throw Exception(
@ -276,62 +254,185 @@ void CacheMetadata::iterate(IterateCacheMetadataFunc && func)
    }
 }

-void CacheMetadata::doCleanup()
+void CacheMetadata::removeAllKeys(bool if_releasable)
 {
    auto lock = lockMetadata();
-
-    FileCacheKey cleanup_key;
-    while (cleanup_queue->tryPop(cleanup_key))
+    for (auto it = begin(); it != end();)
    {
-        auto it = find(cleanup_key);
-        if (it == end())
-            continue;
-
-        LockedKeyPtr locked_metadata;
+        auto locked_key = it->second->lockNoStateCheck();
+        if (locked_key->getKeyState() == KeyMetadata::KeyState::ACTIVE)
        {
-            ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheLockKeyMicroseconds);
-            locked_metadata = std::make_unique<LockedKey>(it->second);
+            bool removed_all = locked_key->removeAllFileSegments(if_releasable);
+            if (removed_all)
+            {
+                it = removeEmptyKey(it, *locked_key, lock);
+                continue;
+            }
        }
+        ++it;
+    }
+}

-        const auto key_state = locked_metadata->getKeyState();
-        if (key_state == KeyMetadata::KeyState::ACTIVE)
+void CacheMetadata::removeKey(const Key & key, bool if_exists, bool if_releasable)
+{
+    auto metadata_lock = lockMetadata();
+
+    auto it = find(key);
+    if (it == end())
+    {
+        if (if_exists)
+            return;
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key: {}", key);
+    }
+
+    auto locked_key = it->second->lockNoStateCheck();
+    auto state = locked_key->getKeyState();
+    if (state != KeyMetadata::KeyState::ACTIVE)
+    {
+        if (if_exists)
+            return;
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key: {} (state: {})", key, magic_enum::enum_name(state));
+    }
+
+    bool removed_all = locked_key->removeAllFileSegments(if_releasable);
+    if (removed_all)
+        removeEmptyKey(it, *locked_key, metadata_lock);
+}
+
+CacheMetadata::iterator CacheMetadata::removeEmptyKey(iterator it, LockedKey & locked_key, const CacheMetadataGuard::Lock &)
+{
+    const auto & key = locked_key.getKey();
+
+    if (!it->second->empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot remove non-empty key: {}", key);
+
+    locked_key.markAsRemoved();
+    auto next_it = erase(it);
+
+    LOG_DEBUG(log, "Key {} is removed from metadata", key);
+
+    const fs::path key_directory = getPathForKey(key);
+    const fs::path key_prefix_directory = key_directory.parent_path();
+
+    try
+    {
+        if (fs::exists(key_directory))
+            fs::remove_all(key_directory);
+    }
+    catch (...)
+    {
+        LOG_ERROR(log, "Error while removing key {}: {}", key, getCurrentExceptionMessage(true));
+        chassert(false);
+        return next_it;
+    }
+
+    try
+    {
+        std::unique_lock mutex(key_prefix_directory_mutex);
+        if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory))
+            fs::remove(key_prefix_directory);
+    }
+    catch (...)
+    {
+        LOG_ERROR(log, "Error while removing key {}: {}", key, getCurrentExceptionMessage(true));
+        chassert(false);
+    }
+    return next_it;
+}
+
+class CleanupQueue
+{
+    friend struct CacheMetadata;
+public:
+    void add(const FileCacheKey & key)
+    {
+        bool inserted;
        {
-            /// Key was added back to cache after we submitted it to removal queue.
-            continue;
+            std::lock_guard lock(mutex);
+            if (cancelled)
+                return;
+            inserted = keys.insert(key).second;
        }
-
-        chassert(it->second->empty());
-        locked_metadata->markAsRemoved();
-        erase(it);
-        LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key);
-
-        const fs::path key_directory = getPathForKey(cleanup_key);
-        const fs::path key_prefix_directory = key_directory.parent_path();
-
-        try
+        /// There is an invariant that key cannot be submitted for removal if it is already in removal queue.
+        /// Because
+        /// 1) when submit key to removal it acquires state REMOVING and we submit key for removal only if it has ACTIVE state.
+        /// 2) if a key is added to cache and it was found in removal queue - it will be removed from the queue and get state ACTIVE.
+        /// and both these actions are synchronized by the same KeyGuard.
+        chassert(inserted);
+        if (inserted)
        {
-            if (fs::exists(key_directory))
-                fs::remove_all(key_directory);
-        }
-        catch (...)
-        {
-            LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true));
-            chassert(false);
-            continue;
-        }
-
-        try
-        {
-            std::unique_lock mutex(key_prefix_directory_mutex);
-            if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory))
-                fs::remove(key_prefix_directory);
-        }
-        catch (...)
-        {
-            LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true));
-            chassert(false);
+            CurrentMetrics::add(CurrentMetrics::FilesystemCacheDelayedCleanupElements);
+            cv.notify_one();
        }
    }
+
+    void cancel()
+    {
+        {
+            std::lock_guard lock(mutex);
+            cancelled = true;
+        }
+        cv.notify_all();
+    }
+
+private:
+    std::unordered_set<FileCacheKey> keys;
+    mutable std::mutex mutex;
+    std::condition_variable cv;
+    bool cancelled = false;
+};
+
+void CacheMetadata::cleanupThreadFunc()
+{
+    while (true)
+    {
+        Key key;
+        {
+            std::unique_lock lock(cleanup_queue->mutex);
+            if (cleanup_queue->cancelled)
+                return;
+
+            auto & keys = cleanup_queue->keys;
+            if (keys.empty())
+            {
+                cleanup_queue->cv.wait(lock, [&](){ return cleanup_queue->cancelled || !keys.empty(); });
+                if (cleanup_queue->cancelled)
+                    return;
+            }
+
+            auto it = keys.begin();
+            key = *it;
+            keys.erase(it);
+        }
+
+        CurrentMetrics::sub(CurrentMetrics::FilesystemCacheDelayedCleanupElements);
+
+        try
+        {
+            auto lock = lockMetadata();
+
+            auto it = find(key);
+            if (it == end())
+                continue;
+
+            auto locked_key = it->second->lockNoStateCheck();
+            if (locked_key->getKeyState() == KeyMetadata::KeyState::REMOVING)
+            {
+                removeEmptyKey(it, *locked_key, lock);
+            }
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+}
+
+void CacheMetadata::cancelCleanup()
+{
+    cleanup_queue->cancel();
 }

 class DownloadQueue
@ -342,6 +443,8 @@ public:
    {
        {
            std::lock_guard lock(mutex);
+            if (cancelled)
+                return;
            queue.push(DownloadInfo{file_segment->key(), file_segment->offset(), file_segment});
        }

@ -390,14 +493,14 @@ void CacheMetadata::downloadThreadFunc()

        {
            std::unique_lock lock(download_queue->mutex);
-
            if (download_queue->cancelled)
                return;

            if (download_queue->queue.empty())
            {
-                download_queue->cv.wait(lock);
-                continue;
+                download_queue->cv.wait(lock, [&](){ return download_queue->cancelled || !download_queue->queue.empty(); });
+                if (download_queue->cancelled)
+                    return;
            }

            auto entry = download_queue->queue.front();
@ -541,9 +644,15 @@ LockedKey::~LockedKey()
    if (!key_metadata->empty() || getKeyState() != KeyMetadata::KeyState::ACTIVE)
        return;

+    /// If state if ACTIVE and key turns out empty - we submit it for delayed removal.
+    /// Because we do not want to always lock all cache metadata lock, when we remove files segments.
+    /// but sometimes we do - we remove the empty key without delay - then key state
+    /// will be REMOVED here and we will return in the check above.
+    /// See comment near cleanupThreadFunc() for more details.
+
    key_metadata->key_state = KeyMetadata::KeyState::REMOVING;
    LOG_DEBUG(key_metadata->log, "Submitting key {} for removal", getKey());
-    key_metadata->cleanup_queue.add(getKey());
+    key_metadata->cleanup_queue->add(getKey());
 }

 void LockedKey::removeFromCleanupQueue()
@ -566,13 +675,15 @@ bool LockedKey::isLastOwnerOfFileSegment(size_t offset) const
    return file_segment_metadata->file_segment.use_count() == 2;
 }

-void LockedKey::removeAll(bool if_releasable)
+bool LockedKey::removeAllFileSegments(bool if_releasable)
 {
+    bool removed_all = true;
    for (auto it = key_metadata->begin(); it != key_metadata->end();)
    {
        if (if_releasable && !it->second->releasable())
        {
            ++it;
+            removed_all = false;
            continue;
        }
        else if (it->second->evicting())
@ -583,12 +694,14 @@ void LockedKey::removeAll(bool if_releasable)
            /// so if we remove file segment now, we break the freeable_count
            /// calculation in tryReserve.
            ++it;
+            removed_all = false;
            continue;
        }

        auto file_segment = it->second->file_segment;
        it = removeFileSegment(file_segment->offset(), file_segment->lock());
    }
+    return removed_all;
 }

 KeyMetadata::iterator LockedKey::removeFileSegment(size_t offset)
@ -687,7 +800,7 @@ void LockedKey::addToDownloadQueue(size_t offset, const FileSegmentGuard::Lock &
    auto it = key_metadata->find(offset);
    if (it == key_metadata->end())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is not offset {}", offset);
-    key_metadata->download_queue.add(it->second->file_segment);
+    key_metadata->download_queue->add(it->second->file_segment);
 }

 std::optional<FileSegment::Range> LockedKey::hasIntersectingRange(const FileSegment::Range & range) const
@ -757,35 +870,4 @@ std::string LockedKey::toString() const
    return result;
 }

-void CleanupQueue::add(const FileCacheKey & key)
-{
-    std::lock_guard lock(mutex);
-    keys.insert(key);
-}
-
-void CleanupQueue::remove(const FileCacheKey & key)
-{
-    std::lock_guard lock(mutex);
-    bool erased = keys.erase(key);
-    if (!erased)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No such key {} in removal queue", key);
-}
-
-bool CleanupQueue::tryPop(FileCacheKey & key)
-{
-    std::lock_guard lock(mutex);
-    if (keys.empty())
-        return false;
-    auto it = keys.begin();
-    key = *it;
-    keys.erase(it);
-    return true;
-}
-
-size_t CleanupQueue::getSize() const
-{
-    std::lock_guard lock(mutex);
-    return keys.size();
-}
-
 }
--- a/src/Interpreters/Cache/Metadata.h
+++ b/src/Interpreters/Cache/Metadata.h
@ -48,8 +48,8 @@ struct KeyMetadata : public std::map<size_t, FileSegmentMetadataPtr>,
    KeyMetadata(
        const Key & key_,
        const std::string & key_path_,
-        CleanupQueue & cleanup_queue_,
-        DownloadQueue & download_queue_,
+        CleanupQueuePtr cleanup_queue_,
+        DownloadQueuePtr download_queue_,
        Poco::Logger * log_,
        std::shared_mutex & key_prefix_directory_mutex_,
        bool created_base_directory_ = false);
@ -69,6 +69,8 @@ struct KeyMetadata : public std::map<size_t, FileSegmentMetadataPtr>,
    /// Return nullptr if key has non-ACTIVE state.
    LockedKeyPtr tryLock();

+    LockedKeyPtr lockNoStateCheck();
+
    bool createBaseDirectory();

    std::string getFileSegmentPath(const FileSegment & file_segment);
@ -76,8 +78,8 @@ struct KeyMetadata : public std::map<size_t, FileSegmentMetadataPtr>,
 private:
    KeyState key_state = KeyState::ACTIVE;
    KeyGuard guard;
-    CleanupQueue & cleanup_queue;
-    DownloadQueue & download_queue;
+    const CleanupQueuePtr cleanup_queue;
+    const DownloadQueuePtr download_queue;
    std::shared_mutex & key_prefix_directory_mutex;
    std::atomic<bool> created_base_directory = false;
    Poco::Logger * log;
@ -90,7 +92,7 @@ struct CacheMetadata : public std::unordered_map<FileCacheKey, KeyMetadataPtr>,
 {
 public:
    using Key = FileCacheKey;
-    using IterateCacheMetadataFunc = std::function<void(LockedKey &)>;
+    using IterateFunc = std::function<void(LockedKey &)>;

    explicit CacheMetadata(const std::string & path_);

@ -104,7 +106,7 @@ public:
    String getPathForKey(const Key & key) const;
    static String getFileNameForFileSegment(size_t offset, FileSegmentKind segment_kind);

-    void iterate(IterateCacheMetadataFunc && func);
+    void iterate(IterateFunc && func);

    enum class KeyNotFoundPolicy
    {
@ -119,7 +121,20 @@ public:
        KeyNotFoundPolicy key_not_found_policy,
        bool is_initial_load = false);

-    void doCleanup();
+    void removeKey(const Key & key, bool if_exists, bool if_releasable);
+    void removeAllKeys(bool if_releasable);
+
+    void cancelCleanup();
+
+    /// Firstly, this cleanup does not delete cache files,
+    /// but only empty keys from cache_metadata_map and key (prefix) directories from fs.
+    /// Secondly, it deletes those only if arose as a result of
+    /// (1) eviction in FileCache::tryReserve();
+    /// (2) removal of cancelled non-downloaded file segments after FileSegment::complete().
+    /// which does not include removal of cache files because of FileCache::removeKey/removeAllKeys,
+    /// triggered by removal of source files from objects storage.
+    /// E.g. number of elements submitted to background cleanup should remain low.
+    void cleanupThreadFunc();

    void downloadThreadFunc();

@ -129,12 +144,13 @@ private:
    CacheMetadataGuard::Lock lockMetadata() const;
    const std::string path; /// Cache base path
    mutable CacheMetadataGuard guard;
-    const CleanupQueuePtr cleanup_queue;
-    const DownloadQueuePtr download_queue;
+    CleanupQueuePtr cleanup_queue;
+    DownloadQueuePtr download_queue;
    std::shared_mutex key_prefix_directory_mutex;
    Poco::Logger * log;

    void downloadImpl(FileSegment & file_segment, std::optional<Memory<>> & memory);
+    iterator removeEmptyKey(iterator it, LockedKey &, const CacheMetadataGuard::Lock &);
 };


@ -174,7 +190,7 @@ struct LockedKey : private boost::noncopyable
    std::shared_ptr<const KeyMetadata> getKeyMetadata() const { return key_metadata; }
    std::shared_ptr<KeyMetadata> getKeyMetadata() { return key_metadata; }

-    void removeAll(bool if_releasable = true);
+    bool removeAllFileSegments(bool if_releasable = true);

    KeyMetadata::iterator removeFileSegment(size_t offset, const FileSegmentGuard::Lock &);
    KeyMetadata::iterator removeFileSegment(size_t offset);
--- a/src/Interpreters/Cluster.h
+++ b/src/Interpreters/Cluster.h
@ -273,6 +273,8 @@ public:
    /// Are distributed DDL Queries (ON CLUSTER Clause) allowed for this cluster
    bool areDistributedDDLQueriesAllowed() const { return allow_distributed_ddl_queries; }

+    const String & getName() const { return name; }
+
 private:
    SlotToShard slot_to_shard;

--- a/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -28,7 +28,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int TOO_LARGE_DISTRIBUTED_DEPTH;
-    extern const int SUPPORT_IS_DISABLED;
+    extern const int LOGICAL_ERROR;
 }

 namespace ClusterProxy
@ -234,7 +234,8 @@ void executeQuery(
            std::move(external_tables),
            log,
            shards,
-            query_info.storage_limits);
+            query_info.storage_limits,
+            query_info.getCluster()->getName());

        read_from_remote->setStepDescription("Read from remote replica");
        plan->addStep(std::move(read_from_remote));
@ -266,20 +267,57 @@ void executeQueryWithParallelReplicas(
    const StorageID & main_table,
    const ASTPtr & table_func_ptr,
    SelectStreamFactory & stream_factory,
-    const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info,
+    const ASTPtr & query_ast,
+    ContextPtr context,
+    const SelectQueryInfo & query_info,
    const ClusterPtr & not_optimized_cluster)
 {
-    if (not_optimized_cluster->getShardsInfo().size() != 1)
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Cluster for parallel replicas should consist only from one shard");
-
-    auto shard_info = not_optimized_cluster->getShardsInfo().front();
-
    const auto & settings = context->getSettingsRef();
-    ClusterPtr new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings);
+    auto new_context = Context::createCopy(context);
+    auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{};
+
+    UInt64 shard_num = 0; /// shard_num is 1-based, so 0 - no shard specified
+    const auto it = scalars.find("_shard_num");
+    if (it != scalars.end())
+    {
+        const Block & block = it->second;
+        const auto & column = block.safeGetByPosition(0).column;
+        shard_num = column->getUInt(0);
+    }
+
+    size_t all_replicas_count = 0;
+    ClusterPtr new_cluster;
+    /// if got valid shard_num from query initiator, then parallel replicas scope is the specified shard
+    /// shards are numbered in order of appearance in the cluster config
+    if (shard_num > 0)
+    {
+        const auto shard_count = not_optimized_cluster->getShardCount();
+        if (shard_num > shard_count)
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Shard number is greater than shard count: shard_num={} shard_count={} cluster={}",
+                shard_num,
+                shard_count,
+                not_optimized_cluster->getName());
+
+        chassert(shard_count == not_optimized_cluster->getShardsAddresses().size());
+
+        LOG_DEBUG(&Poco::Logger::get("executeQueryWithParallelReplicas"), "Parallel replicas query in shard scope: shard_num={} cluster={}",
+                  shard_num, not_optimized_cluster->getName());
+
+        const auto shard_replicas_num = not_optimized_cluster->getShardsAddresses()[shard_num - 1].size();
+        all_replicas_count = std::min(static_cast<size_t>(settings.max_parallel_replicas), shard_replicas_num);
+
+        /// shard_num is 1-based, but getClusterWithSingleShard expects 0-based index
+        new_cluster = not_optimized_cluster->getClusterWithSingleShard(shard_num - 1);
+    }
+    else
+    {
+        new_cluster = not_optimized_cluster->getClusterWithReplicasAsShards(settings);
+        all_replicas_count = std::min(static_cast<size_t>(settings.max_parallel_replicas), new_cluster->getShardCount());
+    }

-    auto all_replicas_count = std::min(static_cast<size_t>(settings.max_parallel_replicas), new_cluster->getShardCount());
    auto coordinator = std::make_shared<ParallelReplicasReadingCoordinator>(all_replicas_count);
-    auto remote_plan = std::make_unique<QueryPlan>();

    /// This is a little bit weird, but we construct an "empty" coordinator without
    /// any specified reading/coordination method (like Default, InOrder, InReverseOrder)
@ -288,8 +326,6 @@ void executeQueryWithParallelReplicas(
    /// to then tell it about the reading method we chose.
    query_info.coordinator = coordinator;

-    auto new_context = Context::createCopy(context);
-    auto scalars = new_context->hasQueryContext() ? new_context->getQueryContext()->getScalars() : Scalars{};
    auto external_tables = new_context->getExternalTables();

    auto read_from_remote = std::make_unique<ReadFromParallelRemoteReplicasStep>(
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -215,6 +215,7 @@ struct ContextSharedPart : boost::noncopyable
    String user_files_path;                                 /// Path to the directory with user provided files, usable by 'file' table function.
    String dictionaries_lib_path;                           /// Path to the directory with user provided binaries and libraries for external dictionaries.
    String user_scripts_path;                               /// Path to the directory with user provided scripts.
+    String filesystem_caches_path;                          /// Path to the directory with filesystem caches.
    ConfigurationPtr config;                                /// Global configuration settings.

    String tmp_path;                                        /// Path to the temporary files that occur when processing the request.
@ -771,6 +772,12 @@ String Context::getUserScriptsPath() const
    return shared->user_scripts_path;
 }

+String Context::getFilesystemCachesPath() const
+{
+    auto lock = getLock();
+    return shared->filesystem_caches_path;
+}
+
 Strings Context::getWarnings() const
 {
    Strings common_warnings;
@ -862,6 +869,16 @@ void Context::setPath(const String & path)
        shared->user_scripts_path = shared->path + "user_scripts/";
 }

+void Context::setFilesystemCachesPath(const String & path)
+{
+    auto lock = getLock();
+
+    if (!fs::path(path).is_absolute())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Filesystem caches path must be absolute: {}", path);
+
+    shared->filesystem_caches_path = path;
+}
+
 static void setupTmpPath(Poco::Logger * log, const std::string & path)
 try
 {
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -491,6 +491,7 @@ public:
    String getUserFilesPath() const;
    String getDictionariesLibPath() const;
    String getUserScriptsPath() const;
+    String getFilesystemCachesPath() const;

    /// A list of warnings about server configuration to place in `system.warnings` table.
    Strings getWarnings() const;
@ -501,6 +502,8 @@ public:
    TemporaryDataOnDiskScopePtr getSharedTempDataOnDisk() const;
    void setTempDataOnDisk(TemporaryDataOnDiskScopePtr temp_data_on_disk_);

+    void setFilesystemCachesPath(const String & path);
+
    void setPath(const String & path);
    void setFlagsPath(const String & path);
    void setUserFilesPath(const String & path);
--- a/src/Interpreters/InterpreterDescribeCacheQuery.cpp
+++ b/src/Interpreters/InterpreterDescribeCacheQuery.cpp
@ -25,7 +25,6 @@ static Block getSampleBlock()
        ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "current_size"},
        ColumnWithTypeAndName{std::make_shared<DataTypeUInt64>(), "current_elements"},
        ColumnWithTypeAndName{std::make_shared<DataTypeString>(), "path"},
-        ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt64>>(), "delayed_cleanup_interval_ms"},
        ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt64>>(), "background_download_threads"},
        ColumnWithTypeAndName{std::make_shared<DataTypeNumber<UInt64>>(), "enable_bypass_cache_with_threshold"},
    };
@ -54,7 +53,6 @@ BlockIO InterpreterDescribeCacheQuery::execute()
    res_columns[i++]->insert(cache->getUsedCacheSize());
    res_columns[i++]->insert(cache->getFileSegmentsNum());
    res_columns[i++]->insert(cache->getBasePath());
-    res_columns[i++]->insert(settings.delayed_cleanup_interval_ms);
    res_columns[i++]->insert(settings.background_download_threads);
    res_columns[i++]->insert(settings.enable_bypass_cache_with_threashold);

--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -1193,12 +1193,12 @@ static InterpolateDescriptionPtr getInterpolateDescription(
            }

            col_set.clear();
-            for (const auto & column : source_block)
+            for (const auto & column : result_block)
            {
                source_columns.emplace_back(column.name, column.type);
                col_set.insert(column.name);
            }
-            for (const auto & column : result_block)
+            for (const auto & column : source_block)
                if (!col_set.contains(column.name))
                    source_columns.emplace_back(column.name, column.type);
        }
--- a/src/Interpreters/MetricLog.cpp
+++ b/src/Interpreters/MetricLog.cpp
@ -16,7 +16,6 @@ NamesAndTypesList MetricLogElement::getNamesAndTypes()
    columns_with_type_and_name.emplace_back("event_date", std::make_shared<DataTypeDate>());
    columns_with_type_and_name.emplace_back("event_time", std::make_shared<DataTypeDateTime>());
    columns_with_type_and_name.emplace_back("event_time_microseconds", std::make_shared<DataTypeDateTime64>(6));
-    columns_with_type_and_name.emplace_back("milliseconds", std::make_shared<DataTypeUInt64>());

    for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
    {
@ -45,7 +44,6 @@ void MetricLogElement::appendToBlock(MutableColumns & columns) const
    columns[column_idx++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType());
    columns[column_idx++]->insert(event_time);
    columns[column_idx++]->insert(event_time_microseconds);
-    columns[column_idx++]->insert(milliseconds);

    for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
        columns[column_idx++]->insert(profile_events[i]);
@ -96,7 +94,6 @@ void MetricLog::metricThreadFunction()
            MetricLogElement elem;
            elem.event_time = std::chrono::system_clock::to_time_t(current_time);
            elem.event_time_microseconds = timeInMicroseconds(current_time);
-            elem.milliseconds = timeInMilliseconds(current_time) - timeInSeconds(current_time) * 1000;

            elem.profile_events.resize(ProfileEvents::end());
            for (ProfileEvents::Event i = ProfileEvents::Event(0), end = ProfileEvents::end(); i < end; ++i)
--- a/src/Interpreters/MetricLog.h
+++ b/src/Interpreters/MetricLog.h
@ -22,7 +22,6 @@ struct MetricLogElement
 {
    time_t event_time{};
    Decimal64 event_time_microseconds{};
-    UInt64 milliseconds{};

    std::vector<ProfileEvents::Count> profile_events;
    std::vector<CurrentMetrics::Metric> current_metrics;
--- a/src/Interpreters/ServerAsynchronousMetrics.cpp
+++ b/src/Interpreters/ServerAsynchronousMetrics.cpp
@ -251,15 +251,26 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
        size_t total_number_of_rows = 0;
        size_t total_number_of_parts = 0;

+        size_t total_number_of_tables_system = 0;
+
+        size_t total_number_of_bytes_system = 0;
+        size_t total_number_of_rows_system = 0;
+        size_t total_number_of_parts_system = 0;
+
        for (const auto & db : databases)
        {
            /// Check if database can contain MergeTree tables
            if (!db.second->canContainMergeTreeTables())
                continue;

+            bool is_system = db.first == DatabaseCatalog::SYSTEM_DATABASE;
+
            for (auto iterator = db.second->getTablesIterator(getContext()); iterator->isValid(); iterator->next())
            {
                ++total_number_of_tables;
+                if (is_system)
+                    ++total_number_of_tables_system;
+
                const auto & table = iterator->table();
                if (!table)
                    continue;
@ -269,9 +280,21 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
                    const auto & settings = getContext()->getSettingsRef();

                    calculateMax(max_part_count_for_partition, table_merge_tree->getMaxPartsCountAndSizeForPartition().first);
-                    total_number_of_bytes += table_merge_tree->totalBytes(settings).value();
-                    total_number_of_rows += table_merge_tree->totalRows(settings).value();
-                    total_number_of_parts += table_merge_tree->getActivePartsCount();
+
+                    size_t bytes = table_merge_tree->totalBytes(settings).value();
+                    size_t rows = table_merge_tree->totalRows(settings).value();
+                    size_t parts = table_merge_tree->getActivePartsCount();
+
+                    total_number_of_bytes += bytes;
+                    total_number_of_rows += rows;
+                    total_number_of_parts += parts;
+
+                    if (is_system)
+                    {
+                        total_number_of_bytes_system += bytes;
+                        total_number_of_rows_system += rows;
+                        total_number_of_parts_system += parts;
+                    }
                }

                if (StorageReplicatedMergeTree * table_replicated_merge_tree = typeid_cast<StorageReplicatedMergeTree *>(table.get()))
@ -325,6 +348,12 @@ void ServerAsynchronousMetrics::updateImpl(AsynchronousMetricValues & new_values
        new_values["TotalRowsOfMergeTreeTables"] = { total_number_of_rows, "Total amount of rows (records) stored in all tables of MergeTree family." };
        new_values["TotalPartsOfMergeTreeTables"] = { total_number_of_parts, "Total amount of data parts in all tables of MergeTree family."
            " Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key." };
+
+        new_values["NumberOfTablesSystem"] = { total_number_of_tables_system, "Total number of tables in the system database on the server stored in tables of MergeTree family."};
+
+        new_values["TotalBytesOfMergeTreeTablesSystem"] = { total_number_of_bytes_system, "Total amount of bytes (compressed, including data and indices) stored in tables of MergeTree family in the system database." };
+        new_values["TotalRowsOfMergeTreeTablesSystem"] = { total_number_of_rows_system, "Total amount of rows (records) stored in tables of MergeTree family in the system database." };
+        new_values["TotalPartsOfMergeTreeTablesSystem"] = { total_number_of_parts_system, "Total amount of data parts in tables of MergeTree family in the system database." };
    }

 #if USE_NURAFT
--- a/src/Interpreters/TextLog.cpp
+++ b/src/Interpreters/TextLog.cpp
@ -36,7 +36,6 @@ NamesAndTypesList TextLogElement::getNamesAndTypes()
        {"event_date", std::make_shared<DataTypeDate>()},
        {"event_time", std::make_shared<DataTypeDateTime>()},
        {"event_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
-        {"microseconds", std::make_shared<DataTypeUInt32>()},

        {"thread_name", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
        {"thread_id", std::make_shared<DataTypeUInt64>()},
@ -62,7 +61,6 @@ void TextLogElement::appendToBlock(MutableColumns & columns) const
    columns[i++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType());
    columns[i++]->insert(event_time);
    columns[i++]->insert(event_time_microseconds);
-    columns[i++]->insert(microseconds);

    columns[i++]->insertData(thread_name.data(), thread_name.size());
    columns[i++]->insert(thread_id);
--- a/src/Interpreters/TextLog.h
+++ b/src/Interpreters/TextLog.h
@ -14,7 +14,6 @@ struct TextLogElement
 {
    time_t event_time{};
    Decimal64 event_time_microseconds{};
-    UInt32 microseconds{};

    String thread_name;
    UInt64 thread_id{};
--- a/src/Interpreters/tests/gtest_lru_file_cache.cpp
+++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp
@ -209,7 +209,7 @@ TEST_F(FileCacheTest, get)

    {
        std::cerr << "Step 1\n";
-        auto cache = DB::FileCache(settings);
+        auto cache = DB::FileCache("1", settings);
        cache.initialize();
        auto key = cache.createKeyForPath("key1");

@ -568,7 +568,7 @@ TEST_F(FileCacheTest, get)
    {
        /// Test LRUCache::restore().

-        auto cache2 = DB::FileCache(settings);
+        auto cache2 = DB::FileCache("2", settings);
        cache2.initialize();
        auto key = cache2.createKeyForPath("key1");

@ -587,7 +587,7 @@ TEST_F(FileCacheTest, get)
        settings2.max_file_segment_size = 10;
        settings2.base_path = caches_dir / "cache2";
        fs::create_directories(settings2.base_path);
-        auto cache2 = DB::FileCache(settings2);
+        auto cache2 = DB::FileCache("3", settings2);
        cache2.initialize();
        auto key = cache2.createKeyForPath("key1");

@ -600,11 +600,10 @@ TEST_F(FileCacheTest, get)

    std::cerr << "Step 13\n";
    {
-        /// Test delated cleanup
+        /// Test delayed cleanup

-        auto cache = FileCache(settings);
+        auto cache = FileCache("4", settings);
        cache.initialize();
-        cache.cleanup();
        const auto key = cache.createKeyForPath("key10");
        const auto key_path = cache.getPathInLocalCache(key);

@ -619,21 +618,15 @@ TEST_F(FileCacheTest, get)

        cache.removeAllReleasable();
        ASSERT_EQ(cache.getUsedCacheSize(), 0);
-        ASSERT_TRUE(fs::exists(key_path));
-        ASSERT_TRUE(!fs::exists(cache.getPathInLocalCache(key, 0, FileSegmentKind::Regular)));
-
-        cache.cleanup();
        ASSERT_TRUE(!fs::exists(key_path));
-        ASSERT_TRUE(!fs::exists(fs::path(key_path).parent_path()));
+        ASSERT_TRUE(!fs::exists(cache.getPathInLocalCache(key, 0, FileSegmentKind::Regular)));
    }

    std::cerr << "Step 14\n";
    {
        /// Test background thread delated cleanup

-        auto settings2{settings};
-        settings2.delayed_cleanup_interval_ms = 0;
-        auto cache = DB::FileCache(settings2);
+        auto cache = DB::FileCache("5", settings);
        cache.initialize();
        const auto key = cache.createKeyForPath("key10");
        const auto key_path = cache.getPathInLocalCache(key);
@ -662,7 +655,7 @@ TEST_F(FileCacheTest, writeBuffer)
    settings.max_file_segment_size = 5;
    settings.base_path = cache_base_path;

-    FileCache cache(settings);
+    FileCache cache("6", settings);
    cache.initialize();

    auto write_to_cache = [&cache](const String & key, const Strings & data, bool flush)
@ -767,7 +760,7 @@ TEST_F(FileCacheTest, temporaryData)
    settings.max_file_segment_size = 1_KiB;
    settings.base_path = cache_base_path;

-    DB::FileCache file_cache(settings);
+    DB::FileCache file_cache("7", settings);
    file_cache.initialize();

    auto tmp_data_scope = std::make_shared<TemporaryDataOnDiskScope>(nullptr, &file_cache, 0);
@ -908,7 +901,7 @@ TEST_F(FileCacheTest, CachedReadBuffer)
    wb->next();
    wb->finalize();

-    auto cache = std::make_shared<DB::FileCache>(settings);
+    auto cache = std::make_shared<DB::FileCache>("8", settings);
    cache->initialize();
    auto key = cache->createKeyForPath(file_path);

--- a/src/Loggers/OwnSplitChannel.cpp
+++ b/src/Loggers/OwnSplitChannel.cpp
@ -117,7 +117,6 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)

        elem.event_time = msg_ext.time_seconds;
        elem.event_time_microseconds = msg_ext.time_in_microseconds;
-        elem.microseconds = msg_ext.time_microseconds;

        elem.thread_name = getThreadName();
        elem.thread_id = msg_ext.thread_id;
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
@ -84,7 +84,24 @@ void ParquetBlockInputFormat::initializeIfNeeded()
    std::shared_ptr<arrow::Schema> schema;
    THROW_ARROW_NOT_OK(parquet::arrow::FromParquetSchema(metadata->schema(), &schema));

-    row_groups.resize(metadata->num_row_groups());
+    int num_row_groups = metadata->num_row_groups();
+    if (num_row_groups == 0)
+        return;
+
+    row_group_batches.reserve(num_row_groups);
+
+    for (int row_group = 0; row_group < num_row_groups; ++row_group)
+    {
+        if (skip_row_groups.contains(row_group))
+            continue;
+
+        if (row_group_batches.empty() || row_group_batches.back().total_bytes_compressed >= min_bytes_for_seek)
+            row_group_batches.emplace_back();
+
+        row_group_batches.back().row_groups_idxs.push_back(row_group);
+        row_group_batches.back().total_rows += metadata->RowGroup(row_group)->num_rows();
+        row_group_batches.back().total_bytes_compressed += metadata->RowGroup(row_group)->total_compressed_size();
+    }

    ArrowFieldIndexUtil field_util(
        format_settings.parquet.case_insensitive_column_matching,
@ -92,9 +109,9 @@ void ParquetBlockInputFormat::initializeIfNeeded()
    column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
 }

-void ParquetBlockInputFormat::initializeRowGroupReader(size_t row_group_idx)
+void ParquetBlockInputFormat::initializeRowGroupBatchReader(size_t row_group_batch_idx)
 {
-    auto & row_group = row_groups[row_group_idx];
+    auto & row_group_batch = row_group_batches[row_group_batch_idx];

    parquet::ArrowReaderProperties properties;
    properties.set_use_threads(false);
@ -140,33 +157,30 @@ void ParquetBlockInputFormat::initializeRowGroupReader(size_t row_group_idx)
        builder.Open(arrow_file, /* not to be confused with ArrowReaderProperties */ parquet::default_reader_properties(), metadata));
    builder.properties(properties);
    // TODO: Pass custom memory_pool() to enable memory accounting with non-jemalloc allocators.
-    THROW_ARROW_NOT_OK(builder.Build(&row_group.file_reader));
+    THROW_ARROW_NOT_OK(builder.Build(&row_group_batch.file_reader));

    THROW_ARROW_NOT_OK(
-        row_group.file_reader->GetRecordBatchReader({static_cast<int>(row_group_idx)}, column_indices, &row_group.record_batch_reader));
+        row_group_batch.file_reader->GetRecordBatchReader(row_group_batch.row_groups_idxs, column_indices, &row_group_batch.record_batch_reader));

-    row_group.arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
+    row_group_batch.arrow_column_to_ch_column = std::make_unique<ArrowColumnToCHColumn>(
        getPort().getHeader(),
        "Parquet",
        format_settings.parquet.allow_missing_columns,
        format_settings.null_as_default,
        format_settings.parquet.case_insensitive_column_matching);
-
-    row_group.row_group_bytes_uncompressed = metadata->RowGroup(static_cast<int>(row_group_idx))->total_compressed_size();
-    row_group.row_group_rows = metadata->RowGroup(static_cast<int>(row_group_idx))->num_rows();
 }

-void ParquetBlockInputFormat::scheduleRowGroup(size_t row_group_idx)
+void ParquetBlockInputFormat::scheduleRowGroup(size_t row_group_batch_idx)
 {
    chassert(!mutex.try_lock());

-    auto & status = row_groups[row_group_idx].status;
-    chassert(status == RowGroupState::Status::NotStarted || status == RowGroupState::Status::Paused);
+    auto & status = row_group_batches[row_group_batch_idx].status;
+    chassert(status == RowGroupBatchState::Status::NotStarted || status == RowGroupBatchState::Status::Paused);

-    status = RowGroupState::Status::Running;
+    status = RowGroupBatchState::Status::Running;

    pool->scheduleOrThrowOnError(
-        [this, row_group_idx, thread_group = CurrentThread::getGroup()]()
+        [this, row_group_batch_idx, thread_group = CurrentThread::getGroup()]()
        {
            if (thread_group)
                CurrentThread::attachToGroupIfDetached(thread_group);
@ -176,7 +190,7 @@ void ParquetBlockInputFormat::scheduleRowGroup(size_t row_group_idx)
            {
                setThreadName("ParquetDecoder");

-                threadFunction(row_group_idx);
+                threadFunction(row_group_batch_idx);
            }
            catch (...)
            {
@ -187,44 +201,44 @@ void ParquetBlockInputFormat::scheduleRowGroup(size_t row_group_idx)
        });
 }

-void ParquetBlockInputFormat::threadFunction(size_t row_group_idx)
+void ParquetBlockInputFormat::threadFunction(size_t row_group_batch_idx)
 {
    std::unique_lock lock(mutex);

-    auto & row_group = row_groups[row_group_idx];
-    chassert(row_group.status == RowGroupState::Status::Running);
+    auto & row_group_batch = row_group_batches[row_group_batch_idx];
+    chassert(row_group_batch.status == RowGroupBatchState::Status::Running);

    while (true)
    {
-        if (is_stopped || row_group.num_pending_chunks >= max_pending_chunks_per_row_group)
+        if (is_stopped || row_group_batch.num_pending_chunks >= max_pending_chunks_per_row_group_batch)
        {
-            row_group.status = RowGroupState::Status::Paused;
+            row_group_batch.status = RowGroupBatchState::Status::Paused;
            return;
        }

-        decodeOneChunk(row_group_idx, lock);
+        decodeOneChunk(row_group_batch_idx, lock);

-        if (row_group.status == RowGroupState::Status::Done)
+        if (row_group_batch.status == RowGroupBatchState::Status::Done)
            return;
    }
 }

-void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_idx, std::unique_lock<std::mutex> & lock)
+void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_batch_idx, std::unique_lock<std::mutex> & lock)
 {
-    auto & row_group = row_groups[row_group_idx];
-    chassert(row_group.status != RowGroupState::Status::Done);
+    auto & row_group_batch = row_group_batches[row_group_batch_idx];
+    chassert(row_group_batch.status != RowGroupBatchState::Status::Done);
    chassert(lock.owns_lock());
    SCOPE_EXIT({ chassert(lock.owns_lock() || std::uncaught_exceptions()); });

    lock.unlock();

    auto end_of_row_group = [&] {
-        row_group.arrow_column_to_ch_column.reset();
-        row_group.record_batch_reader.reset();
-        row_group.file_reader.reset();
+        row_group_batch.arrow_column_to_ch_column.reset();
+        row_group_batch.record_batch_reader.reset();
+        row_group_batch.file_reader.reset();

        lock.lock();
-        row_group.status = RowGroupState::Status::Done;
+        row_group_batch.status = RowGroupBatchState::Status::Done;

        // We may be able to schedule more work now, but can't call scheduleMoreWorkIfNeeded() right
        // here because we're running on the same thread pool, so it'll deadlock if thread limit is
@ -232,23 +246,11 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_idx, std::unique_l
        condvar.notify_all();
    };

-    if (!row_group.record_batch_reader)
-    {
-        if (skip_row_groups.contains(static_cast<int>(row_group_idx)))
-        {
-            // Pretend that the row group is empty.
-            // (We could avoid scheduling the row group on a thread in the first place. But the
-            // skip_row_groups feature is mostly unused, so it's better to be a little inefficient
-            // than to add a bunch of extra mostly-dead code for this.)
-            end_of_row_group();
-            return;
-        }
-
-        initializeRowGroupReader(row_group_idx);
-    }
+    if (!row_group_batch.record_batch_reader)
+        initializeRowGroupBatchReader(row_group_batch_idx);


-    auto batch = row_group.record_batch_reader->Next();
+    auto batch = row_group_batch.record_batch_reader->Next();
    if (!batch.ok())
        throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading Parquet data: {}", batch.status().ToString());

@ -260,44 +262,44 @@ void ParquetBlockInputFormat::decodeOneChunk(size_t row_group_idx, std::unique_l

    auto tmp_table = arrow::Table::FromRecordBatches({*batch});

-    size_t approx_chunk_original_size = static_cast<size_t>(std::ceil(static_cast<double>(row_group.row_group_bytes_uncompressed) / row_group.row_group_rows * (*tmp_table)->num_rows()));
-    PendingChunk res = {.chunk_idx = row_group.next_chunk_idx, .row_group_idx = row_group_idx, .approx_original_chunk_size = approx_chunk_original_size};
+    size_t approx_chunk_original_size = static_cast<size_t>(std::ceil(static_cast<double>(row_group_batch.total_bytes_compressed) / row_group_batch.total_rows * (*tmp_table)->num_rows()));
+    PendingChunk res = {.chunk_idx = row_group_batch.next_chunk_idx, .row_group_batch_idx = row_group_batch_idx, .approx_original_chunk_size = approx_chunk_original_size};

    /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields.
    /// Otherwise fill the missing columns with zero values of its type.
    BlockMissingValues * block_missing_values_ptr = format_settings.defaults_for_omitted_fields ? &res.block_missing_values : nullptr;
-    row_group.arrow_column_to_ch_column->arrowTableToCHChunk(res.chunk, *tmp_table, (*tmp_table)->num_rows(), block_missing_values_ptr);
+    row_group_batch.arrow_column_to_ch_column->arrowTableToCHChunk(res.chunk, *tmp_table, (*tmp_table)->num_rows(), block_missing_values_ptr);

    lock.lock();

-    ++row_group.next_chunk_idx;
-    ++row_group.num_pending_chunks;
+    ++row_group_batch.next_chunk_idx;
+    ++row_group_batch.num_pending_chunks;
    pending_chunks.push(std::move(res));
    condvar.notify_all();
 }

-void ParquetBlockInputFormat::scheduleMoreWorkIfNeeded(std::optional<size_t> row_group_touched)
+void ParquetBlockInputFormat::scheduleMoreWorkIfNeeded(std::optional<size_t> row_group_batch_touched)
 {
-    while (row_groups_completed < row_groups.size())
+    while (row_group_batches_completed < row_group_batches.size())
    {
-        auto & row_group = row_groups[row_groups_completed];
-        if (row_group.status != RowGroupState::Status::Done || row_group.num_pending_chunks != 0)
+        auto & row_group = row_group_batches[row_group_batches_completed];
+        if (row_group.status != RowGroupBatchState::Status::Done || row_group.num_pending_chunks != 0)
            break;
-        ++row_groups_completed;
+        ++row_group_batches_completed;
    }

    if (pool)
    {
-        while (row_groups_started - row_groups_completed < max_decoding_threads &&
-               row_groups_started < row_groups.size())
-            scheduleRowGroup(row_groups_started++);
+        while (row_group_batches_started - row_group_batches_completed < max_decoding_threads &&
+               row_group_batches_started < row_group_batches.size())
+            scheduleRowGroup(row_group_batches_started++);

-        if (row_group_touched)
+        if (row_group_batch_touched)
        {
-            auto & row_group = row_groups[*row_group_touched];
-            if (row_group.status == RowGroupState::Status::Paused &&
-                row_group.num_pending_chunks < max_pending_chunks_per_row_group)
-                scheduleRowGroup(*row_group_touched);
+            auto & row_group = row_group_batches[*row_group_batch_touched];
+            if (row_group.status == RowGroupBatchState::Status::Paused &&
+                row_group.num_pending_chunks < max_pending_chunks_per_row_group_batch)
+                scheduleRowGroup(*row_group_batch_touched);
        }
    }
 }
@ -322,30 +324,30 @@ Chunk ParquetBlockInputFormat::generate()

        if (!pending_chunks.empty() &&
            (!format_settings.parquet.preserve_order ||
-             pending_chunks.top().row_group_idx == row_groups_completed))
+             pending_chunks.top().row_group_batch_idx == row_group_batches_completed))
        {
            PendingChunk chunk = std::move(const_cast<PendingChunk&>(pending_chunks.top()));
            pending_chunks.pop();

-            auto & row_group = row_groups[chunk.row_group_idx];
+            auto & row_group = row_group_batches[chunk.row_group_batch_idx];
            chassert(row_group.num_pending_chunks != 0);
            chassert(chunk.chunk_idx == row_group.next_chunk_idx - row_group.num_pending_chunks);
            --row_group.num_pending_chunks;

-            scheduleMoreWorkIfNeeded(chunk.row_group_idx);
+            scheduleMoreWorkIfNeeded(chunk.row_group_batch_idx);

            previous_block_missing_values = std::move(chunk.block_missing_values);
            previous_approx_bytes_read_for_chunk = chunk.approx_original_chunk_size;
            return std::move(chunk.chunk);
        }

-        if (row_groups_completed == row_groups.size())
+        if (row_group_batches_completed == row_group_batches.size())
            return {};

        if (pool)
            condvar.wait(lock);
        else
-            decodeOneChunk(row_groups_completed, lock);
+            decodeOneChunk(row_group_batches_completed, lock);
    }
 }

@ -358,12 +360,12 @@ void ParquetBlockInputFormat::resetParser()
    arrow_file.reset();
    metadata.reset();
    column_indices.clear();
-    row_groups.clear();
+    row_group_batches.clear();
    while (!pending_chunks.empty())
        pending_chunks.pop();
-    row_groups_completed = 0;
+    row_group_batches_completed = 0;
    previous_block_missing_values.clear();
-    row_groups_started = 0;
+    row_group_batches_started = 0;
    background_exception = nullptr;

    is_stopped = false;
@ -411,7 +413,7 @@ void registerInputFormatParquet(FormatFactory & factory)
               size_t /* max_download_threads */,
               size_t max_parsing_threads)
            {
-                size_t min_bytes_for_seek = is_remote_fs ? read_settings.remote_read_min_bytes_for_seek : 8 * 1024;
+                size_t min_bytes_for_seek = is_remote_fs ? read_settings.remote_read_min_bytes_for_seek : settings.parquet.local_read_min_bytes_for_seek;
                return std::make_shared<ParquetBlockInputFormat>(
                    buf,
                    sample,
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h
@ -52,6 +52,7 @@ public:
        const FormatSettings & format_settings,
        size_t max_decoding_threads,
        size_t min_bytes_for_seek);
+
    ~ParquetBlockInputFormat() override;

    void resetParser() override;
@ -71,14 +72,14 @@ private:
    }

    void initializeIfNeeded();
-    void initializeRowGroupReader(size_t row_group_idx);
+    void initializeRowGroupBatchReader(size_t row_group_batch_idx);

-    void decodeOneChunk(size_t row_group_idx, std::unique_lock<std::mutex> & lock);
+    void decodeOneChunk(size_t row_group_batch_idx, std::unique_lock<std::mutex> & lock);

-    void scheduleMoreWorkIfNeeded(std::optional<size_t> row_group_touched = std::nullopt);
-    void scheduleRowGroup(size_t row_group_idx);
+    void scheduleMoreWorkIfNeeded(std::optional<size_t> row_group_batch_touched = std::nullopt);
+    void scheduleRowGroup(size_t row_group_batch_idx);

-    void threadFunction(size_t row_group_idx);
+    void threadFunction(size_t row_group_batch_idx);

    // Data layout in the file:
    //
@ -165,7 +166,7 @@ private:
    //  * The max_pending_chunks_per_row_group limit could be based on actual memory usage too.
    //    Useful for preserve_order.

-    struct RowGroupState
+    struct RowGroupBatchState
    {
        // Transitions:
        //
@ -202,8 +203,10 @@ private:
        size_t next_chunk_idx = 0;
        size_t num_pending_chunks = 0;

-        size_t row_group_bytes_uncompressed = 0;
-        size_t row_group_rows = 0;
+        size_t total_rows = 0;
+        size_t total_bytes_compressed = 0;
+
+        std::vector<int> row_groups_idxs;

        // These are only used by the decoding thread, so don't require locking the mutex.
        std::unique_ptr<parquet::arrow::FileReader> file_reader;
@ -217,7 +220,7 @@ private:
        Chunk chunk;
        BlockMissingValues block_missing_values;
        size_t chunk_idx; // within row group
-        size_t row_group_idx;
+        size_t row_group_batch_idx;
        size_t approx_original_chunk_size;

        // For priority_queue.
@ -230,8 +233,8 @@ private:
            bool operator()(const PendingChunk & a, const PendingChunk & b) const
            {
                auto tuplificate = [this](const PendingChunk & c)
-                { return row_group_first ? std::tie(c.row_group_idx, c.chunk_idx)
-                                         : std::tie(c.chunk_idx, c.row_group_idx); };
+                { return row_group_first ? std::tie(c.row_group_batch_idx, c.chunk_idx)
+                                         : std::tie(c.chunk_idx, c.row_group_batch_idx); };
                return tuplificate(a) > tuplificate(b);
            }
        };
@ -241,7 +244,7 @@ private:
    const std::unordered_set<int> & skip_row_groups;
    size_t max_decoding_threads;
    size_t min_bytes_for_seek;
-    const size_t max_pending_chunks_per_row_group = 2;
+    const size_t max_pending_chunks_per_row_group_batch = 2;

    // RandomAccessFile is thread safe, so we share it among threads.
    // FileReader is not, so each thread creates its own.
@ -264,12 +267,12 @@ private:
    // Wakes up the generate() call, if any.
    std::condition_variable condvar;

-    std::vector<RowGroupState> row_groups;
+    std::vector<RowGroupBatchState> row_group_batches;
    std::priority_queue<PendingChunk, std::vector<PendingChunk>, PendingChunk::Compare> pending_chunks;
-    size_t row_groups_completed = 0;
+    size_t row_group_batches_completed = 0;

    // These are only used when max_decoding_threads > 1.
-    size_t row_groups_started = 0;
+    size_t row_group_batches_started = 0;
    std::unique_ptr<ThreadPool> pool;

    BlockMissingValues previous_block_missing_values;
--- a/src/Processors/QueryPlan/ReadFromRemote.cpp
+++ b/src/Processors/QueryPlan/ReadFromRemote.cpp
@ -103,7 +103,8 @@ ReadFromRemote::ReadFromRemote(
    Tables external_tables_,
    Poco::Logger * log_,
    UInt32 shard_count_,
-    std::shared_ptr<const StorageLimitsList> storage_limits_)
+    std::shared_ptr<const StorageLimitsList> storage_limits_,
+    const String & cluster_name_)
    : ISourceStep(DataStream{.header = std::move(header_)})
    , shards(std::move(shards_))
    , stage(stage_)
@ -116,6 +117,7 @@ ReadFromRemote::ReadFromRemote(
    , storage_limits(std::move(storage_limits_))
    , log(log_)
    , shard_count(shard_count_)
+    , cluster_name(cluster_name_)
 {
 }

@ -234,13 +236,37 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact
    scalars["_shard_num"]
        = Block{{DataTypeUInt32().createColumnConst(1, shard.shard_info.shard_num), std::make_shared<DataTypeUInt32>(), "_shard_num"}};

+    if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS)
+    {
+        if (context->getSettingsRef().cluster_for_parallel_replicas.changed)
+        {
+            const String cluster_for_parallel_replicas = context->getSettingsRef().cluster_for_parallel_replicas;
+            if (cluster_for_parallel_replicas != cluster_name)
+                LOG_INFO(log, "cluster_for_parallel_replicas has been set for the query but has no effect: {}. Distributed table cluster is used: {}",
+                         cluster_for_parallel_replicas, cluster_name);
+        }
+        context->setSetting("cluster_for_parallel_replicas", cluster_name);
+    }
+
    std::shared_ptr<RemoteQueryExecutor> remote_query_executor;

    remote_query_executor = std::make_shared<RemoteQueryExecutor>(
            shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage);

    remote_query_executor->setLogger(log);
-    remote_query_executor->setPoolMode(PoolMode::GET_MANY);
+
+    if (context->getParallelReplicasMode() == Context::ParallelReplicasMode::READ_TASKS)
+    {
+        // when doing parallel reading from replicas (ParallelReplicasMode::READ_TASKS) on a shard:
+        // establish a connection to a replica on the shard, the replica will instantiate coordinator to manage parallel reading from replicas on the shard.
+        // The coordinator will return query result from the shard.
+        // Only one coordinator per shard is necessary. Therefore using PoolMode::GET_ONE to establish only one connection per shard.
+        // Using PoolMode::GET_MANY for this mode will(can) lead to instantiation of several coordinators (depends on max_parallel_replicas setting)
+        // each will execute parallel reading from replicas, so the query result will be multiplied by the number of created coordinators
+        remote_query_executor->setPoolMode(PoolMode::GET_ONE);
+    }
+    else
+        remote_query_executor->setPoolMode(PoolMode::GET_MANY);

    if (!table_func_ptr)
        remote_query_executor->setMainTable(shard.main_table ? shard.main_table : main_table);
--- a/src/Processors/QueryPlan/ReadFromRemote.h
+++ b/src/Processors/QueryPlan/ReadFromRemote.h
@ -35,7 +35,8 @@ public:
        Tables external_tables_,
        Poco::Logger * log_,
        UInt32 shard_count_,
-        std::shared_ptr<const StorageLimitsList> storage_limits_);
+        std::shared_ptr<const StorageLimitsList> storage_limits_,
+        const String & cluster_name_);

    String getName() const override { return "ReadFromRemote"; }

@ -55,8 +56,9 @@ private:
    Tables external_tables;
    std::shared_ptr<const StorageLimitsList> storage_limits;
    Poco::Logger * log;
-
    UInt32 shard_count;
+    String cluster_name;
+
    void addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard);
    void addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard);
 };
--- a/src/Processors/Sources/ShellCommandSource.cpp
+++ b/src/Processors/Sources/ShellCommandSource.cpp
@ -349,7 +349,7 @@ namespace
        {
            for (auto && send_data_task : send_data_tasks)
            {
-                send_data_threads.emplace_back([task = std::move(send_data_task), this]()
+                send_data_threads.emplace_back([task = std::move(send_data_task), this]() mutable
                {
                    try
                    {
@ -359,6 +359,10 @@ namespace
                    {
                        std::lock_guard lock(send_data_lock);
                        exception_during_send_data = std::current_exception();
+
+                        /// task should be reset inside catch block or else it breaks d'tor
+                        /// invariants such as in ~WriteBuffer.
+                        task = {};
                    }
                });
            }
--- a/src/Storages/Kafka/KafkaConsumer.cpp
+++ b/src/Storages/Kafka/KafkaConsumer.cpp
@ -61,6 +61,7 @@ KafkaConsumer::KafkaConsumer(
    , stopped(stopped_)
    , current(messages.begin())
    , topics(_topics)
+    , exceptions_buffer(EXCEPTIONS_DEPTH)
 {
    // called (synchronously, during poll) when we enter the consumer group
    consumer->set_assignment_callback([this](const cppkafka::TopicPartitionList & topic_partitions)
@ -79,6 +80,7 @@ KafkaConsumer::KafkaConsumer(
        }

        assignment = topic_partitions;
+        num_rebalance_assignments++;
    });

    // called (synchronously, during poll) when we leave the consumer group
@ -106,6 +108,8 @@ KafkaConsumer::KafkaConsumer(
        cleanUnprocessed();

        stalled_status = REBALANCE_HAPPENED;
+        last_rebalance_timestamp_usec = static_cast<UInt64>(Poco::Timestamp().epochTime());
+
        assignment.reset();
        waited_for_assignment = 0;

@ -118,12 +122,14 @@ KafkaConsumer::KafkaConsumer(
        // {
        //     LOG_WARNING(log, "Commit error: {}", e.what());
        // }
+        num_rebalance_revocations++;
    });

    consumer->set_rebalance_error_callback([this](cppkafka::Error err)
    {
        LOG_ERROR(log, "Rebalance error: {}", err);
        ProfileEvents::increment(ProfileEvents::KafkaRebalanceErrors);
+        setExceptionInfo(err);
    });
 }

@ -177,6 +183,7 @@ void KafkaConsumer::drain()
            else
            {
                LOG_ERROR(log, "Error during draining: {}", error);
+                setExceptionInfo(error);
            }
        }

@ -251,6 +258,8 @@ void KafkaConsumer::commit()
                consumer->commit();
                committed = true;
                print_offsets("Committed offset", consumer->get_offsets_committed(consumer->get_assignment()));
+                last_commit_timestamp_usec = static_cast<UInt64>(Poco::Timestamp().epochTime());
+                num_commits += 1;
            }
            catch (const cppkafka::HandleException & e)
            {
@ -259,7 +268,10 @@ void KafkaConsumer::commit()
                if (e.get_error() == RD_KAFKA_RESP_ERR__NO_OFFSET)
                    committed = true;
                else
+                {
                    LOG_ERROR(log, "Exception during commit attempt: {}", e.what());
+                    setExceptionInfo(e.what());
+                }
            }
            --max_retries;
        }
@ -399,6 +411,8 @@ ReadBufferPtr KafkaConsumer::consume()
        /// Don't drop old messages immediately, since we may need them for virtual columns.
        auto new_messages = consumer->poll_batch(batch_size,
                            std::chrono::milliseconds(actual_poll_timeout_ms));
+        last_poll_timestamp_usec = static_cast<UInt64>(Poco::Timestamp().epochTime());
+        num_messages_read += new_messages.size();

        resetIfStopped();
        if (stalled_status == CONSUMER_STOPPED)
@ -495,6 +509,7 @@ size_t KafkaConsumer::filterMessageErrors()
        {
            ProfileEvents::increment(ProfileEvents::KafkaConsumerErrors);
            LOG_ERROR(log, "Consumer error: {}", error);
+            setExceptionInfo(error);
            return true;
        }
        return false;
@ -527,4 +542,64 @@ void KafkaConsumer::storeLastReadMessageOffset()
    }
 }

+void KafkaConsumer::setExceptionInfo(const cppkafka::Error & err)
+{
+    setExceptionInfo(err.to_string());
+}
+
+void KafkaConsumer::setExceptionInfo(const String & text)
+{
+    std::lock_guard<std::mutex> lock(exception_mutex);
+    exceptions_buffer.push_back({text, static_cast<UInt64>(Poco::Timestamp().epochTime())});
+}
+
+/*
+ * Needed until
+ * https://github.com/mfontanini/cppkafka/pull/309
+ * is merged,
+ * because consumer->get_member_id() contains a leak
+ */
+std::string KafkaConsumer::getMemberId() const
+{
+    char * memberid_ptr = rd_kafka_memberid(consumer->get_handle());
+    std::string memberid_string = memberid_ptr;
+    rd_kafka_mem_free(nullptr, memberid_ptr);
+    return memberid_string;
+}
+
+
+KafkaConsumer::Stat KafkaConsumer::getStat() const
+{
+    KafkaConsumer::Stat::Assignments assignments;
+    auto cpp_assignments = consumer->get_assignment();
+    auto cpp_offsets = consumer->get_offsets_position(cpp_assignments);
+
+    for (size_t num = 0; num < cpp_assignments.size(); ++num)
+    {
+        assignments.push_back({
+            cpp_assignments[num].get_topic(),
+            cpp_assignments[num].get_partition(),
+            cpp_offsets[num].get_offset(),
+        });
+    }
+
+    return {
+        .consumer_id = getMemberId() /* consumer->get_member_id() */ ,
+        .assignments = std::move(assignments),
+        .last_poll_time = last_poll_timestamp_usec.load(),
+        .num_messages_read = num_messages_read.load(),
+
+        .last_commit_timestamp_usec = last_commit_timestamp_usec.load(),
+        .last_rebalance_timestamp_usec = last_rebalance_timestamp_usec.load(),
+        .num_commits = num_commits.load(),
+        .num_rebalance_assignments = num_rebalance_assignments.load(),
+        .num_rebalance_revocations = num_rebalance_revocations.load(),
+        .exceptions_buffer = [&](){std::lock_guard<std::mutex> lock(exception_mutex);
+            return exceptions_buffer;}(),
+        .in_use = in_use.load(),
+        .rdkafka_stat = [&](){std::lock_guard<std::mutex> lock(rdkafka_stat_mutex);
+            return rdkafka_stat;}(),
+    };
+}
+
 }
--- a/src/Storages/Kafka/KafkaConsumer.h
+++ b/src/Storages/Kafka/KafkaConsumer.h
@ -1,5 +1,7 @@
 #pragma once

+#include <boost/circular_buffer.hpp>
+
 #include <Core/Names.h>
 #include <base/types.h>
 #include <IO/ReadBuffer.h>
@ -20,10 +22,44 @@ namespace Poco
 namespace DB
 {

+class StorageSystemKafkaConsumers;
+
 using ConsumerPtr = std::shared_ptr<cppkafka::Consumer>;

 class KafkaConsumer
 {
+public:
+    struct ExceptionInfo
+    {
+        String text;
+        UInt64 timestamp_usec;
+    };
+    using ExceptionsBuffer = boost::circular_buffer<ExceptionInfo>;
+
+    struct Stat // system.kafka_consumers data
+    {
+        struct Assignment
+        {
+            String topic_str;
+            Int32 partition_id;
+            Int64 current_offset;
+        };
+        using Assignments = std::vector<Assignment>;
+
+        String consumer_id;
+        Assignments assignments;
+        UInt64 last_poll_time;
+        UInt64 num_messages_read;
+        UInt64 last_commit_timestamp_usec;
+        UInt64 last_rebalance_timestamp_usec;
+        UInt64 num_commits;
+        UInt64 num_rebalance_assignments;
+        UInt64 num_rebalance_revocations;
+        KafkaConsumer::ExceptionsBuffer exceptions_buffer;
+        bool in_use;
+        std::string rdkafka_stat;
+    };
+
 public:
    KafkaConsumer(
        ConsumerPtr consumer_,
@ -69,6 +105,18 @@ public:
    auto currentTimestamp() const { return current[-1].get_timestamp(); }
    const auto & currentHeaderList() const { return current[-1].get_header_list(); }
    String currentPayload() const { return current[-1].get_payload(); }
+    void setExceptionInfo(const cppkafka::Error & err);
+    void setExceptionInfo(const String & text);
+    void setRDKafkaStat(const std::string & stat_json_string)
+    {
+        std::lock_guard<std::mutex> lock(rdkafka_stat_mutex);
+        rdkafka_stat = stat_json_string;
+    }
+    void inUse() { in_use = true; }
+    void notInUse() { in_use = false; }
+
+    // For system.kafka_consumers
+    Stat getStat() const;

 private:
    using Messages = std::vector<cppkafka::Message>;
@ -105,12 +153,33 @@ private:
    std::optional<cppkafka::TopicPartitionList> assignment;
    const Names topics;

+    /// system.kafka_consumers data is retrieved asynchronously
+    ///  so we have to protect exceptions_buffer
+    mutable std::mutex exception_mutex;
+    const size_t EXCEPTIONS_DEPTH = 10;
+    ExceptionsBuffer exceptions_buffer;
+
+    std::atomic<UInt64> last_exception_timestamp_usec = 0;
+    std::atomic<UInt64> last_poll_timestamp_usec = 0;
+    std::atomic<UInt64> num_messages_read = 0;
+    std::atomic<UInt64> last_commit_timestamp_usec = 0;
+    std::atomic<UInt64> num_commits = 0;
+    std::atomic<UInt64> last_rebalance_timestamp_usec = 0;
+    std::atomic<UInt64> num_rebalance_assignments = 0;
+    std::atomic<UInt64> num_rebalance_revocations = 0;
+    std::atomic<bool> in_use = 0;
+
+    mutable std::mutex rdkafka_stat_mutex;
+    std::string rdkafka_stat;
+
    void drain();
    void cleanUnprocessed();
    void resetIfStopped();
    /// Return number of messages with an error.
    size_t filterMessageErrors();
    ReadBufferPtr getNextMessage();
+
+    std::string getMemberId() const;
 };

 }
--- a/src/Storages/Kafka/KafkaSource.cpp
+++ b/src/Storages/Kafka/KafkaSource.cpp
@ -133,6 +133,7 @@ Chunk KafkaSource::generateImpl()
        {
            e.addMessage("while parsing Kafka message (topic: {}, partition: {}, offset: {})'",
                consumer->currentTopic(), consumer->currentPartition(), consumer->currentOffset());
+            consumer->setExceptionInfo(e.message());
            throw std::move(e);
        }
    };
--- a/src/Storages/Kafka/StorageKafka.cpp
+++ b/src/Storages/Kafka/StorageKafka.cpp
@ -416,7 +416,9 @@ void StorageKafka::startup()
    {
        try
        {
-            pushConsumer(createConsumer(i));
+            auto consumer = createConsumer(i);
+            pushConsumer(consumer);
+            all_consumers.push_back(consumer);
            ++num_created_consumers;
        }
        catch (const cppkafka::Exception &)
@ -456,6 +458,7 @@ void StorageKafka::shutdown()
 void StorageKafka::pushConsumer(KafkaConsumerPtr consumer)
 {
    std::lock_guard lock(mutex);
+    consumer->notInUse();
    consumers.push_back(consumer);
    semaphore.set();
    CurrentMetrics::sub(CurrentMetrics::KafkaConsumersInUse, 1);
@ -484,6 +487,7 @@ KafkaConsumerPtr StorageKafka::popConsumer(std::chrono::milliseconds timeout)
    auto consumer = consumers.back();
    consumers.pop_back();
    CurrentMetrics::add(CurrentMetrics::KafkaConsumersInUse, 1);
+    consumer->inUse();
    return consumer;
 }

@ -512,7 +516,11 @@ KafkaConsumerPtr StorageKafka::createConsumer(size_t consumer_number)
    size_t default_queued_min_messages = 100000; // we don't want to decrease the default
    conf.set("queued.min.messages", std::max(getMaxBlockSize(),default_queued_min_messages));

-    updateConfiguration(conf);
+    /// a reference to the consumer is needed in statistic callback
+    /// although the consumer does not exist when callback is being registered
+    /// shared_ptr<weak_ptr<KafkaConsumer>> comes to the rescue
+    auto consumer_weak_ptr_ptr = std::make_shared<KafkaConsumerWeakPtr>();
+    updateConfiguration(conf, consumer_weak_ptr_ptr);

    // those settings should not be changed by users.
    conf.set("enable.auto.commit", "false");       // We manually commit offsets after a stream successfully finished
@ -523,13 +531,20 @@ KafkaConsumerPtr StorageKafka::createConsumer(size_t consumer_number)
    auto consumer_impl = std::make_shared<cppkafka::Consumer>(conf);
    consumer_impl->set_destroy_flags(RD_KAFKA_DESTROY_F_NO_CONSUMER_CLOSE);

+    KafkaConsumerPtr kafka_consumer_ptr;
+
    /// NOTE: we pass |stream_cancelled| by reference here, so the buffers should not outlive the storage.
    if (thread_per_consumer)
    {
        auto& stream_cancelled = tasks[consumer_number]->stream_cancelled;
-        return std::make_shared<KafkaConsumer>(consumer_impl, log, getPollMaxBatchSize(), getPollTimeoutMillisecond(), intermediate_commit, stream_cancelled, topics);
+        kafka_consumer_ptr = std::make_shared<KafkaConsumer>(consumer_impl, log, getPollMaxBatchSize(), getPollTimeoutMillisecond(), intermediate_commit, stream_cancelled, topics);
    }
-    return std::make_shared<KafkaConsumer>(consumer_impl, log, getPollMaxBatchSize(), getPollTimeoutMillisecond(), intermediate_commit, tasks.back()->stream_cancelled, topics);
+    else
+    {
+        kafka_consumer_ptr = std::make_shared<KafkaConsumer>(consumer_impl, log, getPollMaxBatchSize(), getPollTimeoutMillisecond(), intermediate_commit, tasks.back()->stream_cancelled, topics);
+    }
+    *consumer_weak_ptr_ptr = kafka_consumer_ptr;
+    return kafka_consumer_ptr;
 }

 size_t StorageKafka::getMaxBlockSize() const
@ -562,7 +577,8 @@ String StorageKafka::getConfigPrefix() const
    return CONFIG_KAFKA_TAG;
 }

-void StorageKafka::updateConfiguration(cppkafka::Configuration & kafka_config)
+void StorageKafka::updateConfiguration(cppkafka::Configuration & kafka_config,
+    std::shared_ptr<KafkaConsumerWeakPtr>  kafka_consumer_weak_ptr_ptr)
 {
    // Update consumer configuration from the configuration. Example:
    //     <kafka>
@ -642,6 +658,26 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & kafka_config)
        LOG_IMPL(log, client_logs_level, poco_level, "[rdk:{}] {}", facility, message);
    });

+    if (kafka_consumer_weak_ptr_ptr)
+    {
+        if (!config.has(config_prefix + "." + "statistics_interval_ms"))
+        {
+            kafka_config.set("statistics.interval.ms", "3000"); // every 3 seconds by default. set to 0 to disable.
+        }
+
+        if (kafka_config.get("statistics.interval.ms") != "0")
+        {
+            kafka_config.set_stats_callback([kafka_consumer_weak_ptr_ptr](cppkafka::KafkaHandleBase &, const std::string & stat_json_string)
+            {
+                auto kafka_consumer_ptr = kafka_consumer_weak_ptr_ptr->lock();
+                if (kafka_consumer_ptr)
+                {
+                    kafka_consumer_ptr->setRDKafkaStat(stat_json_string);
+                }
+            });
+        }
+    }
+
    // Configure interceptor to change thread name
    //
    // TODO: add interceptors support into the cppkafka.
@ -952,7 +988,7 @@ void registerStorageKafka(StorageFactory & factory)
                            "of getting data from Kafka, consider using a setting kafka_thread_per_consumer=1, "
                            "and ensure you have enough threads "
                            "in MessageBrokerSchedulePool (background_message_broker_schedule_pool_size). "
-                            "See also https://clickhouse.com/docs/integrations/kafka/kafka-table-engine#tuning-performance", max_consumers);
+                            "See also https://clickhouse.com/docs/en/integrations/kafka#tuning-performance", max_consumers);
        }
        else if (num_consumers < 1)
        {
--- a/src/Storages/Kafka/StorageKafka.h
+++ b/src/Storages/Kafka/StorageKafka.h
@ -23,9 +23,12 @@ class Configuration;
 namespace DB
 {

+class StorageSystemKafkaConsumers;
+
 struct StorageKafkaInterceptors;

 using KafkaConsumerPtr = std::shared_ptr<KafkaConsumer>;
+using KafkaConsumerWeakPtr = std::weak_ptr<KafkaConsumer>;

 /** Implements a Kafka queue table engine that can be used as a persistent queue / buffer,
  * or as a basic building block for creating pipelines with a continuous insertion / ETL.
@ -77,6 +80,15 @@ public:
    Names getVirtualColumnNames() const;
    HandleKafkaErrorMode getHandleKafkaErrorMode() const { return kafka_settings->kafka_handle_error_mode; }

+    struct SafeConsumers
+    {
+        std::shared_ptr<IStorage> storage_ptr;
+        std::unique_lock<std::mutex> lock;
+        std::vector<KafkaConsumerWeakPtr> & consumers;
+    };
+
+    SafeConsumers getSafeConsumers() { return {shared_from_this(), std::unique_lock(mutex), all_consumers};  }
+
 private:
    // Configuration and state
    std::unique_ptr<KafkaSettings> kafka_settings;
@ -101,6 +113,7 @@ private:
    size_t num_created_consumers = 0; /// number of actually created consumers.

    std::vector<KafkaConsumerPtr> consumers; /// available consumers
+    std::vector<KafkaConsumerWeakPtr> all_consumers; /// busy (belong to a KafkaSource) and vacant consumers

    std::mutex mutex;

@ -129,7 +142,12 @@ private:
    std::atomic<bool> shutdown_called = false;

    // Update Kafka configuration with values from CH user configuration.
-    void updateConfiguration(cppkafka::Configuration & kafka_config);
+    void updateConfiguration(cppkafka::Configuration & kafka_config, std::shared_ptr<KafkaConsumerWeakPtr>);
+    void updateConfiguration(cppkafka::Configuration & kafka_config)
+    {
+        updateConfiguration(kafka_config, std::make_shared<KafkaConsumerWeakPtr>());
+    }
+
    String getConfigPrefix() const;
    void threadFunc(size_t idx);

@ -142,6 +160,7 @@ private:

    bool streamToViews();
    bool checkDependencies(const StorageID & table_id);
+
 };

 }
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@ -220,7 +220,8 @@ void StorageMergeTree::read(
            local_context, query_info.query,
            table_id.database_name, table_id.table_name, /*remote_table_function_ptr*/nullptr);

-        auto cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas);
+        String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas;
+        auto cluster = local_context->getCluster(cluster_for_parallel_replicas);

        Block header;

--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -5157,7 +5157,9 @@ void StorageReplicatedMergeTree::readParallelReplicasImpl(
 {
    auto table_id = getStorageID();

-    auto parallel_replicas_cluster = local_context->getCluster(local_context->getSettingsRef().cluster_for_parallel_replicas);
+    auto scalars = local_context->hasQueryContext() ? local_context->getQueryContext()->getScalars() : Scalars{};
+    String cluster_for_parallel_replicas = local_context->getSettingsRef().cluster_for_parallel_replicas;
+    auto parallel_replicas_cluster = local_context->getCluster(cluster_for_parallel_replicas);

    ASTPtr modified_query_ast;
    Block header;
--- a/src/Storages/System/StorageSystemKafkaConsumers.cpp
+++ b/src/Storages/System/StorageSystemKafkaConsumers.cpp
@ -0,0 +1,175 @@
+#include "config.h"
+
+#if USE_RDKAFKA
+
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <Columns/ColumnsDateTime.h>
+#include <Access/ContextAccess.h>
+#include <Storages/System/StorageSystemKafkaConsumers.h>
+#include <Storages/Kafka/StorageKafka.h>
+
+#include <Interpreters/Context.h>
+#include <Interpreters/DatabaseCatalog.h>
+#include "base/types.h"
+
+namespace DB
+{
+
+NamesAndTypesList StorageSystemKafkaConsumers::getNamesAndTypes()
+{
+    NamesAndTypesList names_and_types{
+        {"database", std::make_shared<DataTypeString>()},
+        {"table", std::make_shared<DataTypeString>()},
+        {"consumer_id", std::make_shared<DataTypeString>()}, //(number? or string? - single clickhouse table can have many consumers)
+        {"assignments.topic", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
+        {"assignments.partition_id", std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt32>())},
+        {"assignments.current_offset", std::make_shared<DataTypeArray>(std::make_shared<DataTypeInt64>())},
+        {"exceptions.time", std::make_shared<DataTypeArray>(std::make_shared<DataTypeDateTime>())},
+        {"exceptions.text", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
+        {"last_poll_time", std::make_shared<DataTypeDateTime>()},
+        {"num_messages_read", std::make_shared<DataTypeUInt64>()},
+        {"last_commit_time", std::make_shared<DataTypeDateTime>()},
+        {"num_commits", std::make_shared<DataTypeUInt64>()},
+        {"last_rebalance_time", std::make_shared<DataTypeDateTime>()},
+        {"num_rebalance_revocations", std::make_shared<DataTypeUInt64>()},
+        {"num_rebalance_assignments", std::make_shared<DataTypeUInt64>()},
+        {"is_currently_used", std::make_shared<DataTypeUInt8>()},
+        {"rdkafka_stat", std::make_shared<DataTypeString>()},
+    };
+    return names_and_types;
+}
+
+void StorageSystemKafkaConsumers::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
+{
+    auto tables_mark_dropped = DatabaseCatalog::instance().getTablesMarkedDropped();
+
+    size_t index = 0;
+
+
+    auto & database = assert_cast<ColumnString &>(*res_columns[index++]);
+    auto & table = assert_cast<ColumnString &>(*res_columns[index++]);
+    auto & consumer_id = assert_cast<ColumnString &>(*res_columns[index++]); //(number? or string? - single clickhouse table can have many consumers)
+
+    auto & assigments_topics = assert_cast<ColumnString &>(assert_cast<ColumnArray &>(*res_columns[index]).getData());
+    auto & assigments_topics_offsets = assert_cast<ColumnArray &>(*res_columns[index++]).getOffsets();
+
+    auto & assigments_partition_id = assert_cast<ColumnInt32 &>(assert_cast<ColumnArray &>(*res_columns[index]).getData());
+    auto & assigments_partition_id_offsets = assert_cast<ColumnArray &>(*res_columns[index++]).getOffsets();
+
+    auto & assigments_current_offset = assert_cast<ColumnInt64 &>(assert_cast<ColumnArray &>(*res_columns[index]).getData());
+    auto & assigments_current_offset_offsets = assert_cast<ColumnArray &>(*res_columns[index++]).getOffsets();
+
+    auto & exceptions_time = assert_cast<ColumnDateTime &>(assert_cast<ColumnArray &>(*res_columns[index]).getData());
+    auto & exceptions_time_offset = assert_cast<ColumnArray &>(*res_columns[index++]).getOffsets();
+    auto & exceptions_text = assert_cast<ColumnString &>(assert_cast<ColumnArray &>(*res_columns[index]).getData());
+    auto & exceptions_text_offset = assert_cast<ColumnArray &>(*res_columns[index++]).getOffsets();
+    auto & last_poll_time = assert_cast<ColumnDateTime &>(*res_columns[index++]);
+    auto & num_messages_read = assert_cast<ColumnUInt64 &>(*res_columns[index++]);
+    auto & last_commit_time = assert_cast<ColumnDateTime &>(*res_columns[index++]);
+    auto & num_commits = assert_cast<ColumnUInt64 &>(*res_columns[index++]);
+    auto & last_rebalance_time = assert_cast<ColumnDateTime &>(*res_columns[index++]);
+    auto & num_rebalance_revocations = assert_cast<ColumnUInt64 &>(*res_columns[index++]);
+    auto & num_rebalance_assigments = assert_cast<ColumnUInt64 &>(*res_columns[index++]);
+    auto & is_currently_used = assert_cast<ColumnUInt8 &>(*res_columns[index++]);
+    auto & rdkafka_stat = assert_cast<ColumnString &>(*res_columns[index++]);
+
+    const auto access = context->getAccess();
+    size_t last_assignment_num = 0;
+    size_t exceptions_num = 0;
+
+    auto add_row = [&](const DatabaseTablesIteratorPtr & it, StorageKafka * storage_kafka_ptr)
+    {
+        if (!access->isGranted(AccessType::SHOW_TABLES, it->databaseName(), it->name()))
+        {
+            return;
+        }
+
+        std::string database_str = it->databaseName();
+        std::string table_str = it->name();
+
+        auto safe_consumers = storage_kafka_ptr->getSafeConsumers();
+
+        for (const auto & weak_consumer : safe_consumers.consumers)
+        {
+            if (auto consumer = weak_consumer.lock())
+            {
+                auto consumer_stat = consumer->getStat();
+
+                database.insertData(database_str.data(), database_str.size());
+                table.insertData(table_str.data(), table_str.size());
+
+                consumer_id.insertData(consumer_stat.consumer_id.data(), consumer_stat.consumer_id.size());
+
+                const auto num_assignnemts = consumer_stat.assignments.size();
+
+                for (size_t num = 0; num < num_assignnemts; ++num)
+                {
+                    const auto & assign = consumer_stat.assignments[num];
+
+                    assigments_topics.insertData(assign.topic_str.data(), assign.topic_str.size());
+
+                    assigments_partition_id.insert(assign.partition_id);
+                    assigments_current_offset.insert(assign.current_offset);
+                }
+                last_assignment_num += num_assignnemts;
+
+                assigments_topics_offsets.push_back(last_assignment_num);
+                assigments_partition_id_offsets.push_back(last_assignment_num);
+                assigments_current_offset_offsets.push_back(last_assignment_num);
+
+                for (const auto & exc : consumer_stat.exceptions_buffer)
+                {
+                    exceptions_text.insertData(exc.text.data(), exc.text.size());
+                    exceptions_time.insert(exc.timestamp_usec);
+                }
+                exceptions_num += consumer_stat.exceptions_buffer.size();
+                exceptions_text_offset.push_back(exceptions_num);
+                exceptions_time_offset.push_back(exceptions_num);
+
+
+                last_poll_time.insert(consumer_stat.last_poll_time);
+                num_messages_read.insert(consumer_stat.num_messages_read);
+                last_commit_time.insert(consumer_stat.last_commit_timestamp_usec);
+                num_commits.insert(consumer_stat.num_commits);
+                last_rebalance_time.insert(consumer_stat.last_rebalance_timestamp_usec);
+
+                num_rebalance_revocations.insert(consumer_stat.num_rebalance_revocations);
+                num_rebalance_assigments.insert(consumer_stat.num_rebalance_assignments);
+
+                is_currently_used.insert(consumer_stat.in_use);
+
+                rdkafka_stat.insertData(consumer_stat.rdkafka_stat.data(), consumer_stat.rdkafka_stat.size());
+            }
+        }
+    };
+
+    const bool show_tables_granted = access->isGranted(AccessType::SHOW_TABLES);
+
+    if (show_tables_granted)
+    {
+        auto databases = DatabaseCatalog::instance().getDatabases();
+        for (const auto & db : databases)
+        {
+            for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next())
+            {
+                StoragePtr storage = iterator->table();
+                if (auto * kafka_table = dynamic_cast<StorageKafka *>(storage.get()))
+                {
+                    add_row(iterator, kafka_table);
+                }
+            }
+        }
+
+    }
+}
+
+}
+
+#endif
--- a/src/Storages/System/StorageSystemKafkaConsumers.h
+++ b/src/Storages/System/StorageSystemKafkaConsumers.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include "config.h"
+
+#if USE_RDKAFKA
+
+
+#include <Storages/System/IStorageSystemOneBlock.h>
+
+
+namespace DB
+{
+
+class StorageSystemKafkaConsumers final : public IStorageSystemOneBlock<StorageSystemKafkaConsumers>
+{
+public:
+    std::string getName() const override { return "SystemKafkaConsumers"; }
+    static NamesAndTypesList getNamesAndTypes();
+
+protected:
+    using IStorageSystemOneBlock::IStorageSystemOneBlock;
+    void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override;
+};
+
+}
+
+#endif
--- a/src/Storages/System/attachSystemTables.cpp
+++ b/src/Storages/System/attachSystemTables.cpp
@ -84,6 +84,10 @@
 #include <Storages/System/StorageSystemZooKeeperConnection.h>
 #include <Storages/System/StorageSystemJemalloc.h>

+#if USE_RDKAFKA
+#include <Storages/System/StorageSystemKafkaConsumers.h>
+#endif
+
 #ifdef OS_LINUX
 #include <Storages/System/StorageSystemStackTrace.h>
 #endif
@ -144,6 +148,9 @@ void attachSystemTablesLocal(ContextPtr context, IDatabase & system_database)
    attach<StorageSystemBackups>(context, system_database, "backups");
    attach<StorageSystemSchemaInferenceCache>(context, system_database, "schema_inference_cache");
    attach<StorageSystemDroppedTables>(context, system_database, "dropped_tables");
+#if USE_RDKAFKA
+    attach<StorageSystemKafkaConsumers>(context, system_database, "kafka_consumers");
+#endif
 #ifdef OS_LINUX
    attach<StorageSystemStackTrace>(context, system_database, "stack_trace");
 #endif
--- a/tests/config/config.d/clusters.xml
+++ b/tests/config/config.d/clusters.xml
@ -176,6 +176,38 @@
                </replica>
            </shard-->
        </test_cluster_one_shard_three_replicas_localhost>
+        <test_cluster_two_shard_three_replicas_localhost>
+            <shard>
+                <internal_replication>false</internal_replication>
+                <replica>
+                    <host>127.0.0.1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>127.0.0.2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>127.0.0.3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <internal_replication>false</internal_replication>
+                <replica>
+                    <host>127.0.0.4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>127.0.0.5</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>127.0.0.6</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_cluster_two_shard_three_replicas_localhost>
        <test_cluster_two_shards_localhost>
             <shard>
                 <replica>
--- a/tests/config/config.d/s3_storage_policy_by_default.xml
+++ b/tests/config/config.d/s3_storage_policy_by_default.xml
@ -10,7 +10,7 @@
            <cached_s3>
                <type>cache</type>
                <max_size>1Gi</max_size>
-                <path>/var/lib/clickhouse/s3_cache/</path>
+                <path>cached_s3/</path>
                <disk>s3</disk>
            </cached_s3>
        </disks>
--- a/tests/config/config.d/storage_conf.xml
+++ b/tests/config/config.d/storage_conf.xml
@ -1,4 +1,5 @@
 <clickhouse>
+    <filesystem_caches_path>/var/lib/clickhouse/filesystem_caches/</filesystem_caches_path>
    <storage_configuration>
        <disks>
            <s3_disk>
--- a/tests/integration/test_cluster_all_replicas/configs/remote_servers.xml
+++ b/tests/integration/test_cluster_all_replicas/configs/remote_servers.xml
@ -1,6 +1,6 @@
 <clickhouse>
    <remote_servers>
-        <two_shards>
+        <one_shard_two_nodes>
            <shard>
                <replica>
                    <host>node1</host>
@ -11,6 +11,6 @@
                    <port>9000</port>
                </replica>
            </shard>
-        </two_shards>
+        </one_shard_two_nodes>
    </remote_servers>
 </clickhouse>
--- a/tests/integration/test_cluster_all_replicas/test.py
+++ b/tests/integration/test_cluster_all_replicas/test.py
@ -4,12 +4,8 @@ from helpers.cluster import ClickHouseCluster

 cluster = ClickHouseCluster(__file__)

-node1 = cluster.add_instance(
-    "node1", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
-)
-node2 = cluster.add_instance(
-    "node2", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
-)
+node1 = cluster.add_instance("node1", main_configs=["configs/remote_servers.xml"])
+node2 = cluster.add_instance("node2", main_configs=["configs/remote_servers.xml"])


@pytest.fixture(scope="module")
@ -24,11 +20,13 @@ def start_cluster():
 def test_remote(start_cluster):
    assert (
        node1.query(
-            """SELECT hostName() FROM clusterAllReplicas("two_shards", system.one)"""
+            """SELECT hostName() FROM clusterAllReplicas("one_shard_two_nodes", system.one)"""
        )
        == "node1\nnode2\n"
    )
    assert (
-        node1.query("""SELECT hostName() FROM cluster("two_shards", system.one)""")
+        node1.query(
+            """SELECT hostName() FROM cluster("one_shard_two_nodes", system.one)"""
+        )
        == "node1\n"
    )
--- a/tests/integration/test_kafka_bad_messages/test.py
+++ b/tests/integration/test_kafka_bad_messages/test.py
@ -90,7 +90,9 @@ def producer_serializer(x):
    return x.encode() if isinstance(x, str) else x


-def kafka_produce(kafka_cluster, topic, messages, timestamp=None, retries=15):
+def kafka_produce(
+    kafka_cluster, topic, messages, timestamp=None, retries=15, partition=None
+):
    logging.debug(
        "kafka_produce server:{}:{} topic:{}".format(
            "localhost", kafka_cluster.kafka_port, topic
@ -100,7 +102,9 @@ def kafka_produce(kafka_cluster, topic, messages, timestamp=None, retries=15):
        kafka_cluster.kafka_port, producer_serializer, retries
    )
    for message in messages:
-        producer.send(topic=topic, value=message, timestamp_ms=timestamp)
+        producer.send(
+            topic=topic, value=message, timestamp_ms=timestamp, partition=partition
+        )
        producer.flush()


@ -115,7 +119,7 @@ def kafka_cluster():
        cluster.shutdown()


-def test_bad_messages_parsing(kafka_cluster):
+def test_bad_messages_parsing_stream(kafka_cluster):
    admin_client = KafkaAdminClient(
        bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)
    )
@ -244,7 +248,7 @@ struct Message
        f"""
            DROP TABLE IF EXISTS view;
            DROP TABLE IF EXISTS kafka;
-    
+
            CREATE TABLE kafka (key UInt64, value UInt64)
                ENGINE = Kafka
                SETTINGS kafka_broker_list = 'kafka1:19092',
@ -253,9 +257,9 @@ struct Message
                         kafka_format = 'CapnProto',
                         kafka_handle_error_mode='stream',
                         kafka_schema='schema_test_errors:Message';
-    
+
            CREATE MATERIALIZED VIEW view Engine=Log AS
-                SELECT _error FROM kafka WHERE length(_error) != 0 ;                
+                SELECT _error FROM kafka WHERE length(_error) != 0;
        """
    )

@ -279,6 +283,68 @@ struct Message
    kafka_delete_topic(admin_client, "CapnProto_err")


+def test_bad_messages_parsing_exception(kafka_cluster, max_retries=20):
+    admin_client = KafkaAdminClient(
+        bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)
+    )
+
+    for format_name in [
+        "Avro",
+        "JSONEachRow",
+    ]:
+        print(format_name)
+
+        kafka_create_topic(admin_client, f"{format_name}_err")
+
+        instance.query(
+            f"""
+            DROP TABLE IF EXISTS view_{format_name};
+            DROP TABLE IF EXISTS kafka_{format_name};
+            DROP TABLE IF EXISTS kafka;
+
+            CREATE TABLE kafka_{format_name} (key UInt64, value UInt64)
+                ENGINE = Kafka
+                SETTINGS kafka_broker_list = 'kafka1:19092',
+                         kafka_topic_list = '{format_name}_err',
+                         kafka_group_name = '{format_name}',
+                         kafka_format = '{format_name}',
+                         kafka_num_consumers = 1;
+
+            CREATE MATERIALIZED VIEW view_{format_name} Engine=Log AS
+                SELECT * FROM kafka_{format_name};
+        """
+        )
+
+        kafka_produce(
+            kafka_cluster, f"{format_name}_err", ["qwertyuiop", "asdfghjkl", "zxcvbnm"]
+        )
+
+    expected_result = """avro::Exception: Invalid data file. Magic does not match: : while parsing Kafka message (topic: Avro_err, partition: 0, offset: 0)\\'|1|1|1|default|kafka_Avro
+Cannot parse input: expected \\'{\\' before: \\'qwertyuiop\\': while parsing Kafka message (topic: JSONEachRow_err, partition: 0, offset: 0)\\'|1|1|1|default|kafka_JSONEachRow
+"""
+    retries = 0
+    result_system_kafka_consumers = ""
+    while True:
+        result_system_kafka_consumers = instance.query(
+            """
+            SELECT exceptions.text[1], length(exceptions.text) > 1 AND length(exceptions.text) < 15, length(exceptions.time) > 1 AND length(exceptions.time) < 15, abs(dateDiff('second', exceptions.time[1], now())) < 40, database, table FROM system.kafka_consumers ORDER BY table, assignments.partition_id[1]
+            """
+        )
+        result_system_kafka_consumers = result_system_kafka_consumers.replace("\t", "|")
+        if result_system_kafka_consumers == expected_result or retries > max_retries:
+            break
+        retries += 1
+        time.sleep(1)
+
+    assert result_system_kafka_consumers == expected_result
+
+    for format_name in [
+        "Avro",
+        "JSONEachRow",
+    ]:
+        kafka_delete_topic(admin_client, f"{format_name}_err")
+
+
 if __name__ == "__main__":
    cluster.start()
    input("Cluster created, press any key to destroy...")
--- a/tests/integration/test_parallel_replicas_over_distributed/init.py
+++ b/tests/integration/test_parallel_replicas_over_distributed/init.py
--- a/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml
+++ b/tests/integration/test_parallel_replicas_over_distributed/configs/remote_servers.xml
@ -0,0 +1,58 @@
+<clickhouse>
+    <remote_servers>
+        <test_multiple_shards_multiple_replicas>
+            <shard>
+                <internal_replication>true</internal_replication>
+                <replica>
+                    <host>n1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <internal_replication>true</internal_replication>
+                <replica>
+                    <host>n4</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n5</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n6</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_multiple_shards_multiple_replicas>
+        <test_single_shard_multiple_replicas>
+            <shard>
+                <internal_replication>true</internal_replication>
+                <replica>
+                    <host>n1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n2</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n3</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>n4</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_single_shard_multiple_replicas>
+    </remote_servers>
+</clickhouse>
+
--- a/tests/integration/test_parallel_replicas_over_distributed/test.py
+++ b/tests/integration/test_parallel_replicas_over_distributed/test.py
@ -0,0 +1,154 @@
+import pytest
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+
+nodes = [
+    cluster.add_instance(
+        f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
+    )
+    for i in (1, 2, 3, 4, 5, 6)
+]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def create_tables(cluster, table_name):
+    # create replicated tables
+    for node in nodes:
+        node.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
+
+    if cluster == "test_single_shard_multiple_replicas":
+        nodes[0].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
+        )
+        nodes[1].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)"
+        )
+        nodes[2].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
+        )
+        nodes[3].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r4') ORDER BY (key)"
+        )
+    elif cluster == "test_multiple_shards_multiple_replicas":
+        # shard 1
+        nodes[0].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
+        )
+        nodes[1].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r2') ORDER BY (key)"
+        )
+        nodes[2].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
+        )
+        # shard 2
+        nodes[3].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r1') ORDER BY (key)"
+        )
+        nodes[4].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r2') ORDER BY (key)"
+        )
+        nodes[5].query(
+            f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard2/{table_name}', 'r3') ORDER BY (key)"
+        )
+    else:
+        raise Exception(f"Unexpected cluster: {cluster}")
+
+    # create distributed table
+    nodes[0].query(f"DROP TABLE IF EXISTS {table_name}_d SYNC")
+    nodes[0].query(
+        f"""
+            CREATE TABLE {table_name}_d AS {table_name}
+            Engine=Distributed(
+                {cluster},
+                currentDatabase(),
+                {table_name},
+                key
+            )
+            """
+    )
+
+    # populate data
+    nodes[0].query(
+        f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(1000)",
+        settings={"insert_distributed_sync": 1},
+    )
+    nodes[0].query(
+        f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(2000)",
+        settings={"insert_distributed_sync": 1},
+    )
+    nodes[0].query(
+        f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(1000)",
+        settings={"insert_distributed_sync": 1},
+    )
+    nodes[0].query(
+        f"INSERT INTO {table_name}_d SELECT -number, -number FROM numbers(2000)",
+        settings={"insert_distributed_sync": 1},
+    )
+    nodes[0].query(
+        f"INSERT INTO {table_name}_d SELECT number, number FROM numbers(3)",
+        settings={"insert_distributed_sync": 1},
+    )
+
+
+@pytest.mark.parametrize(
+    "cluster,max_parallel_replicas,prefer_localhost_replica",
+    [
+        # prefer_localhost_replica=0
+        pytest.param("test_single_shard_multiple_replicas", 2, 0),
+        pytest.param("test_single_shard_multiple_replicas", 3, 0),
+        pytest.param("test_single_shard_multiple_replicas", 4, 0),
+        pytest.param("test_single_shard_multiple_replicas", 10, 0),
+        # prefer_localhost_replica=1
+        pytest.param("test_single_shard_multiple_replicas", 2, 1),
+        pytest.param("test_single_shard_multiple_replicas", 3, 1),
+        pytest.param("test_single_shard_multiple_replicas", 4, 1),
+        pytest.param("test_single_shard_multiple_replicas", 10, 1),
+        # prefer_localhost_replica=0
+        pytest.param("test_multiple_shards_multiple_replicas", 2, 0),
+        pytest.param("test_multiple_shards_multiple_replicas", 3, 0),
+        pytest.param("test_multiple_shards_multiple_replicas", 4, 0),
+        pytest.param("test_multiple_shards_multiple_replicas", 10, 0),
+        # prefer_localhost_replica=1
+        pytest.param("test_multiple_shards_multiple_replicas", 2, 1),
+        pytest.param("test_multiple_shards_multiple_replicas", 3, 1),
+        pytest.param("test_multiple_shards_multiple_replicas", 4, 1),
+        pytest.param("test_multiple_shards_multiple_replicas", 10, 1),
+    ],
+)
+def test_parallel_replicas_over_distributed(
+    start_cluster, cluster, max_parallel_replicas, prefer_localhost_replica
+):
+    table_name = "test_table"
+    create_tables(cluster, table_name)
+
+    node = nodes[0]
+    expected_result = f"6003\t-1999\t1999\t3\n"
+
+    # w/o parallel replicas
+    assert (
+        node.query(f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d")
+        == expected_result
+    )
+
+    # parallel replicas
+    assert (
+        node.query(
+            f"SELECT count(), min(key), max(key), sum(key) FROM {table_name}_d",
+            settings={
+                "allow_experimental_parallel_reading_from_replicas": 2,
+                "prefer_localhost_replica": prefer_localhost_replica,
+                "max_parallel_replicas": max_parallel_replicas,
+                "use_hedged_requests": 0,
+            },
+        )
+        == expected_result
+    )
--- a/tests/integration/test_storage_kafka/configs/kafka.xml
+++ b/tests/integration/test_storage_kafka/configs/kafka.xml
@ -10,6 +10,10 @@
        -->
        <debug>cgrp,consumer,topic,protocol</debug>

+        <!-- librdkafka stat in system.kafka_consumers -->
+				<!-- default 3000 (every three second) seems too long for test -->
+				<statistics_interval_ms>600</statistics_interval_ms>
+
        <kafka_topic>
            <name>consumer_hang</name>
            <!-- default: 3000 -->
--- a/tests/integration/test_storage_kafka/test.py
+++ b/tests/integration/test_storage_kafka/test.py
@ -1186,6 +1186,7 @@ def test_kafka_consumer_hang2(kafka_cluster):
    instance.query(
        """
        DROP TABLE IF EXISTS test.kafka;
+        DROP TABLE IF EXISTS test.kafka2;

        CREATE TABLE test.kafka (key UInt64, value UInt64)
            ENGINE = Kafka
@ -4545,6 +4546,294 @@ def test_block_based_formats_2(kafka_cluster):
        kafka_delete_topic(admin_client, format_name)


+def test_system_kafka_consumers(kafka_cluster):
+    admin_client = KafkaAdminClient(
+        bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)
+    )
+
+    topic = "system_kafka_cons"
+    kafka_create_topic(admin_client, topic)
+
+    # Check that format_csv_delimiter parameter works now - as part of all available format settings.
+    kafka_produce(
+        kafka_cluster,
+        topic,
+        ["1|foo", "2|bar", "42|answer", "100|multi\n101|row\n103|message"],
+    )
+
+    instance.query(
+        f"""
+        DROP TABLE IF EXISTS test.kafka;
+
+        CREATE TABLE test.kafka (a UInt64, b String)
+            ENGINE = Kafka
+            SETTINGS kafka_broker_list = 'kafka1:19092',
+                     kafka_topic_list = '{topic}',
+                     kafka_group_name = '{topic}',
+                     kafka_commit_on_select = 1,
+                     kafka_format = 'CSV',
+                     kafka_row_delimiter = '\\n',
+                     format_csv_delimiter = '|';
+        """
+    )
+
+    result = instance.query("SELECT * FROM test.kafka ORDER BY a;")
+
+    result_system_kafka_consumers = instance.query(
+        """
+        create or replace function stable_timestamp as
+          (d)->multiIf(d==toDateTime('1970-01-01 00:00:00'), 'never', abs(dateDiff('second', d, now())) < 30, 'now', toString(d));
+
+        SELECT database, table, length(consumer_id), assignments.topic, assignments.partition_id,
+          assignments.current_offset,
+          if(length(exceptions.time)>0, exceptions.time[1]::String, 'never') as last_exception_time_,
+          if(length(exceptions.text)>0, exceptions.text[1], 'no exception') as last_exception_,
+          stable_timestamp(last_poll_time) as last_poll_time_, num_messages_read, stable_timestamp(last_commit_time) as last_commit_time_,
+          num_commits, stable_timestamp(last_rebalance_time) as last_rebalance_time_,
+          num_rebalance_revocations, num_rebalance_assignments, is_currently_used
+          FROM system.kafka_consumers WHERE database='test' and table='kafka' format Vertical;
+        """
+    )
+    logging.debug(f"result_system_kafka_consumers: {result_system_kafka_consumers}")
+    assert (
+        result_system_kafka_consumers
+        == """Row 1:
+──────
+database:                   test
+table:                      kafka
+length(consumer_id):        67
+assignments.topic:          ['system_kafka_cons']
+assignments.partition_id:   [0]
+assignments.current_offset: [4]
+last_exception_time_:       never
+last_exception_:            no exception
+last_poll_time_:            now
+num_messages_read:          4
+last_commit_time_:          now
+num_commits:                1
+last_rebalance_time_:       never
+num_rebalance_revocations:  0
+num_rebalance_assignments:  1
+is_currently_used:          0
+"""
+    )
+
+    instance.query("DROP TABLE test.kafka")
+    kafka_delete_topic(admin_client, topic)
+
+
+def test_system_kafka_consumers_rebalance(kafka_cluster, max_retries=15):
+    # based on test_kafka_consumer_hang2
+    admin_client = KafkaAdminClient(
+        bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)
+    )
+
+    producer = KafkaProducer(
+        bootstrap_servers="localhost:{}".format(cluster.kafka_port),
+        value_serializer=producer_serializer,
+        key_serializer=producer_serializer,
+    )
+
+    topic = "system_kafka_cons2"
+    kafka_create_topic(admin_client, topic, num_partitions=2)
+
+    instance.query(
+        f"""
+        DROP TABLE IF EXISTS test.kafka;
+        DROP TABLE IF EXISTS test.kafka2;
+
+        CREATE TABLE test.kafka (key UInt64, value UInt64)
+            ENGINE = Kafka
+            SETTINGS kafka_broker_list = 'kafka1:19092',
+                     kafka_topic_list = '{topic}',
+                     kafka_group_name = '{topic}',
+                     kafka_commit_on_select = 1,
+                     kafka_format = 'JSONEachRow';
+
+        CREATE TABLE test.kafka2 (key UInt64, value UInt64)
+            ENGINE = Kafka
+            SETTINGS kafka_broker_list = 'kafka1:19092',
+                     kafka_topic_list = '{topic}',
+                     kafka_commit_on_select = 1,
+                     kafka_group_name = '{topic}',
+                     kafka_format = 'JSONEachRow';
+        """
+    )
+
+    producer.send(topic=topic, value=json.dumps({"key": 1, "value": 1}), partition=0)
+    producer.send(topic=topic, value=json.dumps({"key": 11, "value": 11}), partition=1)
+    time.sleep(3)
+
+    # first consumer subscribe the topic, try to poll some data, and go to rest
+    instance.query("SELECT * FROM test.kafka")
+
+    # second consumer do the same leading to rebalance in the first
+    # consumer, try to poll some data
+    instance.query("SELECT * FROM test.kafka2")
+
+    producer.send(topic=topic, value=json.dumps({"key": 1, "value": 1}), partition=0)
+    producer.send(topic=topic, value=json.dumps({"key": 10, "value": 10}), partition=1)
+    time.sleep(3)
+
+    instance.query("SELECT * FROM test.kafka")
+    instance.query("SELECT * FROM test.kafka2")
+    instance.query("SELECT * FROM test.kafka")
+    instance.query("SELECT * FROM test.kafka2")
+
+    result_system_kafka_consumers = instance.query(
+        """
+        create or replace function stable_timestamp as
+          (d)->multiIf(d==toDateTime('1970-01-01 00:00:00'), 'never', abs(dateDiff('second', d, now())) < 30, 'now', toString(d));
+        SELECT database, table, length(consumer_id), assignments.topic, assignments.partition_id,
+          assignments.current_offset,
+          if(length(exceptions.time)>0, exceptions.time[1]::String, 'never') as last_exception_time_,
+          if(length(exceptions.text)>0, exceptions.text[1], 'no exception') as last_exception_,
+          stable_timestamp(last_poll_time) as last_poll_time_, num_messages_read, stable_timestamp(last_commit_time) as last_commit_time_,
+          num_commits, stable_timestamp(last_rebalance_time) as last_rebalance_time_,
+          num_rebalance_revocations, num_rebalance_assignments, is_currently_used
+          FROM system.kafka_consumers WHERE database='test' and table IN ('kafka', 'kafka2') format Vertical;
+        """
+    )
+    logging.debug(f"result_system_kafka_consumers: {result_system_kafka_consumers}")
+    assert (
+        result_system_kafka_consumers
+        == """Row 1:
+──────
+database:                   test
+table:                      kafka
+length(consumer_id):        67
+assignments.topic:          ['system_kafka_cons2']
+assignments.partition_id:   [0]
+assignments.current_offset: [2]
+last_exception_time_:       never
+last_exception_:            no exception
+last_poll_time_:            now
+num_messages_read:          4
+last_commit_time_:          now
+num_commits:                2
+last_rebalance_time_:       now
+num_rebalance_revocations:  1
+num_rebalance_assignments:  2
+is_currently_used:          0
+
+Row 2:
+──────
+database:                   test
+table:                      kafka2
+length(consumer_id):        68
+assignments.topic:          ['system_kafka_cons2']
+assignments.partition_id:   [1]
+assignments.current_offset: [2]
+last_exception_time_:       never
+last_exception_:            no exception
+last_poll_time_:            now
+num_messages_read:          1
+last_commit_time_:          now
+num_commits:                1
+last_rebalance_time_:       never
+num_rebalance_revocations:  0
+num_rebalance_assignments:  1
+is_currently_used:          0
+"""
+    )
+
+    instance.query("DROP TABLE test.kafka")
+    instance.query("DROP TABLE test.kafka2")
+
+    kafka_delete_topic(admin_client, topic)
+
+
+def test_system_kafka_consumers_rebalance_mv(kafka_cluster, max_retries=15):
+    admin_client = KafkaAdminClient(
+        bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port)
+    )
+
+    producer = KafkaProducer(
+        bootstrap_servers="localhost:{}".format(cluster.kafka_port),
+        value_serializer=producer_serializer,
+        key_serializer=producer_serializer,
+    )
+
+    topic = "system_kafka_cons_mv"
+    kafka_create_topic(admin_client, topic, num_partitions=2)
+
+    instance.query(
+        f"""
+        DROP TABLE IF EXISTS test.kafka;
+        DROP TABLE IF EXISTS test.kafka2;
+        DROP TABLE IF EXISTS test.kafka_persistent;
+        DROP TABLE IF EXISTS test.kafka_persistent2;
+
+        CREATE TABLE test.kafka (key UInt64, value UInt64)
+            ENGINE = Kafka
+            SETTINGS kafka_broker_list = 'kafka1:19092',
+                     kafka_topic_list = '{topic}',
+                     kafka_group_name = '{topic}',
+                     kafka_commit_on_select = 1,
+                     kafka_format = 'JSONEachRow';
+
+        CREATE TABLE test.kafka2 (key UInt64, value UInt64)
+            ENGINE = Kafka
+            SETTINGS kafka_broker_list = 'kafka1:19092',
+                     kafka_topic_list = '{topic}',
+                     kafka_commit_on_select = 1,
+                     kafka_group_name = '{topic}',
+                     kafka_format = 'JSONEachRow';
+
+        CREATE TABLE test.kafka_persistent (key UInt64, value UInt64)
+            ENGINE = MergeTree()
+            ORDER BY key;
+        CREATE TABLE test.kafka_persistent2 (key UInt64, value UInt64)
+            ENGINE = MergeTree()
+            ORDER BY key;
+
+        CREATE MATERIALIZED VIEW test.persistent_kafka_mv TO test.kafka_persistent AS
+        SELECT key, value
+        FROM test.kafka;
+
+        CREATE MATERIALIZED VIEW test.persistent_kafka_mv2 TO test.kafka_persistent2 AS
+        SELECT key, value
+        FROM test.kafka2;
+        """
+    )
+
+    producer.send(topic=topic, value=json.dumps({"key": 1, "value": 1}), partition=0)
+    producer.send(topic=topic, value=json.dumps({"key": 11, "value": 11}), partition=1)
+    time.sleep(3)
+
+    retries = 0
+    result_rdkafka_stat = ""
+    while True:
+        result_rdkafka_stat = instance.query(
+            """
+            SELECT table, JSONExtractString(rdkafka_stat, 'type')
+            FROM system.kafka_consumers WHERE database='test' and table = 'kafka' format Vertical;
+            """
+        )
+        if result_rdkafka_stat.find("consumer") or retries > max_retries:
+            break
+        retries += 1
+        time.sleep(1)
+
+    assert (
+        result_rdkafka_stat
+        == """Row 1:
+──────
+table:                                   kafka
+JSONExtractString(rdkafka_stat, 'type'): consumer
+"""
+    )
+
+    instance.query("DROP TABLE test.kafka")
+    instance.query("DROP TABLE test.kafka2")
+    instance.query("DROP TABLE test.kafka_persistent")
+    instance.query("DROP TABLE test.kafka_persistent2")
+    instance.query("DROP TABLE test.persistent_kafka_mv")
+    instance.query("DROP TABLE test.persistent_kafka_mv2")
+
+    kafka_delete_topic(admin_client, topic)
+
+
 if __name__ == "__main__":
    cluster.start()
    input("Cluster created, press any key to destroy...")
--- a/tests/performance/lower_upper_function.xml
+++ b/tests/performance/lower_upper_function.xml
@ -1,11 +0,0 @@
-<test>
-    <query>select lower(randomString(16))</query>
-    <query>select lower(randomString(32))</query>
-    <query>select lower(randomString(64))</query>
-    <query>select lower(randomString(128))</query>
-    <query>select lower(randomString(256))</query>
-    <query>select lower(randomString(512))</query>
-    <query>select lower(randomString(1024))</query>
-    <query>select lower(randomString(832))</query>
-    <query>select lower(randomString(416))</query>
-</test>
--- a/tests/queries/0_stateless/01600_parts_types_metrics_long.reference
+++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.reference
@ -1,4 +1,3 @@
 1
 1
 1
-1
--- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
@ -10,14 +10,14 @@ set -o pipefail

 # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 verify_sql="SELECT
-    (SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) =
-    (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide')
-    FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))"
+    (SELECT sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) =
+    (SELECT countIf(part_type = 'Compact'), countIf(part_type = 'Wide')
+        FROM (SELECT part_type FROM system.parts UNION ALL SELECT part_type FROM system.projection_parts))"

 # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time.
 # So, there is inherent race condition (especially in fasttest that runs tests in parallel).
 #
-# But it should get expected result eventually.
+# But it should get the expected result eventually.
 # In case of test failure, this code will do infinite loop and timeout.
 verify()
 {
@ -32,21 +32,16 @@ verify()
 }

 $CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=1 --query="DROP TABLE IF EXISTS data_01600"
-# InMemory - [0..5]
 # Compact  - (5..10]
 # Wide     - >10
 $CLICKHOUSE_CLIENT --query="CREATE TABLE data_01600 (part_type String, key Int) ENGINE = MergeTree PARTITION BY part_type ORDER BY key SETTINGS min_bytes_for_wide_part=0, min_rows_for_wide_part=10, index_granularity = 8192, index_granularity_bytes = '10Mi'"

-# InMemory
-$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'InMemory', number FROM system.numbers LIMIT 1"
-verify
-
 # Compact
-$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Compact', number FROM system.numbers LIMIT 6 OFFSET 1"
+$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Compact', number FROM system.numbers LIMIT 6"
 verify

 # Wide
-$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Wide', number FROM system.numbers LIMIT 11 OFFSET 7"
+$CLICKHOUSE_CLIENT --query="INSERT INTO data_01600 SELECT 'Wide', number FROM system.numbers LIMIT 11 OFFSET 6"
 verify

 # DROP and check
--- a/tests/queries/0_stateless/02102_row_binary_with_names_and_types.sh
+++ b/tests/queries/0_stateless/02102_row_binary_with_names_and_types.sh
@ -55,7 +55,7 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102"
 $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102"


-$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, [[1, 2, 3], [4, 5], []] as a FORMAT RowBinaryWithNames" | $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames"  2>&1 | grep -F -c "CANNOT_SKIP_UNKNOWN_FIELD"
+$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, [[1, 2, 3], [4, 5], []] as a FORMAT RowBinaryWithNames" 2>&1 | $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_with_names_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNames"  2>&1 | grep -F -c "CANNOT_SKIP_UNKNOWN_FIELD"


 $CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, [[1, 2, 3], [4, 5], []] as a FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_skip_unknown_fields=1 --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes"
@ -63,8 +63,8 @@ $CLICKHOUSE_CLIENT -q "SELECT * FROM test_02102"
 $CLICKHOUSE_CLIENT -q "TRUNCATE TABLE test_02102"


-$CLICKHOUSE_CLIENT -q "SELECT 'text' AS x, toDate('2020-01-01') AS y, toUInt32(1) AS z FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -c "INCORRECT_DATA"
+$CLICKHOUSE_CLIENT -q "SELECT 'text' AS x, toDate('2020-01-01') AS y, toUInt32(1) AS z FORMAT RowBinaryWithNamesAndTypes" 2>&1 | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -c "INCORRECT_DATA"

-$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' as z, toDate('2020-01-01') AS y FORMAT RowBinaryWithNamesAndTypes" | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -c "INCORRECT_DATA"
+$CLICKHOUSE_CLIENT -q "SELECT toUInt32(1) AS x, 'text' as z, toDate('2020-01-01') AS y FORMAT RowBinaryWithNamesAndTypes" 2>&1 | $CLICKHOUSE_CLIENT --input_format_with_names_use_header=1 --input_format_with_types_use_header=1 -q "INSERT INTO test_02102 FORMAT RowBinaryWithNamesAndTypes" 2>&1 | grep -F -c "INCORRECT_DATA"

 $CLICKHOUSE_CLIENT -q "DROP TABLE test_02102"
--- a/tests/queries/0_stateless/02240_filesystem_cache_bypass_cache_threshold.reference
+++ b/tests/queries/0_stateless/02240_filesystem_cache_bypass_cache_threshold.reference
@ -12,7 +12,7 @@ SETTINGS min_bytes_for_wide_part = 10485760,
         disk = disk(
            type = cache,
            max_size = '128Mi',
-            path = '/var/lib/clickhouse/${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
+            path = '${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
            enable_bypass_cache_with_threashold = 1,
            bypass_cache_threashold = 100,
            delayed_cleanup_interval_ms = 100,
--- a/tests/queries/0_stateless/02240_filesystem_cache_bypass_cache_threshold.sql
+++ b/tests/queries/0_stateless/02240_filesystem_cache_bypass_cache_threshold.sql
@ -15,7 +15,7 @@ SETTINGS min_bytes_for_wide_part = 10485760,
         disk = disk(
            type = cache,
            max_size = '128Mi',
-            path = '/var/lib/clickhouse/${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
+            path = '${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
            enable_bypass_cache_with_threashold = 1,
            bypass_cache_threashold = 100,
            delayed_cleanup_interval_ms = 100,
--- a/tests/queries/0_stateless/02240_filesystem_query_cache.reference
+++ b/tests/queries/0_stateless/02240_filesystem_query_cache.reference
@ -14,7 +14,7 @@ SETTINGS min_bytes_for_wide_part = 10485760,
         disk = disk(
            type = cache,
            max_size = '128Mi',
-            path = '/var/lib/clickhouse/${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
+            path = '${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
            cache_on_write_operations= 1,
            enable_filesystem_query_cache_limit = 1,
            delayed_cleanup_interval_ms = 100,
--- a/tests/queries/0_stateless/02240_filesystem_query_cache.sql
+++ b/tests/queries/0_stateless/02240_filesystem_query_cache.sql
@ -17,7 +17,7 @@ SETTINGS min_bytes_for_wide_part = 10485760,
         disk = disk(
            type = cache,
            max_size = '128Mi',
-            path = '/var/lib/clickhouse/${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
+            path = '${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
            cache_on_write_operations= 1,
            enable_filesystem_query_cache_limit = 1,
            delayed_cleanup_interval_ms = 100,
--- a/tests/queries/0_stateless/02344_describe_cache.reference
+++ b/tests/queries/0_stateless/02344_describe_cache.reference
@ -1 +1 @@
-134217728	10000000	33554432	4194304	1	0	0	0	/var/lib/clickhouse/caches/s3_cache/	100	2	0
+134217728	10000000	33554432	4194304	1	0	0	0	/var/lib/clickhouse/filesystem_caches/s3_cache/	2	0
--- a/tests/queries/0_stateless/02503_cache_on_write_with_small_segment_size.sh
+++ b/tests/queries/0_stateless/02503_cache_on_write_with_small_segment_size.sh
@ -22,7 +22,7 @@ SETTINGS min_bytes_for_wide_part = 0,
            type = cache,
            max_size = '128Mi',
            max_file_segment_size = '10Ki',
-            path = '/var/lib/clickhouse/${CLICKHOUSE_TEST_UNIQUE_NAME}_cache',
+            path = '${CLICKHOUSE_TEST_UNIQUE_NAME}',
            cache_on_write_operations = 1,
            enable_filesystem_query_cache_limit = 1,
            delayed_cleanup_interval_ms = 100,
--- a/tests/queries/0_stateless/02714_local_object_storage.sql
+++ b/tests/queries/0_stateless/02714_local_object_storage.sql
@ -7,7 +7,7 @@ CREATE TABLE test (a Int32, b String)
 ENGINE = MergeTree() ORDER BY tuple()
 SETTINGS disk = disk(
    type = 'local_blob_storage',
-    path = '/var/lib/clickhouse/disks/${CLICKHOUSE_TEST_UNIQUE_NAME}/');
+    path = '${CLICKHOUSE_TEST_UNIQUE_NAME}/');

 INSERT INTO test SELECT 1, 'test';
 SELECT * FROM test;
@ -19,7 +19,7 @@ ENGINE = MergeTree() ORDER BY tuple()
 SETTINGS disk = disk(
    type = 'cache',
    max_size = '10Mi',
-    path = '/var/lib/clickhouse/caches/${CLICKHOUSE_TEST_UNIQUE_NAME}/',
+    path = '${CLICKHOUSE_TEST_UNIQUE_NAME}/',
    disk = disk(type='local_blob_storage', path='/var/lib/clickhouse/disks/${CLICKHOUSE_TEST_UNIQUE_NAME}/'));

 INSERT INTO test SELECT 1, 'test';
--- a/tests/queries/0_stateless/02725_parquet_preserve_order.sh
+++ b/tests/queries/0_stateless/02725_parquet_preserve_order.sh
@ -10,7 +10,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # It'll be read into two blocks. The first block will sleep 2x longer than the second.
 # So reordering is very likely if the order-preservation doesn't work.

-$CLICKHOUSE_LOCAL -q "select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1, function_sleep_max_microseconds_per_block = 6000000"
+$CLICKHOUSE_LOCAL -q "select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1, function_sleep_max_microseconds_per_block = 6000000, input_format_parquet_local_file_min_bytes_for_seek=0"

-$CLICKHOUSE_LOCAL -q "explain pipeline select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1, max_threads=2"
-$CLICKHOUSE_LOCAL -q "explain pipeline select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=0, parallelize_output_from_storages=1, max_threads=2"
+$CLICKHOUSE_LOCAL -q "explain pipeline select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=1, max_threads=2, input_format_parquet_local_file_min_bytes_for_seek=0"
+$CLICKHOUSE_LOCAL -q "explain pipeline select number + sleepEachRow(3) from file('$CURDIR/data_parquet/02725_data.parquet') settings input_format_parquet_preserve_order=0, parallelize_output_from_storages=1, max_threads=2, input_format_parquet_local_file_min_bytes_for_seek=0"
--- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference
+++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.reference
@ -0,0 +1,6 @@
+-- 1 shard, 3 replicas
+100	0	99	49.5
+200	0	99	49.5
+-- 2 shards, 3 replicas each
+200	0	99	49.5
+400	0	99	49.5
--- a/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql
+++ b/tests/queries/0_stateless/02835_parallel_replicas_over_distributed.sql
@ -0,0 +1,47 @@
+-- 1 shard
+
+SELECT '-- 1 shard, 3 replicas';
+DROP TABLE IF EXISTS test_d;
+DROP TABLE IF EXISTS test;
+CREATE TABLE test (id UInt64, date Date)
+ENGINE = MergeTree
+ORDER BY id;
+
+CREATE TABLE IF NOT EXISTS test_d as test
+ENGINE = Distributed(test_cluster_one_shard_three_replicas_localhost, currentDatabase(), test);
+
+insert into test select *, today() from numbers(100);
+
+SELECT count(), min(id), max(id), avg(id)
+FROM test_d
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0;
+
+insert into test select *, today() from numbers(100);
+
+SELECT count(), min(id), max(id), avg(id)
+FROM test_d
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0;
+
+-- 2 shards
+
+SELECT '-- 2 shards, 3 replicas each';
+DROP TABLE IF EXISTS test2_d;
+DROP TABLE IF EXISTS test2;
+CREATE TABLE test2 (id UInt64, date Date)
+ENGINE = MergeTree
+ORDER BY id;
+
+CREATE TABLE IF NOT EXISTS test2_d as test2
+ENGINE = Distributed(test_cluster_two_shard_three_replicas_localhost, currentDatabase(), test2, id);
+
+insert into test2 select *, today() from numbers(100);
+
+SELECT count(), min(id), max(id), avg(id)
+FROM test2_d
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0;
+
+insert into test2 select *, today() from numbers(100);
+
+SELECT count(), min(id), max(id), avg(id)
+FROM test2_d
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, max_parallel_replicas = 3, prefer_localhost_replica = 0, parallel_replicas_for_non_replicated_merge_tree=1, use_hedged_requests=0;
--- a/tests/queries/0_stateless/02842_filesystem_cache_validate_path.reference
+++ b/tests/queries/0_stateless/02842_filesystem_cache_validate_path.reference
--- a/tests/queries/0_stateless/02842_filesystem_cache_validate_path.sql
+++ b/tests/queries/0_stateless/02842_filesystem_cache_validate_path.sql
@ -0,0 +1,45 @@
+-- Tags: no-fasttest
+
+DROP TABLE IF EXISTS test;
+DROP TABLE IF EXISTS test_1;
+DROP TABLE IF EXISTS test_2;
+
+CREATE TABLE test (a Int32)
+ENGINE = MergeTree()
+ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+                     max_size = '1Mi',
+                     path = '/kek',
+                     disk = 'local_disk'); -- {serverError BAD_ARGUMENTS}
+
+CREATE TABLE test (a Int32)
+ENGINE = MergeTree()
+ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+                     max_size = '1Mi',
+                     path = '/var/lib/clickhouse/filesystem_caches/../kek',
+                     disk = 'local_disk'); -- {serverError BAD_ARGUMENTS}
+
+CREATE TABLE test (a Int32)
+ENGINE = MergeTree()
+ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+                     max_size = '1Mi',
+                     path = '../kek',
+                     disk = 'local_disk'); -- {serverError BAD_ARGUMENTS}
+
+CREATE TABLE test_1 (a Int32)
+ENGINE = MergeTree()
+ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+                     max_size = '1Mi',
+                     path = '/var/lib/clickhouse/filesystem_caches/kek',
+                     disk = 'local_disk');
+
+CREATE TABLE test_2 (a Int32)
+ENGINE = MergeTree()
+ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+                     max_size = '1Mi',
+                     path = 'kek',
+                     disk = 'local_disk');
--- a/tests/queries/0_stateless/02861_interpolate_alias_precedence.reference
+++ b/tests/queries/0_stateless/02861_interpolate_alias_precedence.reference
@ -0,0 +1,8 @@
+2023-05-15	1
+2023-05-16	1
+2023-05-17	1
+2023-05-18	1
+2023-05-19	1
+2023-05-20	1
+2023-05-21	1
+2023-05-22	15
--- a/tests/queries/0_stateless/02861_interpolate_alias_precedence.sql
+++ b/tests/queries/0_stateless/02861_interpolate_alias_precedence.sql
@ -0,0 +1,8 @@
+DROP TABLE IF EXISTS 02861_interpolate;
+
+CREATE TABLE 02861_interpolate (date Date, id String, f Int16) ENGINE=MergeTree() ORDER BY (date);
+INSERT INTO 02861_interpolate VALUES ('2023-05-15', '1', 1), ('2023-05-22', '1', 15);
+
+SELECT date AS d, toNullable(f) AS f FROM 02861_interpolate WHERE id = '1' ORDER BY d ASC WITH FILL STEP toIntervalDay(1) INTERPOLATE (f);
+
+DROP TABLE 02861_interpolate;
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@ -14,6 +14,7 @@ v23.4.4.16-stable	2023-06-17
 v23.4.3.48-stable	2023-06-12
 v23.4.2.11-stable	2023-05-02
 v23.4.1.1943-stable	2023-04-27
+v23.3.9.55-lts	2023-08-21
 v23.3.8.21-lts	2023-07-13
 v23.3.7.5-lts	2023-06-29
 v23.3.6.7-lts	2023-06-28