Merge remote-tracking branch 'origin/master' into analyzer-refactor-constant-name

2024-11-24 08:32:02 +00:00 · 2024-02-26 15:11:57 +00:00 · 2024-02-26 15:11:57 +00:00 · 42b91201a5
commit 42b91201a5
parent d13b2a91c1 048f7ec3e9
556 changed files with 14706 additions and 3487 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
      - 'backport/**'
 jobs:
  RunConfig:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
      - 'master'
 jobs:
  RunConfig:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
@ -375,28 +375,12 @@ jobs:
      test_name: Stateless tests (release)
      runner_type: func-tester
      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseDatabaseReplicated:
+  FunctionalStatelessTestReleaseAnalyzerS3Replicated:
    needs: [RunConfig, BuilderDebRelease]
    if: ${{ !failure() && !cancelled() }}
    uses: ./.github/workflows/reusable_test.yml
    with:
-      test_name: Stateless tests (release, DatabaseReplicated)
-      runner_type: func-tester
-      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseAnalyzer:
-    needs: [RunConfig, BuilderDebRelease]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Stateless tests (release, analyzer)
-      runner_type: func-tester
-      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseS3:
-    needs: [RunConfig, BuilderDebRelease]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Stateless tests (release, s3 storage)
+      test_name: Stateless tests (release, analyzer, s3, DatabaseReplicated)
      runner_type: func-tester
      data: ${{ needs.RunConfig.outputs.data }}
  FunctionalStatelessTestS3Debug:
@ -825,9 +809,7 @@ jobs:
      - MarkReleaseReady
      - FunctionalStatelessTestDebug
      - FunctionalStatelessTestRelease
-      - FunctionalStatelessTestReleaseDatabaseReplicated
-      - FunctionalStatelessTestReleaseAnalyzer
-      - FunctionalStatelessTestReleaseS3
+      - FunctionalStatelessTestReleaseAnalyzerS3Replicated
      - FunctionalStatelessTestAarch64
      - FunctionalStatelessTestAsan
      - FunctionalStatelessTestTsan
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -14,7 +14,7 @@ jobs:
    # The task for having a preserved ENV and event.json for later investigation
    uses: ./.github/workflows/debug.yml
  RunConfig:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -18,7 +18,7 @@ on:  # yamllint disable-line rule:truthy
 ##########################################################################################
 jobs:
  RunConfig:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
@ -391,28 +391,12 @@ jobs:
      test_name: Stateless tests (release)
      runner_type: func-tester
      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseDatabaseReplicated:
+  FunctionalStatelessTestReleaseAnalyzerS3Replicated:
    needs: [RunConfig, BuilderDebRelease]
    if: ${{ !failure() && !cancelled() }}
    uses: ./.github/workflows/reusable_test.yml
    with:
-      test_name: Stateless tests (release, DatabaseReplicated)
-      runner_type: func-tester
-      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseAnalyzer:
-    needs: [RunConfig, BuilderDebRelease]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Stateless tests (release, analyzer)
-      runner_type: func-tester
-      data: ${{ needs.RunConfig.outputs.data }}
-  FunctionalStatelessTestReleaseS3:
-    needs: [RunConfig, BuilderDebRelease]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Stateless tests (release, s3 storage)
+      test_name: Stateless tests (release, analyzer, s3, DatabaseReplicated)
      runner_type: func-tester
      data: ${{ needs.RunConfig.outputs.data }}
  FunctionalStatelessTestS3Debug:
@ -750,14 +734,6 @@ jobs:
 #############################################################################################
 ############################# INTEGRATION TESTS #############################################
 #############################################################################################
-  IntegrationTestsAsan:
-    needs: [RunConfig, BuilderDebAsan]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Integration tests (asan)
-      runner_type: stress-tester
-      data: ${{ needs.RunConfig.outputs.data }}
  IntegrationTestsAnalyzerAsan:
    needs: [RunConfig, BuilderDebAsan]
    if: ${{ !failure() && !cancelled() }}
@ -774,14 +750,6 @@ jobs:
      test_name: Integration tests (tsan)
      runner_type: stress-tester
      data: ${{ needs.RunConfig.outputs.data }}
-  IntegrationTestsRelease:
-    needs: [RunConfig, BuilderDebRelease]
-    if: ${{ !failure() && !cancelled() }}
-    uses: ./.github/workflows/reusable_test.yml
-    with:
-      test_name: Integration tests (release)
-      runner_type: stress-tester
-      data: ${{ needs.RunConfig.outputs.data }}
  IntegrationTestsAarch64:
    needs: [RunConfig, BuilderDebAarch64]
    if: ${{ !failure() && !cancelled() }}
@ -890,8 +858,6 @@ jobs:
      - TestsBugfixCheck
      - FunctionalStatelessTestDebug
      - FunctionalStatelessTestRelease
-      - FunctionalStatelessTestReleaseDatabaseReplicated
-      - FunctionalStatelessTestReleaseAnalyzer
      - FunctionalStatelessTestAarch64
      - FunctionalStatelessTestAsan
      - FunctionalStatelessTestTsan
@ -904,9 +870,9 @@ jobs:
      - FunctionalStatefulTestTsan
      - FunctionalStatefulTestMsan
      - FunctionalStatefulTestUBsan
-      - FunctionalStatelessTestReleaseS3
      - FunctionalStatelessTestS3Debug
      - FunctionalStatelessTestS3Tsan
+      - FunctionalStatelessTestReleaseAnalyzerS3Replicated
      - FunctionalStatefulTestReleaseParallelReplicas
      - FunctionalStatefulTestAsanParallelReplicas
      - FunctionalStatefulTestTsanParallelReplicas
@ -927,10 +893,8 @@ jobs:
      - ASTFuzzerTestTsan
      - ASTFuzzerTestMSan
      - ASTFuzzerTestUBSan
-      - IntegrationTestsAsan
      - IntegrationTestsAnalyzerAsan
      - IntegrationTestsTsan
-      - IntegrationTestsRelease
      - IntegrationTestsAarch64
      - IntegrationTestsFlakyCheck
      - PerformanceComparisonX86
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -14,7 +14,7 @@ on: # yamllint disable-line rule:truthy

 jobs:
  RunConfig:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    outputs:
      data: ${{ steps.runconfig.outputs.CI_DATA }}
    steps:
--- a/base/base/CMakeLists.txt
+++ b/base/base/CMakeLists.txt
@ -10,6 +10,7 @@ set (CMAKE_CXX_STANDARD 20)

 set (SRCS
    argsToConfig.cpp
+    cgroupsv2.cpp
    coverage.cpp
    demangle.cpp
    getAvailableMemoryAmount.cpp
--- a/base/base/Decimal_fwd.h
+++ b/base/base/Decimal_fwd.h
@ -1,6 +1,7 @@
 #pragma once

 #include <base/types.h>
+#include <base/extended_types.h>

 namespace wide
 {
@ -44,3 +45,8 @@ concept is_over_big_int =
    || std::is_same_v<T, Decimal128>
    || std::is_same_v<T, Decimal256>;
 }
+
+template <> struct is_signed<DB::Decimal32> { static constexpr bool value = true; };
+template <> struct is_signed<DB::Decimal64> { static constexpr bool value = true; };
+template <> struct is_signed<DB::Decimal128> { static constexpr bool value = true; };
+template <> struct is_signed<DB::Decimal256> { static constexpr bool value = true; };
--- a/base/base/cgroupsv2.cpp
+++ b/base/base/cgroupsv2.cpp
@ -0,0 +1,64 @@
+#include <base/cgroupsv2.h>
+
+#include <base/defines.h>
+
+#include <fstream>
+#include <sstream>
+
+
+bool cgroupsV2Enabled()
+{
+#if defined(OS_LINUX)
+    /// This file exists iff the host has cgroups v2 enabled.
+    auto controllers_file = default_cgroups_mount / "cgroup.controllers";
+    if (!std::filesystem::exists(controllers_file))
+        return false;
+    return true;
+#else
+    return false;
+#endif
+}
+
+bool cgroupsV2MemoryControllerEnabled()
+{
+#if defined(OS_LINUX)
+    chassert(cgroupsV2Enabled());
+    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html:
+    /// - file 'cgroup.controllers' defines which controllers *can* be enabled
+    /// - file 'cgroup.subtree_control' defines which controllers *are* enabled
+    /// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
+    std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
+    if (!subtree_control_file.is_open())
+        return false;
+    std::string controllers;
+    std::getline(subtree_control_file, controllers);
+    if (controllers.find("memory") == std::string::npos)
+        return false;
+    return true;
+#else
+    return false;
+#endif
+}
+
+std::string cgroupV2OfProcess()
+{
+#if defined(OS_LINUX)
+    chassert(cgroupsV2Enabled());
+    /// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
+    /// A simpler way to get the membership is:
+    std::ifstream cgroup_name_file("/proc/self/cgroup");
+    if (!cgroup_name_file.is_open())
+        return "";
+    /// With cgroups v2, there will be a *single* line with prefix "0::/"
+    /// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
+    std::string cgroup;
+    std::getline(cgroup_name_file, cgroup);
+    static const std::string v2_prefix = "0::/";
+    if (!cgroup.starts_with(v2_prefix))
+        return "";
+    cgroup = cgroup.substr(v2_prefix.length());
+    return cgroup;
+#else
+    return "";
+#endif
+}
--- a/base/base/cgroupsv2.h
+++ b/base/base/cgroupsv2.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include <filesystem>
+#include <string>
+
+#if defined(OS_LINUX)
+/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
+/// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I have seen.
+static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
+#endif
+
+/// Is cgroups v2 enabled on the system?
+bool cgroupsV2Enabled();
+
+/// Is the memory controller of cgroups v2 enabled on the system?
+/// Assumes that cgroupsV2Enabled() is enabled.
+bool cgroupsV2MemoryControllerEnabled();
+
+/// Which cgroup does the process belong to?
+/// Returns an empty string if the cgroup cannot be determined.
+/// Assumes that cgroupsV2Enabled() is enabled.
+std::string cgroupV2OfProcess();
--- a/base/base/getMemoryAmount.cpp
+++ b/base/base/getMemoryAmount.cpp
@ -1,17 +1,14 @@
 #include <base/getMemoryAmount.h>

+#include <base/cgroupsv2.h>
 #include <base/getPageSize.h>

 #include <fstream>
-#include <sstream>
 #include <stdexcept>

 #include <unistd.h>
 #include <sys/types.h>
 #include <sys/param.h>
-#if defined(BSD)
-#include <sys/sysctl.h>
-#endif


 namespace
@ -20,49 +17,14 @@ namespace
 std::optional<uint64_t> getCgroupsV2MemoryLimit()
 {
 #if defined(OS_LINUX)
-    const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
-
-    /// This file exists iff the host has cgroups v2 enabled.
-    std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers");
-    if (!controllers_file.is_open())
+    if (!cgroupsV2Enabled())
        return {};

-    /// Make sure that the memory controller is enabled.
-    /// - cgroup.controllers defines which controllers *can* be enabled.
-    /// - cgroup.subtree_control defines which controllers *are* enabled.
-    /// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
-    /// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
-    /// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
-    /// std::string subtree_control;
-    /// readString(subtree_control, subtree_control_file);
-    /// if (subtree_control.find("memory") == std::string::npos)
-    ///     return {};
-    std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
-    std::stringstream subtree_control_buf;
-    subtree_control_buf << subtree_control_file.rdbuf();
-    std::string subtree_control = subtree_control_buf.str();
-    if (subtree_control.find("memory") == std::string::npos)
+    if (!cgroupsV2MemoryControllerEnabled())
        return {};

-    /// Identify the cgroup the process belongs to
-    /// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
-    /// A simpler way to get the membership is:
-    std::ifstream cgroup_name_file("/proc/self/cgroup");
-    if (!cgroup_name_file.is_open())
-        return {};
-
-    std::stringstream cgroup_name_buf;
-    cgroup_name_buf << cgroup_name_file.rdbuf();
-    std::string cgroup_name = cgroup_name_buf.str();
-    if (!cgroup_name.empty() && cgroup_name.back() == '\n')
-        cgroup_name.pop_back(); /// remove trailing newline, if any
-    /// With cgroups v2, there will be a *single* line with prefix "0::/"
-    const std::string v2_prefix = "0::/";
-    if (!cgroup_name.starts_with(v2_prefix))
-        return {};
-    cgroup_name = cgroup_name.substr(v2_prefix.length());
-
-    std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name);
+    std::string cgroup = cgroupV2OfProcess();
+    auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);

    /// Open the bottom-most nested memory limit setting file. If there is no such file at the current
    /// level, try again at the parent level as memory settings are inherited.
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@ -1 +1 @@
-Subproject commit 1278e32bb0d5dc489f947e002bdf8c71b0ddaa63
+Subproject commit 4a12f99dfc9d47c687ff7700b927cc76856225d1
--- a/contrib/aws
+++ b/contrib/aws
@ -1 +1 @@
-Subproject commit 9eb5097a0abfa837722cca7a5114a25837817bf2
+Subproject commit 5f0542b3ad7eef25b0540d37d778207e0345ea8f
--- a/contrib/qpl
+++ b/contrib/qpl
@ -1 +1 @@
-Subproject commit a61bdd845fd7ca363b2bcc55454aa520dfcd8298
+Subproject commit d4715e0e79896b85612158e135ee1a85f3b3e04d
--- a/contrib/rapidjson
+++ b/contrib/rapidjson
@ -1 +1 @@
-Subproject commit c4ef90ccdbc21d5d5a628d08316bfd301e32d6fa
+Subproject commit 800ca2f38fc3b387271d9e1926fcfc9070222104
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -387,6 +387,11 @@ if [ -f core.zst ]; then
 fi

 rg --text -F '<Fatal>' server.log > fatal.log ||:
+FATAL_LINK=''
+if [ -s fatal.log ]; then
+    FATAL_LINK='<a href="fatal.log">fatal.log</a>'
+fi
+
 dmesg -T > dmesg.log ||:

 zstd --threads=0 --rm server.log
@ -419,6 +424,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
  <a href="main.log">main.log</a>
  <a href="dmesg.log">dmesg.log</a>
  ${CORE_LINK}
+  ${FATAL_LINK}
 </p>
 <table>
 <tr>
--- a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml
+++ b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml
@ -1,7 +1,7 @@
 version: '2.3'
 services:
    mysql2:
-        image: mysql:5.7
+        image: mysql:8.0
        restart: always
        environment:
            MYSQL_ROOT_PASSWORD: clickhouse
@ -23,7 +23,7 @@ services:
              source: ${MYSQL_CLUSTER_LOGS:-}
              target: /mysql/
    mysql3:
-        image: mysql:5.7
+        image: mysql:8.0
        restart: always
        environment:
            MYSQL_ROOT_PASSWORD: clickhouse
@ -45,7 +45,7 @@ services:
              source: ${MYSQL_CLUSTER_LOGS:-}
              target: /mysql/
    mysql4:
-        image: mysql:5.7
+        image: mysql:8.0
        restart: always
        environment:
            MYSQL_ROOT_PASSWORD: clickhouse
--- a/docker/test/stateless/stress_tests.lib
+++ b/docker/test/stateless/stress_tests.lib
@ -214,8 +214,7 @@ function check_server_start()
 function check_logs_for_critical_errors()
 {
    # Sanitizer asserts
-    rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
-    rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
+    sed -n '/WARNING:.*anitizer/,/^$/p' >> /test_output/tmp
    rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
        && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
        || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
@ -233,8 +232,8 @@ function check_logs_for_critical_errors()
    # Remove file logical_errors.txt if it's empty
    [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt

-    # No such key errors
-    rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
+    # No such key errors (ignore a.myext which is used in 02724_database_s3.sh and does not exist)
+    rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log | grep -v "a.myext" > /test_output/no_such_key_errors.txt \
        && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
        || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv

--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -77,11 +77,18 @@ remove_keeper_config "async_replication" "1"
 # create_if_not_exists feature flag doesn't exist on some older versions
 remove_keeper_config "create_if_not_exists" "[01]"

+# latest_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
+
+# commit_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
+
 # it contains some new settings, but we can safely remove it
 rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/config.d/zero_copy_destructive_operations.xml
 rm /etc/clickhouse-server/config.d/storage_conf_02963.xml
+rm /etc/clickhouse-server/config.d/backoff_failed_mutation.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
 rm /etc/clickhouse-server/users.d/replicated_ddl_entry.xml
@ -109,6 +116,12 @@ remove_keeper_config "async_replication" "1"
 # create_if_not_exists feature flag doesn't exist on some older versions
 remove_keeper_config "create_if_not_exists" "[01]"

+# latest_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
+
+# commit_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
+
 # But we still need default disk because some tables loaded only into it
 sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
  | sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
@ -122,6 +135,7 @@ rm /etc/clickhouse-server/config.d/merge_tree.xml
 rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
 rm /etc/clickhouse-server/config.d/zero_copy_destructive_operations.xml
 rm /etc/clickhouse-server/config.d/storage_conf_02963.xml
+rm /etc/clickhouse-server/config.d/backoff_failed_mutation.xml
 rm /etc/clickhouse-server/config.d/block_number.xml
 rm /etc/clickhouse-server/users.d/nonconst_timezone.xml
 rm /etc/clickhouse-server/users.d/s3_cache_new.xml
--- a/docs/en/development/build-osx.md
+++ b/docs/en/development/build-osx.md
@ -37,7 +37,7 @@ sudo xcode-select --install

 ``` bash
 brew update
-brew install ccache cmake ninja libtool gettext llvm gcc binutils grep findutils
+brew install ccache cmake ninja libtool gettext llvm gcc binutils grep findutils nasm
 ```

 ## Checkout ClickHouse Sources {#checkout-clickhouse-sources}
--- a/docs/en/engines/database-engines/postgresql.md
+++ b/docs/en/engines/database-engines/postgresql.md
@ -10,7 +10,7 @@ Allows to connect to databases on a remote [PostgreSQL](https://www.postgresql.o

 Gives the real-time access to table list and table structure from remote PostgreSQL with the help of `SHOW TABLES` and `DESCRIBE TABLE` queries.

-Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) it set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries.
+Supports table structure modifications (`ALTER TABLE ... ADD|DROP COLUMN`). If `use_table_cache` parameter (see the Engine Parameters below) is set to `1`, the table structure is cached and not checked for being modified, but can be updated with `DETACH` and `ATTACH` queries.

 ## Creating a Database {#creating-a-database}

--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -80,6 +80,7 @@ The BACKUP and RESTORE statements take a list of DATABASE and TABLE names, a des
 - ASYNC: backup or restore asynchronously
 - PARTITIONS: a list of partitions to restore
 - SETTINGS:
+    - `id`: id of backup or restore operation, randomly generated UUID is used, if not specified manually. If there is already running operation with the same `id` exception is thrown.
    - [`compression_method`](/docs/en/sql-reference/statements/create/table.md/#column-compression-codecs) and compression_level
    - `password` for the file on disk
    - `base_backup`: the destination of the previous backup of this source.  For example, `Disk('backups', '1.zip')`
@ -206,7 +207,7 @@ end_time:          2022-08-30 09:21:46
 1 row in set. Elapsed: 0.002 sec.
 ```

-Along with `system.backups` table, all backup and restore operations are also tracked in the system log table [backup_log](../operations/system-tables/backup_log.md): 
+Along with `system.backups` table, all backup and restore operations are also tracked in the system log table [backup_log](../operations/system-tables/backup_log.md):
 ```
 SELECT *
 FROM system.backup_log
@ -222,7 +223,7 @@ event_time_microseconds: 2023-08-18 11:13:43.097414
 id:                      7678b0b3-f519-4e6e-811f-5a0781a4eb52
 name:                    Disk('backups', '1.zip')
 status:                  CREATING_BACKUP
-error:                   
+error:
 start_time:              2023-08-18 11:13:43
 end_time:                1970-01-01 03:00:00
 num_files:               0
@ -252,7 +253,7 @@ compressed_size:         0
 files_read:              0
 bytes_read:              0

-2 rows in set. Elapsed: 0.075 sec. 
+2 rows in set. Elapsed: 0.075 sec.
 ```

 ## Configuring BACKUP/RESTORE to use an S3 Endpoint
@ -271,7 +272,7 @@ Creating an S3 bucket is covered in [Use S3 Object Storage as a ClickHouse disk]

 The destination for a backup will be specified like this:
 ```
-S3('<S3 endpoint>/<directory>', '<Access key ID>', '<Secret access key>)
+S3('<S3 endpoint>/<directory>', '<Access key ID>', '<Secret access key>')
 ```

 ```sql
--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@ -10,11 +10,62 @@ The ClickHouse server can be configured with configuration files in XML or YAML

 It is possible to mix XML and YAML configuration files, for example you could have a main configuration file `config.xml` and additional configuration files `config.d/network.xml`, `config.d/timezone.yaml` and `config.d/keeper.yaml`. Mixing XML and YAML within a single configuration file is not supported. XML configuration files should use `<clickhouse>...</clickhouse>` as top-level tag. In YAML configuration files, `clickhouse:` is optional, the parser inserts it implicitly if absent.

-## Overriding Configuration {#override}
+## Merging Configuration {#merging}

-The merge of configuration files behaves as one intuitively expects: The contents of both files are combined recursively, children with the same name are replaced by the element of the more specific configuration file. The merge can be customized using attributes `replace` and `remove`.
- Attribute `replace` means that the element is replaced by the specified one.
- Attribute `remove` means that the element is deleted.
+Two configuration files (usually the main configuration file and another configuration files from `config.d/`) are merged as follows:
+
+- If a node (i.e. a path leading to an element) appears in both files and does not have attributes `replace` or `remove`, it is included in the merged configuration file and children from both nodes are included and merged recursively.
+- If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included.
+- If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted).
+
+Example:
+
+
+```xml
+<!-- config.xml -->
+<clickhouse>
+    <config_a>
+        <setting_1>1</setting_1>
+    </config_a>
+    <config_b>
+        <setting_2>2</setting_2>
+    </config_b>
+    <config_c>
+        <setting_3>3</setting_3>
+    </config_c>
+</clickhouse>
+```
+
+and
+
+```xml
+<!-- config.d/other_config.xml -->
+<clickhouse>
+    <config_a>
+        <setting_4>4</setting_4>
+    </config_a>
+    <config_b replace="replace">
+        <setting_5>5</setting_5>
+    </config_b>
+    <config_c remove="remove">
+        <setting_6>6</setting_6>
+    </config_c>
+</clickhouse>
+```
+
+generates merged configuration file:
+
+```xml
+<clickhouse>
+    <config_a>
+        <setting_1>1</setting_1>
+        <setting_4>4</setting_4>
+    </config_a>
+    <config_b>
+        <setting_5>5</setting_5>
+    </config_b>
+</clickhouse>
+```

 To specify that a value of an element should be replaced by the value of an environment variable, you can use attribute `from_env`.

@ -125,7 +176,7 @@ Users configuration can be split into separate files similar to `config.xml` and
 Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`.
 Directory `users.d` is used by default, as `users_config` defaults to `users.xml`.

-Note that configuration files are first merged taking into account [Override](#override) settings and includes are processed after that.
+Note that configuration files are first [merged](#merging) taking into account settings, and includes are processed after that.

 ## XML example {#example}

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -458,6 +458,38 @@ Type: Double

 Default: 0.9

+## cgroups_memory_usage_observer_wait_time
+
+Interval in seconds during which the server's maximum allowed memory consumption is adjusted by the corresponding threshold in cgroups. (see
+settings `cgroup_memory_watcher_hard_limit_ratio` and `cgroup_memory_watcher_soft_limit_ratio`).
+
+Type: UInt64
+
+Default: 15
+
+## cgroup_memory_watcher_hard_limit_ratio
+
+Specifies the "hard" threshold with regards to the memory consumption of the server process according to cgroups after which the server's
+maximum memory consumption is adjusted to the threshold value.
+
+See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_soft_limit_ratio`
+
+Type: Double
+
+Default: 0.95
+
+## cgroup_memory_watcher_soft_limit_ratio
+
+Specifies the "soft" threshold with regards to the memory consumption of the server process according to cgroups after which arenas in
+jemalloc are purged.
+
+
+See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_hard_limit_ratio`
+
+Type: Double
+
+Default: 0.95
+
 ## max_table_size_to_drop

 Restriction on deleting tables.
@ -472,10 +504,10 @@ The value 0 means that you can delete all tables without any restrictions.
 ``` xml
 <max_table_size_to_drop>0</max_table_size_to_drop>
 ```
-  

-## max\_database\_num\_to\_warn {#max-database-num-to-warn}  
-If the number of attached databases exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.    
+
+## max\_database\_num\_to\_warn {#max-database-num-to-warn}
+If the number of attached databases exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
 Default value: 1000

 **Example**
@ -483,10 +515,10 @@ Default value: 1000
 ``` xml
 <max_database_num_to_warn>50</max_database_num_to_warn>
 ```
-  
-## max\_table\_num\_to\_warn {#max-table-num-to-warn}   
-If the number of attached tables exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.  
-Default value: 5000    
+
+## max\_table\_num\_to\_warn {#max-table-num-to-warn}
+If the number of attached tables exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
+Default value: 5000

 **Example**

@ -495,9 +527,9 @@ Default value: 5000
 ```


-## max\_part\_num\_to\_warn {#max-part-num-to-warn}  
-If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.  
-Default value: 100000  
+## max\_part\_num\_to\_warn {#max-part-num-to-warn}
+If the number of active parts exceeds the specified value, clickhouse server will add warning messages to `system.warnings` table.
+Default value: 100000

 **Example**

@ -2873,3 +2905,11 @@ A limit on the number of materialized views attached to a table.
 Note that only directly dependent views are considered here, and the creation of one view on top of another view is not considered.

 Default value: `0`.
+
+## format_alter_operations_with_parentheses {#format_alter_operations_with_parentheses}
+
+If set to true, then alter operations will be surrounded by parentheses in formatted queries. This makes the parsing of formatted alter queries less ambiguous.
+
+Type: Bool
+
+Default: 0
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -755,7 +755,7 @@ By default: 1,000,000. It only works when reading from MergeTree engines.

 ## max_concurrent_queries_for_user {#max-concurrent-queries-for-user}

-The maximum number of simultaneously processed queries related to MergeTree table per user.
+The maximum number of simultaneously processed queries per user.

 Possible values:

@ -4281,7 +4281,7 @@ Result:

 ## enable_order_by_all {#enable-order-by-all}

-Enables or disables sorting by `ALL` columns, i.e. [ORDER BY](../../sql-reference/statements/select/order-by.md)
+Enables or disables sorting with `ORDER BY ALL` syntax, see [ORDER BY](../../sql-reference/statements/select/order-by.md).

 Possible values:

@ -4301,7 +4301,7 @@ INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20);

 SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous

-SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all;
+SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all = 0;
 ```

 Result:
--- a/docs/en/operations/system-tables/settings_changes.md
+++ b/docs/en/operations/system-tables/settings_changes.md
@ -0,0 +1,32 @@
+---
+slug: /en/operations/system-tables/settings_changes
+---
+# settings_changes
+
+Contains information about setting changes in previous ClickHouse versions.
+
+Columns:
+
+- `version` ([String](../../sql-reference/data-types/string.md)) — The ClickHouse version in which settings were changed
+- `changes` ([Array](../../sql-reference/data-types/array.md) of [Tuple](../../sql-reference/data-types/tuple.md)) — A description of the setting changes: (setting name, previous value, new value, reason for the change)
+
+**Example**
+
+``` sql
+SELECT *
+FROM system.settings_changes
+WHERE version = '23.5'
+FORMAT Vertical
+```
+
+``` text
+Row 1:
+──────
+version: 23.5
+changes: [('input_format_parquet_preserve_order','1','0','Allow Parquet reader to reorder rows for better parallelism.'),('parallelize_output_from_storages','0','1','Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows.'),('use_with_fill_by_sorting_prefix','0','1','Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently'),('output_format_parquet_compliant_nested_types','0','1','Change an internal field name in output Parquet file schema.')]
+```
+
+**See also**
+
+- [Settings](../../operations/settings/index.md#session-settings-intro)
+- [system.settings](settings.md)
--- a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md
@ -13,8 +13,8 @@ simpleLinearRegression(x, y)

 Parameters:

- `x` — Column with dependent variable values.
- `y` — Column with explanatory variable values.
+- `x` — Column with explanatory variable values.
+- `y` — Column with dependent variable values.

 Returned values:

--- a/docs/en/sql-reference/functions/distance-functions.md
+++ b/docs/en/sql-reference/functions/distance-functions.md
@ -509,7 +509,7 @@ Result:

 ## cosineDistance

-Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The less the returned value is, the more similar are the vectors.
+Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The smaller the returned value is, the more similar are the vectors.

 **Syntax**

--- a/docs/en/sql-reference/functions/tuple-functions.md
+++ b/docs/en/sql-reference/functions/tuple-functions.md
@ -542,7 +542,7 @@ Alias: `scalarProduct`.

 - Scalar product.

-Type: [Int/UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
+Type: [Int/UInt](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md).

 **Example**

--- a/docs/en/sql-reference/statements/alter/partition.md
+++ b/docs/en/sql-reference/statements/alter/partition.md
@ -9,6 +9,7 @@ The following operations with [partitions](/docs/en/engines/table-engines/merget

 - [DETACH PARTITION\|PART](#detach-partitionpart) — Moves a partition or part to the `detached` directory and forget it.
 - [DROP PARTITION\|PART](#drop-partitionpart) — Deletes a partition or part.
+- [FORGET PARTITION](#forget-partition) — Deletes a partition metadata from zookeeper if it's empty.
 - [ATTACH PARTITION\|PART](#attach-partitionpart) — Adds a partition or part from the `detached` directory to the table.
 - [ATTACH PARTITION FROM](#attach-partition-from) — Copies the data partition from one table to another and adds.
 - [REPLACE PARTITION](#replace-partition) — Copies the data partition from one table to another and replaces.
@ -73,6 +74,22 @@ ALTER TABLE table_name [ON CLUSTER cluster] DROP DETACHED PARTITION|PART partiti
 Removes the specified part or all parts of the specified partition from `detached`.
 Read more about setting the partition expression in a section [How to set the partition expression](#how-to-set-partition-expression).

+## FORGET PARTITION
+
+``` sql
+ALTER TABLE table_name FORGET PARTITION partition_expr
+```
+
+Removes all metadata about an empty partition from ZooKeeper. Query fails if partition is not empty or unknown. Make sure to execute only for partitions that will never be used again.
+
+Read about setting the partition expression in a section [How to set the partition expression](#how-to-set-partition-expression).
+
+Example:
+
+``` sql
+ALTER TABLE mt FORGET PARTITION '20201121';
+```
+
 ## ATTACH PARTITION\|PART

 ``` sql
--- a/docs/en/sql-reference/table-functions/merge.md
+++ b/docs/en/sql-reference/table-functions/merge.md
@ -11,11 +11,11 @@ Creates a temporary [Merge](../../engines/table-engines/special/merge.md) table.
 **Syntax**

 ```sql
-merge('db_name', 'tables_regexp')
+merge(['db_name',] 'tables_regexp')
 ```
 **Arguments**

- `db_name` — Possible values:
+- `db_name` — Possible values (optional, default is `currentDatabase()`):
    - database name,
    - constant expression that returns a string with a database name, for example, `currentDatabase()`,
    - `REGEXP(expression)`, where `expression` is a regular expression to match the DB names.
--- a/programs/benchmark/Benchmark.cpp
+++ b/programs/benchmark/Benchmark.cpp
@ -2,7 +2,6 @@
 #include <cstdlib>
 #include <csignal>
 #include <iostream>
-#include <fstream>
 #include <iomanip>
 #include <optional>
 #include <random>
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -330,6 +330,7 @@ try
    processConfig();
    adjustSettings();
    initTTYBuffer(toProgressOption(config().getString("progress", "default")));
+    ASTAlterCommand::setFormatAlterCommandsWithParentheses(true);

    {
        // All that just to set DB::CurrentThread::get().getGlobalContext()
--- a/programs/extract-from-config/ExtractFromConfig.cpp
+++ b/programs/extract-from-config/ExtractFromConfig.cpp
@ -91,8 +91,8 @@ static std::vector<std::string> extractFromConfig(

        zkutil::validateZooKeeperConfig(*bootstrap_configuration);

-        zkutil::ZooKeeperPtr zookeeper = std::make_shared<zkutil::ZooKeeper>(
-            *bootstrap_configuration, bootstrap_configuration->has("zookeeper") ? "zookeeper" : "keeper", nullptr);
+        zkutil::ZooKeeperPtr zookeeper = zkutil::ZooKeeper::createWithoutKillingPreviousSessions(
+            *bootstrap_configuration, bootstrap_configuration->has("zookeeper") ? "zookeeper" : "keeper");

        zkutil::ZooKeeperNodeCache zk_node_cache([&] { return zookeeper; });
        config_xml = processor.processConfig(&has_zk_includes, &zk_node_cache);
--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@ -400,7 +400,7 @@ int KeeperClient::main(const std::vector<String> & /* args */)
    zk_args.connection_timeout_ms = config().getInt("connection-timeout", 10) * 1000;
    zk_args.session_timeout_ms = config().getInt("session-timeout", 10) * 1000;
    zk_args.operation_timeout_ms = config().getInt("operation-timeout", 10) * 1000;
-    zookeeper = std::make_unique<zkutil::ZooKeeper>(zk_args);
+    zookeeper = zkutil::ZooKeeper::createWithoutKillingPreviousSessions(zk_args);

    if (config().has("no-confirmation") || config().has("query"))
        ask_confirmation = false;
--- a/programs/keeper-converter/KeeperConverter.cpp
+++ b/programs/keeper-converter/KeeperConverter.cpp
@ -1,6 +1,7 @@
 #include <iostream>
 #include <boost/program_options.hpp>

+#include <Coordination/CoordinationSettings.h>
 #include <Coordination/KeeperSnapshotManager.h>
 #include <Coordination/ZooKeeperDataReader.h>
 #include <Common/TerminalSize.h>
@ -39,7 +40,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)

    try
    {
-        auto keeper_context = std::make_shared<KeeperContext>(true);
+        auto keeper_context = std::make_shared<KeeperContext>(true, std::make_shared<CoordinationSettings>());
        keeper_context->setDigestEnabled(true);
        keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("Keeper-snapshots", options["output-dir"].as<std::string>()));

--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@ -41,7 +41,7 @@ if (BUILD_STANDALONE_KEEPER)
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperCommon.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@ -560,7 +560,7 @@ try
    auto main_config_reloader = std::make_unique<ConfigReloader>(
        config_path,
        extra_paths,
-        config().getString("path", ""),
+        config().getString("path", KEEPER_DEFAULT_PATH),
        std::move(unused_cache),
        unused_event,
        [&](ConfigurationPtr config, bool /* initial_loading */)
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -506,6 +506,7 @@ try
    processConfig();
    adjustSettings();
    initTTYBuffer(toProgressOption(config().getString("progress", "default")));
+    ASTAlterCommand::setFormatAlterCommandsWithParentheses(true);

    applyCmdSettings(global_context);

--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -24,6 +24,7 @@
 #include <Common/MemoryTracker.h>
 #include <Common/ClickHouseRevision.h>
 #include <Common/DNSResolver.h>
+#include <Common/CgroupsMemoryUsageObserver.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/ConcurrencyControl.h>
 #include <Common/Macros.h>
@ -623,6 +624,8 @@ try
    ServerSettings server_settings;
    server_settings.loadSettingsFromConfig(config());

+    ASTAlterCommand::setFormatAlterCommandsWithParentheses(server_settings.format_alter_operations_with_parentheses);
+
    StackTrace::setShowAddresses(server_settings.show_addresses_in_stack_traces);

 #if USE_HDFS
@ -1280,6 +1283,18 @@ try
        SensitiveDataMasker::setInstance(std::make_unique<SensitiveDataMasker>(config(), "query_masking_rules"));
    }

+    std::optional<CgroupsMemoryUsageObserver> cgroups_memory_usage_observer;
+    try
+    {
+        UInt64 wait_time = server_settings.cgroups_memory_usage_observer_wait_time;
+        if (wait_time != 0)
+            cgroups_memory_usage_observer.emplace(std::chrono::seconds(wait_time));
+    }
+    catch (Exception &)
+    {
+        tryLogCurrentException(log, "Disabling cgroup memory observer because of an error during initialization");
+    }
+
    const std::string cert_path = config().getString("openSSL.server.certificateFile", "");
    const std::string key_path = config().getString("openSSL.server.privateKeyFile", "");

@ -1292,7 +1307,7 @@ try
    auto main_config_reloader = std::make_unique<ConfigReloader>(
        config_path,
        extra_paths,
-        config().getString("path", ""),
+        config().getString("path", DBMS_DEFAULT_PATH),
        std::move(main_config_zk_node_cache),
        main_config_zk_changed_event,
        [&](ConfigurationPtr config, bool initial_loading)
@ -1333,6 +1348,15 @@ try
            total_memory_tracker.setDescription("(total)");
            total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking);

+            if (cgroups_memory_usage_observer)
+            {
+                double hard_limit_ratio = new_server_settings.cgroup_memory_watcher_hard_limit_ratio;
+                double soft_limit_ratio = new_server_settings.cgroup_memory_watcher_soft_limit_ratio;
+                cgroups_memory_usage_observer->setLimits(
+                    static_cast<uint64_t>(max_server_memory_usage * hard_limit_ratio),
+                    static_cast<uint64_t>(max_server_memory_usage * soft_limit_ratio));
+            }
+
            size_t merges_mutations_memory_usage_soft_limit = new_server_settings.merges_mutations_memory_usage_soft_limit;

            size_t default_merges_mutations_server_memory_usage = static_cast<size_t>(current_physical_server_memory * new_server_settings.merges_mutations_memory_usage_to_ram_ratio);
@ -1391,7 +1415,7 @@ try
            global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn);
            global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn);

-            ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited;
+            SlotCount concurrent_threads_soft_limit = UnlimitedSlots;
            if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit)
                concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num;
            if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0)
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -1569,6 +1569,11 @@

    <backups>
        <allowed_path>backups</allowed_path>
+
+        <!-- If the BACKUP command fails and this setting is true then the files
+             copied before the failure will be removed automatically.
+        -->
+        <remove_backup_files_after_failure>true</remove_backup_files_after_failure>
    </backups>

    <!-- This allows to disable exposing addresses in stack traces for security reasons.
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -219,7 +219,7 @@ public:
        : IAggregateFunctionDataHelper<AggregateFunctionCountData, AggregateFunctionCountNotNullUnary>({argument}, params, createResultType())
    {
        if (!argument->isNullable())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Not Nullable data type passed to AggregateFunctionCountNotNullUnary");
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: not Nullable data type passed to AggregateFunctionCountNotNullUnary");
    }

    String getName() const override { return "count"; }
--- a/src/AggregateFunctions/AggregateFunctionFactory.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp
@ -100,7 +100,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
    {
        AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null");
        if (!combinator)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find aggregate function combinator "
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: cannot find aggregate function combinator "
                            "to apply a function to Nullable arguments.");

        DataTypes nested_types = combinator->transformArguments(types_without_low_cardinality);
@ -123,7 +123,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
    auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false);

    if (!with_original_arguments)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionFactory returned nullptr");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: AggregateFunctionFactory returned nullptr");
    return with_original_arguments;
 }

--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@ -146,9 +146,7 @@ struct AggregateFunctionSumData
        size_t count = end - start;
        const auto * end_ptr = ptr + count;

-        if constexpr (
-            (is_integer<T> && !is_big_int_v<T>)
-            || (is_decimal<T> && !std::is_same_v<T, Decimal256> && !std::is_same_v<T, Decimal128>))
+        if constexpr ((is_integer<T> || is_decimal<T>) && !is_over_big_int<T>)
        {
            /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
            /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
@ -163,8 +161,39 @@ struct AggregateFunctionSumData
            Impl::add(sum, local_sum);
            return;
        }
+        else if constexpr (is_over_big_int<T>)
+        {
+            /// Use a mask to discard or keep the value to reduce branch miss.
+            /// Notice that for (U)Int128 or Decimal128, MaskType is Int8 instead of Int64, otherwise extra branches will be introduced by compiler (for unknown reason) and performance will be worse.
+            using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
+            alignas(64) const MaskType masks[2] = {0, -1};
+            T local_sum{};
+            while (ptr < end_ptr)
+            {
+                Value v = *ptr;
+                if constexpr (!add_if_zero)
+                {
+                    if constexpr (is_integer<T>)
+                        v &= masks[!!*condition_map];
+                    else
+                        v.value &= masks[!!*condition_map];
+                }
+                else
+                {
+                    if constexpr (is_integer<T>)
+                        v &= masks[!*condition_map];
+                    else
+                        v.value &= masks[!*condition_map];
+                }

-        if constexpr (std::is_floating_point_v<T>)
+                Impl::add(local_sum, v);
+                ++ptr;
+                ++condition_map;
+            }
+            Impl::add(sum, local_sum);
+            return;
+        }
+        else if constexpr (std::is_floating_point_v<T>)
        {
            /// For floating point we use a similar trick as above, except that now we  reinterpret the floating point number as an unsigned
            /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
--- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp
@ -249,7 +249,7 @@ public:
        : Base(std::move(nested_function_), arguments, params), number_of_arguments(arguments.size())
    {
        if (number_of_arguments == 1)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionIfNullVariadic");
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionIfNullVariadic");

        if (number_of_arguments > MAX_ARGS)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
--- a/src/AggregateFunctions/Combinators/AggregateFunctionNull.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionNull.h
@ -429,7 +429,7 @@ public:
        , number_of_arguments(arguments.size())
    {
        if (number_of_arguments == 1)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionNullVariadic");
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionNullVariadic");

        if (number_of_arguments > MAX_ARGS)
            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
--- a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp
+++ b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp
@ -1,6 +1,7 @@
 #include <Analyzer/Passes/ArrayExistsToHasPass.h>

 #include <Functions/FunctionFactory.h>
+#include <Functions/array/has.h>

 #include <Interpreters/Context.h>

@ -83,7 +84,8 @@ public:
            return;
        }

-        auto has_function = FunctionFactory::instance().get("has", getContext());
+        auto has_function = createInternalFunctionHasOverloadResolver();
+
        array_exists_function_arguments_nodes[0] = std::move(array_exists_function_arguments_nodes[1]);
        array_exists_function_arguments_nodes[1] = std::move(has_constant_element_argument);
        array_exists_function_node->resolveAsFunction(has_function->build(array_exists_function_node->getArgumentColumns()));
--- a/src/Analyzer/Passes/CNF.cpp
+++ b/src/Analyzer/Passes/CNF.cpp
@ -10,6 +10,7 @@
 #include <IO/Operators.h>

 #include <Functions/FunctionFactory.h>
+#include <Functions/logical.h>

 #include <Common/checkStackSize.h>

@ -79,7 +80,7 @@ public:

        if (name == "and" || name == "or")
        {
-            auto function_resolver = FunctionFactory::instance().get(name, current_context);
+            auto function_resolver = name == "and" ? createInternalFunctionAndOverloadResolver() : createInternalFunctionOrOverloadResolver();

            const auto & arguments = function_node->getArguments().getNodes();
            if (arguments.size() > 2)
@ -110,10 +111,10 @@ private:
 class PushNotVisitor
 {
 public:
-    explicit PushNotVisitor(const ContextPtr & context)
-        : not_function_resolver(FunctionFactory::instance().get("not", context))
-        , or_function_resolver(FunctionFactory::instance().get("or", context))
-        , and_function_resolver(FunctionFactory::instance().get("and", context))
+    explicit PushNotVisitor()
+        : not_function_resolver(createInternalFunctionNotOverloadResolver())
+        , or_function_resolver(createInternalFunctionOrOverloadResolver())
+        , and_function_resolver(createInternalFunctionAndOverloadResolver())
    {}

    void visit(QueryTreeNodePtr & node, bool add_negation)
@ -162,10 +163,10 @@ private:
 class PushOrVisitor
 {
 public:
-    PushOrVisitor(ContextPtr context, size_t max_atoms_)
+    explicit PushOrVisitor(size_t max_atoms_)
        : max_atoms(max_atoms_)
-        , and_resolver(FunctionFactory::instance().get("and", context))
-        , or_resolver(FunctionFactory::instance().get("or", context))
+        , and_resolver(createInternalFunctionAndOverloadResolver())
+        , or_resolver(createInternalFunctionOrOverloadResolver())
    {}

    bool visit(QueryTreeNodePtr & node, size_t num_atoms)
@ -513,11 +514,11 @@ std::optional<CNF> CNF::tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr co
    }

    {
-        PushNotVisitor visitor(context);
+        PushNotVisitor visitor;
        visitor.visit(node_cloned, false);
    }

-    if (PushOrVisitor visitor(context, max_atoms);
+    if (PushOrVisitor visitor(max_atoms);
        !visitor.visit(node_cloned, atom_count))
            return std::nullopt;

@ -542,7 +543,7 @@ CNF CNF::toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_gro
    return *cnf;
 }

-QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
+QueryTreeNodePtr CNF::toQueryTree() const
 {
    if (statements.empty())
        return nullptr;
@ -550,9 +551,9 @@ QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
    QueryTreeNodes and_arguments;
    and_arguments.reserve(statements.size());

-    auto not_resolver = FunctionFactory::instance().get("not", context);
-    auto or_resolver = FunctionFactory::instance().get("or", context);
-    auto and_resolver = FunctionFactory::instance().get("and", context);
+    auto not_resolver = createInternalFunctionNotOverloadResolver();
+    auto or_resolver = createInternalFunctionOrOverloadResolver();
+    auto and_resolver = createInternalFunctionAndOverloadResolver();

    const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr
    {
--- a/src/Analyzer/Passes/CNF.h
+++ b/src/Analyzer/Passes/CNF.h
@ -54,7 +54,7 @@ public:
    static std::optional<CNF> tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
    static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);

-    QueryTreeNodePtr toQueryTree(ContextPtr context) const;
+    QueryTreeNodePtr toQueryTree() const;

    const auto & getStatements() const
    {
--- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
+++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
@ -11,6 +11,8 @@
 #include <DataTypes/DataTypesNumber.h>

 #include <Functions/FunctionFactory.h>
+#include <Functions/multiMatchAny.h>
+#include <Functions/logical.h>

 #include <Interpreters/Context.h>

@ -134,8 +136,10 @@ private:

 void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
 {
-    auto or_function_resolver = FunctionFactory::instance().get("or", context);
-    auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context);
+    const auto & settings = context->getSettingsRef();
+    auto match_function_resolver = createInternalMultiMatchAnyOverloadResolver(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps);
+    auto or_function_resolver = createInternalFunctionOrOverloadResolver();
+
    ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context));
    visitor.visit(query_tree_node);
 }
--- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
+++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
@ -339,7 +339,7 @@ void addIndexConstraint(Analyzer::CNF & cnf, const QueryTreeNodes & table_expres
        {
            Analyzer::CNF::OrGroup new_group;
            auto index_hint_node = std::make_shared<FunctionNode>("indexHint");
-            index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree(context));
+            index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree());
            index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context));
            new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}});

@ -676,7 +676,7 @@ void optimizeNode(QueryTreeNodePtr & node, const QueryTreeNodes & table_expressi
    if (settings.optimize_using_constraints)
        optimizeWithConstraints(*cnf, table_expressions, context);

-    auto new_node = cnf->toQueryTree(context);
+    auto new_node = cnf->toQueryTree();
    node = std::move(new_node);
 }

--- a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp
+++ b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp
@ -12,6 +12,7 @@

 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>
+#include <Functions/logical.h>

 #include <Common/logger_useful.h>

@ -256,7 +257,7 @@ private:
        for (const auto & node : nodes)
            function_node->getArguments().getNodes().push_back(node);

-        const auto & function = FunctionFactory::instance().get("and", getContext());
+        const auto & function = createInternalFunctionAndOverloadResolver();
        function_node->resolveAsFunction(function->build(function_node->getArgumentColumns()));
        return function_node;
    }
--- a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp
+++ b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp
@ -5,6 +5,7 @@
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/FunctionNode.h>
 #include <Functions/FunctionFactory.h>
+#include <Functions/multiIf.h>

 namespace DB
 {
@ -75,7 +76,8 @@ private:

 void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
 {
-    auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context);
+    const auto & settings = context->getSettingsRef();
+    auto multi_if_function_ptr = createInternalMultiIfOverloadResolver(settings.allow_execute_multiif_columnar, settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
    IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context));
    visitor.visit(query_tree_node);
 }
--- a/src/Analyzer/Passes/MultiIfToIfPass.cpp
+++ b/src/Analyzer/Passes/MultiIfToIfPass.cpp
@ -3,6 +3,7 @@
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/FunctionNode.h>
 #include <Functions/FunctionFactory.h>
+#include <Functions/if.h>

 namespace DB
 {
@ -54,7 +55,8 @@ private:

 void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
 {
-    auto if_function_ptr = FunctionFactory::instance().get("if", context);
+    const auto & settings = context->getSettingsRef();
+    auto if_function_ptr = createInternalFunctionIfOverloadResolver(settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
    MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context));
    visitor.visit(query_tree_node);
 }
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -80,6 +80,8 @@
 #include <Analyzer/IQueryTreeNode.h>
 #include <Analyzer/Identifier.h>

+#include <boost/algorithm/string.hpp>
+
 namespace ProfileEvents
 {
    extern const Event ScalarSubqueriesGlobalCacheHit;
@ -2382,6 +2384,13 @@ void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Se

    for (auto & node : projection_nodes)
    {
+        /// Detect and reject ambiguous statements:
+        /// E.g. for a table with columns "all", "a", "b":
+        /// - SELECT all, a, b ORDER BY all;        -- should we sort by all columns in SELECT or by column "all"?
+        /// - SELECT a, b AS all ORDER BY all;      -- like before but "all" as alias
+        /// - SELECT func(...) AS all ORDER BY all; -- like before but "all" as function
+        /// - SELECT a, b ORDER BY all;             -- tricky in other way: does the user want to sort by columns in SELECT clause or by not SELECTed column "all"?
+
        auto resolved_expression_it = resolved_expressions.find(node);
        if (resolved_expression_it != resolved_expressions.end())
        {
@ -2390,7 +2399,7 @@ void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Se
                throw Exception(ErrorCodes::LOGICAL_ERROR,
                                "Expression nodes list expected 1 projection names. Actual {}",
                                projection_names.size());
-            if (Poco::toUpper(projection_names[0]) == "ALL")
+            if (boost::iequals(projection_names[0], "all"))
                throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION,
                                "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again");
        }
@ -5122,6 +5131,15 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
        true /*allow_lambda_expression*/,
        allow_table_expressions /*allow_table_expression*/);

+    if (function_node_ptr->toAST()->hasSecretParts())
+    {
+        for (auto & argument : arguments_projection_names)
+        {
+            SipHash hash;
+            hash.update(argument);
+            argument = getHexUIntLowercase(hash.get128());
+        }
+    }
    auto & function_node = *function_node_ptr;

    /// Replace right IN function argument if it is table or table function with subquery that read ordinary columns
--- a/src/Backups/BackupIO_AzureBlobStorage.cpp
+++ b/src/Backups/BackupIO_AzureBlobStorage.cpp
@ -2,7 +2,7 @@

 #if USE_AZURE_BLOB_STORAGE
 #include <Common/quoteString.h>
-#include <Interpreters/threadPoolCallbackRunner.h>
+#include <Common/threadPoolCallbackRunner.h>
 #include <Interpreters/Context.h>
 #include <IO/SharedThreadPools.h>
 #include <IO/HTTPHeaderEntries.h>
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -2,7 +2,7 @@

 #if USE_AWS_S3
 #include <Common/quoteString.h>
-#include <Interpreters/threadPoolCallbackRunner.h>
+#include <Common/threadPoolCallbackRunner.h>
 #include <Interpreters/Context.h>
 #include <IO/SharedThreadPools.h>
 #include <IO/ReadBufferFromS3.h>
@ -127,7 +127,7 @@ BackupReaderS3::BackupReaderS3(
    : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3"))
    , s3_uri(s3_uri_)
    , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
-    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
+    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
 {
    auto & request_settings = s3_settings.request_settings;
    request_settings.updateFromSettings(context_->getSettingsRef());
@ -217,7 +217,7 @@ BackupWriterS3::BackupWriterS3(
    : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3"))
    , s3_uri(s3_uri_)
    , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
-    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
+    , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
 {
    auto & request_settings = s3_settings.request_settings;
    request_settings.updateFromSettings(context_->getSettingsRef());
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -144,6 +144,13 @@ BackupImpl::BackupImpl(

 BackupImpl::~BackupImpl()
 {
+    if ((open_mode == OpenMode::WRITE) && !is_internal_backup && !writing_finalized && !std::uncaught_exceptions() && !std::current_exception())
+    {
+        /// It is suspicious to destroy BackupImpl without finalization while writing a backup when there is no exception.
+        LOG_ERROR(log, "BackupImpl is not finalized when destructor is called. Stack trace: {}", StackTrace().toString());
+        chassert(false && "BackupImpl is not finalized when destructor is called.");
+    }
+
    try
    {
        close();
@ -195,10 +202,6 @@ void BackupImpl::close()
 {
    std::lock_guard lock{mutex};
    closeArchive(/* finalize= */ false);
-
-    if (!is_internal_backup && writer && !writing_finalized)
-        removeAllFilesAfterFailure();
-
    writer.reset();
    reader.reset();
    coordination.reset();
@ -1005,14 +1008,18 @@ void BackupImpl::setCompressedSize()
 }


-void BackupImpl::removeAllFilesAfterFailure()
+void BackupImpl::tryRemoveAllFiles()
 {
+    if (open_mode != OpenMode::WRITE)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
+
    if (is_internal_backup)
-        return; /// Let the initiator remove unnecessary files.
+        return;

    try
    {
-        LOG_INFO(log, "Removing all files of backup {} after failure", backup_name_for_logging);
+        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+        closeArchive(/* finalize= */ false);

        Strings files_to_remove;
        if (use_archive)
--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@ -81,8 +81,9 @@ public:
    size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const override;
    size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const override;
    void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override;
-    void finalizeWriting() override;
    bool supportsWritingInMultipleThreads() const override { return !use_archive; }
+    void finalizeWriting() override;
+    void tryRemoveAllFiles() override;

 private:
    void open();
@ -107,8 +108,6 @@ private:
    bool checkLockFile(bool throw_if_failed) const;
    void removeLockFile();

-    void removeAllFilesAfterFailure();
-
    /// Calculates and sets `compressed_size`.
    void setCompressedSize();

--- a/src/Backups/BackupOperationInfo.h
+++ b/src/Backups/BackupOperationInfo.h
@ -20,6 +20,9 @@ struct BackupOperationInfo
    /// Base Backup Operation name, a string like "Disk('backups', 'my_base_backup')"
    String base_backup_name;

+    /// Query ID of a query that started backup
+    String query_id;
+
    /// This operation is internal and should not be shown in system.backups
    bool internal = false;

--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -375,11 +375,12 @@ private:
 };


-BackupsWorker::BackupsWorker(ContextMutablePtr global_context, size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_, bool test_inject_sleep_)
+BackupsWorker::BackupsWorker(ContextMutablePtr global_context, size_t num_backup_threads, size_t num_restore_threads)
    : thread_pools(std::make_unique<ThreadPools>(num_backup_threads, num_restore_threads))
-    , allow_concurrent_backups(allow_concurrent_backups_)
-    , allow_concurrent_restores(allow_concurrent_restores_)
-    , test_inject_sleep(test_inject_sleep_)
+    , allow_concurrent_backups(global_context->getConfigRef().getBool("backups.allow_concurrent_backups", true))
+    , allow_concurrent_restores(global_context->getConfigRef().getBool("backups.allow_concurrent_restores", true))
+    , remove_backup_files_after_failure(global_context->getConfigRef().getBool("backups.remove_backup_files_after_failure", true))
+    , test_inject_sleep(global_context->getConfigRef().getBool("backups.test_inject_sleep", false))
    , log(getLogger("BackupsWorker"))
    , backup_log(global_context->getBackupLog())
    , process_list(global_context->getProcessList())
@ -411,6 +412,9 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
    auto backup_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto backup_settings = BackupSettings::fromBackupQuery(*backup_query);

+    auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
+    String backup_name_for_logging = backup_info.toStringForLogging();
+
    if (!backup_settings.backup_uuid)
        backup_settings.backup_uuid = UUIDHelpers::generateV4();

@ -424,23 +428,43 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
        backup_id = toString(*backup_settings.backup_uuid);

    std::shared_ptr<IBackupCoordination> backup_coordination;
-    if (backup_settings.internal)
-    {
-        /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
-        /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
-        /// if an exception will be thrown in startMakingBackup() other hosts will know about that.
-        backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ true);
-    }
+    BackupMutablePtr backup;

-    auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
-    String backup_name_for_logging = backup_info.toStringForLogging();
-    String base_backup_name;
-    if (backup_settings.base_backup_info)
-        base_backup_name = backup_settings.base_backup_info->toStringForLogging();
+    /// Called in exception handlers below. This lambda function can be called on a separate thread, so it can't capture local variables by reference.
+    auto on_exception = [this](BackupMutablePtr & backup_, const OperationID & backup_id_, const String & backup_name_for_logging_,
+                               const BackupSettings & backup_settings_, const std::shared_ptr<IBackupCoordination> & backup_coordination_)
+    {
+        /// Something bad happened, the backup has not built.
+        tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings_.internal ? "internal backup" : "backup"), backup_name_for_logging_));
+        setStatusSafe(backup_id_, getBackupStatusFromCurrentException());
+        sendCurrentExceptionToCoordination(backup_coordination_);
+
+        if (backup_ && remove_backup_files_after_failure)
+            backup_->tryRemoveAllFiles();
+        backup_.reset();
+    };

    try
    {
-        addInfo(backup_id, backup_name_for_logging, base_backup_name, backup_settings.internal, context->getProcessListElement(), BackupStatus::CREATING_BACKUP);
+        String base_backup_name;
+        if (backup_settings.base_backup_info)
+            base_backup_name = backup_settings.base_backup_info->toStringForLogging();
+
+        addInfo(backup_id,
+            backup_name_for_logging,
+            base_backup_name,
+            context->getCurrentQueryId(),
+            backup_settings.internal,
+            context->getProcessListElement(),
+            BackupStatus::CREATING_BACKUP);
+
+        if (backup_settings.internal)
+        {
+            /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
+            /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
+            /// if an exception will be thrown in startMakingBackup() other hosts will know about that.
+            backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ true);
+        }

        /// Prepare context to use.
        ContextPtr context_in_use = context;
@ -462,7 +486,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
            /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
            auto process_list_element = context_in_use->getProcessListElement();

-            thread_pool.scheduleOrThrowOnError(
+            scheduleFromThreadPool<void>(
                [this,
                 backup_query,
                 backup_id,
@ -472,25 +496,34 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
                 backup_coordination,
                 context_in_use,
                 mutable_context,
-                 thread_group = CurrentThread::getGroup(),
+                 on_exception,
                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
                {
-                    doBackup(
-                        backup_query,
-                        backup_id,
-                        backup_name_for_logging,
-                        backup_info,
-                        backup_settings,
-                        backup_coordination,
-                        context_in_use,
-                        mutable_context,
-                        thread_group,
-                        /* called_async= */ true);
-                });
+                    BackupMutablePtr backup_async;
+                    try
+                    {
+                        doBackup(
+                            backup_async,
+                            backup_query,
+                            backup_id,
+                            backup_name_for_logging,
+                            backup_info,
+                            backup_settings,
+                            backup_coordination,
+                            context_in_use,
+                            mutable_context);
+                    }
+                    catch (...)
+                    {
+                        on_exception(backup_async, backup_id, backup_name_for_logging, backup_settings, backup_coordination);
+                    }
+                },
+                thread_pool, "BackupWorker");
        }
        else
        {
            doBackup(
+                backup,
                backup_query,
                backup_id,
                backup_name_for_logging,
@ -498,25 +531,21 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
                backup_settings,
                backup_coordination,
                context_in_use,
-                mutable_context,
-                nullptr,
-                /* called_async= */ false);
+                mutable_context);
        }

        return backup_id;
    }
    catch (...)
    {
-        tryLogCurrentException(log, fmt::format("Failed to start {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
-        /// Something bad happened, the backup has not built.
-        setStatusSafe(backup_id, getBackupStatusFromCurrentException());
-        sendCurrentExceptionToCoordination(backup_coordination);
+        on_exception(backup, backup_id, backup_name_for_logging, backup_settings, backup_coordination);
        throw;
    }
 }


 void BackupsWorker::doBackup(
+    BackupMutablePtr & backup,
    const std::shared_ptr<ASTBackupQuery> & backup_query,
    const OperationID & backup_id,
    const String & backup_name_for_logging,
@ -524,147 +553,120 @@ void BackupsWorker::doBackup(
    BackupSettings backup_settings,
    std::shared_ptr<IBackupCoordination> backup_coordination,
    const ContextPtr & context,
-    ContextMutablePtr mutable_context,
-    ThreadGroupPtr thread_group,
-    bool called_async)
+    ContextMutablePtr mutable_context)
 {
-    SCOPE_EXIT_SAFE(
-        if (called_async && thread_group)
-            CurrentThread::detachFromGroupIfNotDetached();
-    );
+    bool on_cluster = !backup_query->cluster.empty();
+    assert(!on_cluster || mutable_context);

-    try
+    /// Checks access rights if this is not ON CLUSTER query.
+    /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
+    auto required_access = getRequiredAccessToBackup(backup_query->elements);
+    if (!on_cluster)
+        context->checkAccess(required_access);
+
+    ClusterPtr cluster;
+    if (on_cluster)
    {
-        if (called_async && thread_group)
-            CurrentThread::attachToGroup(thread_group);
-        if (called_async)
-            setThreadName("BackupWorker");
-
-        bool on_cluster = !backup_query->cluster.empty();
-        assert(mutable_context || (!on_cluster && !called_async));
-
-        /// Checks access rights if this is not ON CLUSTER query.
-        /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
-        auto required_access = getRequiredAccessToBackup(backup_query->elements);
-        if (!on_cluster)
-            context->checkAccess(required_access);
-
-        ClusterPtr cluster;
-        if (on_cluster)
-        {
-            backup_query->cluster = context->getMacros()->expand(backup_query->cluster);
-            cluster = context->getCluster(backup_query->cluster);
-            backup_settings.cluster_host_ids = cluster->getHostIDs();
-        }
-
-        /// Make a backup coordination.
-        if (!backup_coordination)
-            backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ on_cluster);
-
-        if (!allow_concurrent_backups && backup_coordination->hasConcurrentBackups(std::ref(num_active_backups)))
-            throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
-
-        /// Opens a backup for writing.
-        BackupFactory::CreateParams backup_create_params;
-        backup_create_params.open_mode = IBackup::OpenMode::WRITE;
-        backup_create_params.context = context;
-        backup_create_params.backup_info = backup_info;
-        backup_create_params.base_backup_info = backup_settings.base_backup_info;
-        backup_create_params.compression_method = backup_settings.compression_method;
-        backup_create_params.compression_level = backup_settings.compression_level;
-        backup_create_params.password = backup_settings.password;
-        backup_create_params.s3_storage_class = backup_settings.s3_storage_class;
-        backup_create_params.is_internal_backup = backup_settings.internal;
-        backup_create_params.backup_coordination = backup_coordination;
-        backup_create_params.backup_uuid = backup_settings.backup_uuid;
-        backup_create_params.deduplicate_files = backup_settings.deduplicate_files;
-        backup_create_params.allow_s3_native_copy = backup_settings.allow_s3_native_copy;
-        backup_create_params.use_same_s3_credentials_for_base_backup = backup_settings.use_same_s3_credentials_for_base_backup;
-        backup_create_params.read_settings = getReadSettingsForBackup(context, backup_settings);
-        backup_create_params.write_settings = getWriteSettingsForBackup(context);
-        BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params);
-
-        /// Write the backup.
-        if (on_cluster)
-        {
-            DDLQueryOnClusterParams params;
-            params.cluster = cluster;
-            params.only_shard_num = backup_settings.shard_num;
-            params.only_replica_num = backup_settings.replica_num;
-            params.access_to_check = required_access;
-            backup_settings.copySettingsToQuery(*backup_query);
-
-            // executeDDLQueryOnCluster() will return without waiting for completion
-            mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
-            mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
-            executeDDLQueryOnCluster(backup_query, mutable_context, params);
-
-            /// Wait until all the hosts have written their backup entries.
-            backup_coordination->waitForStage(Stage::COMPLETED);
-            backup_coordination->setStage(Stage::COMPLETED,"");
-        }
-        else
-        {
-            backup_query->setCurrentDatabase(context->getCurrentDatabase());
-
-            /// Prepare backup entries.
-            BackupEntries backup_entries;
-            {
-                BackupEntriesCollector backup_entries_collector(
-                    backup_query->elements, backup_settings, backup_coordination,
-                    backup_create_params.read_settings, context, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST));
-                backup_entries = backup_entries_collector.run();
-            }
-
-            /// Write the backup entries to the backup.
-            buildFileInfosForBackupEntries(backup, backup_entries, backup_create_params.read_settings, backup_coordination, context->getProcessListElement());
-            writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal, context->getProcessListElement());
-
-            /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
-            backup_coordination->setStage(Stage::COMPLETED,"");
-        }
-
-        size_t num_files = 0;
-        UInt64 total_size = 0;
-        size_t num_entries = 0;
-        UInt64 uncompressed_size = 0;
-        UInt64 compressed_size = 0;
-
-        /// Finalize backup (write its metadata).
-        if (!backup_settings.internal)
-        {
-            backup->finalizeWriting();
-            num_files = backup->getNumFiles();
-            total_size = backup->getTotalSize();
-            num_entries = backup->getNumEntries();
-            uncompressed_size = backup->getUncompressedSize();
-            compressed_size = backup->getCompressedSize();
-        }
-
-        /// Close the backup.
-        backup.reset();
-
-        LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_name_for_logging);
-        /// NOTE: we need to update metadata again after backup->finalizeWriting(), because backup metadata is written there.
-        setNumFilesAndSize(backup_id, num_files, total_size, num_entries, uncompressed_size, compressed_size, 0, 0);
-        /// NOTE: setStatus is called after setNumFilesAndSize in order to have actual information in a backup log record
-        setStatus(backup_id, BackupStatus::BACKUP_CREATED);
+        backup_query->cluster = context->getMacros()->expand(backup_query->cluster);
+        cluster = context->getCluster(backup_query->cluster);
+        backup_settings.cluster_host_ids = cluster->getHostIDs();
    }
-    catch (...)
+
+    /// Make a backup coordination.
+    if (!backup_coordination)
+        backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ on_cluster);
+
+    if (!allow_concurrent_backups && backup_coordination->hasConcurrentBackups(std::ref(num_active_backups)))
+        throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
+
+    /// Opens a backup for writing.
+    BackupFactory::CreateParams backup_create_params;
+    backup_create_params.open_mode = IBackup::OpenMode::WRITE;
+    backup_create_params.context = context;
+    backup_create_params.backup_info = backup_info;
+    backup_create_params.base_backup_info = backup_settings.base_backup_info;
+    backup_create_params.compression_method = backup_settings.compression_method;
+    backup_create_params.compression_level = backup_settings.compression_level;
+    backup_create_params.password = backup_settings.password;
+    backup_create_params.s3_storage_class = backup_settings.s3_storage_class;
+    backup_create_params.is_internal_backup = backup_settings.internal;
+    backup_create_params.backup_coordination = backup_coordination;
+    backup_create_params.backup_uuid = backup_settings.backup_uuid;
+    backup_create_params.deduplicate_files = backup_settings.deduplicate_files;
+    backup_create_params.allow_s3_native_copy = backup_settings.allow_s3_native_copy;
+    backup_create_params.use_same_s3_credentials_for_base_backup = backup_settings.use_same_s3_credentials_for_base_backup;
+    backup_create_params.read_settings = getReadSettingsForBackup(context, backup_settings);
+    backup_create_params.write_settings = getWriteSettingsForBackup(context);
+    backup = BackupFactory::instance().createBackup(backup_create_params);
+
+    /// Write the backup.
+    if (on_cluster)
    {
-        /// Something bad happened, the backup has not built.
-        if (called_async)
-        {
-            tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
-            setStatusSafe(backup_id, getBackupStatusFromCurrentException());
-            sendCurrentExceptionToCoordination(backup_coordination);
-        }
-        else
-        {
-            /// setStatus() and sendCurrentExceptionToCoordination() will be called by startMakingBackup().
-            throw;
-        }
+        DDLQueryOnClusterParams params;
+        params.cluster = cluster;
+        params.only_shard_num = backup_settings.shard_num;
+        params.only_replica_num = backup_settings.replica_num;
+        params.access_to_check = required_access;
+        backup_settings.copySettingsToQuery(*backup_query);
+
+        // executeDDLQueryOnCluster() will return without waiting for completion
+        mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
+        mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
+        executeDDLQueryOnCluster(backup_query, mutable_context, params);
+
+        /// Wait until all the hosts have written their backup entries.
+        backup_coordination->waitForStage(Stage::COMPLETED);
+        backup_coordination->setStage(Stage::COMPLETED,"");
    }
+    else
+    {
+        backup_query->setCurrentDatabase(context->getCurrentDatabase());
+
+        /// Prepare backup entries.
+        BackupEntries backup_entries;
+        {
+            BackupEntriesCollector backup_entries_collector(
+                backup_query->elements, backup_settings, backup_coordination,
+                backup_create_params.read_settings, context, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST));
+            backup_entries = backup_entries_collector.run();
+        }
+
+        /// Write the backup entries to the backup.
+        chassert(backup);
+        chassert(backup_coordination);
+        chassert(context);
+        buildFileInfosForBackupEntries(backup, backup_entries, backup_create_params.read_settings, backup_coordination, context->getProcessListElement());
+        writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal, context->getProcessListElement());
+
+        /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
+        backup_coordination->setStage(Stage::COMPLETED,"");
+    }
+
+    size_t num_files = 0;
+    UInt64 total_size = 0;
+    size_t num_entries = 0;
+    UInt64 uncompressed_size = 0;
+    UInt64 compressed_size = 0;
+
+    /// Finalize backup (write its metadata).
+    if (!backup_settings.internal)
+    {
+        backup->finalizeWriting();
+        num_files = backup->getNumFiles();
+        total_size = backup->getTotalSize();
+        num_entries = backup->getNumEntries();
+        uncompressed_size = backup->getUncompressedSize();
+        compressed_size = backup->getCompressedSize();
+    }
+
+    /// Close the backup.
+    backup.reset();
+
+    LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_name_for_logging);
+    /// NOTE: we need to update metadata again after backup->finalizeWriting(), because backup metadata is written there.
+    setNumFilesAndSize(backup_id, num_files, total_size, num_entries, uncompressed_size, compressed_size, 0, 0);
+    /// NOTE: setStatus is called after setNumFilesAndSize in order to have actual information in a backup log record
+    setStatus(backup_id, BackupStatus::BACKUP_CREATED);
 }


@ -794,6 +796,9 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);

+    auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
+    String backup_name_for_logging = backup_info.toStringForLogging();
+
    if (!restore_settings.restore_uuid)
        restore_settings.restore_uuid = UUIDHelpers::generateV4();

@ -807,23 +812,38 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
        restore_id = toString(*restore_settings.restore_uuid);

    std::shared_ptr<IRestoreCoordination> restore_coordination;
-    if (restore_settings.internal)
+
+    /// Called in exception handlers below. This lambda function can be called on a separate thread, so it can't capture local variables by reference.
+    auto on_exception = [this](const OperationID & restore_id_, const String & backup_name_for_logging_,
+                               const RestoreSettings & restore_settings_, const std::shared_ptr<IRestoreCoordination> & restore_coordination_)
    {
-        /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
-        /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
-        /// if an exception will be thrown in startRestoring() other hosts will know about that.
-        restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ true);
-    }
+        /// Something bad happened, some data were not restored.
+        tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings_.internal ? "internal backup" : "backup"), backup_name_for_logging_));
+        setStatusSafe(restore_id_, getRestoreStatusFromCurrentException());
+        sendCurrentExceptionToCoordination(restore_coordination_);
+    };

    try
    {
-        auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
-        String backup_name_for_logging = backup_info.toStringForLogging();
        String base_backup_name;
        if (restore_settings.base_backup_info)
            base_backup_name = restore_settings.base_backup_info->toStringForLogging();

-        addInfo(restore_id, backup_name_for_logging, base_backup_name, restore_settings.internal, context->getProcessListElement(), BackupStatus::RESTORING);
+        addInfo(restore_id,
+            backup_name_for_logging,
+            base_backup_name,
+            context->getCurrentQueryId(),
+            restore_settings.internal,
+            context->getProcessListElement(),
+            BackupStatus::RESTORING);
+
+        if (restore_settings.internal)
+        {
+            /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
+            /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
+            /// if an exception will be thrown in startRestoring() other hosts will know about that.
+            restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ true);
+        }

        /// Prepare context to use.
        ContextMutablePtr context_in_use = context;
@ -844,7 +864,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
            /// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
            auto process_list_element = context_in_use->getProcessListElement();

-            thread_pool.scheduleOrThrowOnError(
+            scheduleFromThreadPool<void>(
                [this,
                 restore_query,
                 restore_id,
@ -853,20 +873,27 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
                 restore_settings,
                 restore_coordination,
                 context_in_use,
-                 thread_group = CurrentThread::getGroup(),
+                 on_exception,
                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
                {
-                    doRestore(
-                        restore_query,
-                        restore_id,
-                        backup_name_for_logging,
-                        backup_info,
-                        restore_settings,
-                        restore_coordination,
-                        context_in_use,
-                        thread_group,
-                        /* called_async= */ true);
-                });
+                    try
+                    {
+                        doRestore(
+                            restore_query,
+                            restore_id,
+                            backup_name_for_logging,
+                            backup_info,
+                            restore_settings,
+                            restore_coordination,
+                            context_in_use);
+                    }
+                    catch (...)
+                    {
+                        on_exception(restore_id, backup_name_for_logging, restore_settings, restore_coordination);
+                    }
+                },
+                thread_pool,
+                "RestoreWorker");
        }
        else
        {
@ -877,18 +904,14 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
                backup_info,
                restore_settings,
                restore_coordination,
-                context_in_use,
-                nullptr,
-                /* called_async= */ false);
+                context_in_use);
        }

        return restore_id;
    }
    catch (...)
    {
-        /// Something bad happened, the backup has not built.
-        setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
-        sendCurrentExceptionToCoordination(restore_coordination);
+        on_exception(restore_id, backup_name_for_logging, restore_settings, restore_coordination);
        throw;
    }
 }
@ -901,133 +924,103 @@ void BackupsWorker::doRestore(
    const BackupInfo & backup_info,
    RestoreSettings restore_settings,
    std::shared_ptr<IRestoreCoordination> restore_coordination,
-    ContextMutablePtr context,
-    ThreadGroupPtr thread_group,
-    bool called_async)
+    ContextMutablePtr context)
 {
-    SCOPE_EXIT_SAFE(
-        if (called_async && thread_group)
-            CurrentThread::detachFromGroupIfNotDetached();
-    );
+    /// Open the backup for reading.
+    BackupFactory::CreateParams backup_open_params;
+    backup_open_params.open_mode = IBackup::OpenMode::READ;
+    backup_open_params.context = context;
+    backup_open_params.backup_info = backup_info;
+    backup_open_params.base_backup_info = restore_settings.base_backup_info;
+    backup_open_params.password = restore_settings.password;
+    backup_open_params.allow_s3_native_copy = restore_settings.allow_s3_native_copy;
+    backup_open_params.use_same_s3_credentials_for_base_backup = restore_settings.use_same_s3_credentials_for_base_backup;
+    backup_open_params.read_settings = getReadSettingsForRestore(context);
+    backup_open_params.write_settings = getWriteSettingsForRestore(context);
+    BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);

-    try
+    String current_database = context->getCurrentDatabase();
+    /// Checks access rights if this is ON CLUSTER query.
+    /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
+    ClusterPtr cluster;
+    bool on_cluster = !restore_query->cluster.empty();
+
+    if (on_cluster)
    {
-        if (called_async && thread_group)
-            CurrentThread::attachToGroup(thread_group);
-        if (called_async)
-            setThreadName("RestoreWorker");
-
-        /// Open the backup for reading.
-        BackupFactory::CreateParams backup_open_params;
-        backup_open_params.open_mode = IBackup::OpenMode::READ;
-        backup_open_params.context = context;
-        backup_open_params.backup_info = backup_info;
-        backup_open_params.base_backup_info = restore_settings.base_backup_info;
-        backup_open_params.password = restore_settings.password;
-        backup_open_params.allow_s3_native_copy = restore_settings.allow_s3_native_copy;
-        backup_open_params.use_same_s3_credentials_for_base_backup = restore_settings.use_same_s3_credentials_for_base_backup;
-        backup_open_params.read_settings = getReadSettingsForRestore(context);
-        backup_open_params.write_settings = getWriteSettingsForRestore(context);
-        BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
-
-        String current_database = context->getCurrentDatabase();
-        /// Checks access rights if this is ON CLUSTER query.
-        /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
-        ClusterPtr cluster;
-        bool on_cluster = !restore_query->cluster.empty();
-
-        if (on_cluster)
-        {
-            restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
-            cluster = context->getCluster(restore_query->cluster);
-            restore_settings.cluster_host_ids = cluster->getHostIDs();
-        }
-
-        /// Make a restore coordination.
-        if (!restore_coordination)
-            restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
-
-        if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
-            throw Exception(
-                ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
-                "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
-
-
-        if (on_cluster)
-        {
-            /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
-            /// because different replicas can contain different set of tables and so the required access rights can differ too.
-            /// So the right way is pass through the entire cluster and check access for each host.
-            auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num);
-            for (const auto * address : addresses)
-            {
-                restore_settings.host_id = address->toString();
-                auto restore_elements = restore_query->elements;
-                String addr_database = address->default_database.empty() ? current_database : address->default_database;
-                for (auto & element : restore_elements)
-                    element.setCurrentDatabase(addr_database);
-                RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context};
-                dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY);
-            }
-        }
-
-        /// Do RESTORE.
-        if (on_cluster)
-        {
-
-            DDLQueryOnClusterParams params;
-            params.cluster = cluster;
-            params.only_shard_num = restore_settings.shard_num;
-            params.only_replica_num = restore_settings.replica_num;
-            restore_settings.copySettingsToQuery(*restore_query);
-
-            // executeDDLQueryOnCluster() will return without waiting for completion
-            context->setSetting("distributed_ddl_task_timeout", Field{0});
-            context->setSetting("distributed_ddl_output_mode", Field{"none"});
-
-            executeDDLQueryOnCluster(restore_query, context, params);
-
-            /// Wait until all the hosts have written their backup entries.
-            restore_coordination->waitForStage(Stage::COMPLETED);
-            restore_coordination->setStage(Stage::COMPLETED,"");
-        }
-        else
-        {
-            restore_query->setCurrentDatabase(current_database);
-
-            /// Restore metadata and prepare data restoring tasks.
-            DataRestoreTasks data_restore_tasks;
-            {
-                RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination,
-                                            backup, context};
-                data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE);
-            }
-
-            /// Execute the data restoring tasks.
-            restoreTablesData(restore_id, backup, std::move(data_restore_tasks), getThreadPool(ThreadPoolId::RESTORE_TABLES_DATA), context->getProcessListElement());
-
-            /// We have restored everything, we need to tell other hosts (they could be waiting for it).
-            restore_coordination->setStage(Stage::COMPLETED, "");
-        }
-
-        LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_name_for_logging);
-        setStatus(restore_id, BackupStatus::RESTORED);
+        restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
+        cluster = context->getCluster(restore_query->cluster);
+        restore_settings.cluster_host_ids = cluster->getHostIDs();
    }
-    catch (...)
+
+    /// Make a restore coordination.
+    if (!restore_coordination)
+        restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
+
+    if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
+        throw Exception(
+            ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
+            "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
+
+
+    if (on_cluster)
    {
-        /// Something bad happened, the backup has not built.
-        if (called_async)
+        /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
+        /// because different replicas can contain different set of tables and so the required access rights can differ too.
+        /// So the right way is pass through the entire cluster and check access for each host.
+        auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num);
+        for (const auto * address : addresses)
        {
-            tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_name_for_logging));
-            setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
-            sendCurrentExceptionToCoordination(restore_coordination);
-        }
-        else
-        {
-            /// setStatus() and sendCurrentExceptionToCoordination() will be called by startRestoring().
-            throw;
+            restore_settings.host_id = address->toString();
+            auto restore_elements = restore_query->elements;
+            String addr_database = address->default_database.empty() ? current_database : address->default_database;
+            for (auto & element : restore_elements)
+                element.setCurrentDatabase(addr_database);
+            RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context};
+            dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY);
        }
    }
+
+    /// Do RESTORE.
+    if (on_cluster)
+    {
+
+        DDLQueryOnClusterParams params;
+        params.cluster = cluster;
+        params.only_shard_num = restore_settings.shard_num;
+        params.only_replica_num = restore_settings.replica_num;
+        restore_settings.copySettingsToQuery(*restore_query);
+
+        // executeDDLQueryOnCluster() will return without waiting for completion
+        context->setSetting("distributed_ddl_task_timeout", Field{0});
+        context->setSetting("distributed_ddl_output_mode", Field{"none"});
+
+        executeDDLQueryOnCluster(restore_query, context, params);
+
+        /// Wait until all the hosts have written their backup entries.
+        restore_coordination->waitForStage(Stage::COMPLETED);
+        restore_coordination->setStage(Stage::COMPLETED,"");
+    }
+    else
+    {
+        restore_query->setCurrentDatabase(current_database);
+
+        /// Restore metadata and prepare data restoring tasks.
+        DataRestoreTasks data_restore_tasks;
+        {
+            RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination,
+                                        backup, context};
+            data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE);
+        }
+
+        /// Execute the data restoring tasks.
+        restoreTablesData(restore_id, backup, std::move(data_restore_tasks), getThreadPool(ThreadPoolId::RESTORE_TABLES_DATA), context->getProcessListElement());
+
+        /// We have restored everything, we need to tell other hosts (they could be waiting for it).
+        restore_coordination->setStage(Stage::COMPLETED, "");
+    }
+
+    LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_name_for_logging);
+    setStatus(restore_id, BackupStatus::RESTORED);
 }


@ -1108,13 +1101,15 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr
 }


-void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status)
+void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, const String & query_id,
+                            bool internal, QueryStatusPtr process_list_element, BackupStatus status)
 {
    ExtendedOperationInfo extended_info;
    auto & info = extended_info.info;
    info.id = id;
    info.name = name;
    info.base_backup_name = base_backup_name;
+    info.query_id = query_id;
    info.internal = internal;
    info.status = status;
    info.start_time = std::chrono::system_clock::now();
@ -1183,7 +1178,7 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw

    if (isFailedOrCancelled(status))
    {
-        info.error_message = getCurrentExceptionMessage(false);
+        info.error_message = getCurrentExceptionMessage(true /*with_stacktrace*/);
        info.exception = std::current_exception();
    }

--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -38,20 +38,15 @@ class ProcessList;
 class BackupsWorker
 {
 public:
-    BackupsWorker(
-        ContextMutablePtr global_context,
-        size_t num_backup_threads,
-        size_t num_restore_threads,
-        bool allow_concurrent_backups_,
-        bool allow_concurrent_restores_,
-        bool test_inject_sleep_);
-
+    BackupsWorker(ContextMutablePtr global_context, size_t num_backup_threads, size_t num_restore_threads);
    ~BackupsWorker();

    /// Waits until all tasks have been completed.
    void shutdown();

    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
+    /// For asynchronous operations the function throws no exceptions on failure usually,
+    /// call getInfo() on a returned operation id to check for errors.
    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

    /// Waits until the specified backup or restore operation finishes or stops.
@ -75,6 +70,7 @@ private:
    BackupOperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);

    void doBackup(
+        BackupMutablePtr & backup,
        const std::shared_ptr<ASTBackupQuery> & backup_query,
        const BackupOperationID & backup_id,
        const String & backup_name_for_logging,
@ -82,9 +78,7 @@ private:
        BackupSettings backup_settings,
        std::shared_ptr<IBackupCoordination> backup_coordination,
        const ContextPtr & context,
-        ContextMutablePtr mutable_context,
-        ThreadGroupPtr thread_group,
-        bool called_async);
+        ContextMutablePtr mutable_context);

    /// Builds file infos for specified backup entries.
    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);
@ -101,14 +95,13 @@ private:
        const BackupInfo & backup_info,
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
-        ContextMutablePtr context,
-        ThreadGroupPtr thread_group,
-        bool called_async);
+        ContextMutablePtr context);

    /// Run data restoring tasks which insert data to tables.
    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);

-    void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status);
+    void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, const String & query_id,
+                bool internal, QueryStatusPtr process_list_element, BackupStatus status);
    void setStatus(const BackupOperationID & id, BackupStatus status, bool throw_if_error = true);
    void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
    void setNumFilesAndSize(const BackupOperationID & id, size_t num_files, UInt64 total_size, size_t num_entries,
@ -125,6 +118,7 @@ private:

    const bool allow_concurrent_backups;
    const bool allow_concurrent_restores;
+    const bool remove_backup_files_after_failure;
    const bool test_inject_sleep;

    LoggerPtr log;
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@ -117,11 +117,14 @@ public:
    /// Puts a new entry to the backup.
    virtual void writeFile(const BackupFileInfo & file_info, BackupEntryPtr entry) = 0;

+    /// Whether it's possible to add new entries to the backup in multiple threads.
+    virtual bool supportsWritingInMultipleThreads() const = 0;
+
    /// Finalizes writing the backup, should be called after all entries have been successfully written.
    virtual void finalizeWriting() = 0;

-    /// Whether it's possible to add new entries to the backup in multiple threads.
-    virtual bool supportsWritingInMultipleThreads() const = 0;
+    /// Try to remove all files copied to the backup. Used after an exception or it the backup was cancelled.
+    virtual void tryRemoveAllFiles() = 0;
 };

 using BackupPtr = std::shared_ptr<const IBackup>;
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -506,6 +506,10 @@ if (TARGET ch_contrib::s2)
    dbms_target_link_libraries (PUBLIC ch_contrib::s2)
 endif()

+if (TARGET ch_contrib::vectorscan)
+    dbms_target_link_libraries (PRIVATE ch_contrib::vectorscan)
+endif()
+
 if (TARGET ch_contrib::brotli)
    target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::brotli)
 endif()
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@ -3,6 +3,7 @@
 #include <Common/AsyncTaskExecutor.h>
 #include <Common/Epoll.h>
 #include <Common/Fiber.h>
+#include <Common/FiberStack.h>
 #include <Common/TimerDescriptor.h>
 #include <Common/PoolWithFailoverBase.h>
 #include <Client/ConnectionPool.h>
--- a/src/Client/ConnectionPool.h
+++ b/src/Client/ConnectionPool.h
@ -28,7 +28,10 @@ public:
    using Entry = PoolBase<Connection>::Entry;

    IConnectionPool() = default;
-    IConnectionPool(String host_, UInt16 port_) : host(host_), port(port_), address(host + ":" + toString(port_)) {}
+    IConnectionPool(String host_, UInt16 port_, Priority config_priority_)
+        : host(host_), port(port_), address(host + ":" + toString(port_)), config_priority(config_priority_)
+    {
+    }

    virtual ~IConnectionPool() = default;

@ -42,12 +45,13 @@ public:
    const std::string & getHost() const { return host; }
    UInt16 getPort() const { return port; }
    const String & getAddress() const { return address; }
-    virtual Priority getPriority() const { return Priority{1}; }
+    Priority getConfigPriority() const { return config_priority; }

 protected:
    const String host;
    const UInt16 port = 0;
    const String address;
+    const Priority config_priority;
 };

 using ConnectionPoolPtr = std::shared_ptr<IConnectionPool>;
@ -61,32 +65,31 @@ public:
    using Entry = IConnectionPool::Entry;
    using Base = PoolBase<Connection>;

-    ConnectionPool(unsigned max_connections_,
-            const String & host_,
-            UInt16 port_,
-            const String & default_database_,
-            const String & user_,
-            const String & password_,
-            const String & quota_key_,
-            const String & cluster_,
-            const String & cluster_secret_,
-            const String & client_name_,
-            Protocol::Compression compression_,
-            Protocol::Secure secure_,
-            Priority priority_ = Priority{1})
-       : IConnectionPool(host_, port_),
-        Base(max_connections_,
-        getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")")),
-        default_database(default_database_),
-        user(user_),
-        password(password_),
-        quota_key(quota_key_),
-        cluster(cluster_),
-        cluster_secret(cluster_secret_),
-        client_name(client_name_),
-        compression(compression_),
-        secure(secure_),
-        priority(priority_)
+    ConnectionPool(
+        unsigned max_connections_,
+        const String & host_,
+        UInt16 port_,
+        const String & default_database_,
+        const String & user_,
+        const String & password_,
+        const String & quota_key_,
+        const String & cluster_,
+        const String & cluster_secret_,
+        const String & client_name_,
+        Protocol::Compression compression_,
+        Protocol::Secure secure_,
+        Priority config_priority_ = Priority{1})
+        : IConnectionPool(host_, port_, config_priority_)
+        , Base(max_connections_, getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")"))
+        , default_database(default_database_)
+        , user(user_)
+        , password(password_)
+        , quota_key(quota_key_)
+        , cluster(cluster_)
+        , cluster_secret(cluster_secret_)
+        , client_name(client_name_)
+        , compression(compression_)
+        , secure(secure_)
    {
    }

@ -114,11 +117,6 @@ public:
        return host + ":" + toString(port);
    }

-    Priority getPriority() const override
-    {
-        return priority;
-    }
-
 protected:
    /** Creates a new object to put in the pool. */
    ConnectionPtr allocObject() override
@ -143,7 +141,6 @@ private:
    String client_name;
    Protocol::Compression compression; /// Whether to compress data when interacting with the server.
    Protocol::Secure secure;           /// Whether to encrypt data when interacting with the server.
-    Priority priority;                 /// priority from <remote_servers>
 };

 /**
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -79,14 +79,6 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
    return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority);
 }

-Priority ConnectionPoolWithFailover::getPriority() const
-{
-    return (*std::max_element(nested_pools.begin(), nested_pools.end(), [](const auto & a, const auto & b)
-    {
-        return a->getPriority() < b->getPriority();
-    }))->getPriority();
-}
-
 ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const
 {
    const auto [states, pools, error_decrease_time] = getPoolExtendedStates();
@ -253,13 +245,13 @@ ConnectionPoolWithFailover::tryGetEntry(
 }

 std::vector<ConnectionPoolWithFailover::Base::ShuffledPool>
-ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func)
+ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func, bool use_slowdown_count)
 {
    if (!priority_func)
        priority_func = makeGetPriorityFunc(settings);

    UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value;
-    return Base::getShuffledPools(max_ignored_errors, priority_func);
+    return Base::getShuffledPools(max_ignored_errors, priority_func, use_slowdown_count);
 }

 }
--- a/src/Client/ConnectionPoolWithFailover.h
+++ b/src/Client/ConnectionPoolWithFailover.h
@ -49,8 +49,6 @@ public:
              const Settings & settings,
              bool force_connected) override; /// From IConnectionPool

-    Priority getPriority() const override; /// From IConnectionPool
-
    /** Allocates up to the specified number of connections to work.
      * Connections provide access to different replicas of one shard.
      */
@ -83,15 +81,15 @@ public:
    struct NestedPoolStatus
    {
        const Base::NestedPoolPtr pool;
-        size_t error_count;
-        size_t slowdown_count;
+        size_t error_count = 0;
+        size_t slowdown_count = 0;
        std::chrono::seconds estimated_recovery_time;
    };

    using Status = std::vector<NestedPoolStatus>;
    Status getStatus() const;

-    std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {});
+    std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}, bool use_slowdown_count = false);

    size_t getMaxErrorCup() const { return Base::max_error_cap; }

--- a/src/Client/HedgedConnectionsFactory.cpp
+++ b/src/Client/HedgedConnectionsFactory.cpp
@ -40,7 +40,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
    , max_parallel_replicas(max_parallel_replicas_)
    , skip_unavailable_shards(skip_unavailable_shards_)
 {
-    shuffled_pools = pool->getShuffledPools(settings_, priority_func);
+    shuffled_pools = pool->getShuffledPools(settings_, priority_func, /* use_slowdown_count */ true);
+
    for (const auto & shuffled_pool : shuffled_pools)
        replicas.emplace_back(
            std::make_unique<ConnectionEstablisherAsync>(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get()));
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -320,7 +320,7 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac
    ReplicaState & state = getReplicaForReading();
    current_connection = state.connection;
    if (current_connection == nullptr)
-        throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "No available replica");
+        throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "Logical error: no available replica");

    Packet packet;
    try
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@ -5,6 +5,7 @@
 #include <variant>

 #include <Client/IConnections.h>
+#include <Common/FiberStack.h>
 #include <Common/Fiber.h>
 #include <Common/Epoll.h>
 #include <Common/TimerDescriptor.h>
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -810,7 +810,7 @@ ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint
    size_t tuple_size = tuple.tupleSize();

    if (tuple_size == 0)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");

    Columns temporary_arrays(tuple_size);
    for (size_t i = 0; i < tuple_size; ++i)
@ -1263,7 +1263,7 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const
    size_t tuple_size = tuple.tupleSize();

    if (tuple_size == 0)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");

    Columns temporary_arrays(tuple_size);
    for (size_t i = 0; i < tuple_size; ++i)
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -1,5 +1,7 @@
 #include <Common/Arena.h>
 #include <Common/SipHash.h>
+#include <Common/NaNUtils.h>
+#include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Common/WeakHash.h>
 #include <Columns/ColumnDecimal.h>
@ -26,6 +28,7 @@ namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
    extern const int ILLEGAL_COLUMN;
+    extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT;
    extern const int NOT_IMPLEMENTED;
 }

@ -826,7 +829,8 @@ void ColumnNullable::applyNullMap(const ColumnNullable & other)
 void ColumnNullable::checkConsistency() const
 {
    if (null_map->size() != getNestedColumn().size())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of nested column and null map of Nullable column are not equal");
+        throw Exception(ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT,
+                        "Logical error: Sizes of nested column and null map of Nullable column are not equal");
 }

 ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const
--- a/src/Columns/MaskOperations.cpp
+++ b/src/Columns/MaskOperations.cpp
@ -78,7 +78,7 @@ INSTANTIATE(IPv6)
 #undef INSTANTIATE

 template <bool inverted, bool column_is_short, typename Container>
-size_t extractMaskNumericImpl(
+static size_t extractMaskNumericImpl(
    PaddedPODArray<UInt8> & mask,
    const Container & data,
    UInt8 null_value,
@ -97,8 +97,7 @@ size_t extractMaskNumericImpl(
    size_t mask_size = mask.size();
    size_t data_size = data.size();

-    size_t i = 0;
-    for (; i != mask_size && data_index != data_size; ++i)
+    for (size_t i = 0; i != mask_size && data_index != data_size; ++i)
    {
        // Change mask only where value is 1.
        if (!mask[i])
@ -142,7 +141,7 @@ size_t extractMaskNumericImpl(
 }

 template <bool inverted, typename NumericType>
-bool extractMaskNumeric(
+static bool extractMaskNumeric(
    PaddedPODArray<UInt8> & mask,
    const ColumnPtr & column,
    UInt8 null_value,
@ -167,7 +166,7 @@ bool extractMaskNumeric(
 }

 template <bool inverted>
-MaskInfo extractMaskFromConstOrNull(
+static MaskInfo extractMaskFromConstOrNull(
    PaddedPODArray<UInt8> & mask,
    const ColumnPtr & column,
    UInt8 null_value,
@ -196,7 +195,7 @@ MaskInfo extractMaskFromConstOrNull(
 }

 template <bool inverted>
-MaskInfo extractMaskImpl(
+static MaskInfo extractMaskImpl(
    PaddedPODArray<UInt8> & mask,
    const ColumnPtr & col,
    UInt8 null_value,
--- a/src/Columns/MaskOperations.h
+++ b/src/Columns/MaskOperations.h
@ -58,7 +58,12 @@ void inverseMask(PaddedPODArray<UInt8> & mask, MaskInfo & mask_info);

 /// If given column is lazy executed argument (ColumnFunction with isShortCircuitArgument() = true),
 /// filter it by mask and then reduce. If inverted is true, we will work with inverted mask.
-void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> & mask, const MaskInfo & mask_info);
+/// mask_info is used for for optimization in cases when we have all zeros or all ones in mask, so
+/// in general case this info is not used and we can skip it.
+void maskedExecute(
+    ColumnWithTypeAndName & column,
+    const PaddedPODArray<UInt8> & mask,
+    const MaskInfo & mask_info = {true, true});

 /// If given column is lazy executed argument, reduce it. If empty is true,
 /// create an empty column with the execution result type.
--- a/src/Columns/getLeastSuperColumn.cpp
+++ b/src/Columns/getLeastSuperColumn.cpp
@ -21,7 +21,7 @@ static bool sameConstants(const IColumn & a, const IColumn & b)
 ColumnWithTypeAndName getLeastSuperColumn(const std::vector<const ColumnWithTypeAndName *> & columns)
 {
    if (columns.empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No src columns for supercolumn");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no src columns for supercolumn");

    ColumnWithTypeAndName result = *columns[0];

--- a/src/Common/CPUID.h
+++ b/src/Common/CPUID.h
@ -57,6 +57,249 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
 #endif
 }

+union CPUInfo
+{
+    UInt32 info[4];
+
+    struct Registers
+    {
+        UInt32 eax;
+        UInt32 ebx;
+        UInt32 ecx;
+        UInt32 edx;
+    } registers;
+
+    inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); }
+
+    inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); }
+};
+
+inline bool haveRDTSCP() noexcept
+{
+    return (CPUInfo(0x80000001).registers.edx >> 27) & 1u;
+}
+
+inline bool haveSSE() noexcept
+{
+    return (CPUInfo(0x1).registers.edx >> 25) & 1u;
+}
+
+inline bool haveSSE2() noexcept
+{
+    return (CPUInfo(0x1).registers.edx >> 26) & 1u;
+}
+
+inline bool haveSSE3() noexcept
+{
+    return CPUInfo(0x1).registers.ecx & 1u;
+}
+
+inline bool havePCLMUL() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 1) & 1u;
+}
+
+inline bool haveSSSE3() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 9) & 1u;
+}
+
+inline bool haveSSE41() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 19) & 1u;
+}
+
+inline bool haveSSE42() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 20) & 1u;
+}
+
+inline bool haveF16C() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 29) & 1u;
+}
+
+inline bool havePOPCNT() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 23) & 1u;
+}
+
+inline bool haveAES() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 25) & 1u;
+}
+
+inline bool haveXSAVE() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 26) & 1u;
+}
+
+inline bool haveOSXSAVE() noexcept
+{
+    return (CPUInfo(0x1).registers.ecx >> 27) & 1u;
+}
+
+inline bool haveAVX() noexcept
+{
+#if defined(__x86_64__)
+    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+    // https://bugs.chromium.org/p/chromium/issues/detail?id=375968
+    return haveOSXSAVE()                           // implies haveXSAVE()
+           && (our_xgetbv(0) & 6u) == 6u              // XMM state and YMM state are enabled by OS
+           && ((CPUInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit
+#else
+    return false;
+#endif
+}
+
+inline bool haveFMA() noexcept
+{
+    return haveAVX() && ((CPUInfo(0x1).registers.ecx >> 12) & 1u);
+}
+
+inline bool haveAVX2() noexcept
+{
+    return haveAVX() && ((CPUInfo(0x7, 0).registers.ebx >> 5) & 1u);
+}
+
+inline bool haveBMI1() noexcept
+{
+    return (CPUInfo(0x7, 0).registers.ebx >> 3) & 1u;
+}
+
+inline bool haveBMI2() noexcept
+{
+    return (CPUInfo(0x7, 0).registers.ebx >> 8) & 1u;
+}
+
+inline bool haveAVX512F() noexcept
+{
+#if defined(__x86_64__)
+    // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
+    return haveOSXSAVE()                           // implies haveXSAVE()
+           && (our_xgetbv(0) & 6u) == 6u              // XMM state and YMM state are enabled by OS
+           && ((our_xgetbv(0) >> 5) & 7u) == 7u       // ZMM state is enabled by OS
+           && CPUInfo(0x0).registers.eax >= 0x7          // leaf 7 is present
+           && ((CPUInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit
+#else
+    return false;
+#endif
+}
+
+inline bool haveAVX512DQ() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 17) & 1u);
+}
+
+inline bool haveRDSEED() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 18) & 1u);
+}
+
+inline bool haveADX() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 19) & 1u);
+}
+
+inline bool haveAVX512IFMA() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 21) & 1u);
+}
+
+inline bool havePCOMMIT() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 22) & 1u);
+}
+
+inline bool haveCLFLUSHOPT() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 23) & 1u);
+}
+
+inline bool haveCLWB() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 24) & 1u);
+}
+
+inline bool haveAVX512PF() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 26) & 1u);
+}
+
+inline bool haveAVX512ER() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 27) & 1u);
+}
+
+inline bool haveAVX512CD() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 28) & 1u);
+}
+
+inline bool haveSHA() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 29) & 1u);
+}
+
+inline bool haveAVX512BW() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 30) & 1u);
+}
+
+inline bool haveAVX512VL() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 31) & 1u);
+}
+
+inline bool havePREFETCHWT1() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ecx >> 0) & 1u);
+}
+
+inline bool haveAVX512VBMI() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 1) & 1u);
+}
+
+inline bool haveAVX512VBMI2() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u);
+}
+
+inline bool haveRDRAND() noexcept
+{
+    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u);
+}
+
+inline bool haveAMX() noexcept
+{
+#if defined(__x86_64__)
+    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+    return haveOSXSAVE()                           // implies haveXSAVE()
+           && ((our_xgetbv(0) >> 17) & 0x3) == 0x3;        // AMX state are enabled by OS
+#else
+    return false;
+#endif
+}
+
+inline bool haveAMXBF16() noexcept
+{
+    return haveAMX()
+            && ((CPUInfo(0x7, 0).registers.edx >> 22) & 1u);  // AMX-BF16 bit
+}
+
+inline bool haveAMXTILE() noexcept
+{
+    return haveAMX()
+            && ((CPUInfo(0x7, 0).registers.edx >> 24) & 1u);  // AMX-TILE bit
+}
+
+inline bool haveAMXINT8() noexcept
+{
+    return haveAMX()
+            && ((CPUInfo(0x7, 0).registers.edx >> 25) & 1u);  // AMX-INT8 bit
+}
+
 #define CPU_ID_ENUMERATE(OP) \
    OP(SSE)                  \
    OP(SSE2)                 \
@ -98,253 +341,6 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
    OP(AMXTILE)              \
    OP(AMXINT8)

-union CPUInfo
-{
-    UInt32 info[4];
-
-    struct Registers
-    {
-        UInt32 eax;
-        UInt32 ebx;
-        UInt32 ecx;
-        UInt32 edx;
-    } registers;
-
-    inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); }
-
-    inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); }
-};
-
-#define DEF_NAME(X) inline bool have##X() noexcept;
-    CPU_ID_ENUMERATE(DEF_NAME)
-#undef DEF_NAME
-
-bool haveRDTSCP() noexcept
-{
-    return (CPUInfo(0x80000001).registers.edx >> 27) & 1u;
-}
-
-bool haveSSE() noexcept
-{
-    return (CPUInfo(0x1).registers.edx >> 25) & 1u;
-}
-
-bool haveSSE2() noexcept
-{
-    return (CPUInfo(0x1).registers.edx >> 26) & 1u;
-}
-
-bool haveSSE3() noexcept
-{
-    return CPUInfo(0x1).registers.ecx & 1u;
-}
-
-bool havePCLMUL() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 1) & 1u;
-}
-
-bool haveSSSE3() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 9) & 1u;
-}
-
-bool haveSSE41() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 19) & 1u;
-}
-
-bool haveSSE42() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 20) & 1u;
-}
-
-bool haveF16C() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 29) & 1u;
-}
-
-bool havePOPCNT() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 23) & 1u;
-}
-
-bool haveAES() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 25) & 1u;
-}
-
-bool haveXSAVE() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 26) & 1u;
-}
-
-bool haveOSXSAVE() noexcept
-{
-    return (CPUInfo(0x1).registers.ecx >> 27) & 1u;
-}
-
-bool haveAVX() noexcept
-{
-#if defined(__x86_64__)
-    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
-    // https://bugs.chromium.org/p/chromium/issues/detail?id=375968
-    return haveOSXSAVE()                           // implies haveXSAVE()
-           && (our_xgetbv(0) & 6u) == 6u              // XMM state and YMM state are enabled by OS
-           && ((CPUInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit
-#else
-    return false;
-#endif
-}
-
-bool haveFMA() noexcept
-{
-    return haveAVX() && ((CPUInfo(0x1).registers.ecx >> 12) & 1u);
-}
-
-bool haveAVX2() noexcept
-{
-    return haveAVX() && ((CPUInfo(0x7, 0).registers.ebx >> 5) & 1u);
-}
-
-bool haveBMI1() noexcept
-{
-    return (CPUInfo(0x7, 0).registers.ebx >> 3) & 1u;
-}
-
-bool haveBMI2() noexcept
-{
-    return (CPUInfo(0x7, 0).registers.ebx >> 8) & 1u;
-}
-
-bool haveAVX512F() noexcept
-{
-#if defined(__x86_64__)
-    // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
-    return haveOSXSAVE()                           // implies haveXSAVE()
-           && (our_xgetbv(0) & 6u) == 6u              // XMM state and YMM state are enabled by OS
-           && ((our_xgetbv(0) >> 5) & 7u) == 7u       // ZMM state is enabled by OS
-           && CPUInfo(0x0).registers.eax >= 0x7          // leaf 7 is present
-           && ((CPUInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit
-#else
-    return false;
-#endif
-}
-
-bool haveAVX512DQ() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 17) & 1u);
-}
-
-bool haveRDSEED() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 18) & 1u);
-}
-
-bool haveADX() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 19) & 1u);
-}
-
-bool haveAVX512IFMA() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 21) & 1u);
-}
-
-bool havePCOMMIT() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 22) & 1u);
-}
-
-bool haveCLFLUSHOPT() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 23) & 1u);
-}
-
-bool haveCLWB() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 24) & 1u);
-}
-
-bool haveAVX512PF() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 26) & 1u);
-}
-
-bool haveAVX512ER() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 27) & 1u);
-}
-
-bool haveAVX512CD() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 28) & 1u);
-}
-
-bool haveSHA() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 29) & 1u);
-}
-
-bool haveAVX512BW() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 30) & 1u);
-}
-
-bool haveAVX512VL() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 31) & 1u);
-}
-
-bool havePREFETCHWT1() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ecx >> 0) & 1u);
-}
-
-bool haveAVX512VBMI() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 1) & 1u);
-}
-
-bool haveAVX512VBMI2() noexcept
-{
-    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u);
-}
-
-bool haveRDRAND() noexcept
-{
-    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u);
-}
-
-inline bool haveAMX() noexcept
-{
-#if defined(__x86_64__)
-    // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
-    return haveOSXSAVE()                           // implies haveXSAVE()
-           && ((our_xgetbv(0) >> 17) & 0x3) == 0x3;        // AMX state are enabled by OS
-#else
-    return false;
-#endif
-}
-
-bool haveAMXBF16() noexcept
-{
-    return haveAMX()
-            && ((CPUInfo(0x7, 0).registers.edx >> 22) & 1u);  // AMX-BF16 bit
-}
-
-bool haveAMXTILE() noexcept
-{
-    return haveAMX()
-            && ((CPUInfo(0x7, 0).registers.edx >> 24) & 1u);  // AMX-TILE bit
-}
-
-bool haveAMXINT8() noexcept
-{
-    return haveAMX()
-            && ((CPUInfo(0x7, 0).registers.edx >> 25) & 1u);  // AMX-INT8 bit
-}
-
 struct CPUFlagsCache
 {
 #define DEF_NAME(X) static inline bool have_##X = have##X();
--- a/src/Common/CgroupsMemoryUsageObserver.cpp
+++ b/src/Common/CgroupsMemoryUsageObserver.cpp
@ -0,0 +1,320 @@
+#include <Common/CgroupsMemoryUsageObserver.h>
+
+#if defined(OS_LINUX)
+
+#include <Common/setThreadName.h>
+#include <Common/logger_useful.h>
+#include <Common/formatReadable.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadBufferFromFileDescriptor.h>
+#include <IO/ReadHelpers.h>
+#include <base/cgroupsv2.h>
+#include <base/sleep.h>
+
+#include <filesystem>
+#include <optional>
+
+#include "config.h"
+#if USE_JEMALLOC
+#    include <jemalloc/jemalloc.h>
+#define STRINGIFY_HELPER(x) #x
+#define STRINGIFY(x) STRINGIFY_HELPER(x)
+#endif
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_CLOSE_FILE;
+    extern const int CANNOT_OPEN_FILE;
+    extern const int FILE_DOESNT_EXIST;
+    extern const int INCORRECT_DATA;
+}
+
+CgroupsMemoryUsageObserver::CgroupsMemoryUsageObserver(std::chrono::seconds wait_time_)
+    : log(getLogger("CgroupsMemoryUsageObserver"))
+    , wait_time(wait_time_)
+    , file(log)
+{
+    LOG_INFO(log, "Initialized cgroups memory limit observer, wait time is {} sec", wait_time.count());
+}
+
+CgroupsMemoryUsageObserver::~CgroupsMemoryUsageObserver()
+{
+    stopThread();
+}
+
+void CgroupsMemoryUsageObserver::setLimits(uint64_t hard_limit_, uint64_t soft_limit_)
+{
+    if (hard_limit_ == hard_limit && soft_limit_ == soft_limit)
+        return;
+
+    stopThread();
+
+    hard_limit = hard_limit_;
+    soft_limit = soft_limit_;
+
+    on_hard_limit = [this, hard_limit_](bool up)
+    {
+        if (up)
+        {
+            LOG_WARNING(log, "Exceeded hard memory limit ({})", ReadableSize(hard_limit_));
+
+            /// Update current usage in memory tracker. Also reset free_memory_in_allocator_arenas to zero though we don't know if they are
+            /// really zero. Trying to avoid OOM ...
+            MemoryTracker::setRSS(hard_limit_, 0);
+        }
+        else
+        {
+            LOG_INFO(log, "Dropped below hard memory limit ({})", ReadableSize(hard_limit_));
+        }
+    };
+
+    on_soft_limit = [this, soft_limit_](bool up)
+    {
+        if (up)
+        {
+            LOG_WARNING(log, "Exceeded sort memory limit ({})", ReadableSize(soft_limit_));
+
+#if USE_JEMALLOC
+            LOG_INFO(log, "Purging jemalloc arenas");
+            mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".purge", nullptr, nullptr, nullptr, 0);
+#endif
+            /// Reset current usage in memory tracker. Expect zero for free_memory_in_allocator_arenas as we just purged them.
+            uint64_t current_usage = readMemoryUsage();
+            MemoryTracker::setRSS(current_usage, 0);
+
+            LOG_INFO(log, "Purged jemalloc arenas. Current memory usage is {}", ReadableSize(current_usage));
+        }
+        else
+        {
+            LOG_INFO(log, "Dropped below soft memory limit ({})", ReadableSize(soft_limit_));
+        }
+    };
+
+    startThread();
+
+    LOG_INFO(log, "Set new limits, soft limit: {}, hard limit: {}", ReadableSize(soft_limit_), ReadableSize(hard_limit_));
+}
+
+uint64_t CgroupsMemoryUsageObserver::readMemoryUsage() const
+{
+    return file.readMemoryUsage();
+}
+
+namespace
+{
+
+/// Caveats:
+/// - All of the logic in this file assumes that the current process is the only process in the
+///   containing cgroup (or more precisely: the only process with significant memory consumption).
+///   If this is not the case, then other processe's memory consumption may affect the internal
+///   memory tracker ...
+/// - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is deprecated for over half a
+///   decade and will go away at some point, hierarchical detection is only implemented for v2.
+/// - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such
+///   systems existed only for a short transition period.
+
+std::optional<std::string> getCgroupsV2FileName()
+{
+    if (!cgroupsV2Enabled())
+        return {};
+
+    if (!cgroupsV2MemoryControllerEnabled())
+        return {};
+
+    String cgroup = cgroupV2OfProcess();
+    auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);
+
+    /// Return the bottom-most nested current memory file. If there is no such file at the current
+    /// level, try again at the parent level as memory settings are inherited.
+    while (current_cgroup != default_cgroups_mount.parent_path())
+    {
+        auto path = current_cgroup / "memory.current";
+        if (std::filesystem::exists(path))
+            return {path};
+        current_cgroup = current_cgroup.parent_path();
+    }
+    return {};
+}
+
+std::optional<std::string> getCgroupsV1FileName()
+{
+    auto path = default_cgroups_mount / "memory/memory.stat";
+    if (!std::filesystem::exists(path))
+        return {};
+    return {path};
+}
+
+std::pair<std::string, CgroupsMemoryUsageObserver::CgroupsVersion> getCgroupsFileName()
+{
+    auto v2_file_name = getCgroupsV2FileName();
+    if (v2_file_name.has_value())
+        return {*v2_file_name, CgroupsMemoryUsageObserver::CgroupsVersion::V2};
+
+    auto v1_file_name = getCgroupsV1FileName();
+    if (v1_file_name.has_value())
+        return {*v1_file_name, CgroupsMemoryUsageObserver::CgroupsVersion::V1};
+
+    throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Cannot find cgroups v1 or v2 current memory file");
+}
+
+}
+
+CgroupsMemoryUsageObserver::File::File(LoggerPtr log_)
+    : log(log_)
+{
+    std::tie(file_name, version) = getCgroupsFileName();
+
+    LOG_INFO(log, "Will read the current memory usage from '{}' (cgroups version: {})", file_name, (version == CgroupsVersion::V1) ? "v1" : "v2");
+
+    fd = ::open(file_name.data(), O_RDONLY);
+    if (fd == -1)
+        ErrnoException::throwFromPath(
+            (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE,
+            file_name, "Cannot open file '{}'", file_name);
+}
+
+CgroupsMemoryUsageObserver::File::~File()
+{
+    assert(fd != -1);
+    if (::close(fd) != 0)
+    {
+        try
+        {
+            ErrnoException::throwFromPath(
+                ErrorCodes::CANNOT_CLOSE_FILE,
+                file_name, "Cannot close file '{}'", file_name);
+        }
+        catch (const ErrnoException &)
+        {
+            tryLogCurrentException(log, __PRETTY_FUNCTION__);
+        }
+    }
+}
+
+uint64_t CgroupsMemoryUsageObserver::File::readMemoryUsage() const
+{
+    /// File read is probably not read is thread-safe, just to be sure
+    std::lock_guard lock(mutex);
+
+    ReadBufferFromFileDescriptor buf(fd);
+    buf.rewind();
+
+    uint64_t mem_usage;
+
+    switch (version)
+    {
+        case CgroupsVersion::V1:
+        {
+            /// Format is
+            ///   kernel 5
+            ///   rss 15
+            ///   [...]
+            std::string key;
+            while (!buf.eof())
+            {
+                readStringUntilWhitespace(key, buf);
+                if (key != "rss")
+                {
+                    std::string dummy;
+                    readStringUntilNewlineInto(dummy, buf);
+                    continue;
+                }
+                assertChar(' ', buf);
+                readIntText(mem_usage, buf);
+                assertChar('\n', buf);
+                break;
+            }
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot find 'rss' in '{}'", file_name);
+        }
+        case CgroupsVersion::V2:
+        {
+            readIntText(mem_usage, buf);
+            break;
+        }
+    }
+
+    LOG_TRACE(log, "Read current memory usage {} from cgroups", ReadableSize(mem_usage));
+
+    return mem_usage;
+}
+
+void CgroupsMemoryUsageObserver::startThread()
+{
+    if (!thread.joinable())
+    {
+        thread = ThreadFromGlobalPool(&CgroupsMemoryUsageObserver::runThread, this);
+        LOG_INFO(log, "Started cgroup current memory usage observer thread");
+    }
+}
+
+void CgroupsMemoryUsageObserver::stopThread()
+{
+    {
+        std::lock_guard lock(thread_mutex);
+        if (!thread.joinable())
+            return;
+        quit = true;
+    }
+
+    cond.notify_one();
+    thread.join();
+
+    LOG_INFO(log, "Stopped cgroup current memory usage observer thread");
+}
+
+void CgroupsMemoryUsageObserver::runThread()
+{
+    setThreadName("CgrpMemUsgObsr");
+
+    std::unique_lock lock(thread_mutex);
+    while (true)
+    {
+        if (cond.wait_for(lock, wait_time, [this] { return quit; }))
+            break;
+
+        try
+        {
+            uint64_t memory_usage = file.readMemoryUsage();
+            processMemoryUsage(memory_usage);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, __PRETTY_FUNCTION__);
+        }
+    }
+}
+
+void CgroupsMemoryUsageObserver::processMemoryUsage(uint64_t current_usage)
+{
+    if (current_usage > hard_limit)
+    {
+        if (last_usage <= hard_limit)
+            on_hard_limit(true);
+    }
+    else
+    {
+        if (last_usage > hard_limit)
+            on_hard_limit(false);
+    }
+
+    if (current_usage > soft_limit)
+    {
+        if (last_usage <= soft_limit)
+            on_soft_limit(true);
+    }
+    else
+    {
+        if (last_usage > soft_limit)
+            on_soft_limit(false);
+    }
+
+    last_usage = current_usage;
+}
+
+}
+
+#endif
--- a/src/Common/CgroupsMemoryUsageObserver.h
+++ b/src/Common/CgroupsMemoryUsageObserver.h
@ -0,0 +1,92 @@
+#pragma once
+
+#include <Common/ThreadPool.h>
+
+#include <atomic>
+#include <chrono>
+#include <mutex>
+
+namespace DB
+{
+
+/// Periodically reads the current memory usage from Linux cgroups.
+/// You can specify soft or hard memory limits:
+/// - When the soft memory limit is hit, drop jemalloc cache.
+/// - When the hard memory limit is hit, update MemoryTracking metric to throw memory exceptions faster.
+#if defined(OS_LINUX)
+class CgroupsMemoryUsageObserver
+{
+public:
+    enum class CgroupsVersion
+    {
+        V1,
+        V2
+
+    };
+    explicit CgroupsMemoryUsageObserver(std::chrono::seconds wait_time_);
+    ~CgroupsMemoryUsageObserver();
+
+    void setLimits(uint64_t hard_limit_, uint64_t soft_limit_);
+
+    size_t getHardLimit() const { return hard_limit; }
+    size_t getSoftLimit() const { return soft_limit; }
+
+    uint64_t readMemoryUsage() const;
+
+private:
+    LoggerPtr log;
+
+    std::atomic<size_t> hard_limit = 0;
+    std::atomic<size_t> soft_limit = 0;
+
+    const std::chrono::seconds wait_time;
+
+    using CallbackFn = std::function<void(bool)>;
+    CallbackFn on_hard_limit;
+    CallbackFn on_soft_limit;
+
+    uint64_t last_usage = 0;
+
+    /// Represents the cgroup virtual file that shows the memory consumption of the process's cgroup.
+    struct File
+    {
+    public:
+        explicit File(LoggerPtr log_);
+        ~File();
+        uint64_t readMemoryUsage() const;
+    private:
+        LoggerPtr log;
+        mutable std::mutex mutex;
+        int fd TSA_GUARDED_BY(mutex) = -1;
+        CgroupsVersion version;
+        std::string file_name;
+    };
+
+    File file;
+
+    void startThread();
+    void stopThread();
+
+    void runThread();
+    void processMemoryUsage(uint64_t usage);
+
+    std::mutex thread_mutex;
+    std::condition_variable cond;
+    ThreadFromGlobalPool thread;
+    bool quit = false;
+};
+
+#else
+class CgroupsMemoryUsageObserver
+{
+public:
+    explicit CgroupsMemoryUsageObserver(std::chrono::seconds) {}
+
+    void setLimits(uint64_t, uint64_t) {}
+    size_t readMemoryUsage() { return 0; }
+    size_t getHardLimit() { return 0; }
+    size_t getSoftLimit() { return 0; }
+};
+#endif
+
+}
--- a/src/Common/ConcurrencyControl.cpp
+++ b/src/Common/ConcurrencyControl.cpp
@ -12,10 +12,10 @@ namespace ErrorCodes

 ConcurrencyControl::Slot::~Slot()
 {
-    allocation->release();
+    static_cast<ConcurrencyControl::Allocation&>(*allocation).release();
 }

-ConcurrencyControl::Slot::Slot(AllocationPtr && allocation_)
+ConcurrencyControl::Slot::Slot(SlotAllocationPtr && allocation_)
    : allocation(std::move(allocation_))
 {
 }
@ -27,7 +27,7 @@ ConcurrencyControl::Allocation::~Allocation()
    parent.free(this);
 }

-[[nodiscard]] ConcurrencyControl::SlotPtr ConcurrencyControl::Allocation::tryAcquire()
+[[nodiscard]] AcquiredSlotPtr ConcurrencyControl::Allocation::tryAcquire()
 {
    SlotCount value = granted.load();
    while (value)
@ -35,15 +35,21 @@ ConcurrencyControl::Allocation::~Allocation()
        if (granted.compare_exchange_strong(value, value - 1))
        {
            std::unique_lock lock{mutex};
-            return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
+            return AcquiredSlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
        }
    }
    return {}; // avoid unnecessary locking
 }

-ConcurrencyControl::SlotCount ConcurrencyControl::Allocation::grantedCount() const
+SlotCount ConcurrencyControl::Allocation::grantedCount() const
 {
-    return granted;
+    return granted.load();
+}
+
+SlotCount ConcurrencyControl::Allocation::allocatedCount() const
+{
+    std::unique_lock lock{mutex};
+    return allocated;
 }

 ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_)
@ -87,7 +93,7 @@ ConcurrencyControl::~ConcurrencyControl()
        abort();
 }

-[[nodiscard]] ConcurrencyControl::AllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max)
+[[nodiscard]] SlotAllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max)
 {
    if (min > max)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements");
@ -100,13 +106,13 @@ ConcurrencyControl::~ConcurrencyControl()

    // Create allocation and start waiting if more slots are required
    if (granted < max)
-        return AllocationPtr(new Allocation(*this, max, granted,
+        return SlotAllocationPtr(new Allocation(*this, max, granted,
            waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */)));
    else
-        return AllocationPtr(new Allocation(*this, max, granted));
+        return SlotAllocationPtr(new Allocation(*this, max, granted));
 }

-void ConcurrencyControl::setMaxConcurrency(ConcurrencyControl::SlotCount value)
+void ConcurrencyControl::setMaxConcurrency(SlotCount value)
 {
    std::unique_lock lock{mutex};
    max_concurrency = std::max<SlotCount>(1, value); // never allow max_concurrency to be zero
@ -162,7 +168,7 @@ void ConcurrencyControl::schedule(std::unique_lock<std::mutex> &)
    }
 }

-ConcurrencyControl::SlotCount ConcurrencyControl::available(std::unique_lock<std::mutex> &) const
+SlotCount ConcurrencyControl::available(std::unique_lock<std::mutex> &) const
 {
    if (cur_concurrency < max_concurrency)
        return max_concurrency - cur_concurrency;
--- a/src/Common/ConcurrencyControl.h
+++ b/src/Common/ConcurrencyControl.h
@ -7,6 +7,7 @@
 #include <base/types.h>
 #include <boost/core/noncopyable.hpp>

+#include <Common/ISlotControl.h>

 namespace DB
 {
@ -34,41 +35,35 @@ namespace DB
 * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)`
 * because `min` amount of slots is allocated for each query unconditionally.
 */
-class ConcurrencyControl : boost::noncopyable
+class ConcurrencyControl : public ISlotControl
 {
 public:
    struct Allocation;
-    using AllocationPtr = std::shared_ptr<Allocation>;
-    using SlotCount = UInt64;
    using Waiters = std::list<Allocation *>;

-    static constexpr SlotCount Unlimited = std::numeric_limits<SlotCount>::max();
-
    // Scoped guard for acquired slot, see Allocation::tryAcquire()
-    struct Slot : boost::noncopyable
+    struct Slot : public IAcquiredSlot
    {
-        ~Slot();
+        ~Slot() override;

    private:
        friend struct Allocation; // for ctor

-        explicit Slot(AllocationPtr && allocation_);
+        explicit Slot(SlotAllocationPtr && allocation_);

-        AllocationPtr allocation;
+        SlotAllocationPtr allocation;
    };

-    // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet
-    using SlotPtr = std::shared_ptr<Slot>;
-
    // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max)
-    struct Allocation : std::enable_shared_from_this<Allocation>, boost::noncopyable
+    struct Allocation : public ISlotAllocation
    {
-        ~Allocation();
+        ~Allocation() override;

        // Take one already granted slot if available. Lock-free iff there is no granted slot.
-        [[nodiscard]] SlotPtr tryAcquire();
+        [[nodiscard]] AcquiredSlotPtr tryAcquire() override;

-        SlotCount grantedCount() const;
+        SlotCount grantedCount() const override;
+        SlotCount allocatedCount() const override;

    private:
        friend struct Slot; // for release()
@ -94,7 +89,7 @@ public:
        ConcurrencyControl & parent;
        const SlotCount limit;

-        std::mutex mutex; // the following values must be accessed under this mutex
+        mutable std::mutex mutex; // the following values must be accessed under this mutex
        SlotCount allocated; // allocated total (including already `released`)
        SlotCount released = 0;

@ -103,17 +98,16 @@ public:
        const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit
    };

-public:
    ConcurrencyControl();

    // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl
    // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries
-    ~ConcurrencyControl();
+    ~ConcurrencyControl() override;

    // Allocate at least `min` and at most `max` slots.
    // If not all `max` slots were successfully allocated, a subscription for later allocation is created
    // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread.
-    [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max);
+    [[nodiscard]] SlotAllocationPtr allocate(SlotCount min, SlotCount max) override;

    void setMaxConcurrency(SlotCount value);

@ -134,7 +128,7 @@ private:
    std::mutex mutex;
    Waiters waiters;
    Waiters::iterator cur_waiter; // round-robin pointer
-    SlotCount max_concurrency = Unlimited;
+    SlotCount max_concurrency = UnlimitedSlots;
    SlotCount cur_concurrency = 0;
 };

--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -262,6 +262,9 @@
    M(ActiveTimersInQueryProfiler, "Number of Active thread local timers in QueryProfiler") \
    M(RefreshableViews, "Number materialized views with periodic refreshing (REFRESH)") \
    M(RefreshingViews, "Number of materialized views currently executing a refresh") \
+    M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \
+    M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \
+    M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer")

 #ifdef APPLY_FOR_EXTERNAL_METRICS
    #define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -595,6 +595,7 @@
    M(713, BROKEN_PROJECTION) \
    M(714, UNEXPECTED_CLUSTER) \
    M(715, CANNOT_DETECT_FORMAT) \
+    M(716, CANNOT_FORGET_PARTITION) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@ -17,7 +17,7 @@ private:
    template <typename T> friend class FiberLocal;

 public:
-    template <typename StackAlloc, typename Fn>
+    template< typename StackAlloc, typename Fn>
    Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
    {
    }
@ -46,12 +46,6 @@ public:
        current_fiber = parent_fiber;
    }

-    static FiberPtr & getCurrentFiber()
-    {
-        thread_local static FiberPtr current_fiber;
-        return current_fiber;
-    }
-
 private:
    template <typename Fn>
    struct RoutineImpl
@ -80,6 +74,12 @@ private:
        Fn fn;
    };

+    static FiberPtr & getCurrentFiber()
+    {
+        thread_local static FiberPtr current_fiber;
+        return current_fiber;
+    }
+
    /// Special wrapper to store data in uniquer_ptr.
    struct DataWrapper
    {
@ -146,3 +146,4 @@ private:

    T main_instance;
 };
+
--- a/src/Common/ISlotControl.h
+++ b/src/Common/ISlotControl.h
@ -0,0 +1,76 @@
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <base/types.h>
+#include <boost/core/noncopyable.hpp>
+
+
+namespace DB
+{
+
+// Interfaces for abstract "slot" allocation and control.
+// Slot is a virtual entity existing in a limited amount (CPUs or memory chunks, etc).
+//
+// Every slot can be in one of the following states:
+//  * free: slot is available to be allocated.
+//  * allocated: slot is allocated to a specific ISlotAllocation.
+//
+// Allocated slots can be in one of the following states:
+//  * granted: allocated, but not yet acquired.
+//  * acquired: a granted slot becomes acquired by using IAcquiredSlot.
+//
+// Example for CPU (see ConcurrencyControl.h). Every slot represents one CPU in the system.
+// Slot allocation is a request to allocate specific number of CPUs for a specific query.
+// Acquired slot is an entity that is held by a thread as long as it is running. This allows
+// total number of threads in the system to be limited and the distribution process to be controlled.
+//
+// TODO:
+// - for preemption - ability to return granted slot back and reacquire it later.
+// - for memory allocations - variable size of slots (in bytes).
+
+/// Number of slots
+using SlotCount = UInt64;
+
+/// Unlimited number of slots
+constexpr SlotCount UnlimitedSlots = std::numeric_limits<SlotCount>::max();
+
+/// Acquired slot holder. Slot is considered to be acquired as long as the object exists.
+class IAcquiredSlot : public std::enable_shared_from_this<IAcquiredSlot>, boost::noncopyable
+{
+public:
+    virtual ~IAcquiredSlot() = default;
+};
+
+using AcquiredSlotPtr = std::shared_ptr<IAcquiredSlot>;
+
+/// Request for allocation of slots from ISlotControl.
+/// Allows for more slots to be acquired and the whole request to be canceled.
+class ISlotAllocation : public std::enable_shared_from_this<ISlotAllocation>, boost::noncopyable
+{
+public:
+    virtual ~ISlotAllocation() = default;
+
+    /// Take one already granted slot if available.
+    [[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0;
+
+    /// Returns the number of granted slots for given allocation (i.e. available to be acquired)
+    virtual SlotCount grantedCount() const = 0;
+
+    /// Returns the total number of slots allocated at the moment (acquired and granted)
+    virtual SlotCount allocatedCount() const = 0;
+};
+
+using SlotAllocationPtr = std::shared_ptr<ISlotAllocation>;
+
+class ISlotControl : boost::noncopyable
+{
+public:
+    virtual ~ISlotControl() = default;
+
+    // Allocate at least `min` and at most `max` slots.
+    // If not all `max` slots were successfully allocated, a "subscription" for later allocation is created
+    [[nodiscard]] virtual SlotAllocationPtr allocate(SlotCount min, SlotCount max) = 0;
+};
+
+}
--- a/src/Common/PoolWithFailoverBase.h
+++ b/src/Common/PoolWithFailoverBase.h
@ -66,7 +66,7 @@ public:
        , log(log_)
    {
        for (size_t i = 0;i < nested_pools.size(); ++i)
-            shared_pool_states[i].config_priority = nested_pools[i]->getPriority();
+            shared_pool_states[i].config_priority = nested_pools[i]->getConfigPriority();
    }

    struct TryResult
@ -133,7 +133,7 @@ protected:

    void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const;

-    std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority);
+    std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority, bool use_slowdown_count = false);

    inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools);

@ -160,7 +160,7 @@ protected:
 template <typename TNestedPool>
 std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool>
 PoolWithFailoverBase<TNestedPool>::getShuffledPools(
-    size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority)
+    size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority, bool use_slowdown_count)
 {
    /// Update random numbers and error counts.
    PoolStates pool_states = updatePoolStates(max_ignored_errors);
@ -175,13 +175,13 @@ PoolWithFailoverBase<TNestedPool>::getShuffledPools(
    std::vector<ShuffledPool> shuffled_pools;
    shuffled_pools.reserve(nested_pools.size());
    for (size_t i = 0; i < nested_pools.size(); ++i)
-        shuffled_pools.push_back(ShuffledPool{nested_pools[i], &pool_states[i], i, /* error_count = */ 0, /* slowdown_count = */ 0});
+        shuffled_pools.emplace_back(ShuffledPool{.pool = nested_pools[i], .state = &pool_states[i], .index = i});

    ::sort(
        shuffled_pools.begin(), shuffled_pools.end(),
-        [](const ShuffledPool & lhs, const ShuffledPool & rhs)
+        [use_slowdown_count](const ShuffledPool & lhs, const ShuffledPool & rhs)
        {
-            return PoolState::compare(*lhs.state, *rhs.state);
+            return PoolState::compare(*lhs.state, *rhs.state, use_slowdown_count);
        });

    return shuffled_pools;
@ -344,10 +344,14 @@ struct PoolWithFailoverBase<TNestedPool>::PoolState
        random = rng();
    }

-    static bool compare(const PoolState & lhs, const PoolState & rhs)
+    static bool compare(const PoolState & lhs, const PoolState & rhs, bool use_slowdown_count)
    {
-        return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
-             < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
+        if (use_slowdown_count)
+            return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
+                < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
+        else
+            return std::forward_as_tuple(lhs.error_count, lhs.config_priority, lhs.priority, lhs.random)
+                < std::forward_as_tuple(rhs.error_count, rhs.config_priority, rhs.priority, rhs.random);
    }

 private:
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -632,6 +632,12 @@ The server successfully detected this situation and will download merged part fr
    M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces") \
    \
    M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas") \
+    \
+    M(KeeperLogsEntryReadFromLatestCache, "Number of log entries in Keeper being read from latest logs cache") \
+    M(KeeperLogsEntryReadFromCommitCache, "Number of log entries in Keeper being read from commit logs cache") \
+    M(KeeperLogsEntryReadFromFile, "Number of log entries in Keeper being read directly from the changelog file") \
+    M(KeeperLogsPrefetchedEntries, "Number of log entries in Keeper being prefetched from the changelog file") \
+    \
    M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas") \
    M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas") \

--- a/src/Common/SensitiveDataMasker.cpp
+++ b/src/Common/SensitiveDataMasker.cpp
@ -91,7 +91,7 @@ void SensitiveDataMasker::setInstance(std::unique_ptr<SensitiveDataMasker>&& sen
 {

    if (!sensitive_data_masker_)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The 'sensitive_data_masker' is not set");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: the 'sensitive_data_masker' is not set");

    if (sensitive_data_masker_->rulesCount() > 0)
    {
--- a/src/Common/SipHash.h
+++ b/src/Common/SipHash.h
@ -209,7 +209,7 @@ public:
    {
        if (!is_reference_128)
            throw DB::Exception(
-                DB::ErrorCodes::LOGICAL_ERROR, "Can't call get128Reference when is_reference_128 is not set");
+                DB::ErrorCodes::LOGICAL_ERROR, "Logical error: can't call get128Reference when is_reference_128 is not set");
        finalize();
        const auto lo = v0 ^ v1 ^ v2 ^ v3;
        v1 ^= 0xdd;
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@ -448,6 +448,9 @@ toStringEveryLineImpl([[maybe_unused]] bool fatal, const StackTraceRefTriple & s
            DB::writePointerHex(frame.physical_addr, out);
        }

+        if (frame.object.has_value())
+            out << " in " << *frame.object;
+
        callback(out.str());
    };
 #else
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -16,6 +16,7 @@
 #include <base/find_symbols.h>
 #include <base/sort.h>
 #include <base/getFQDNOrHostName.h>
+#include <Core/ServerUUID.h>
 #include "Common/ZooKeeper/IKeeper.h"
 #include <Common/DNSResolver.h>
 #include <Common/StringUtils/StringUtils.h>
@ -375,7 +376,7 @@ void ZooKeeper::createAncestors(const std::string & path)
        }

        Coordination::Responses responses;
-        Coordination::Error code = multiImpl(create_ops, responses);
+        Coordination::Error code = multiImpl(create_ops, responses, /*check_session_valid*/ false);

        if (code == Coordination::Error::ZOK)
            return;
@ -638,12 +639,22 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin
 }


-Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses)
+Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
 {
    if (requests.empty())
        return Coordination::Error::ZOK;

-    auto future_result = asyncTryMultiNoThrow(requests);
+    std::future<Coordination::MultiResponse> future_result;
+    if (check_session_valid)
+    {
+        Coordination::Requests new_requests = requests;
+        addCheckSessionOp(new_requests);
+        future_result = asyncTryMultiNoThrow(new_requests);
+    }
+    else
+    {
+        future_result = asyncTryMultiNoThrow(requests);
+    }

    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
@ -655,21 +666,35 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests
        auto response = future_result.get();
        Coordination::Error code = response.error;
        responses = response.responses;
+
+        if (check_session_valid)
+        {
+            if (code != Coordination::Error::ZOK && !Coordination::isHardwareError(code) && getFailedOpIndex(code, responses) == requests.size())
+            {
+                impl->finalize(fmt::format("Session was killed: {}", requests.back()->getPath()));
+                code = Coordination::Error::ZSESSIONMOVED;
+            }
+            responses.pop_back();
+            /// For some reason, for hardware errors we set ZOK codes for all responses.
+            /// In other cases, if the multi-request status is not ZOK, then the last response status must indicate an error too
+            chassert(code == Coordination::Error::ZOK || Coordination::isHardwareError(code) || responses.back()->error != Coordination::Error::ZOK);
+        }
+
        return code;
    }
 }

-Coordination::Responses ZooKeeper::multi(const Coordination::Requests & requests)
+Coordination::Responses ZooKeeper::multi(const Coordination::Requests & requests, bool check_session_valid)
 {
    Coordination::Responses responses;
-    Coordination::Error code = multiImpl(requests, responses);
+    Coordination::Error code = multiImpl(requests, responses, check_session_valid);
    KeeperMultiException::check(code, requests, responses);
    return responses;
 }

-Coordination::Error ZooKeeper::tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses)
+Coordination::Error ZooKeeper::tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
 {
-    Coordination::Error code = multiImpl(requests, responses);
+    Coordination::Error code = multiImpl(requests, responses, check_session_valid);
    if (code != Coordination::Error::ZOK && !Coordination::isUserError(code))
        throw KeeperException(code);
    return code;
@ -935,12 +960,40 @@ Coordination::ReconfigResponse ZooKeeper::reconfig(
    return future_result.get();
 }

+ZooKeeperPtr ZooKeeper::create(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
+{
+    auto res = std::shared_ptr<ZooKeeper>(new ZooKeeper(config, config_name, zk_log_));
+    res->initSession();
+    return res;
+}

 ZooKeeperPtr ZooKeeper::startNewSession() const
 {
-    return std::make_shared<ZooKeeper>(args, zk_log);
+    auto res = std::shared_ptr<ZooKeeper>(new ZooKeeper(args, zk_log));
+    res->initSession();
+    return res;
 }

+void ZooKeeper::initSession()
+{
+    String session_path = fs::path(args.sessions_path) / args.zookeeper_name / toString(DB::ServerUUID::get());
+    Coordination::Stat stat;
+    if (trySet(session_path, "", -1, &stat) == Coordination::Error::ZOK)
+    {
+        session_node_version = stat.version;
+        return;
+    }
+
+    createAncestors(session_path);
+    create(session_path, "", zkutil::CreateMode::Persistent);
+    session_node_version = 0;
+}
+
+void ZooKeeper::addCheckSessionOp(Coordination::Requests & requests) const
+{
+    String session_path = fs::path(args.sessions_path) / args.zookeeper_name / toString(DB::ServerUUID::get());
+    requests.push_back(zkutil::makeCheckRequest(session_path, session_node_version));
+}

 bool ZooKeeper::expired()
 {
@ -1243,11 +1296,11 @@ std::future<Coordination::MultiResponse> ZooKeeper::asyncMulti(const Coordinatio
    return future;
 }

-Coordination::Error ZooKeeper::tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses)
+Coordination::Error ZooKeeper::tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
 {
    try
    {
-        return multiImpl(requests, responses);
+        return multiImpl(requests, responses, check_session_valid);
    }
    catch (const Coordination::Exception & e)
    {
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -198,11 +198,6 @@ class ZooKeeper
    /// ZooKeeperWithFaultInjection wants access to `impl` pointer to reimplement some async functions with faults
    friend class DB::ZooKeeperWithFaultInjection;

-public:
-
-    using Ptr = std::shared_ptr<ZooKeeper>;
-    using ErrorsList = std::initializer_list<Coordination::Error>;
-
    explicit ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);

    /** Config of the form:
@ -227,10 +222,27 @@ public:
            <identity>user:password</identity>
        </zookeeper>
    */
-    ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_);
+    ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);
+
+    /// See addCheckSessionOp
+    void initSession();
+
+public:
+        using Ptr = std::shared_ptr<ZooKeeper>;
+        using ErrorsList = std::initializer_list<Coordination::Error>;

    std::vector<ShuffleHost> shuffleHosts() const;

+    static Ptr create(const Poco::Util::AbstractConfiguration & config,
+                      const std::string & config_name,
+                      std::shared_ptr<DB::ZooKeeperLog> zk_log_);
+
+    template <typename... Args>
+    static Ptr createWithoutKillingPreviousSessions(Args &&... args)
+    {
+        return std::shared_ptr<ZooKeeper>(new ZooKeeper(std::forward<Args>(args)...));
+    }
+
    /// Creates a new session with the same parameters. This method can be used for reconnecting
    /// after the session has expired.
    /// This object remains unchanged, and the new session is returned.
@ -427,13 +439,14 @@ public:

    /// Performs several operations in a transaction.
    /// Throws on every error.
-    Coordination::Responses multi(const Coordination::Requests & requests);
+    /// For check_session_valid see addCheckSessionOp
+    Coordination::Responses multi(const Coordination::Requests & requests, bool check_session_valid = false);
    /// Throws only if some operation has returned an "unexpected" error - an error that would cause
    /// the corresponding try- method to throw.
    /// On exception, `responses` may or may not be populated.
-    Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses);
+    Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid = false);
    /// Throws nothing (even session expired errors)
-    Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses);
+    Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid = false);

    std::string sync(const std::string & path);

@ -587,6 +600,22 @@ public:

    const DB::KeeperFeatureFlags * getKeeperFeatureFlags() const { return impl->getKeeperFeatureFlags(); }

+    /// Checks that our session was not killed, and allows to avoid applying a request from an old lost session.
+    /// Imagine a "connection-loss-on-commit" situation like this:
+    /// - We have written some write requests to the socket and immediately disconnected (e.g. due to "Operation timeout")
+    /// - The requests were sent, but the destination [Zoo]Keeper host will receive it later (it doesn't know about our requests yet)
+    /// - We don't know the status of our requests
+    /// - We connect to another [Zoo]Keeper replica with a new session, and do some reads
+    ///   to find out the status of our requests. We see that they were not committed.
+    /// - The packets from our old session finally arrive to the destination [Zoo]Keeper host. The requests get processed.
+    /// - Changes are committed (although, we have already seen that they are not)
+    ///
+    /// We need a way to reject requests from old sessions somehow.
+    ///
+    /// So we update the version of /clickhouse/sessions/server_uuid node when starting a new session.
+    /// And there's an option to check this version when committing something.
+    void addCheckSessionOp(Coordination::Requests & requests) const;
+
 private:
    void init(ZooKeeperArgs args_);

@ -602,7 +631,7 @@ private:
        Coordination::Stat * stat,
        Coordination::WatchCallbackPtr watch_callback,
        Coordination::ListRequestType list_request_type);
-    Coordination::Error multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses);
+    Coordination::Error multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid);
    Coordination::Error existsImpl(const std::string & path, Coordination::Stat * stat_, Coordination::WatchCallback watch_callback);
    Coordination::Error syncImpl(const std::string & path, std::string & returned_path);

@ -654,6 +683,8 @@ private:
    std::shared_ptr<DB::ZooKeeperLog> zk_log;

    AtomicStopwatch session_uptime;
+
+    int32_t session_node_version;
 };


--- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@ -132,6 +132,8 @@ void ZooKeeperArgs::initFromKeeperServerSection(const Poco::Util::AbstractConfig

 void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguration & config, const std::string & config_name)
 {
+    zookeeper_name = config_name;
+
    Poco::Util::AbstractConfiguration::Keys keys;
    config.keys(config_name, keys);

@ -193,6 +195,10 @@ void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguratio
        {
            chroot = config.getString(config_name + "." + key);
        }
+        else if (key == "sessions_path")
+        {
+            sessions_path = config.getString(config_name + "." + key);
+        }
        else if (key == "implementation")
        {
            implementation = config.getString(config_name + "." + key);
--- a/src/Common/ZooKeeper/ZooKeeperArgs.h
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@ -29,11 +29,13 @@ struct ZooKeeperArgs
    ZooKeeperArgs() = default;
    bool operator == (const ZooKeeperArgs &) const = default;

+    String zookeeper_name = "zookeeper";
    String implementation = "zookeeper";
    Strings hosts;
    String auth_scheme;
    String identity;
    String chroot;
+    String sessions_path = "/clickhouse/sessions";
    int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
    int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
    int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -401,9 +401,6 @@ ZooKeeper::ZooKeeper(
        keeper_feature_flags.logFlags(log);

        ProfileEvents::increment(ProfileEvents::ZooKeeperInit);
-
-        /// Avoid stale reads after connecting
-        sync("/", [](const SyncResponse &){});
    }
    catch (...)
    {
--- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.cpp
@ -336,14 +336,14 @@ Coordination::Error ZooKeeperWithFaultInjection::tryCreate(const std::string & p
    return tryCreate(path, data, mode, path_created);
 }

-Coordination::Responses ZooKeeperWithFaultInjection::multi(const Coordination::Requests & requests)
+Coordination::Responses ZooKeeperWithFaultInjection::multi(const Coordination::Requests & requests, bool check_session_valid)
 {
    return executeWithFaultSync(
        __func__,
        !requests.empty() ? requests.front()->getPath() : "",
        [&]()
        {
-            auto responses = keeper->multi(requests);
+            auto responses = keeper->multi(requests, check_session_valid);
            if (unlikely(fault_policy))
                multiResponseSaveEphemeralNodePaths(requests, responses);
            return responses;
@ -420,14 +420,14 @@ void ZooKeeperWithFaultInjection::deleteEphemeralNodeIfContentMatches(
        __func__, path, [&]() { return keeper->deleteEphemeralNodeIfContentMatches(path, fast_delete_if_equal_value); });
 }

-Coordination::Error ZooKeeperWithFaultInjection::tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses)
+Coordination::Error ZooKeeperWithFaultInjection::tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
 {
    return executeWithFaultSync(
        __func__,
        !requests.empty() ? requests.front()->getPath() : "",
        [&]()
        {
-            auto code = keeper->tryMulti(requests, responses);
+            auto code = keeper->tryMulti(requests, responses, check_session_valid);
            if (unlikely(fault_policy) && code == Coordination::Error::ZOK)
                multiResponseSaveEphemeralNodePaths(requests, responses);
            return code;
@ -435,11 +435,11 @@ Coordination::Error ZooKeeperWithFaultInjection::tryMulti(const Coordination::Re
 }

 Coordination::Error
-ZooKeeperWithFaultInjection::tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses)
+ZooKeeperWithFaultInjection::tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid)
 {
    try
    {
-        return tryMulti(requests, responses);
+        return tryMulti(requests, responses, check_session_valid);
    }
    catch (const Coordination::Exception & e)
    {
--- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
+++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
@ -212,7 +212,7 @@ public:

    Coordination::Error tryCreate(const std::string & path, const std::string & data, int32_t mode);

-    Coordination::Responses multi(const Coordination::Requests & requests);
+    Coordination::Responses multi(const Coordination::Requests & requests, bool check_session_valid = false);

    void createIfNotExists(const std::string & path, const std::string & data);

@ -242,9 +242,9 @@ public:

    void deleteEphemeralNodeIfContentMatches(const std::string & path, const std::string & fast_delete_if_equal_value);

-    Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses);
+    Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid = false);

-    Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses);
+    Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses, bool check_session_valid = false);

    ///
    /// mirror ZooKeeper interface: Async functions
--- a/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
@ -5,9 +5,9 @@
 int main(int argc, char ** argv)
 try
 {
-    zkutil::ZooKeeper zookeeper{zkutil::ZooKeeperArgs("localhost:2181")};
+    auto zookeeper = zkutil::ZooKeeper::createWithoutKillingPreviousSessions(zkutil::ZooKeeperArgs("localhost:2181"));

-    auto nodes = zookeeper.getChildren("/tmp");
+    auto nodes = zookeeper->getChildren("/tmp");

    if (argc < 2)
    {
@ -26,7 +26,7 @@ try
                std::vector<std::future<Coordination::GetResponse>> futures;
                futures.reserve(nodes.size());
                for (auto & node : nodes)
-                    futures.push_back(zookeeper.asyncGet("/tmp/" + node));
+                    futures.push_back(zookeeper->asyncGet("/tmp/" + node));

                for (auto & future : futures)
                    std::cerr << (future.get().data.empty() ? ',' : '.');
--- a/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
@ -16,34 +16,34 @@ try
        return 1;
    }

-    ZooKeeper zk{zkutil::ZooKeeperArgs(argv[1])};
+    auto zk = ZooKeeper::createWithoutKillingPreviousSessions(zkutil::ZooKeeperArgs(argv[1]));

    std::cout << "create path" << std::endl;
-    zk.create("/test", "old", zkutil::CreateMode::Persistent);
+    zk->create("/test", "old", zkutil::CreateMode::Persistent);
    Coordination::Stat stat;
    zkutil::EventPtr watch = std::make_shared<Poco::Event>();

    std::cout << "get path" << std::endl;
-    zk.get("/test", &stat, watch);
+    zk->get("/test", &stat, watch);
    std::cout << "set path" << std::endl;
-    zk.set("/test", "new");
+    zk->set("/test", "new");
    watch->wait();
    std::cout << "watch happened" << std::endl;
    std::cout << "remove path" << std::endl;

    std::cout << "list path" << std::endl;
-    Strings children = zk.getChildren("/");
+    Strings children = zk->getChildren("/");
    for (const auto & name : children)
        std::cerr << "\t" << name << "\n";

-    zk.remove("/test");
+    zk->remove("/test");

    Coordination::Requests ops;
    ops.emplace_back(zkutil::makeCreateRequest("/test", "multi1", CreateMode::Persistent));
    ops.emplace_back(zkutil::makeSetRequest("/test", "multi2", -1));
    ops.emplace_back(zkutil::makeRemoveRequest("/test", -1));
    std::cout << "multi" << std::endl;
-    Coordination::Responses res = zk.multi(ops);
+    Coordination::Responses res = zk->multi(ops);
    std::cout << "path created: " << dynamic_cast<const Coordination::CreateResponse &>(*res[0]).path_created << std::endl;

    return 0;
--- a/src/Common/checkStackSize.cpp
+++ b/src/Common/checkStackSize.cpp
@ -1,8 +1,8 @@
-#include <base/getThreadId.h>
-#include <base/defines.h> /// THREAD_SANITIZER
 #include <Common/checkStackSize.h>
 #include <Common/Exception.h>
-#include <Common/Fiber.h>
+#include <base/getThreadId.h>
+#include <base/scope_guard.h>
+#include <base/defines.h> /// THREAD_SANITIZER
 #include <sys/resource.h>
 #include <pthread.h>
 #include <unistd.h>
@ -114,10 +114,6 @@ __attribute__((__weak__)) void checkStackSize()
 {
    using namespace DB;

-    /// Not implemented for coroutines.
-    if (Fiber::getCurrentFiber())
-        return;
-
    if (!stack_address)
        max_stack_size = getStackSize(&stack_address);

@ -140,7 +136,7 @@ __attribute__((__weak__)) void checkStackSize()

    /// We assume that stack grows towards lower addresses. And that it starts to grow from the end of a chunk of memory of max_stack_size.
    if (int_frame_address > int_stack_address + max_stack_size)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Frame address is greater than stack begin address");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: frame address is greater than stack begin address");

    size_t stack_size = int_stack_address + max_stack_size - int_frame_address;
    size_t max_stack_size_allowed = static_cast<size_t>(max_stack_size * STACK_SIZE_FREE_RATIO);
--- a/src/Common/getNumberOfPhysicalCPUCores.cpp
+++ b/src/Common/getNumberOfPhysicalCPUCores.cpp
@ -1,5 +1,4 @@
 #include "getNumberOfPhysicalCPUCores.h"
-#include <filesystem>

 #if defined(OS_LINUX)
 #    include <cmath>
@ -8,8 +7,10 @@

 #include <boost/algorithm/string/trim.hpp>
 #include <boost/algorithm/string/split.hpp>
+#include <base/cgroupsv2.h>
 #include <base/range.h>

+#include <filesystem>
 #include <thread>
 #include <set>

@ -33,26 +34,15 @@ int32_t readFrom(const std::filesystem::path & filename, int default_value)
 uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
 {
    uint32_t quota_count = default_cpu_count;
-    std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
    /// cgroupsv2
-    std::ifstream contr_file(default_cgroups_mount / "cgroup.controllers");
-    if (contr_file.is_open())
+    if (cgroupsV2Enabled())
    {
        /// First, we identify the cgroup the process belongs
-        std::ifstream cgroup_name_file("/proc/self/cgroup");
-        if (!cgroup_name_file.is_open())
+        std::string cgroup = cgroupV2OfProcess();
+        if (cgroup.empty())
            return default_cpu_count;

-        // cgroup_name_file always starts with '0::/' for v2
-        cgroup_name_file.ignore(4);
-        std::string cgroup_name;
-        cgroup_name_file >> cgroup_name;
-
-        std::filesystem::path current_cgroup;
-        if (cgroup_name.empty())
-            current_cgroup = default_cgroups_mount;
-        else
-            current_cgroup = default_cgroups_mount / cgroup_name;
+        auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);

        // Looking for cpu.max in directories from the current cgroup to the top level
        // It does not stop on the first time since the child could have a greater value than parent
@ -72,7 +62,7 @@ uint32_t getCGroupLimitedCPUCores(unsigned default_cpu_count)
            }
            current_cgroup = current_cgroup.parent_path();
        }
-        current_cgroup = default_cgroups_mount / cgroup_name;
+        current_cgroup = default_cgroups_mount / cgroup;
        // Looking for cpuset.cpus.effective in directories from the current cgroup to the top level
        while (current_cgroup != default_cgroups_mount.parent_path())
        {
@ -179,7 +169,6 @@ unsigned getNumberOfPhysicalCPUCoresImpl()
 {
    unsigned cores = std::thread::hardware_concurrency(); /// logical cores (with SMT/HyperThreading)

-
 #if defined(__x86_64__) && defined(OS_LINUX)
    /// Most x86_64 CPUs have 2-way SMT (Hyper-Threading).
    /// Aarch64 and RISC-V don't have SMT so far.
--- a/Show More
+++ b/Show More