Merge branch 'master' into replicated_database_improvements

2024-11-21 23:21:59 +00:00 · 2022-07-28 13:08:30 +02:00 · 2022-07-28 13:08:30 +02:00 · b3732df248
commit b3732df248
parent c7fb6aa6d5 28ef623a42
292 changed files with 6292 additions and 1976 deletions
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -1126,6 +1126,84 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestReleaseDatabaseReplicated0:
+    needs: [BuilderDebRelease]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_database_replicated
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (release, DatabaseReplicated)
+          REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=0
+          RUN_BY_HASH_TOTAL=2
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestReleaseDatabaseReplicated1:
+    needs: [BuilderDebRelease]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_database_replicated
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (release, DatabaseReplicated)
+          REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=1
+          RUN_BY_HASH_TOTAL=2
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
  FunctionalStatelessTestReleaseS3:
    needs: [BuilderDebRelease]
    runs-on: [self-hosted, func-tester]
@ -1706,43 +1784,6 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
-  FunctionalStatefulTestReleaseDatabaseOrdinary:
-    needs: [BuilderDebRelease]
-    runs-on: [self-hosted, func-tester]
-    steps:
-      - name: Set envs
-        run: |
-          cat >> "$GITHUB_ENV" << 'EOF'
-          TEMP_PATH=${{runner.temp}}/stateful_release_database_ordinary
-          REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (release, DatabaseOrdinary)
-          REPO_COPY=${{runner.temp}}/stateful_release_database_ordinary/ClickHouse
-          KILL_TIMEOUT=3600
-          EOF
-      - name: Download json reports
-        uses: actions/download-artifact@v2
-        with:
-          path: ${{ env.REPORTS_PATH }}
-      - name: Clear repository
-        run: |
-          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
-      - name: Check out repository code
-        uses: actions/checkout@v2
-      - name: Functional test
-        run: |
-          sudo rm -fr "$TEMP_PATH"
-          mkdir -p "$TEMP_PATH"
-          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
-          cd "$REPO_COPY/tests/ci"
-          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
-      - name: Cleanup
-        if: always()
-        run: |
-          # shellcheck disable=SC2046
-          docker kill $(docker ps -q) ||:
-          # shellcheck disable=SC2046
-          docker rm -f $(docker ps -a -q) ||:
-          sudo rm -fr "$TEMP_PATH"
  FunctionalStatefulTestAarch64:
    needs: [BuilderDebAarch64]
    runs-on: [self-hosted, func-tester-aarch64]
@ -3063,6 +3104,8 @@ jobs:
      - FunctionalStatelessTestDebug2
      - FunctionalStatelessTestRelease
      - FunctionalStatelessTestReleaseDatabaseOrdinary
+      - FunctionalStatelessTestReleaseDatabaseReplicated0
+      - FunctionalStatelessTestReleaseDatabaseReplicated1
      - FunctionalStatelessTestAarch64
      - FunctionalStatelessTestAsan0
      - FunctionalStatelessTestAsan1
@ -3075,7 +3118,6 @@ jobs:
      - FunctionalStatelessTestUBsan
      - FunctionalStatefulTestDebug
      - FunctionalStatefulTestRelease
-      - FunctionalStatefulTestReleaseDatabaseOrdinary
      - FunctionalStatelessTestReleaseS3
      - FunctionalStatefulTestAarch64
      - FunctionalStatefulTestAsan
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -254,7 +254,7 @@ jobs:
 #################################### ORDINARY BUILDS ####################################
 #########################################################################################
  BuilderDebRelease:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -301,7 +301,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
  BuilderBinRelease:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -345,53 +345,8 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
-  # BuilderBinGCC:
-  #   needs: [DockerHubPush, FastTest]
-  #   runs-on: [self-hosted, builder]
-  #   steps:
-  #     - name: Set envs
-  #       run: |
-  #         cat >> "$GITHUB_ENV" << 'EOF'
-  #         TEMP_PATH=${{runner.temp}}/build_check
-  #         IMAGES_PATH=${{runner.temp}}/images_path
-  #         REPO_COPY=${{runner.temp}}/build_check/ClickHouse
-  #         CACHES_PATH=${{runner.temp}}/../ccaches
-  #         BUILD_NAME=binary_gcc
-  #         EOF
-  #     - name: Download changed images
-  #       uses: actions/download-artifact@v2
-  #       with:
-  #         name: changed_images
-  #         path: ${{ runner.temp }}/images_path
-  #     - name: Clear repository
-  #       run: |
-  #         sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
-  #     - name: Check out repository code
-  #       uses: actions/checkout@v2
-  #     - name: Build
-  #       run: |
-  #         git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-  #         git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
-  #         sudo rm -fr "$TEMP_PATH"
-  #         mkdir -p "$TEMP_PATH"
-  #         cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
-  #         cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
-  #     - name: Upload build URLs to artifacts
-  #       if: ${{ success() || failure() }}
-  #       uses: actions/upload-artifact@v2
-  #       with:
-  #         name: ${{ env.BUILD_URLS }}
-  #         path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
-  #     - name: Cleanup
-  #       if: always()
-  #       run: |
-  #         # shellcheck disable=SC2046
-  #         docker kill $(docker ps -q) ||:
-  #         # shellcheck disable=SC2046
-  #         docker rm -f $(docker ps -a -q) ||:
-  #         sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebAarch64:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -438,7 +393,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebAsan:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -483,7 +438,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebUBsan:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -528,7 +483,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebTsan:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -573,7 +528,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebMsan:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -618,7 +573,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderDebDebug:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -666,7 +621,7 @@ jobs:
 ##################################### SPECIAL BUILDS #####################################
 ##########################################################################################
  BuilderDebSplitted:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -711,7 +666,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinClangTidy:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -756,7 +711,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinDarwin:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -801,7 +756,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinAarch64:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -846,7 +801,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinFreeBSD:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -891,7 +846,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinDarwinAarch64:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -936,7 +891,7 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinPPC64:
-    needs: [DockerHubPush, FastTest]
+    needs: [DockerHubPush, FastTest, StyleCheck]
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -2974,42 +2929,6 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
-  # UnitTestsReleaseGCC:
-  #   needs: [BuilderBinGCC]
-  #   runs-on: [self-hosted, fuzzer-unit-tester]
-  #   steps:
-  #     - name: Set envs
-  #       run: |
-  #         cat >> "$GITHUB_ENV" << 'EOF'
-  #         TEMP_PATH=${{runner.temp}}/unit_tests_asan
-  #         REPORTS_PATH=${{runner.temp}}/reports_dir
-  #         CHECK_NAME=Unit tests (release-gcc)
-  #         REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse
-  #         EOF
-  #     - name: Download json reports
-  #       uses: actions/download-artifact@v2
-  #       with:
-  #         path: ${{ env.REPORTS_PATH }}
-  #     - name: Clear repository
-  #       run: |
-  #         sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
-  #     - name: Check out repository code
-  #       uses: actions/checkout@v2
-  #     - name: Unit test
-  #       run: |
-  #         sudo rm -fr "$TEMP_PATH"
-  #         mkdir -p "$TEMP_PATH"
-  #         cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
-  #         cd "$REPO_COPY/tests/ci"
-  #         python3 unit_tests_check.py "$CHECK_NAME"
-  #     - name: Cleanup
-  #       if: always()
-  #       run: |
-  #         # shellcheck disable=SC2046
-  #         docker kill $(docker ps -q) ||:
-  #         # shellcheck disable=SC2046
-  #         docker rm -f $(docker ps -a -q) ||:
-  #         sudo rm -fr "$TEMP_PATH"
  UnitTestsTsan:
    needs: [BuilderDebTsan]
    runs-on: [self-hosted, fuzzer-unit-tester]
--- a/contrib/avro
+++ b/contrib/avro
@ -1 +1 @@
-Subproject commit e43c46e87fd32eafdc09471e95344555454c5ef8
+Subproject commit 7832659ec986075d560f930c288e973c64679552
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@ -119,5 +119,20 @@ ENV GOCACHE=/workdir/
 RUN mkdir /workdir && chmod 777 /workdir
 WORKDIR /workdir

+# FIXME: thread sanitizer is broken in clang-14, we have to build it with clang-13
+# https://github.com/ClickHouse/ClickHouse/pull/39450
+# https://github.com/google/sanitizers/issues/1540
+# https://github.com/google/sanitizers/issues/1552
+
+RUN export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
+    && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-13 main" >> \
+        /etc/apt/sources.list.d/clang.list \
+    && apt-get update \
+    && apt-get install \
+        clang-13 \
+        clang-tidy-13 \
+        --yes --no-install-recommends \
+    && apt-get clean
+
 COPY build.sh /
 CMD ["bash", "-c", "/build.sh 2>&1"]
--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -323,6 +323,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--compiler",
        choices=(
+            "clang-13",  # For TSAN builds, see #39450
            "clang-14",
            "clang-14-darwin",
            "clang-14-darwin-aarch64",
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -7,29 +7,26 @@ set -x

 # Thread Fuzzer allows to check more permutations of possible thread scheduling
 # and find more potential issues.
-#
-# But under thread fuzzer, TSan build is too slow and this produces some flaky
-# tests, so for now, as a temporary solution it had been disabled.
-if ! test -f package_folder/clickhouse-server*tsan*.deb; then
-    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
-    export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
-    export THREAD_FUZZER_SLEEP_TIME_US=100000

-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
+export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
+export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
+export THREAD_FUZZER_SLEEP_TIME_US=100000

-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
+export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
+export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
+export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
+export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
+
+export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
+export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
+export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
+export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
+export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
+
+export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
+export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
+export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000

-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
-fi

 function install_packages()
 {
--- a/docs/changelogs/v22.6.4.35-stable.md
+++ b/docs/changelogs/v22.6.4.35-stable.md
@ -0,0 +1,36 @@
+---
+sidebar_position: 1
+sidebar_label: 2022
+---
+
+# 2022 Changelog
+
+### ClickHouse release v22.6.4.35-stable FIXME as compared to v22.6.3.35-stable
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#38822](https://github.com/ClickHouse/ClickHouse/issues/38822): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### Bug Fix (user-visible misbehavior in official stable or prestable release)
+
+* Backported in [#38242](https://github.com/ClickHouse/ClickHouse/issues/38242): Fix possible crash in `Distributed` async insert in case of removing a replica from config. [#38029](https://github.com/ClickHouse/ClickHouse/pull/38029) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Backported in [#38865](https://github.com/ClickHouse/ClickHouse/issues/38865): Fix s3 seekable reads with parallel read buffer. (Affected memory usage during query). Closes [#38258](https://github.com/ClickHouse/ClickHouse/issues/38258). [#38802](https://github.com/ClickHouse/ClickHouse/pull/38802) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Backported in [#38853](https://github.com/ClickHouse/ClickHouse/issues/38853): Update `simdjson`. This fixes [#38621](https://github.com/ClickHouse/ClickHouse/issues/38621). [#38838](https://github.com/ClickHouse/ClickHouse/pull/38838) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Backported in [#38942](https://github.com/ClickHouse/ClickHouse/issues/38942): - Fix settings profile with seconds unit. [#38896](https://github.com/ClickHouse/ClickHouse/pull/38896) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#39063](https://github.com/ClickHouse/ClickHouse/issues/39063): Any allocations inside OvercommitTracker may lead to deadlock. Logging was not very informative so it's easier just to remove logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794). [#39030](https://github.com/ClickHouse/ClickHouse/pull/39030) ([Dmitry Novik](https://github.com/novikd)).
+* Backported in [#39077](https://github.com/ClickHouse/ClickHouse/issues/39077): Fix bug in filesystem cache that could happen in some corner case which coincided with cache capacity hitting the limit. Closes [#39066](https://github.com/ClickHouse/ClickHouse/issues/39066). [#39070](https://github.com/ClickHouse/ClickHouse/pull/39070) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Backported in [#39151](https://github.com/ClickHouse/ClickHouse/issues/39151): Fix error `Block structure mismatch` which could happen for INSERT into table with attached MATERIALIZED VIEW and enabled setting `extremes = 1`. Closes [#29759](https://github.com/ClickHouse/ClickHouse/issues/29759) and [#38729](https://github.com/ClickHouse/ClickHouse/issues/38729). [#39125](https://github.com/ClickHouse/ClickHouse/pull/39125) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Backported in [#39275](https://github.com/ClickHouse/ClickHouse/issues/39275): Fixed error `Not found column Type in block` in selects with `PREWHERE` and read-in-order optimizations. [#39157](https://github.com/ClickHouse/ClickHouse/pull/39157) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Backported in [#39371](https://github.com/ClickHouse/ClickHouse/issues/39371): Declare RabbitMQ queue without default arguments `x-max-length` and `x-overflow`. [#39259](https://github.com/ClickHouse/ClickHouse/pull/39259) ([rnbondarenko](https://github.com/rnbondarenko)).
+* Backported in [#39352](https://github.com/ClickHouse/ClickHouse/issues/39352): Fix incorrect fetch postgresql tables query fro PostgreSQL database engine. Closes [#33502](https://github.com/ClickHouse/ClickHouse/issues/33502). [#39283](https://github.com/ClickHouse/ClickHouse/pull/39283) ([Kseniia Sumarokova](https://github.com/kssenii)).
+
+#### NO CL CATEGORY
+
+* Backported in [#38685](https://github.com/ClickHouse/ClickHouse/issues/38685):. [#38449](https://github.com/ClickHouse/ClickHouse/pull/38449) ([Maksim Kita](https://github.com/kitaisreal)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Use native Map type for OpenTelemetry attributes [#38814](https://github.com/ClickHouse/ClickHouse/pull/38814) ([Ilya Yatsishin](https://github.com/qoega)).
+* Retry docker buildx commands with progressive sleep in between [#38898](https://github.com/ClickHouse/ClickHouse/pull/38898) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Add docker_server.py running to backport and release CIs [#39011](https://github.com/ClickHouse/ClickHouse/pull/39011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix meilisearch tests [#39110](https://github.com/ClickHouse/ClickHouse/pull/39110) ([Kseniia Sumarokova](https://github.com/kssenii)).
+
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -482,9 +482,9 @@ For example:

 ## Projections {#projections}
 Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries.
-
-Projections are an experimental feature. To enable them you must set the [allow_experimental_projection_optimization](../../../operations/settings/settings.md#allow-experimental-projection-optimization) to `1`. See also the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting.
-
+::: note
+When you are implementing projections you should also consider the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting.
+:::
 Projections are not supported in the `SELECT` statements with the [FINAL](../../../sql-reference/statements/select/from.md#select-from-final) modifier.

 ### Projection Query {#projection-query}
--- a/docs/en/interfaces/third-party/gui.md
+++ b/docs/en/interfaces/third-party/gui.md
@ -67,7 +67,7 @@ Features:

 ### Grafana {#grafana}

-[Grafana](https://grafana.com/grafana/plugins/vertamedia-clickhouse-datasource) is a platform for monitoring and visualization.
+[Grafana](https://grafana.com/grafana/plugins/grafana-clickhouse-datasource/) is a platform for monitoring and visualization.

 "Grafana allows you to query, visualize, alert on and understand your metrics no matter where they are stored. Create, explore, and share dashboards with your team and foster a data driven culture. Trusted and loved by the community" &mdash; grafana.com.

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -197,7 +197,7 @@ Default value: `480` (8 minute).

 Parameter of a task that cleans up garbage from `store/` directory.
 If some subdirectory is not used by clickhouse-server and this directory was not modified for last
-`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by 
+`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by
 removing all access rights. It also works for directories that clickhouse-server does not
 expect to see inside `store/`. Zero means "immediately".

@ -206,10 +206,10 @@ Default value: `3600` (1 hour).
 ## database_catalog_unused_dir_rm_timeout_sec {#database_catalog_unused_dir_rm_timeout_sec}

 Parameter of a task that cleans up garbage from `store/` directory.
-If some subdirectory is not used by clickhouse-server and it was previousely "hidden" 
-(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec)) 
+If some subdirectory is not used by clickhouse-server and it was previousely "hidden"
+(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec))
 and this directory was not modified for last
-`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory. 
+`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory.
 It also works for directories that clickhouse-server does not
 expect to see inside `store/`. Zero means "never".

@ -731,6 +731,16 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa

 -   [max_server_memory_usage](#max_server_memory_usage)

+## concurrent_threads_soft_limit {#concurrent_threads_soft_limit}
+The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
+
+Possible values:
+-   Positive integer.
+-   0 — No limit.
+-   -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks.
+
+Default value: `0`.
+
 ## max_concurrent_queries {#max-concurrent-queries}

 The maximum number of simultaneously processed queries.
--- a/docs/en/operations/settings/settings-users.md
+++ b/docs/en/operations/settings/settings-users.md
@ -29,7 +29,7 @@ Structure of the `users` section:
        <profile>profile_name</profile>

        <quota>default</quota>
-        <default_database>default<default_database>
+        <default_database>default</default_database>
        <databases>
            <database_name>
                <table_name>
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -302,18 +302,34 @@ Default value: `ALL`.

 Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm.

+Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine.
+
 Possible values:

- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used.
- `partial_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) is used.
- `prefer_partial_merge` — ClickHouse always tries to use `merge` join if possible.
- `auto` — ClickHouse tries to change `hash` join to `merge` join on the fly to avoid out of memory.
+- `default` — `hash` or `direct`, if possible (same as `direct,hash`)

-Default value: `hash`.
+- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.

-When using `hash` algorithm the right part of `JOIN` is uploaded into RAM.
+- `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.
+
+When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM.
+
+- `partial_merge` — a variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.
+
+The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported).
+
+When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.
+
+- `direct` - can be applied when the right storage supports key-value requests.
+
+The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs.
+
+- `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated.
+
+- `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
+
+- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.

-When using `partial_merge` algorithm ClickHouse sorts the data and dumps it to the disk. The `merge` algorithm in ClickHouse differs a bit from the classic realization. First ClickHouse sorts the right table by [join key](../../sql-reference/statements/select/join.md#select-join) in blocks and creates min-max index for sorted blocks. Then it sorts parts of left table by `join key` and joins them over right table. The min-max index is also used to skip unneeded right table blocks.

 ## join_any_take_last_row {#settings-join_any_take_last_row}

--- a/docs/en/sql-reference/aggregate-functions/reference/any.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/any.md
@ -4,7 +4,7 @@ sidebar_position: 6

 # any

-Selects the first encountered value.
+Selects the first encountered (non-NULL) value, unless all rows have NULL values in that column.
 The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate.
 To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’.

--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
@ -5,9 +5,9 @@ sidebar_label: Sources of External Dictionaries

 # Sources of External Dictionaries

-An external dictionary can be connected from many different sources.
+An external dictionary can be connected to ClickHouse from many different sources.

-If dictionary is configured using xml-file, the configuration looks like this:
+If the dictionary is configured using an xml-file, the configuration looks like this:

 ``` xml
 <clickhouse>
@ -24,7 +24,7 @@ If dictionary is configured using xml-file, the configuration looks like this:
 </clickhouse>
 ```

-In case of [DDL-query](../../../sql-reference/statements/create/dictionary.md), equal configuration will looks like:
+In case of [DDL-query](../../../sql-reference/statements/create/dictionary.md), the configuration described above will look like:

 ``` sql
 CREATE DICTIONARY dict_name (...)
@ -96,7 +96,7 @@ Setting fields:
 -   `path` – The absolute path to the file.
 -   `format` – The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported.

-When dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in `user_files` directory, to prevent DB users accessing arbitrary file on ClickHouse node.
+When a dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in the `user_files` directory to prevent DB users from accessing arbitrary files on the ClickHouse node.

 **See Also**

@ -104,7 +104,7 @@ When dictionary with source `FILE` is created via DDL command (`CREATE DICTIONAR

 ## Executable File

-Working with executable files depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts executable file and treats its output as dictionary data.
+Working with executable files depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data.

 Example of settings:

@ -120,22 +120,22 @@ Example of settings:

 Setting fields:

-   `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`).
+-   `command` — The absolute path to the executable file, or the file name (if the command's directory is in the `PATH`).
 -   `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported.
-   `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter.
-   `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter.
-   `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter.
+-   `command_termination_timeout` — The executable script should contain a main read-write loop. After the dictionary is destroyed, the pipe is closed, and the executable file will have `command_termination_timeout` seconds to shutdown before ClickHouse will send a SIGTERM signal to the child process. `command_termination_timeout` is specified in seconds. Default value is 10. Optional parameter.
+-   `command_read_timeout` - Timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter.
+-   `command_write_timeout` - Timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter.
 -   `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false.
-   `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter.
+-   `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using a whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter.
 -   `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`.

-That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node.
+That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node.

 ## Executable Pool

-Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](external-dicts-dict-layout.md#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, `complex_key_direct` layouts.
+Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](external-dicts-dict-layout.md#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts.

-Executable pool will spawn pool of processes with specified command and keep them running until they exit. The program should read data from STDIN while it is available and output result to STDOUT, and it can wait for next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early.
+Executable pool will spawn a pool of processes with the specified command and keep them running until they exit. The program should read data from STDIN while it is available and output the result to STDOUT. It can wait for the next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data, but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early.

 Example of settings:

@ -555,7 +555,11 @@ Setting fields:
 The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared.
 :::

-MySQL can be connected on a local host via sockets. To do this, set `host` and `socket`.
+:::note
+There is no explicit parameter `secure`. When establishing an SSL-connection security is mandatory.
+:::
+
+MySQL can be connected to on a local host via sockets. To do this, set `host` and `socket`.

 Example of settings:

@ -815,4 +819,4 @@ Setting fields:

 :::note
 The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared.
-:::
+:::
--- a/docs/en/sql-reference/statements/create/table.md
+++ b/docs/en/sql-reference/statements/create/table.md
@ -250,10 +250,12 @@ High compression levels are useful for asymmetric scenarios, like compress once,

 #### DEFLATE_QPL

-`DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library, which has dependency on Intel Hardware:
+`DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply:

-   DEFLATE_QPL is only supported on systems with AVX2/AVX512/IAA.
-   DEFLATE_QPL-compressed data can only be transferred between nodes with AVX2/AVX512/IAA.
+-   DEFLATE_QPL is experimental and can only be used after setting configuration parameter `allow_experimental_codecs=1`.
+-   DEFLATE_QPL only works if ClickHouse was compiled with support for AVX2 or AVX512 instructions
+-   DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device
+-   DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with support for AVX2/AVX512

 ### Specialized Codecs

--- a/docs/en/sql-reference/statements/select/into-outfile.md
+++ b/docs/en/sql-reference/statements/select/into-outfile.md
@ -11,7 +11,7 @@ Compressed files are supported. Compression type is detected by the extension of
 **Syntax**

 ```sql
-SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
+SELECT <expr_list> INTO OUTFILE file_name [AND STDOUT] [COMPRESSION type [LEVEL level]]
 ```

 `file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.
@ -23,6 +23,7 @@ SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
 -   This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail.
 -   The query will fail if a file with the same file name already exists.
 -   The default [output format](../../../interfaces/formats.md) is `TabSeparated` (like in the command-line client batch mode). Use [FORMAT](format.md) clause to change it.
+-   If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. If used with compression, the plaintext is displayed on standard output.

 **Example**

--- a/docs/en/sql-reference/statements/select/join.md
+++ b/docs/en/sql-reference/statements/select/join.md
@ -36,7 +36,7 @@ Additional join types available in ClickHouse:
 -   `LEFT ANY JOIN`, `RIGHT ANY JOIN` and `INNER ANY JOIN`, partially (for opposite side of `LEFT` and `RIGHT`) or completely (for `INNER` and `FULL`) disables the cartesian product for standard `JOIN` types.
 -   `ASOF JOIN` and `LEFT ASOF JOIN`, joining sequences with a non-exact match. `ASOF JOIN` usage is described below.

-:::note    
+:::note
 When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported).
 :::

@ -64,7 +64,7 @@ Rows are joined if the whole complex condition is met. If the conditions are not

 The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause.

-:::note    
+:::note
 If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far.
 :::

@ -83,7 +83,7 @@ Consider `table_1` and `table_2`:
 Query with one join key condition and an additional condition for `table_2`:

 ``` sql
-SELECT name, text FROM table_1 LEFT OUTER JOIN table_2 
+SELECT name, text FROM table_1 LEFT OUTER JOIN table_2
    ON table_1.Id = table_2.Id AND startsWith(table_2.text, 'Text');
 ```

@ -100,7 +100,7 @@ Note that the result contains the row with the name `C` and the empty text colum
 Query with `INNER` type of a join and multiple conditions:

 ``` sql
-SELECT name, text, scores FROM table_1 INNER JOIN table_2 
+SELECT name, text, scores FROM table_1 INNER JOIN table_2
    ON table_1.Id = table_2.Id AND table_2.scores > 10 AND startsWith(table_2.text, 'Text');
 ```

@ -199,7 +199,7 @@ For example, consider the following tables:

 `ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can’t be joined.

-:::note    
+:::note
 `ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine.
 :::

--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -656,8 +656,9 @@ ClickHouse может парсить только базовый формат `Y

 Изменяет поведение операций, выполняемых со строгостью `ANY`.

-:::danger "Внимание"
+:::warning "Внимание"
    Настройка применяется только для операций `JOIN`, выполняемых над таблицами с движком [Join](../../engines/table-engines/special/join.md).
+:::

 Возможные значения:

@ -2112,8 +2113,9 @@ SELECT * FROM test_table

 Устанавливает приоритет ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) для потоков, исполняющих запросы. Планировщик ОС учитывает эти приоритеты при выборе следующего потока для исполнения на доступном ядре CPU.

-:::danger "Предупреждение"
+:::warning "Предупреждение"
    Для использования этой настройки необходимо установить свойство `CAP_SYS_NICE`. Пакет `clickhouse-server` устанавливает его во время инсталляции. Некоторые виртуальные окружения не позволяют установить `CAP_SYS_NICE`. В этом случае, `clickhouse-server` выводит сообщение при запуске.
+:::

 Допустимые значения:

--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
@ -542,6 +542,7 @@ SOURCE(MYSQL(

 :::info "Примечание"
    Поля `table` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `table` или `query`.
+    Явный параметр `secure` отсутствует. Автоматически поддержана работа в обоих случаях: когда установка SSL-соединения необходима и когда нет.

 MySQL можно подключить на локальном хосте через сокеты, для этого необходимо задать `host` и `socket`.

--- a/programs/keeper-converter/KeeperConverter.cpp
+++ b/programs/keeper-converter/KeeperConverter.cpp
@ -39,14 +39,19 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)

    try
    {
-        DB::KeeperStorage storage(500, "", true);
+        auto keeper_context = std::make_shared<KeeperContext>();
+        keeper_context->digest_enabled = true;
+
+        DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false);

        DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as<std::string>(), logger);
+        storage.initializeSystemNodes();
+
        DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as<std::string>(), logger);
        DB::SnapshotMetadataPtr snapshot_meta = std::make_shared<DB::SnapshotMetadata>(storage.getZXID(), 1, std::make_shared<nuraft::cluster_config>());
        DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta);

-        DB::KeeperSnapshotManager manager(options["output-dir"].as<std::string>(), 1);
+        DB::KeeperSnapshotManager manager(options["output-dir"].as<std::string>(), 1, keeper_context);
        auto snp = manager.serializeSnapshotToBuffer(snapshot);
        auto path = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
        std::cout << "Snapshot serialized to path:" << path << std::endl;
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -29,6 +29,7 @@
 #include <Common/ClickHouseRevision.h>
 #include <Common/DNSResolver.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/ConcurrencyControl.h>
 #include <Common/Macros.h>
 #include <Common/ShellCommand.h>
 #include <Common/StringUtils/StringUtils.h>
@ -1124,6 +1125,23 @@ int Server::main(const std::vector<std::string> & /*args*/)
            if (config->has("max_partition_size_to_drop"))
                global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop"));

+            if (config->has("concurrent_threads_soft_limit"))
+            {
+                auto concurrent_threads_soft_limit = config->getInt("concurrent_threads_soft_limit", 0);
+                if (concurrent_threads_soft_limit == -1)
+                {
+                    // Based on tests concurrent_threads_soft_limit has an optimal value when it's about 3 times of logical CPU cores
+                    constexpr size_t thread_factor = 3;
+                    concurrent_threads_soft_limit = std::thread::hardware_concurrency() * thread_factor;
+                }
+                if (concurrent_threads_soft_limit)
+                    ConcurrencyControl::instance().setMaxConcurrency(concurrent_threads_soft_limit);
+                else
+                    ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
+            }
+            else
+                ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited);
+
            if (config->has("max_concurrent_queries"))
                global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0));

--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -269,6 +269,13 @@
    <http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>
    -->

+    <!-- Maximum number of query processing threads to run all queries.
+         Note that This is not a hard limit. In case if the limit is reached the query will still get one thread to run.
+         For value equals to -1 this parameter is initialized by number of logical cores multiplies by 3.
+         Which is a good heuristic for CPU-bound tasks.
+    -->
+    <concurrent_threads_soft_limit>0</concurrent_threads_soft_limit>
+
    <!-- Maximum number of concurrent queries. -->
    <max_concurrent_queries>100</max_concurrent_queries>

@ -604,7 +611,7 @@
             if this setting is true the user B will see all rows, and if this setting is false the user B will see no rows.
             By default this setting is false for compatibility with earlier access configurations. -->
        <users_without_row_policies_can_read_rows>false</users_without_row_policies_can_read_rows>
-        
+
        <!-- By default, for backward compatibility ON CLUSTER queries ignore CLUSTER grant,
             however you can change this behaviour by setting this to true -->
        <on_cluster_queries_require_cluster_grant>false</on_cluster_queries_require_cluster_grant>
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -13,20 +13,20 @@ using FileInfo = IBackupCoordination::FileInfo;
 BackupCoordinationLocal::BackupCoordinationLocal() = default;
 BackupCoordinationLocal::~BackupCoordinationLocal() = default;

-void BackupCoordinationLocal::setStatus(const String &, const String &, const String &)
+void BackupCoordinationLocal::setStage(const String &, const String &, const String &)
 {
 }

-void BackupCoordinationLocal::setErrorStatus(const String &, const Exception &)
+void BackupCoordinationLocal::setError(const String &, const Exception &)
 {
 }

-Strings BackupCoordinationLocal::waitStatus(const Strings &, const String &)
+Strings BackupCoordinationLocal::waitForStage(const Strings &, const String &)
 {
    return {};
 }

-Strings BackupCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64)
+Strings BackupCoordinationLocal::waitForStage(const Strings &, const String &, std::chrono::milliseconds)
 {
    return {};
 }
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -20,10 +20,10 @@ public:
    BackupCoordinationLocal();
    ~BackupCoordinationLocal() override;

-    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    void setErrorStatus(const String & current_host, const Exception & exception) override;
-    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
-    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;
+    void setStage(const String & current_host, const String & new_stage, const String & message) override;
+    void setError(const String & current_host, const Exception & exception) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override;

    void addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
--- a/src/Backups/BackupCoordinationRemote.cpp
+++ b/src/Backups/BackupCoordinationRemote.cpp
@ -165,55 +165,94 @@ namespace
    constexpr size_t NUM_ATTEMPTS = 10;
 }

-BackupCoordinationRemote::BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
+BackupCoordinationRemote::BackupCoordinationRemote(
+    const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_)
    : zookeeper_path(zookeeper_path_)
    , get_zookeeper(get_zookeeper_)
-    , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("BackupCoordination"))
+    , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_)
 {
    createRootNodes();
+    stage_sync.emplace(
+        zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("BackupCoordination"));
 }

-BackupCoordinationRemote::~BackupCoordinationRemote() = default;
+BackupCoordinationRemote::~BackupCoordinationRemote()
+{
+    try
+    {
+        if (remove_zk_nodes_in_destructor)
+            removeAllNodes();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+zkutil::ZooKeeperPtr BackupCoordinationRemote::getZooKeeper() const
+{
+    std::lock_guard lock{mutex};
+    return getZooKeeperNoLock();
+}
+
+zkutil::ZooKeeperPtr BackupCoordinationRemote::getZooKeeperNoLock() const
+{
+    if (!zookeeper || zookeeper->expired())
+    {
+        zookeeper = get_zookeeper();
+
+        /// It's possible that we connected to different [Zoo]Keeper instance
+        /// so we may read a bit stale state.
+        zookeeper->sync(zookeeper_path);
+    }
+    return zookeeper;
+}

 void BackupCoordinationRemote::createRootNodes()
 {
-    auto zookeeper = get_zookeeper();
-    zookeeper->createAncestors(zookeeper_path);
-    zookeeper->createIfNotExists(zookeeper_path, "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_part_names", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_mutations", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_access", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/file_names", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/file_infos", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", "");
+    auto zk = getZooKeeper();
+    zk->createAncestors(zookeeper_path);
+    zk->createIfNotExists(zookeeper_path, "");
+    zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+    zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
+    zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
+    zk->createIfNotExists(zookeeper_path + "/repl_access", "");
+    zk->createIfNotExists(zookeeper_path + "/file_names", "");
+    zk->createIfNotExists(zookeeper_path + "/file_infos", "");
+    zk->createIfNotExists(zookeeper_path + "/archive_suffixes", "");
 }

 void BackupCoordinationRemote::removeAllNodes()
 {
-    auto zookeeper = get_zookeeper();
-    zookeeper->removeRecursive(zookeeper_path);
+    /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
+    ///
+    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
+    /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
+    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
+    /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
+    auto zk = getZooKeeper();
+    zk->removeRecursive(zookeeper_path);
 }


-void BackupCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message)
+void BackupCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message)
 {
-    status_sync.set(current_host, new_status, message);
+    stage_sync->set(current_host, new_stage, message);
 }

-void BackupCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception)
+void BackupCoordinationRemote::setError(const String & current_host, const Exception & exception)
 {
-    status_sync.setError(current_host, exception);
+    stage_sync->setError(current_host, exception);
 }

-Strings BackupCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait)
+Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait)
 {
-    return status_sync.wait(all_hosts, status_to_wait);
+    return stage_sync->wait(all_hosts, stage_to_wait);
 }

-Strings BackupCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
+Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout)
 {
-    return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms);
+    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
 }


@ -229,11 +268,11 @@ void BackupCoordinationRemote::addReplicatedPartNames(
            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedPartNames() must not be called after preparing");
    }

-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_shared_id);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
    path += "/" + escapeForFileName(replica_name);
-    zookeeper->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent);
+    zk->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent);
 }

 Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
@ -255,11 +294,11 @@ void BackupCoordinationRemote::addReplicatedMutations(
            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedMutations() must not be called after preparing");
    }

-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
    path += "/" + escapeForFileName(replica_name);
-    zookeeper->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent);
+    zk->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent);
 }

 std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const
@ -279,11 +318,11 @@ void BackupCoordinationRemote::addReplicatedDataPath(
            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedDataPath() must not be called after preparing");
    }

-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
    path += "/" + escapeForFileName(data_path);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
 }

 Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_shared_id) const
@ -300,18 +339,18 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
        return;

    replicated_tables.emplace();
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeperNoLock();

    {
        String path = zookeeper_path + "/repl_part_names";
-        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
+        for (const String & escaped_table_shared_id : zk->getChildren(path))
        {
            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
            String path2 = path + "/" + escaped_table_shared_id;
-            for (const String & escaped_replica_name : zookeeper->getChildren(path2))
+            for (const String & escaped_replica_name : zk->getChildren(path2))
            {
                String replica_name = unescapeForFileName(escaped_replica_name);
-                auto part_names = ReplicatedPartNames::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name));
+                auto part_names = ReplicatedPartNames::deserialize(zk->get(path2 + "/" + escaped_replica_name));
                replicated_tables->addPartNames(table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums);
            }
        }
@ -319,14 +358,14 @@ void BackupCoordinationRemote::prepareReplicatedTables() const

    {
        String path = zookeeper_path + "/repl_mutations";
-        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
+        for (const String & escaped_table_shared_id : zk->getChildren(path))
        {
            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
            String path2 = path + "/" + escaped_table_shared_id;
-            for (const String & escaped_replica_name : zookeeper->getChildren(path2))
+            for (const String & escaped_replica_name : zk->getChildren(path2))
            {
                String replica_name = unescapeForFileName(escaped_replica_name);
-                auto mutations = ReplicatedMutations::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name));
+                auto mutations = ReplicatedMutations::deserialize(zk->get(path2 + "/" + escaped_replica_name));
                replicated_tables->addMutations(table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations);
            }
        }
@ -334,11 +373,11 @@ void BackupCoordinationRemote::prepareReplicatedTables() const

    {
        String path = zookeeper_path + "/repl_data_paths";
-        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
+        for (const String & escaped_table_shared_id : zk->getChildren(path))
        {
            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
            String path2 = path + "/" + escaped_table_shared_id;
-            for (const String & escaped_data_path : zookeeper->getChildren(path2))
+            for (const String & escaped_data_path : zk->getChildren(path2))
            {
                String data_path = unescapeForFileName(escaped_data_path);
                replicated_tables->addDataPath(table_shared_id, data_path);
@ -356,13 +395,13 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedAccessFilePath() must not be called after preparing");
    }

-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String path = zookeeper_path + "/repl_access/" + escapeForFileName(access_zk_path);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
    path += "/" + AccessEntityTypeInfo::get(access_entity_type).name;
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");
    path += "/" + host_id;
-    zookeeper->createIfNotExists(path, file_path);
+    zk->createIfNotExists(path, file_path);
 }

 Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const
@ -378,20 +417,20 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
        return;

    replicated_access.emplace();
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeperNoLock();

    String path = zookeeper_path + "/repl_access";
-    for (const String & escaped_access_zk_path : zookeeper->getChildren(path))
+    for (const String & escaped_access_zk_path : zk->getChildren(path))
    {
        String access_zk_path = unescapeForFileName(escaped_access_zk_path);
        String path2 = path + "/" + escaped_access_zk_path;
-        for (const String & type_str : zookeeper->getChildren(path2))
+        for (const String & type_str : zk->getChildren(path2))
        {
            AccessEntityType type = AccessEntityTypeInfo::parseType(type_str);
            String path3 = path2 + "/" + type_str;
-            for (const String & host_id : zookeeper->getChildren(path3))
+            for (const String & host_id : zk->getChildren(path3))
            {
-                String file_path = zookeeper->get(path3 + "/" + host_id);
+                String file_path = zk->get(path3 + "/" + host_id);
                replicated_access->addFilePath(access_zk_path, type, host_id, file_path);
            }
        }
@ -401,11 +440,11 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const

 void BackupCoordinationRemote::addFileInfo(const FileInfo & file_info, bool & is_data_file_required)
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();

    String full_path = zookeeper_path + "/file_names/" + escapeForFileName(file_info.file_name);
    String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum});
-    zookeeper->create(full_path, size_and_checksum, zkutil::CreateMode::Persistent);
+    zk->create(full_path, size_and_checksum, zkutil::CreateMode::Persistent);

    if (!file_info.size)
    {
@ -414,7 +453,7 @@ void BackupCoordinationRemote::addFileInfo(const FileInfo & file_info, bool & is
    }

    full_path = zookeeper_path + "/file_infos/" + size_and_checksum;
-    auto code = zookeeper->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent);
+    auto code = zk->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent);
    if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
        throw zkutil::KeeperException(code, full_path);

@ -426,15 +465,15 @@ void BackupCoordinationRemote::updateFileInfo(const FileInfo & file_info)
    if (!file_info.size)
        return; /// we don't keep FileInfos for empty files, nothing to update

-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum});
    String full_path = zookeeper_path + "/file_infos/" + size_and_checksum;
    for (size_t attempt = 0; attempt < NUM_ATTEMPTS; ++attempt)
    {
        Coordination::Stat stat;
-        auto new_info = deserializeFileInfo(zookeeper->get(full_path, &stat));
+        auto new_info = deserializeFileInfo(zk->get(full_path, &stat));
        new_info.archive_suffix = file_info.archive_suffix;
-        auto code = zookeeper->trySet(full_path, serializeFileInfo(new_info), stat.version);
+        auto code = zk->trySet(full_path, serializeFileInfo(new_info), stat.version);
        if (code == Coordination::Error::ZOK)
            return;
        bool is_last_attempt = (attempt == NUM_ATTEMPTS - 1);
@ -445,16 +484,16 @@ void BackupCoordinationRemote::updateFileInfo(const FileInfo & file_info)

 std::vector<FileInfo> BackupCoordinationRemote::getAllFileInfos() const
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    std::vector<FileInfo> file_infos;
-    Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names");
+    Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names");
    for (const String & escaped_name : escaped_names)
    {
-        String size_and_checksum = zookeeper->get(zookeeper_path + "/file_names/" + escaped_name);
+        String size_and_checksum = zk->get(zookeeper_path + "/file_names/" + escaped_name);
        UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first;
        FileInfo file_info;
        if (size) /// we don't keep FileInfos for empty files
-            file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum));
+            file_info = deserializeFileInfo(zk->get(zookeeper_path + "/file_infos/" + size_and_checksum));
        file_info.file_name = unescapeForFileName(escaped_name);
        file_infos.emplace_back(std::move(file_info));
    }
@ -463,8 +502,8 @@ std::vector<FileInfo> BackupCoordinationRemote::getAllFileInfos() const

 Strings BackupCoordinationRemote::listFiles(const String & directory, bool recursive) const
 {
-    auto zookeeper = get_zookeeper();
-    Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names");
+    auto zk = getZooKeeper();
+    Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names");

    String prefix = directory;
    if (!prefix.empty() && !prefix.ends_with('/'))
@ -496,8 +535,8 @@ Strings BackupCoordinationRemote::listFiles(const String & directory, bool recur

 bool BackupCoordinationRemote::hasFiles(const String & directory) const
 {
-    auto zookeeper = get_zookeeper();
-    Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names");
+    auto zk = getZooKeeper();
+    Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names");

    String prefix = directory;
    if (!prefix.empty() && !prefix.ends_with('/'))
@ -515,42 +554,42 @@ bool BackupCoordinationRemote::hasFiles(const String & directory) const

 std::optional<FileInfo> BackupCoordinationRemote::getFileInfo(const String & file_name) const
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String size_and_checksum;
-    if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum))
+    if (!zk->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum))
        return std::nullopt;
    UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first;
    FileInfo file_info;
    if (size) /// we don't keep FileInfos for empty files
-        file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum));
+        file_info = deserializeFileInfo(zk->get(zookeeper_path + "/file_infos/" + size_and_checksum));
    file_info.file_name = file_name;
    return file_info;
 }

 std::optional<FileInfo> BackupCoordinationRemote::getFileInfo(const SizeAndChecksum & size_and_checksum) const
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String file_info_str;
-    if (!zookeeper->tryGet(zookeeper_path + "/file_infos/" + serializeSizeAndChecksum(size_and_checksum), file_info_str))
+    if (!zk->tryGet(zookeeper_path + "/file_infos/" + serializeSizeAndChecksum(size_and_checksum), file_info_str))
        return std::nullopt;
    return deserializeFileInfo(file_info_str);
 }

 std::optional<SizeAndChecksum> BackupCoordinationRemote::getFileSizeAndChecksum(const String & file_name) const
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String size_and_checksum;
-    if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum))
+    if (!zk->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum))
        return std::nullopt;
    return deserializeSizeAndChecksum(size_and_checksum);
 }

 String BackupCoordinationRemote::getNextArchiveSuffix()
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();
    String path = zookeeper_path + "/archive_suffixes/a";
    String path_created;
-    auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::PersistentSequential, path_created);
+    auto code = zk->tryCreate(path, "", zkutil::CreateMode::PersistentSequential, path_created);
    if (code != Coordination::Error::ZOK)
        throw zkutil::KeeperException(code, path);
    return formatArchiveSuffix(extractCounterFromSequentialNodeName(path_created));
@ -558,16 +597,11 @@ String BackupCoordinationRemote::getNextArchiveSuffix()

 Strings BackupCoordinationRemote::getAllArchiveSuffixes() const
 {
-    auto zookeeper = get_zookeeper();
-    Strings node_names = zookeeper->getChildren(zookeeper_path + "/archive_suffixes");
+    auto zk = getZooKeeper();
+    Strings node_names = zk->getChildren(zookeeper_path + "/archive_suffixes");
    for (auto & node_name : node_names)
        node_name = formatArchiveSuffix(extractCounterFromSequentialNodeName(node_name));
    return node_names;
 }

-void BackupCoordinationRemote::drop()
-{
-    removeAllNodes();
-}
-
 }
--- a/src/Backups/BackupCoordinationRemote.h
+++ b/src/Backups/BackupCoordinationRemote.h
@ -3,7 +3,7 @@
 #include <Backups/IBackupCoordination.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedTables.h>
-#include <Backups/BackupCoordinationStatusSync.h>
+#include <Backups/BackupCoordinationStageSync.h>


 namespace DB
@ -13,13 +13,13 @@ namespace DB
 class BackupCoordinationRemote : public IBackupCoordination
 {
 public:
-    BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_);
+    BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_);
    ~BackupCoordinationRemote() override;

-    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    void setErrorStatus(const String & current_host, const Exception & exception) override;
-    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
-    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;
+    void setStage(const String & current_host, const String & new_stage, const String & message) override;
+    void setError(const String & current_host, const Exception & exception) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override;

    void addReplicatedPartNames(
        const String & table_shared_id,
@ -56,9 +56,9 @@ public:
    String getNextArchiveSuffix() override;
    Strings getAllArchiveSuffixes() const override;

-    void drop() override;
-
 private:
+    zkutil::ZooKeeperPtr getZooKeeper() const;
+    zkutil::ZooKeeperPtr getZooKeeperNoLock() const;
    void createRootNodes();
    void removeAllNodes();
    void prepareReplicatedTables() const;
@ -66,10 +66,12 @@ private:

    const String zookeeper_path;
    const zkutil::GetZooKeeper get_zookeeper;
+    const bool remove_zk_nodes_in_destructor;

-    BackupCoordinationStatusSync status_sync;
+    std::optional<BackupCoordinationStageSync> stage_sync;

    mutable std::mutex mutex;
+    mutable zkutil::ZooKeeperPtr zookeeper;
    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables;
    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access;
 };
--- a/src/Backups/BackupCoordinationStage.cpp
+++ b/src/Backups/BackupCoordinationStage.cpp
@ -0,0 +1,13 @@
+#include <Backups/BackupCoordinationStage.h>
+#include <fmt/format.h>
+
+
+namespace DB
+{
+
+String BackupCoordinationStage::formatGatheringMetadata(size_t pass)
+{
+    return fmt::format("{} ({})", GATHERING_METADATA, pass);
+}
+
+}
--- a/src/Backups/BackupCoordinationStage.h
+++ b/src/Backups/BackupCoordinationStage.h
@ -0,0 +1,41 @@
+#pragma once
+
+#include <base/types.h>
+
+
+namespace DB
+{
+
+namespace BackupCoordinationStage
+{
+    /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
+    constexpr const char * GATHERING_METADATA = "gathering metadata";
+
+    String formatGatheringMetadata(size_t pass);
+
+    /// Making temporary hard links and prepare backup entries.
+    constexpr const char * EXTRACTING_DATA_FROM_TABLES = "extracting data from tables";
+
+    /// Running special tasks for replicated tables which can also prepare some backup entries.
+    constexpr const char * RUNNING_POST_TASKS = "running post-tasks";
+
+    /// Writing backup entries to the backup and removing temporary hard links.
+    constexpr const char * WRITING_BACKUP = "writing backup";
+
+    /// Finding databases and tables in the backup which we're going to restore.
+    constexpr const char * FINDING_TABLES_IN_BACKUP = "finding tables in backup";
+
+    /// Creating databases or finding them and checking their definitions.
+    constexpr const char * CREATING_DATABASES = "creating databases";
+
+    /// Creating tables or finding them and checking their definition.
+    constexpr const char * CREATING_TABLES = "creating tables";
+
+    /// Inserting restored data to tables.
+    constexpr const char * INSERTING_DATA_TO_TABLES = "inserting data to tables";
+
+    /// Coordination stage meaning that a host finished its work.
+    constexpr const char * COMPLETED = "completed";
+}
+
+}
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
@ -0,0 +1,201 @@
+#include <Backups/BackupCoordinationStageSync.h>
+#include <Common/Exception.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+#include <base/chrono_io.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE;
+}
+
+
+BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_)
+    : zookeeper_path(zookeeper_path_)
+    , get_zookeeper(get_zookeeper_)
+    , log(log_)
+{
+    createRootNodes();
+}
+
+void BackupCoordinationStageSync::createRootNodes()
+{
+    auto zookeeper = get_zookeeper();
+    zookeeper->createAncestors(zookeeper_path);
+    zookeeper->createIfNotExists(zookeeper_path, "");
+}
+
+void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message)
+{
+    auto zookeeper = get_zookeeper();
+
+    /// Make an ephemeral node so the initiator can track if the current host is still working.
+    String alive_node_path = zookeeper_path + "/alive|" + current_host;
+    auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
+    if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS)
+        throw zkutil::KeeperException(code, alive_node_path);
+
+    zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
+    zookeeper->create(zookeeper_path + "/current|" + current_host + "|" + new_stage, message, zkutil::CreateMode::Persistent);
+}
+
+void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception)
+{
+    auto zookeeper = get_zookeeper();
+    WriteBufferFromOwnString buf;
+    writeStringBinary(current_host, buf);
+    writeException(exception, buf, true);
+    zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
+}
+
+Strings BackupCoordinationStageSync::wait(const Strings & all_hosts, const String & stage_to_wait)
+{
+    return waitImpl(all_hosts, stage_to_wait, {});
+}
+
+Strings BackupCoordinationStageSync::waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout)
+{
+    return waitImpl(all_hosts, stage_to_wait, timeout);
+}
+
+namespace
+{
+    struct UnreadyHostState
+    {
+        bool started = false;
+        bool alive = false;
+    };
+}
+
+struct BackupCoordinationStageSync::State
+{
+    Strings results;
+    std::map<String, UnreadyHostState> unready_hosts;
+    std::optional<std::pair<String, Exception>> error;
+    std::optional<String> host_terminated;
+};
+
+BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState(
+    zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const
+{
+    std::unordered_set<std::string_view> zk_nodes_set{zk_nodes.begin(), zk_nodes.end()};
+
+    State state;
+    if (zk_nodes_set.contains("error"))
+    {
+        ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")};
+        String host;
+        readStringBinary(host, buf);
+        state.error = std::make_pair(host, readException(buf, fmt::format("Got error from {}", host)));
+        return state;
+    }
+
+    for (const auto & host : all_hosts)
+    {
+        if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait))
+        {
+            UnreadyHostState unready_host_state;
+            unready_host_state.started = zk_nodes_set.contains("started|" + host);
+            unready_host_state.alive = zk_nodes_set.contains("alive|" + host);
+            state.unready_hosts.emplace(host, unready_host_state);
+            if (!unready_host_state.alive && unready_host_state.started && !state.host_terminated)
+                state.host_terminated = host;
+        }
+    }
+
+    if (state.host_terminated || !state.unready_hosts.empty())
+        return state;
+
+    state.results.reserve(all_hosts.size());
+    for (const auto & host : all_hosts)
+        state.results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
+
+    return state;
+}
+
+Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const
+{
+    if (all_hosts.empty())
+        return {};
+
+    /// Wait until all hosts are ready or an error happens or time is out.
+
+    auto zookeeper = get_zookeeper();
+
+    /// Set by ZooKepper when list of zk nodes have changed.
+    auto watch = std::make_shared<Poco::Event>();
+
+    bool use_timeout = timeout.has_value();
+    std::chrono::steady_clock::time_point end_of_timeout;
+    if (use_timeout)
+        end_of_timeout = std::chrono::steady_clock::now() + std::chrono::duration_cast<std::chrono::steady_clock::duration>(*timeout);
+
+    State state;
+
+    String previous_unready_host; /// Used for logging: we don't want to log the same unready host again.
+
+    for (;;)
+    {
+        /// Get zk nodes and subscribe on their changes.
+        Strings zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
+
+        /// Read and analyze the current state of zk nodes.
+        state = readCurrentState(zookeeper, zk_nodes, all_hosts, stage_to_wait);
+        if (state.error || state.host_terminated || state.unready_hosts.empty())
+            break; /// Error happened or everything is ready.
+
+        /// Log that we will wait for another host.
+        const auto & unready_host = state.unready_hosts.begin()->first;
+        if (unready_host != previous_unready_host)
+        {
+            LOG_TRACE(log, "Waiting for host {}", unready_host);
+            previous_unready_host = unready_host;
+        }
+
+        /// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed.
+        {
+            if (use_timeout)
+            {
+                auto current_time = std::chrono::steady_clock::now();
+                if ((current_time > end_of_timeout)
+                    || !watch->tryWait(std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time).count()))
+                    break;
+            }
+            else
+            {
+                watch->wait();
+            }
+        }
+    }
+
+    /// Rethrow an error raised originally on another host.
+    if (state.error)
+        state.error->second.rethrow();
+
+    /// Another host terminated without errors.
+    if (state.host_terminated)
+        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Host {} suddenly stopped working", *state.host_terminated);
+
+    /// Something's unready, timeout is probably not enough.
+    if (!state.unready_hosts.empty())
+    {
+        const auto & [unready_host, unready_host_state] = *state.unready_hosts.begin();
+        throw Exception(
+            ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+            "Waited for host {} too long (> {}){}",
+            unready_host,
+            to_string(*timeout),
+            unready_host_state.started ? "" : ": Operation didn't start");
+    }
+
+    return state.results;
+}
+
+}
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include <Common/ZooKeeper/Common.h>
+
+
+namespace DB
+{
+
+/// Used to coordinate hosts so all hosts would come to a specific stage at around the same time.
+class BackupCoordinationStageSync
+{
+public:
+    BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_);
+
+    /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
+    void set(const String & current_host, const String & new_stage, const String & message);
+    void setError(const String & current_host, const Exception & exception);
+
+    /// Sets the stage of the current host and waits until all hosts come to the same stage.
+    /// The function returns the messages all hosts set when they come to the required stage.
+    Strings wait(const Strings & all_hosts, const String & stage_to_wait);
+
+    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
+    Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout);
+
+private:
+    void createRootNodes();
+
+    struct State;
+    State readCurrentState(zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+
+    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
+
+    String zookeeper_path;
+    zkutil::GetZooKeeper get_zookeeper;
+    Poco::Logger * log;
+};
+
+}
--- a/src/Backups/BackupCoordinationStatusSync.cpp
+++ b/src/Backups/BackupCoordinationStatusSync.cpp
@ -1,182 +0,0 @@
-#include <Backups/BackupCoordinationStatusSync.h>
-#include <Common/Exception.h>
-#include <IO/ReadBufferFromString.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteBufferFromString.h>
-#include <IO/WriteHelpers.h>
-#include <base/chrono_io.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE;
-}
-
-
-BackupCoordinationStatusSync::BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_)
-    : zookeeper_path(zookeeper_path_)
-    , get_zookeeper(get_zookeeper_)
-    , log(log_)
-{
-    createRootNodes();
-}
-
-void BackupCoordinationStatusSync::createRootNodes()
-{
-    auto zookeeper = get_zookeeper();
-    zookeeper->createAncestors(zookeeper_path);
-    zookeeper->createIfNotExists(zookeeper_path, "");
-}
-
-void BackupCoordinationStatusSync::set(const String & current_host, const String & new_status, const String & message)
-{
-    auto zookeeper = get_zookeeper();
-    zookeeper->createIfNotExists(zookeeper_path + "/" + current_host + "|" + new_status, message);
-}
-
-void BackupCoordinationStatusSync::setError(const String & current_host, const Exception & exception)
-{
-    auto zookeeper = get_zookeeper();
-
-    Exception exception2 = exception;
-    exception2.addMessage("Host {}", current_host);
-    WriteBufferFromOwnString buf;
-    writeException(exception2, buf, true);
-
-    zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
-}
-
-Strings BackupCoordinationStatusSync::wait(const Strings & all_hosts, const String & status_to_wait)
-{
-    return waitImpl(all_hosts, status_to_wait, {});
-}
-
-Strings BackupCoordinationStatusSync::waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
-{
-    return waitImpl(all_hosts, status_to_wait, timeout_ms);
-}
-
-Strings BackupCoordinationStatusSync::waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional<UInt64> timeout_ms)
-{
-    if (all_hosts.empty())
-        return {};
-
-    /// Wait for other hosts.
-
-    Strings ready_hosts_results;
-    ready_hosts_results.resize(all_hosts.size());
-
-    std::map<String, std::vector<size_t> /* index in `ready_hosts_results` */> unready_hosts;
-    for (size_t i = 0; i != all_hosts.size(); ++i)
-        unready_hosts[all_hosts[i]].push_back(i);
-
-    std::optional<Exception> error;
-
-    auto zookeeper = get_zookeeper();
-
-    /// Process ZooKeeper's nodes and set `all_hosts_ready` or `unready_host` or `error_message`.
-    auto process_zk_nodes = [&](const Strings & zk_nodes)
-    {
-        for (const String & zk_node : zk_nodes)
-        {
-            if (zk_node.starts_with("remove_watch-"))
-                continue;
-
-            if (zk_node == "error")
-            {
-                ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")};
-                error = readException(buf, "", true);
-                break;
-            }
-
-            size_t separator_pos = zk_node.find('|');
-            if (separator_pos == String::npos)
-                throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected zk node {}", zookeeper_path + "/" + zk_node);
-
-            String host = zk_node.substr(0, separator_pos);
-            String status = zk_node.substr(separator_pos + 1);
-
-            auto it = unready_hosts.find(host);
-            if ((it != unready_hosts.end()) && (status == status_to_wait))
-            {
-                String result = zookeeper->get(zookeeper_path + "/" + zk_node);
-                for (size_t i : it->second)
-                    ready_hosts_results[i] = result;
-                unready_hosts.erase(it);
-            }
-        }
-    };
-
-    /// Wait until all hosts are ready or an error happens or time is out.
-    std::atomic<bool> watch_set = false;
-    std::condition_variable watch_triggered_event;
-
-    auto watch_callback = [&](const Coordination::WatchResponse &)
-    {
-        watch_set = false; /// After it's triggered it's not set until we call getChildrenWatch() again.
-        watch_triggered_event.notify_all();
-    };
-
-    auto watch_triggered = [&] { return !watch_set; };
-
-    bool use_timeout = timeout_ms.has_value();
-    std::chrono::milliseconds timeout{timeout_ms.value_or(0)};
-    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
-    std::chrono::steady_clock::duration elapsed;
-    std::mutex dummy_mutex;
-    String previous_unready_host;
-
-    while (!unready_hosts.empty() && !error)
-    {
-        watch_set = true;
-        Strings nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback);
-        process_zk_nodes(nodes);
-
-        if (!unready_hosts.empty() && !error)
-        {
-            const auto & unready_host = unready_hosts.begin()->first;
-            if (unready_host != previous_unready_host)
-            {
-                LOG_TRACE(log, "Waiting for host {}", unready_host);
-                previous_unready_host = unready_host;
-            }
-
-            std::unique_lock dummy_lock{dummy_mutex};
-            if (use_timeout)
-            {
-                elapsed = std::chrono::steady_clock::now() - start_time;
-                if ((elapsed > timeout) || !watch_triggered_event.wait_for(dummy_lock, timeout - elapsed, watch_triggered))
-                    break;
-            }
-            else
-                watch_triggered_event.wait(dummy_lock, watch_triggered);
-        }
-    }
-
-    if (watch_set)
-    {
-        /// Remove watch by triggering it.
-        zookeeper->create(zookeeper_path + "/remove_watch-", "", zkutil::CreateMode::EphemeralSequential);
-        std::unique_lock dummy_lock{dummy_mutex};
-        watch_triggered_event.wait(dummy_lock, watch_triggered);
-    }
-
-    if (error)
-        error->rethrow();
-
-    if (!unready_hosts.empty())
-    {
-        throw Exception(
-            ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
-            "Waited for host {} too long ({})",
-            unready_hosts.begin()->first,
-            to_string(elapsed));
-    }
-
-    return ready_hosts_results;
-}
-
-}
--- a/src/Backups/BackupCoordinationStatusSync.h
+++ b/src/Backups/BackupCoordinationStatusSync.h
@ -1,37 +0,0 @@
-#pragma once
-
-#include <Common/ZooKeeper/Common.h>
-
-
-namespace DB
-{
-
-/// Used to coordinate hosts so all hosts would come to a specific status at around the same time.
-class BackupCoordinationStatusSync
-{
-public:
-    BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_);
-
-    /// Sets the status of the current host and signal other hosts if there were other hosts waiting for that.
-    void set(const String & current_host, const String & new_status, const String & message);
-    void setError(const String & current_host, const Exception & exception);
-
-    /// Sets the status of the current host and waits until all hosts come to the same status.
-    /// The function returns the messages all hosts set when they come to the required status.
-    Strings wait(const Strings & all_hosts, const String & status_to_wait);
-
-    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
-    Strings waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms);
-
-    static constexpr const char * kErrorStatus = "error";
-
-private:
-    void createRootNodes();
-    Strings waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional<UInt64> timeout_ms);
-
-    String zookeeper_path;
-    zkutil::GetZooKeeper get_zookeeper;
-    Poco::Logger * log;
-};
-
-}
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -1,6 +1,7 @@
 #include <Backups/BackupEntriesCollector.h>
 #include <Backups/BackupEntryFromMemory.h>
 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupCoordinationStage.h>
 #include <Backups/BackupUtils.h>
 #include <Backups/DDLAdjustingForBackupVisitor.h>
 #include <Databases/IDatabase.h>
@ -31,20 +32,11 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+
+namespace Stage = BackupCoordinationStage;
+
 namespace
 {
-    /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
-    constexpr const char * kGatheringMetadataStatus = "gathering metadata";
-
-    /// Making temporary hard links and prepare backup entries.
-    constexpr const char * kExtractingDataFromTablesStatus = "extracting data from tables";
-
-    /// Running special tasks for replicated tables which can also prepare some backup entries.
-    constexpr const char * kRunningPostTasksStatus = "running post-tasks";
-
-    /// Writing backup entries to the backup and removing temporary hard links.
-    constexpr const char * kWritingBackupStatus = "writing backup";
-
    /// Uppercases the first character of a passed string.
    String toUpperFirst(const String & str)
    {
@ -90,7 +82,8 @@ BackupEntriesCollector::BackupEntriesCollector(
    , backup_settings(backup_settings_)
    , backup_coordination(backup_coordination_)
    , context(context_)
-    , consistent_metadata_snapshot_timeout(context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 300000))
+    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
+    , consistent_metadata_snapshot_timeout(context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000))
    , log(&Poco::Logger::get("BackupEntriesCollector"))
 {
 }
@ -100,7 +93,7 @@ BackupEntriesCollector::~BackupEntriesCollector() = default;
 BackupEntries BackupEntriesCollector::run()
 {
    /// run() can be called onle once.
-    if (!current_status.empty())
+    if (!current_stage.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries");

    /// Find other hosts working along with us to execute this ON CLUSTER query.
@ -123,36 +116,40 @@ BackupEntries BackupEntriesCollector::run()
    makeBackupEntriesForTablesDefs();

    /// Make backup entries for the data of the found tables.
-    setStatus(kExtractingDataFromTablesStatus);
+    setStage(Stage::EXTRACTING_DATA_FROM_TABLES);
    makeBackupEntriesForTablesData();

    /// Run all the tasks added with addPostCollectingTask().
-    setStatus(kRunningPostTasksStatus);
+    setStage(Stage::RUNNING_POST_TASKS);
    runPostTasks();

    /// No more backup entries or tasks are allowed after this point.
-    setStatus(kWritingBackupStatus);
+    setStage(Stage::WRITING_BACKUP);

    return std::move(backup_entries);
 }

-Strings BackupEntriesCollector::setStatus(const String & new_status, const String & message)
+Strings BackupEntriesCollector::setStage(const String & new_stage, const String & message)
 {
-    LOG_TRACE(log, "{}", toUpperFirst(new_status));
-    current_status = new_status;
+    LOG_TRACE(log, "{}", toUpperFirst(new_stage));
+    current_stage = new_stage;

-    backup_coordination->setStatus(backup_settings.host_id, new_status, message);
+    backup_coordination->setStage(backup_settings.host_id, new_stage, message);

-    if (new_status.starts_with(kGatheringMetadataStatus))
+    if (new_stage == Stage::formatGatheringMetadata(1))
    {
-        auto now = std::chrono::steady_clock::now();
-        auto end_of_timeout = std::max(now, consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout);
-        return backup_coordination->waitStatusFor(
-            all_hosts, new_status, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - now).count());
+        return backup_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout);
+    }
+    else if (new_stage.starts_with(Stage::GATHERING_METADATA))
+    {
+        auto current_time = std::chrono::steady_clock::now();
+        auto end_of_timeout = std::max(current_time, consistent_metadata_snapshot_end_time);
+        return backup_coordination->waitForStage(
+            all_hosts, new_stage, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time));
    }
    else
    {
-        return backup_coordination->waitStatus(all_hosts, new_status);
+        return backup_coordination->waitForStage(all_hosts, new_stage);
    }
 }

@ -173,18 +170,18 @@ void BackupEntriesCollector::calculateRootPathInBackup()
 /// Finds databases and tables which we will put to the backup.
 void BackupEntriesCollector::gatherMetadataAndCheckConsistency()
 {
-    consistent_metadata_snapshot_start_time = std::chrono::steady_clock::now();
-    auto end_of_timeout = consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout;
-    setStatus(fmt::format("{} ({})", kGatheringMetadataStatus, 1));
+    setStage(Stage::formatGatheringMetadata(1));
+
+    consistent_metadata_snapshot_end_time = std::chrono::steady_clock::now() + consistent_metadata_snapshot_timeout;

    for (size_t pass = 1;; ++pass)
    {
-        String new_status = fmt::format("{} ({})", kGatheringMetadataStatus, pass + 1);
+        String next_stage = Stage::formatGatheringMetadata(pass + 1);
        std::optional<Exception> inconsistency_error;
        if (tryGatherMetadataAndCompareWithPrevious(inconsistency_error))
        {
            /// Gathered metadata and checked consistency, cool! But we have to check that other hosts cope with that too.
-            auto all_hosts_results = setStatus(new_status, "consistent");
+            auto all_hosts_results = setStage(next_stage, "consistent");

            std::optional<String> host_with_inconsistency;
            std::optional<String> inconsistency_error_on_other_host;
@ -210,13 +207,13 @@ void BackupEntriesCollector::gatherMetadataAndCheckConsistency()
        else
        {
            /// Failed to gather metadata or something wasn't consistent. We'll let other hosts know that and try again.
-            setStatus(new_status, inconsistency_error->displayText());
+            setStage(next_stage, inconsistency_error->displayText());
        }

        /// Two passes is minimum (we need to compare with table names with previous ones to be sure we don't miss anything).
        if (pass >= 2)
        {
-            if (std::chrono::steady_clock::now() > end_of_timeout)
+            if (std::chrono::steady_clock::now() > consistent_metadata_snapshot_end_time)
                inconsistency_error->rethrow();
            else
                LOG_WARNING(log, "{}", inconsistency_error->displayText());
@ -239,6 +236,7 @@ bool BackupEntriesCollector::tryGatherMetadataAndCompareWithPrevious(std::option
        table_infos.clear();
        gatherDatabasesMetadata();
        gatherTablesMetadata();
+        lockTablesForReading();
    }
    catch (Exception & e)
    {
@ -526,12 +524,11 @@ void BackupEntriesCollector::lockTablesForReading()
    for (auto & [table_name, table_info] : table_infos)
    {
        auto storage = table_info.storage;
-        TableLockHolder table_lock;
        if (storage)
        {
            try
            {
-                table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
+                table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
            }
            catch (Exception & e)
            {
@ -712,7 +709,7 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN

 void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry)
 {
-    if (current_status == kWritingBackupStatus)
+    if (current_stage == Stage::WRITING_BACKUP)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed");
    backup_entries.emplace_back(file_name, backup_entry);
 }
@ -724,21 +721,21 @@ void BackupEntriesCollector::addBackupEntry(const std::pair<String, BackupEntryP

 void BackupEntriesCollector::addBackupEntries(const BackupEntries & backup_entries_)
 {
-    if (current_status == kWritingBackupStatus)
+    if (current_stage == Stage::WRITING_BACKUP)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed");
    insertAtEnd(backup_entries, backup_entries_);
 }

 void BackupEntriesCollector::addBackupEntries(BackupEntries && backup_entries_)
 {
-    if (current_status == kWritingBackupStatus)
+    if (current_stage == Stage::WRITING_BACKUP)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed");
    insertAtEnd(backup_entries, std::move(backup_entries_));
 }

 void BackupEntriesCollector::addPostTask(std::function<void()> task)
 {
-    if (current_status == kWritingBackupStatus)
+    if (current_stage == Stage::WRITING_BACKUP)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of post tasks is not allowed");
    post_tasks.push(std::move(task));
 }
--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -86,12 +86,13 @@ private:

    void runPostTasks();

-    Strings setStatus(const String & new_status, const String & message = "");
+    Strings setStage(const String & new_stage, const String & message = "");

    const ASTBackupQuery::Elements backup_query_elements;
    const BackupSettings backup_settings;
    std::shared_ptr<IBackupCoordination> backup_coordination;
    ContextPtr context;
+    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds consistent_metadata_snapshot_timeout;
    Poco::Logger * log;

@ -129,8 +130,8 @@ private:
        std::optional<ASTs> partitions;
    };

-    String current_status;
-    std::chrono::steady_clock::time_point consistent_metadata_snapshot_start_time;
+    String current_stage;
+    std::chrono::steady_clock::time_point consistent_metadata_snapshot_end_time;
    std::unordered_map<String, DatabaseInfo> database_infos;
    std::unordered_map<QualifiedTableName, TableInfo> table_infos;
    std::vector<std::pair<String, String>> previous_databases_metadata;
--- a/src/Backups/BackupIO.h
+++ b/src/Backups/BackupIO.h
@ -8,21 +8,22 @@ class SeekableReadBuffer;
 class WriteBuffer;

 /// Represents operations of loading from disk or downloading for reading a backup.
-class IBackupReader /// BackupReaderFile, BackupReaderDisk, BackupReaderS3
+class IBackupReader /// BackupReaderFile, BackupReaderDisk
 {
 public:
    virtual ~IBackupReader() = default;
    virtual bool fileExists(const String & file_name) = 0;
-    virtual size_t getFileSize(const String & file_name) = 0;
+    virtual UInt64 getFileSize(const String & file_name) = 0;
    virtual std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) = 0;
 };

 /// Represents operations of storing to disk or uploading for writing a backup.
-class IBackupWriter /// BackupWriterFile, BackupWriterDisk, BackupWriterS3
+class IBackupWriter /// BackupWriterFile, BackupWriterDisk
 {
 public:
    virtual ~IBackupWriter() = default;
    virtual bool fileExists(const String & file_name) = 0;
+    virtual UInt64 getFileSize(const String & file_name) = 0;
    virtual bool fileContentsEqual(const String & file_name, const String & expected_file_contents) = 0;
    virtual std::unique_ptr<WriteBuffer> writeFile(const String & file_name) = 0;
    virtual void removeFiles(const Strings & file_names) = 0;
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@ -17,7 +17,7 @@ bool BackupReaderDisk::fileExists(const String & file_name)
    return disk->exists(path / file_name);
 }

-size_t BackupReaderDisk::getFileSize(const String & file_name)
+UInt64 BackupReaderDisk::getFileSize(const String & file_name)
 {
    return disk->getFileSize(path / file_name);
 }
@ -38,6 +38,11 @@ bool BackupWriterDisk::fileExists(const String & file_name)
    return disk->exists(path / file_name);
 }

+UInt64 BackupWriterDisk::getFileSize(const String & file_name)
+{
+    return disk->getFileSize(path / file_name);
+}
+
 bool BackupWriterDisk::fileContentsEqual(const String & file_name, const String & expected_file_contents)
 {
    if (!disk->exists(path / file_name))
--- a/src/Backups/BackupIO_Disk.h
+++ b/src/Backups/BackupIO_Disk.h
@ -15,7 +15,7 @@ public:
    ~BackupReaderDisk() override;

    bool fileExists(const String & file_name) override;
-    size_t getFileSize(const String & file_name) override;
+    UInt64 getFileSize(const String & file_name) override;
    std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) override;

 private:
@ -30,6 +30,7 @@ public:
    ~BackupWriterDisk() override;

    bool fileExists(const String & file_name) override;
+    UInt64 getFileSize(const String & file_name) override;
    bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override;
    std::unique_ptr<WriteBuffer> writeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@ -18,7 +18,7 @@ bool BackupReaderFile::fileExists(const String & file_name)
    return fs::exists(path / file_name);
 }

-size_t BackupReaderFile::getFileSize(const String & file_name)
+UInt64 BackupReaderFile::getFileSize(const String & file_name)
 {
    return fs::file_size(path / file_name);
 }
@ -39,6 +39,11 @@ bool BackupWriterFile::fileExists(const String & file_name)
    return fs::exists(path / file_name);
 }

+UInt64 BackupWriterFile::getFileSize(const String & file_name)
+{
+    return fs::file_size(path / file_name);
+}
+
 bool BackupWriterFile::fileContentsEqual(const String & file_name, const String & expected_file_contents)
 {
    if (!fs::exists(path / file_name))
--- a/src/Backups/BackupIO_File.h
+++ b/src/Backups/BackupIO_File.h
@ -13,7 +13,7 @@ public:
    ~BackupReaderFile() override;

    bool fileExists(const String & file_name) override;
-    size_t getFileSize(const String & file_name) override;
+    UInt64 getFileSize(const String & file_name) override;
    std::unique_ptr<SeekableReadBuffer> readFile(const String & file_name) override;

 private:
@ -27,6 +27,7 @@ public:
    ~BackupWriterFile() override;

    bool fileExists(const String & file_name) override;
+    UInt64 getFileSize(const String & file_name) override;
    bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override;
    std::unique_ptr<WriteBuffer> writeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -219,10 +219,7 @@ void BackupImpl::open(const ContextPtr & context)
 void BackupImpl::close()
 {
    std::lock_guard lock{mutex};
-
-    archive_readers.clear();
-    for (auto & archive_writer : archive_writers)
-        archive_writer = {"", nullptr};
+    closeArchives();

    if (!is_internal_backup && writer && !writing_finalized)
        removeAllFilesAfterFailure();
@ -232,10 +229,29 @@ void BackupImpl::close()
    coordination.reset();
 }

-time_t BackupImpl::getTimestamp() const
+void BackupImpl::closeArchives()
+{
+    archive_readers.clear();
+    for (auto & archive_writer : archive_writers)
+        archive_writer = {"", nullptr};
+}
+
+size_t BackupImpl::getNumFiles() const
 {
    std::lock_guard lock{mutex};
-    return timestamp;
+    return num_files;
+}
+
+UInt64 BackupImpl::getUncompressedSize() const
+{
+    std::lock_guard lock{mutex};
+    return uncompressed_size;
+}
+
+UInt64 BackupImpl::getCompressedSize() const
+{
+    std::lock_guard lock{mutex};
+    return compressed_size;
 }

 void BackupImpl::writeBackupMetadata()
@ -290,6 +306,7 @@ void BackupImpl::writeBackupMetadata()
            if (info.pos_in_archive != static_cast<size_t>(-1))
                config->setUInt64(prefix + "pos_in_archive", info.pos_in_archive);
        }
+        increaseUncompressedSize(info);
        ++index;
    }

@ -306,6 +323,8 @@ void BackupImpl::writeBackupMetadata()
        out = writer->writeFile(".backup");
    out->write(str.data(), str.size());
    out->finalize();
+
+    increaseUncompressedSize(str.size());
 }

 void BackupImpl::readBackupMetadata()
@ -315,6 +334,7 @@ void BackupImpl::readBackupMetadata()
    {
        if (!reader->fileExists(archive_params.archive_name))
            throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", backup_name);
+        setCompressedSize();
        in = getArchiveReader("")->readFile(".backup");
    }
    else
@ -326,6 +346,7 @@ void BackupImpl::readBackupMetadata()

    String str;
    readStringUntilEOF(str, *in);
+    increaseUncompressedSize(str.size());
    std::istringstream stream(str); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
    Poco::AutoPtr<Poco::Util::XMLConfiguration> config{new Poco::Util::XMLConfiguration()};
    config->load(stream);
@ -382,8 +403,12 @@ void BackupImpl::readBackupMetadata()
            }

            coordination->addFileInfo(info);
+            increaseUncompressedSize(info);
        }
    }
+
+    if (!use_archives)
+        setCompressedSize();
 }

 void BackupImpl::checkBackupDoesntExist() const
@ -750,6 +775,8 @@ void BackupImpl::finalizeWriting()
    {
        LOG_TRACE(log, "Finalizing backup {}", backup_name);
        writeBackupMetadata();
+        closeArchives();
+        setCompressedSize();
        removeLockFile();
        LOG_TRACE(log, "Finalized backup {}", backup_name);
    }
@ -758,12 +785,32 @@ void BackupImpl::finalizeWriting()
 }


+void BackupImpl::increaseUncompressedSize(UInt64 file_size)
+{
+    uncompressed_size += file_size;
+    ++num_files;
+}
+
+void BackupImpl::increaseUncompressedSize(const FileInfo & info)
+{
+    if ((info.size > info.base_size) && (info.data_file_name.empty() || (info.data_file_name == info.file_name)))
+        increaseUncompressedSize(info.size - info.base_size);
+}
+
+void BackupImpl::setCompressedSize()
+{
+    if (use_archives)
+        compressed_size = writer ? writer->getFileSize(archive_params.archive_name) : reader->getFileSize(archive_params.archive_name);
+    else
+        compressed_size = uncompressed_size;
+}
+
+
 String BackupImpl::getArchiveNameWithSuffix(const String & suffix) const
 {
    return archive_params.archive_name + (suffix.empty() ? "" : ".") + suffix;
 }

-
 std::shared_ptr<IArchiveReader> BackupImpl::getArchiveReader(const String & suffix) const
 {
    auto it = archive_readers.find(suffix);
@ -796,6 +843,7 @@ std::shared_ptr<IArchiveWriter> BackupImpl::getArchiveWriter(const String & suff
    return new_archive_writer;
 }

+
 void BackupImpl::removeAllFilesAfterFailure()
 {
    if (is_internal_backup)
--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@ -55,8 +55,11 @@ public:

    const String & getName() const override { return backup_name; }
    OpenMode getOpenMode() const override { return open_mode; }
-    time_t getTimestamp() const override;
+    time_t getTimestamp() const override { return timestamp; }
    UUID getUUID() const override { return *uuid; }
+    size_t getNumFiles() const override;
+    UInt64 getUncompressedSize() const override;
+    UInt64 getCompressedSize() const override;
    Strings listFiles(const String & directory, bool recursive) const override;
    bool hasFiles(const String & directory) const override;
    bool fileExists(const String & file_name) const override;
@ -76,6 +79,7 @@ private:

    void open(const ContextPtr & context);
    void close();
+    void closeArchives();

    /// Writes the file ".backup" containing backup's metadata.
    void writeBackupMetadata();
@ -96,6 +100,13 @@ private:
    std::shared_ptr<IArchiveReader> getArchiveReader(const String & suffix) const;
    std::shared_ptr<IArchiveWriter> getArchiveWriter(const String & suffix);

+    /// Increases `uncompressed_size` by a specific value and `num_files` by 1.
+    void increaseUncompressedSize(UInt64 file_size);
+    void increaseUncompressedSize(const FileInfo & info);
+
+    /// Calculates and sets `compressed_size`.
+    void setCompressedSize();
+
    const String backup_name;
    const ArchiveParams archive_params;
    const bool use_archives;
@ -108,6 +119,9 @@ private:
    mutable std::mutex mutex;
    std::optional<UUID> uuid;
    time_t timestamp = 0;
+    size_t num_files = 0;
+    UInt64 uncompressed_size = 0;
+    UInt64 compressed_size = 0;
    UInt64 version;
    std::optional<BackupInfo> base_backup_info;
    std::shared_ptr<const IBackup> base_backup;
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -60,6 +60,7 @@ namespace

 /// List of backup settings except base_backup_name and cluster_host_ids.
 #define LIST_OF_BACKUP_SETTINGS(M) \
+    M(String, id) \
    M(String, compression_method) \
    M(Int64, compression_level) \
    M(String, password) \
--- a/src/Backups/BackupSettings.h
+++ b/src/Backups/BackupSettings.h
@ -11,6 +11,9 @@ class ASTBackupQuery;
 /// Settings specified in the "SETTINGS" clause of a BACKUP query.
 struct BackupSettings
 {
+    /// ID of the backup operation, to identify it in the system.backups table. Auto-generated if not set.
+    String id;
+
    /// Base backup, if it's set an incremental backup will be built. That means only differences made after the base backup will be put
    /// into a new backup.
    std::optional<BackupInfo> base_backup_info;
--- a/src/Backups/BackupStatus.cpp
+++ b/src/Backups/BackupStatus.cpp
@ -15,18 +15,18 @@ std::string_view toString(BackupStatus backup_status)
 {
    switch (backup_status)
    {
-        case BackupStatus::MAKING_BACKUP:
-            return "MAKING_BACKUP";
-        case BackupStatus::BACKUP_COMPLETE:
-            return "BACKUP_COMPLETE";
-        case BackupStatus::FAILED_TO_BACKUP:
-            return "FAILED_TO_BACKUP";
+        case BackupStatus::CREATING_BACKUP:
+            return "CREATING_BACKUP";
+        case BackupStatus::BACKUP_CREATED:
+            return "BACKUP_CREATED";
+        case BackupStatus::BACKUP_FAILED:
+            return "BACKUP_FAILED";
        case BackupStatus::RESTORING:
            return "RESTORING";
        case BackupStatus::RESTORED:
            return "RESTORED";
-        case BackupStatus::FAILED_TO_RESTORE:
-            return "FAILED_TO_RESTORE";
+        case BackupStatus::RESTORE_FAILED:
+            return "RESTORE_FAILED";
        default:
            break;
    }
--- a/src/Backups/BackupStatus.h
+++ b/src/Backups/BackupStatus.h
@ -9,14 +9,14 @@ namespace DB
 enum class BackupStatus
 {
    /// Statuses of making backups
-    MAKING_BACKUP,
-    BACKUP_COMPLETE,
-    FAILED_TO_BACKUP,
+    CREATING_BACKUP,
+    BACKUP_CREATED,
+    BACKUP_FAILED,

    /// Status of restoring
    RESTORING,
    RESTORED,
-    FAILED_TO_RESTORE,
+    RESTORE_FAILED,

    MAX,
 };
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -5,6 +5,7 @@
 #include <Backups/BackupUtils.h>
 #include <Backups/IBackupEntry.h>
 #include <Backups/BackupEntriesCollector.h>
+#include <Backups/BackupCoordinationStage.h>
 #include <Backups/BackupCoordinationRemote.h>
 #include <Backups/BackupCoordinationLocal.h>
 #include <Backups/RestoreCoordinationRemote.h>
@ -18,7 +19,6 @@
 #include <Common/Exception.h>
 #include <Common/Macros.h>
 #include <Common/logger_useful.h>
-#include <Common/scope_guard_safe.h>
 #include <Common/setThreadName.h>


@ -27,28 +27,95 @@ namespace DB

 namespace ErrorCodes
 {
+    extern const int BAD_ARGUMENTS;
    extern const int LOGICAL_ERROR;
 }

+using OperationID = BackupsWorker::OperationID;
+namespace Stage = BackupCoordinationStage;
+
 namespace
 {
-    /// Coordination status meaning that a host finished its work.
-    constexpr const char * kCompletedCoordinationStatus = "completed";
-
-    /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination.
-    template <typename CoordinationType>
-    void sendErrorToCoordination(std::shared_ptr<CoordinationType> coordination, const String & current_host)
+    std::shared_ptr<IBackupCoordination> makeBackupCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup)
+    {
+        if (!coordination_zk_path.empty())
+        {
+            auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
+            return std::make_shared<BackupCoordinationRemote>(coordination_zk_path, get_zookeeper, !is_internal_backup);
+        }
+        else
+        {
+            return std::make_shared<BackupCoordinationLocal>();
+        }
+    }
+
+    std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup)
+    {
+        if (!coordination_zk_path.empty())
+        {
+            auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
+            return std::make_shared<RestoreCoordinationRemote>(coordination_zk_path, get_zookeeper, !is_internal_backup);
+        }
+        else
+        {
+            return std::make_shared<RestoreCoordinationLocal>();
+        }
+    }
+
+    /// Sends information about an exception to IBackupCoordination or IRestoreCoordination.
+    template <typename CoordinationType>
+    void sendExceptionToCoordination(std::shared_ptr<CoordinationType> coordination, const String & current_host, const Exception & exception)
    {
-        if (!coordination)
-            return;
        try
        {
-            coordination->setErrorStatus(current_host, Exception{getCurrentExceptionCode(), getCurrentExceptionMessage(true, true)});
+            if (coordination)
+                coordination->setError(current_host, exception);
        }
        catch (...)
        {
        }
    }
+
+    /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination.
+    template <typename CoordinationType>
+    void sendCurrentExceptionToCoordination(std::shared_ptr<CoordinationType> coordination, const String & current_host)
+    {
+        try
+        {
+            throw;
+        }
+        catch (const Exception & e)
+        {
+            sendExceptionToCoordination(coordination, current_host, e);
+        }
+        catch (...)
+        {
+            coordination->setError(current_host, Exception{getCurrentExceptionCode(), getCurrentExceptionMessage(true, true)});
+        }
+    }
+
+    bool isFinalStatus(BackupStatus status)
+    {
+        return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORED)
+            || (status == BackupStatus::RESTORE_FAILED);
+    }
+
+    bool isErrorStatus(BackupStatus status)
+    {
+        return (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORE_FAILED);
+    }
+
+    /// Used to change num_active_backups.
+    size_t getNumActiveBackupsChange(BackupStatus status)
+    {
+        return status == BackupStatus::CREATING_BACKUP;
+    }
+
+    /// Used to change num_active_restores.
+    size_t getNumActiveRestoresChange(BackupStatus status)
+    {
+        return status == BackupStatus::RESTORING;
+    }
 }


@ -60,7 +127,8 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa
    /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now.
 }

-UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
+
+OperationID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
 {
    const ASTBackupQuery & backup_query = typeid_cast<const ASTBackupQuery &>(*backup_or_restore_query);
    if (backup_query.kind == ASTBackupQuery::Kind::BACKUP)
@ -70,379 +138,525 @@ UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutable
 }


-UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
+OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
 {
    auto backup_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto backup_settings = BackupSettings::fromBackupQuery(*backup_query);
-    auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
-    bool on_cluster = !backup_query->cluster.empty();

    if (!backup_settings.backup_uuid)
        backup_settings.backup_uuid = UUIDHelpers::generateV4();
-    UUID backup_uuid = *backup_settings.backup_uuid;

-    /// Prepare context to use.
-    ContextPtr context_in_use = context;
-    ContextMutablePtr mutable_context;
-    if (on_cluster || backup_settings.async)
+    /// `backup_id` will be used as a key to the `infos` map, so it should be unique.
+    OperationID backup_id;
+    if (backup_settings.internal)
+        backup_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `backup_id` for internal backup to avoid collision if both internal and non-internal backups are on the same host
+    else if (!backup_settings.id.empty())
+        backup_id = backup_settings.id;
+    else
+        backup_id = toString(*backup_settings.backup_uuid);
+
+    std::shared_ptr<IBackupCoordination> backup_coordination;
+    if (backup_settings.internal)
    {
-        /// For ON CLUSTER queries we will need to change some settings.
-        /// For ASYNC queries we have to clone the context anyway.
-        context_in_use = mutable_context = Context::createCopy(context);
+        /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
+        /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
+        /// if an exception will be thrown in startMakingBackup() other hosts will know about that.
+        backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal);
    }

-    addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP, backup_settings.internal);
+    try
+    {
+        auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
+        addInfo(backup_id, backup_info.toString(), backup_settings.internal, BackupStatus::CREATING_BACKUP);

-    auto job = [this,
-                backup_uuid,
+        /// Prepare context to use.
+        ContextPtr context_in_use = context;
+        ContextMutablePtr mutable_context;
+        bool on_cluster = !backup_query->cluster.empty();
+        if (on_cluster || backup_settings.async)
+        {
+            /// For ON CLUSTER queries we will need to change some settings.
+            /// For ASYNC queries we have to clone the context anyway.
+            context_in_use = mutable_context = Context::createCopy(context);
+        }
+
+        if (backup_settings.async)
+        {
+            backups_thread_pool.scheduleOrThrowOnError(
+                [this, backup_query, backup_id, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context]
+                {
+                    doBackup(
+                        backup_query,
+                        backup_id,
+                        backup_settings,
+                        backup_info,
+                        backup_coordination,
+                        context_in_use,
+                        mutable_context,
+                        /* called_async= */ true);
+                });
+        }
+        else
+        {
+            doBackup(
                backup_query,
+                backup_id,
                backup_settings,
                backup_info,
-                on_cluster,
+                backup_coordination,
                context_in_use,
-                mutable_context](bool async) mutable
+                mutable_context,
+                /* called_async= */ false);
+        }
+
+        return backup_id;
+    }
+    catch (...)
    {
-        std::optional<CurrentThread::QueryScope> query_scope;
-        std::shared_ptr<IBackupCoordination> backup_coordination;
-        SCOPE_EXIT_SAFE(if (backup_coordination && !backup_settings.internal) backup_coordination->drop(););
-
-        try
-        {
-            if (async)
-            {
-                query_scope.emplace(mutable_context);
-                setThreadName("BackupWorker");
-            }
-
-            /// Checks access rights if this is not ON CLUSTER query.
-            /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
-            auto required_access = getRequiredAccessToBackup(backup_query->elements);
-            if (!on_cluster)
-                context_in_use->checkAccess(required_access);
-
-            ClusterPtr cluster;
-            if (on_cluster)
-            {
-                backup_query->cluster = context_in_use->getMacros()->expand(backup_query->cluster);
-                cluster = context_in_use->getCluster(backup_query->cluster);
-                backup_settings.cluster_host_ids = cluster->getHostIDs();
-                if (backup_settings.coordination_zk_path.empty())
-                {
-                    String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
-                    backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid);
-                }
-            }
-
-            /// Make a backup coordination.
-            if (!backup_settings.coordination_zk_path.empty())
-            {
-                backup_coordination = std::make_shared<BackupCoordinationRemote>(
-                    backup_settings.coordination_zk_path,
-                    [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); });
-            }
-            else
-            {
-                backup_coordination = std::make_shared<BackupCoordinationLocal>();
-            }
-
-            /// Opens a backup for writing.
-            BackupFactory::CreateParams backup_create_params;
-            backup_create_params.open_mode = IBackup::OpenMode::WRITE;
-            backup_create_params.context = context_in_use;
-            backup_create_params.backup_info = backup_info;
-            backup_create_params.base_backup_info = backup_settings.base_backup_info;
-            backup_create_params.compression_method = backup_settings.compression_method;
-            backup_create_params.compression_level = backup_settings.compression_level;
-            backup_create_params.password = backup_settings.password;
-            backup_create_params.is_internal_backup = backup_settings.internal;
-            backup_create_params.backup_coordination = backup_coordination;
-            backup_create_params.backup_uuid = backup_uuid;
-            BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params);
-
-            /// Write the backup.
-            if (on_cluster)
-            {
-                DDLQueryOnClusterParams params;
-                params.cluster = cluster;
-                params.only_shard_num = backup_settings.shard_num;
-                params.only_replica_num = backup_settings.replica_num;
-                params.access_to_check = required_access;
-                backup_settings.copySettingsToQuery(*backup_query);
-
-                // executeDDLQueryOnCluster() will return without waiting for completion
-                mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
-                mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
-                executeDDLQueryOnCluster(backup_query, mutable_context, params);
-
-                /// Wait until all the hosts have written their backup entries.
-                auto all_hosts = BackupSettings::Util::filterHostIDs(
-                    backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
-                backup_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus);
-            }
-            else
-            {
-                backup_query->setCurrentDatabase(context_in_use->getCurrentDatabase());
-
-                /// Prepare backup entries.
-                BackupEntries backup_entries;
-                {
-                    BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context_in_use};
-                    backup_entries = backup_entries_collector.run();
-                }
-
-                /// Write the backup entries to the backup.
-                writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool);
-
-                /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
-                backup_coordination->setStatus(backup_settings.host_id, kCompletedCoordinationStatus, "");
-            }
-
-            /// Finalize backup (write its metadata).
-            if (!backup_settings.internal)
-                backup->finalizeWriting();
-
-            /// Close the backup.
-            backup.reset();
-
-            setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE);
-        }
-        catch (...)
-        {
-            /// Something bad happened, the backup has not built.
-            setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP);
-            sendErrorToCoordination(backup_coordination, backup_settings.host_id);
-            if (!async)
-                throw;
-        }
-    };
-
-    if (backup_settings.async)
-        backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); });
-    else
-        job(false);
-
-    return backup_uuid;
+        /// Something bad happened, the backup has not built.
+        setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
+        sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id);
+        throw;
+    }
 }


-UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
+void BackupsWorker::doBackup(
+    const std::shared_ptr<ASTBackupQuery> & backup_query,
+    const OperationID & backup_id,
+    BackupSettings backup_settings,
+    const BackupInfo & backup_info,
+    std::shared_ptr<IBackupCoordination> backup_coordination,
+    const ContextPtr & context,
+    ContextMutablePtr mutable_context,
+    bool called_async)
+{
+    std::optional<CurrentThread::QueryScope> query_scope;
+    try
+    {
+        if (called_async)
+        {
+            query_scope.emplace(mutable_context);
+            setThreadName("BackupWorker");
+        }
+
+        bool on_cluster = !backup_query->cluster.empty();
+        assert(mutable_context || (!on_cluster && !called_async));
+
+        /// Checks access rights if this is not ON CLUSTER query.
+        /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
+        auto required_access = getRequiredAccessToBackup(backup_query->elements);
+        if (!on_cluster)
+            context->checkAccess(required_access);
+
+        ClusterPtr cluster;
+        if (on_cluster)
+        {
+            backup_query->cluster = context->getMacros()->expand(backup_query->cluster);
+            cluster = context->getCluster(backup_query->cluster);
+            backup_settings.cluster_host_ids = cluster->getHostIDs();
+            if (backup_settings.coordination_zk_path.empty())
+            {
+                String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
+                backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(*backup_settings.backup_uuid);
+            }
+        }
+
+        /// Make a backup coordination.
+        if (!backup_coordination)
+            backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal);
+
+        /// Opens a backup for writing.
+        BackupFactory::CreateParams backup_create_params;
+        backup_create_params.open_mode = IBackup::OpenMode::WRITE;
+        backup_create_params.context = context;
+        backup_create_params.backup_info = backup_info;
+        backup_create_params.base_backup_info = backup_settings.base_backup_info;
+        backup_create_params.compression_method = backup_settings.compression_method;
+        backup_create_params.compression_level = backup_settings.compression_level;
+        backup_create_params.password = backup_settings.password;
+        backup_create_params.is_internal_backup = backup_settings.internal;
+        backup_create_params.backup_coordination = backup_coordination;
+        backup_create_params.backup_uuid = backup_settings.backup_uuid;
+        BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params);
+
+        /// Write the backup.
+        if (on_cluster)
+        {
+            DDLQueryOnClusterParams params;
+            params.cluster = cluster;
+            params.only_shard_num = backup_settings.shard_num;
+            params.only_replica_num = backup_settings.replica_num;
+            params.access_to_check = required_access;
+            backup_settings.copySettingsToQuery(*backup_query);
+
+            // executeDDLQueryOnCluster() will return without waiting for completion
+            mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
+            mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
+            executeDDLQueryOnCluster(backup_query, mutable_context, params);
+
+            /// Wait until all the hosts have written their backup entries.
+            auto all_hosts = BackupSettings::Util::filterHostIDs(
+                backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
+            backup_coordination->waitForStage(all_hosts, Stage::COMPLETED);
+        }
+        else
+        {
+            backup_query->setCurrentDatabase(context->getCurrentDatabase());
+
+            /// Prepare backup entries.
+            BackupEntries backup_entries;
+            {
+                BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context};
+                backup_entries = backup_entries_collector.run();
+            }
+
+            /// Write the backup entries to the backup.
+            writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool);
+
+            /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
+            backup_coordination->setStage(backup_settings.host_id, Stage::COMPLETED, "");
+        }
+
+        size_t num_files = 0;
+        UInt64 uncompressed_size = 0;
+        UInt64 compressed_size = 0;
+
+        /// Finalize backup (write its metadata).
+        if (!backup_settings.internal)
+        {
+            backup->finalizeWriting();
+            num_files = backup->getNumFiles();
+            uncompressed_size = backup->getUncompressedSize();
+            compressed_size = backup->getCompressedSize();
+        }
+
+        /// Close the backup.
+        backup.reset();
+
+        LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString());
+        setStatus(backup_id, BackupStatus::BACKUP_CREATED);
+        setNumFilesAndSize(backup_id, num_files, uncompressed_size, compressed_size);
+    }
+    catch (...)
+    {
+        /// Something bad happened, the backup has not built.
+        if (called_async)
+        {
+            tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString()));
+            setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED);
+            sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id);
+        }
+        else
+        {
+            /// setStatus() and sendCurrentExceptionToCoordination() will be called by startMakingBackup().
+            throw;
+        }
+    }
+}
+
+
+OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
 {
-    UUID restore_uuid = UUIDHelpers::generateV4();
    auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
-    auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
-    bool on_cluster = !restore_query->cluster.empty();

-    /// Prepare context to use.
-    ContextMutablePtr context_in_use = context;
-    if (restore_settings.async || on_cluster)
+    /// `restore_id` will be used as a key to the `infos` map, so it should be unique.
+    OperationID restore_id;
+    if (restore_settings.internal)
+        restore_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `restore_id` for internal restore to avoid collision if both internal and non-internal restores are on the same host
+    else if (!restore_settings.id.empty())
+        restore_id = restore_settings.id;
+    else
+        restore_id = toString(UUIDHelpers::generateV4());
+
+    std::shared_ptr<IRestoreCoordination> restore_coordination;
+    if (restore_settings.internal)
    {
-        /// For ON CLUSTER queries we will need to change some settings.
-        /// For ASYNC queries we have to clone the context anyway.
-        context_in_use = Context::createCopy(context);
+        /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
+        /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
+        /// if an exception will be thrown in startRestoring() other hosts will know about that.
+        restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal);
    }

-    addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING, restore_settings.internal);
+    try
+    {
+        auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
+        addInfo(restore_id, backup_info.toString(), restore_settings.internal, BackupStatus::RESTORING);

-    auto job = [this,
-                restore_uuid,
+        /// Prepare context to use.
+        ContextMutablePtr context_in_use = context;
+        bool on_cluster = !restore_query->cluster.empty();
+        if (restore_settings.async || on_cluster)
+        {
+            /// For ON CLUSTER queries we will need to change some settings.
+            /// For ASYNC queries we have to clone the context anyway.
+            context_in_use = Context::createCopy(context);
+        }
+
+        if (restore_settings.async)
+        {
+            backups_thread_pool.scheduleOrThrowOnError(
+                [this, restore_query, restore_id, restore_settings, backup_info, restore_coordination, context_in_use] {
+                    doRestore(
+                        restore_query,
+                        restore_id,
+                        restore_settings,
+                        backup_info,
+                        restore_coordination,
+                        context_in_use,
+                        /* called_async= */ true);
+                });
+        }
+        else
+        {
+            doRestore(
                restore_query,
+                restore_id,
                restore_settings,
                backup_info,
-                on_cluster,
-                context_in_use](bool async) mutable
+                restore_coordination,
+                context_in_use,
+                /* called_async= */ false);
+        }
+
+        return restore_id;
+    }
+    catch (...)
    {
-        std::optional<CurrentThread::QueryScope> query_scope;
-        std::shared_ptr<IRestoreCoordination> restore_coordination;
-        SCOPE_EXIT_SAFE(if (restore_coordination && !restore_settings.internal) restore_coordination->drop(););
-
-        try
-        {
-            if (async)
-            {
-                query_scope.emplace(context_in_use);
-                setThreadName("RestoreWorker");
-            }
-
-            /// Open the backup for reading.
-            BackupFactory::CreateParams backup_open_params;
-            backup_open_params.open_mode = IBackup::OpenMode::READ;
-            backup_open_params.context = context_in_use;
-            backup_open_params.backup_info = backup_info;
-            backup_open_params.base_backup_info = restore_settings.base_backup_info;
-            backup_open_params.password = restore_settings.password;
-            BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
-
-            String current_database = context_in_use->getCurrentDatabase();
-
-            /// Checks access rights if this is ON CLUSTER query.
-            /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
-            ClusterPtr cluster;
-            if (on_cluster)
-            {
-                restore_query->cluster = context_in_use->getMacros()->expand(restore_query->cluster);
-                cluster = context_in_use->getCluster(restore_query->cluster);
-                restore_settings.cluster_host_ids = cluster->getHostIDs();
-
-                /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
-                /// because different replicas can contain different set of tables and so the required access rights can differ too.
-                /// So the right way is pass through the entire cluster and check access for each host.
-                auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num);
-                for (const auto * address : addresses)
-                {
-                    restore_settings.host_id = address->toString();
-                    auto restore_elements = restore_query->elements;
-                    String addr_database = address->default_database.empty() ? current_database : address->default_database;
-                    for (auto & element : restore_elements)
-                        element.setCurrentDatabase(addr_database);
-                    RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context_in_use};
-                    dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY);
-                }
-            }
-
-            /// Make a restore coordination.
-            if (on_cluster && restore_settings.coordination_zk_path.empty())
-            {
-                String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
-                restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid);
-            }
-
-            if (!restore_settings.coordination_zk_path.empty())
-            {
-                restore_coordination = std::make_shared<RestoreCoordinationRemote>(
-                    restore_settings.coordination_zk_path,
-                    [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); });
-            }
-            else
-            {
-                restore_coordination = std::make_shared<RestoreCoordinationLocal>();
-            }
-
-            /// Do RESTORE.
-            if (on_cluster)
-            {
-
-                DDLQueryOnClusterParams params;
-                params.cluster = cluster;
-                params.only_shard_num = restore_settings.shard_num;
-                params.only_replica_num = restore_settings.replica_num;
-                restore_settings.copySettingsToQuery(*restore_query);
-
-                // executeDDLQueryOnCluster() will return without waiting for completion
-                context_in_use->setSetting("distributed_ddl_task_timeout", Field{0});
-                context_in_use->setSetting("distributed_ddl_output_mode", Field{"none"});
-
-                executeDDLQueryOnCluster(restore_query, context_in_use, params);
-
-                /// Wait until all the hosts have written their backup entries.
-                auto all_hosts = BackupSettings::Util::filterHostIDs(
-                    restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
-                restore_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus);
-            }
-            else
-            {
-                restore_query->setCurrentDatabase(current_database);
-
-                /// Restore metadata and prepare data restoring tasks.
-                DataRestoreTasks data_restore_tasks;
-                {
-                    RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination,
-                                                backup, context_in_use};
-                    data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE);
-                }
-
-                /// Execute the data restoring tasks.
-                restoreTablesData(std::move(data_restore_tasks), restores_thread_pool);
-
-                /// We have restored everything, we need to tell other hosts (they could be waiting for it).
-                restore_coordination->setStatus(restore_settings.host_id, kCompletedCoordinationStatus, "");
-            }
-
-            setStatus(restore_uuid, BackupStatus::RESTORED);
-        }
-        catch (...)
-        {
-            /// Something bad happened, the backup has not built.
-            setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE);
-            sendErrorToCoordination(restore_coordination, restore_settings.host_id);
-            if (!async)
-                throw;
-        }
-    };
-
-    if (restore_settings.async)
-        backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); });
-    else
-        job(false);
-
-    return restore_uuid;
+        /// Something bad happened, the backup has not built.
+        setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
+        sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id);
+        throw;
+    }
 }


-void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, BackupStatus status, bool internal)
+void BackupsWorker::doRestore(
+    const std::shared_ptr<ASTBackupQuery> & restore_query,
+    const OperationID & restore_id,
+    RestoreSettings restore_settings,
+    const BackupInfo & backup_info,
+    std::shared_ptr<IRestoreCoordination> restore_coordination,
+    ContextMutablePtr context,
+    bool called_async)
+{
+    std::optional<CurrentThread::QueryScope> query_scope;
+    try
+    {
+        if (called_async)
+        {
+            query_scope.emplace(context);
+            setThreadName("RestoreWorker");
+        }
+
+        /// Open the backup for reading.
+        BackupFactory::CreateParams backup_open_params;
+        backup_open_params.open_mode = IBackup::OpenMode::READ;
+        backup_open_params.context = context;
+        backup_open_params.backup_info = backup_info;
+        backup_open_params.base_backup_info = restore_settings.base_backup_info;
+        backup_open_params.password = restore_settings.password;
+        BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
+
+        setNumFilesAndSize(restore_id, backup->getNumFiles(), backup->getUncompressedSize(), backup->getCompressedSize());
+
+        String current_database = context->getCurrentDatabase();
+
+        /// Checks access rights if this is ON CLUSTER query.
+        /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
+        ClusterPtr cluster;
+        bool on_cluster = !restore_query->cluster.empty();
+        if (on_cluster)
+        {
+            restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
+            cluster = context->getCluster(restore_query->cluster);
+            restore_settings.cluster_host_ids = cluster->getHostIDs();
+
+            /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
+            /// because different replicas can contain different set of tables and so the required access rights can differ too.
+            /// So the right way is pass through the entire cluster and check access for each host.
+            auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num);
+            for (const auto * address : addresses)
+            {
+                restore_settings.host_id = address->toString();
+                auto restore_elements = restore_query->elements;
+                String addr_database = address->default_database.empty() ? current_database : address->default_database;
+                for (auto & element : restore_elements)
+                    element.setCurrentDatabase(addr_database);
+                RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context};
+                dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY);
+            }
+        }
+
+        /// Make a restore coordination.
+        if (on_cluster && restore_settings.coordination_zk_path.empty())
+        {
+            String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
+            restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(UUIDHelpers::generateV4());
+        }
+
+        if (!restore_coordination)
+            restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal);
+
+        /// Do RESTORE.
+        if (on_cluster)
+        {
+
+            DDLQueryOnClusterParams params;
+            params.cluster = cluster;
+            params.only_shard_num = restore_settings.shard_num;
+            params.only_replica_num = restore_settings.replica_num;
+            restore_settings.copySettingsToQuery(*restore_query);
+
+            // executeDDLQueryOnCluster() will return without waiting for completion
+            context->setSetting("distributed_ddl_task_timeout", Field{0});
+            context->setSetting("distributed_ddl_output_mode", Field{"none"});
+
+            executeDDLQueryOnCluster(restore_query, context, params);
+
+            /// Wait until all the hosts have written their backup entries.
+            auto all_hosts = BackupSettings::Util::filterHostIDs(
+                restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
+            restore_coordination->waitForStage(all_hosts, Stage::COMPLETED);
+        }
+        else
+        {
+            restore_query->setCurrentDatabase(current_database);
+
+            /// Restore metadata and prepare data restoring tasks.
+            DataRestoreTasks data_restore_tasks;
+            {
+                RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination,
+                                            backup, context};
+                data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE);
+            }
+
+            /// Execute the data restoring tasks.
+            restoreTablesData(std::move(data_restore_tasks), restores_thread_pool);
+
+            /// We have restored everything, we need to tell other hosts (they could be waiting for it).
+            restore_coordination->setStage(restore_settings.host_id, Stage::COMPLETED, "");
+        }
+
+        LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString());
+        setStatus(restore_id, BackupStatus::RESTORED);
+    }
+    catch (...)
+    {
+        /// Something bad happened, the backup has not built.
+        if (called_async)
+        {
+            tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()));
+            setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED);
+            sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id);
+        }
+        else
+        {
+            /// setStatus() and sendCurrentExceptionToCoordination() will be called by startRestoring().
+            throw;
+        }
+    }
+}
+
+
+void BackupsWorker::addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status)
 {
    Info info;
-    info.uuid = uuid;
-    info.backup_name = backup_name;
-    info.status = status;
-    info.status_changed_time = time(nullptr);
+    info.id = id;
+    info.name = name;
    info.internal = internal;
+    info.status = status;
+    info.start_time = std::chrono::system_clock::now();
+
+    if (isFinalStatus(status))
+        info.end_time = info.start_time;
+
    std::lock_guard lock{infos_mutex};
-    infos[uuid] = std::move(info);
+
+    auto it = infos.find(id);
+    if (it != infos.end())
+    {
+        /// It's better not allow to overwrite the current status if it's in progress.
+        auto current_status = it->second.status;
+        if (!isFinalStatus(current_status))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: ID {} is already in use", id);
+    }
+
+    infos[id] = std::move(info);
+
+    num_active_backups += getNumActiveBackupsChange(status);
+    num_active_restores += getNumActiveRestoresChange(status);
 }

-void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status)
+
+void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw_if_error)
 {
    std::lock_guard lock{infos_mutex};
-    auto & info = infos.at(uuid);
-    info.status = status;
-    info.status_changed_time = time(nullptr);
-
-    if (status == BackupStatus::BACKUP_COMPLETE)
+    auto it = infos.find(id);
+    if (it == infos.end())
    {
-        LOG_INFO(log, "{} {} was created successfully", (info.internal ? "Internal backup" : "Backup"), info.backup_name);
-    }
-    else if (status == BackupStatus::RESTORED)
-    {
-        LOG_INFO(log, "Restored from {} {} successfully", (info.internal ? "internal backup" : "backup"), info.backup_name);
-    }
-    else if ((status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::FAILED_TO_RESTORE))
-    {
-        String start_of_message;
-        if (status == BackupStatus::FAILED_TO_BACKUP)
-            start_of_message = fmt::format("Failed to create {} {}", (info.internal ? "internal backup" : "backup"), info.backup_name);
+        if (throw_if_error)
+           throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
        else
-            start_of_message = fmt::format("Failed to restore from {} {}", (info.internal ? "internal backup" : "backup"), info.backup_name);
-        tryLogCurrentException(log, start_of_message);
+            return;
+    }

+    auto & info = it->second;
+    auto old_status = info.status;
+
+    info.status = status;
+
+    if (isFinalStatus(status))
+        info.end_time = std::chrono::system_clock::now();
+
+    if (isErrorStatus(status))
+    {
        info.error_message = getCurrentExceptionMessage(false);
        info.exception = std::current_exception();
    }
+
+    num_active_backups += getNumActiveBackupsChange(status) - getNumActiveBackupsChange(old_status);
+    num_active_restores += getNumActiveRestoresChange(status) - getNumActiveRestoresChange(old_status);
 }


-void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_exception)
+void BackupsWorker::setNumFilesAndSize(const String & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size)
+{
+    std::lock_guard lock{infos_mutex};
+    auto it = infos.find(id);
+    if (it == infos.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
+
+    auto & info = it->second;
+    info.num_files = num_files;
+    info.uncompressed_size = uncompressed_size;
+    info.compressed_size = compressed_size;
+}
+
+
+void BackupsWorker::wait(const OperationID & id, bool rethrow_exception)
 {
    std::unique_lock lock{infos_mutex};
    status_changed.wait(lock, [&]
    {
-        auto it = infos.find(backup_or_restore_uuid);
+        auto it = infos.find(id);
        if (it == infos.end())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: Unknown UUID {}", toString(backup_or_restore_uuid));
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
        const auto & info = it->second;
        auto current_status = info.status;
-        if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE)))
+        if (rethrow_exception && isErrorStatus(current_status))
            std::rethrow_exception(info.exception);
-        return (current_status == BackupStatus::BACKUP_COMPLETE) || (current_status == BackupStatus::RESTORED);
+        return isFinalStatus(current_status);
    });
 }

-BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid) const
+BackupsWorker::Info BackupsWorker::getInfo(const OperationID & id) const
 {
    std::lock_guard lock{infos_mutex};
-    auto it = infos.find(backup_or_restore_uuid);
+    auto it = infos.find(id);
    if (it == infos.end())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: Unknown UUID {}", toString(backup_or_restore_uuid));
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id);
    return it->second;
 }

@ -451,20 +665,24 @@ std::vector<BackupsWorker::Info> BackupsWorker::getAllInfos() const
    std::vector<Info> res_infos;
    std::lock_guard lock{infos_mutex};
    for (const auto & info : infos | boost::adaptors::map_values)
-        res_infos.push_back(info);
+    {
+        if (!info.internal)
+            res_infos.push_back(info);
+    }
    return res_infos;
 }

 void BackupsWorker::shutdown()
 {
-    size_t num_active_backups = backups_thread_pool.active();
-    size_t num_active_restores = restores_thread_pool.active();
-    if (!num_active_backups && !num_active_restores)
-        return;
-    LOG_INFO(log, "Waiting for {} backup and {} restore tasks to be finished", num_active_backups, num_active_restores);
+    bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
+    if (has_active_backups_and_restores)
+        LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores);
+
    backups_thread_pool.wait();
    restores_thread_pool.wait();
-    LOG_INFO(log, "All backup and restore tasks have finished");
+
+    if (has_active_backups_and_restores)
+        LOG_INFO(log, "All backup and restore tasks have finished");
 }

 }
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -11,6 +11,13 @@ namespace Poco::Util { class AbstractConfiguration; }

 namespace DB
 {
+class ASTBackupQuery;
+struct BackupSettings;
+struct RestoreSettings;
+struct BackupInfo;
+class IBackupCoordination;
+class IRestoreCoordination;
+
 /// Manager of backups and restores: executes backups and restores' threads in the background.
 /// Keeps information about backups and restores started in this session.
 class BackupsWorker
@ -21,47 +28,75 @@ public:
    /// Waits until all tasks have been completed.
    void shutdown();

-    /// Starts executing a BACKUP or RESTORE query. Returns UUID of the operation.
-    UUID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
+    /// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID.
+    using OperationID = String;
+
+    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
+    OperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

    /// Waits until a BACKUP or RESTORE query started by start() is finished.
    /// The function returns immediately if the operation is already finished.
-    void wait(const UUID & backup_or_restore_uuid, bool rethrow_exception = true);
+    void wait(const OperationID & backup_or_restore_id, bool rethrow_exception = true);

    /// Information about executing a BACKUP or RESTORE query started by calling start().
    struct Info
    {
-        UUID uuid;
+        /// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID.
+        OperationID id;

        /// Backup's name, a string like "Disk('backups', 'my_backup')"
-        String backup_name;
+        String name;

-        BackupStatus status;
-        time_t status_changed_time;
-
-        String error_message;
-        std::exception_ptr exception;
-
-        /// Whether this operation is internal, i.e. caused by another BACKUP or RESTORE operation.
-        /// For example BACKUP ON CLUSTER executes an internal BACKUP commands per each node.
+        /// This operation is internal and should not be shown in system.backups
        bool internal = false;
+
+        /// Status of backup or restore operation.
+        BackupStatus status;
+
+        /// Number of files in the backup (including backup's metadata; only unique files are counted).
+        size_t num_files = 0;
+
+        /// Size of all files in the backup (including backup's metadata; only unique files are counted).
+        UInt64 uncompressed_size = 0;
+
+        /// Size of the backup if it's stored as an archive; or the same as `uncompressed_size` if the backup is stored as a folder.
+        UInt64 compressed_size = 0;
+
+        /// Set only if there was an error.
+        std::exception_ptr exception;
+        String error_message;
+
+        std::chrono::system_clock::time_point start_time;
+        std::chrono::system_clock::time_point end_time;
    };

-    Info getInfo(const UUID & backup_or_restore_uuid) const;
+    Info getInfo(const OperationID & id) const;
    std::vector<Info> getAllInfos() const;

 private:
-    UUID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
-    UUID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    OperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);

-    void addInfo(const UUID & uuid, const String & backup_name, BackupStatus status, bool internal);
-    void setStatus(const UUID & uuid, BackupStatus status);
+    void doBackup(const std::shared_ptr<ASTBackupQuery> & backup_query, const OperationID & backup_id, BackupSettings backup_settings,
+                  const BackupInfo & backup_info, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context,
+                  ContextMutablePtr mutable_context, bool called_async);
+
+    OperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+
+    void doRestore(const std::shared_ptr<ASTBackupQuery> & restore_query, const OperationID & restore_id, RestoreSettings restore_settings, const BackupInfo & backup_info,
+                   std::shared_ptr<IRestoreCoordination> restore_coordination, ContextMutablePtr context, bool called_async);
+
+    void addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status);
+    void setStatus(const OperationID & id, BackupStatus status, bool throw_if_error = true);
+    void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
+    void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);

    ThreadPool backups_thread_pool;
    ThreadPool restores_thread_pool;

-    std::unordered_map<UUID, Info> infos;
+    std::unordered_map<OperationID, Info> infos;
    std::condition_variable status_changed;
+    std::atomic<size_t> num_active_backups = 0;
+    std::atomic<size_t> num_active_restores = 0;
    mutable std::mutex infos_mutex;
    Poco::Logger * log;
 };
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@ -36,6 +36,15 @@ public:
    /// Returns UUID of the backup.
    virtual UUID getUUID() const = 0;

+    /// Returns the number of unique files in the backup.
+    virtual size_t getNumFiles() const = 0;
+
+    /// Returns the total size of unique files in the backup.
+    virtual UInt64 getUncompressedSize() const = 0;
+
+    /// Returns the compressed size of the backup. If the backup is not stored as an archive it returns the same as getUncompressedSize().
+    virtual UInt64 getCompressedSize() const = 0;
+
    /// Returns names of entries stored in a specified directory in the backup.
    /// If `directory` is empty or '/' the functions returns entries in the backup's root.
    virtual Strings listFiles(const String & directory, bool recursive = false) const = 0;
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -18,11 +18,11 @@ class IBackupCoordination
 public:
    virtual ~IBackupCoordination() = default;

-    /// Sets the current status and waits for other hosts to come to this status too.
-    virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0;
-    virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0;
-    virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0;
-    virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0;
+    /// Sets the current stage and waits for other hosts to come to this stage too.
+    virtual void setStage(const String & current_host, const String & new_stage, const String & message) = 0;
+    virtual void setError(const String & current_host, const Exception & exception) = 0;
+    virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) = 0;
+    virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;

    struct PartNameAndChecksum
    {
@ -115,9 +115,6 @@ public:

    /// Returns the list of all the archive suffixes which were generated.
    virtual Strings getAllArchiveSuffixes() const = 0;
-
-    /// Removes remotely stored information.
-    virtual void drop() {}
 };

 }
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -16,11 +16,11 @@ class IRestoreCoordination
 public:
    virtual ~IRestoreCoordination() = default;

-    /// Sets the current status and waits for other hosts to come to this status too.
-    virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0;
-    virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0;
-    virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0;
-    virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0;
+    /// Sets the current stage and waits for other hosts to come to this stage too.
+    virtual void setStage(const String & current_host, const String & new_stage, const String & message) = 0;
+    virtual void setError(const String & current_host, const Exception & exception) = 0;
+    virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) = 0;
+    virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;

    static constexpr const char * kErrorStatus = "error";

@ -34,9 +34,6 @@ public:
    /// Sets that this replica is going to restore a ReplicatedAccessStorage.
    /// The function returns false if this access storage is being already restored by another replica.
    virtual bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) = 0;
-
-    /// Removes remotely stored information.
-    virtual void drop() {}
 };

 }
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -7,20 +7,20 @@ namespace DB
 RestoreCoordinationLocal::RestoreCoordinationLocal() = default;
 RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;

-void RestoreCoordinationLocal::setStatus(const String &, const String &, const String &)
+void RestoreCoordinationLocal::setStage(const String &, const String &, const String &)
 {
 }

-void RestoreCoordinationLocal::setErrorStatus(const String &, const Exception &)
+void RestoreCoordinationLocal::setError(const String &, const Exception &)
 {
 }

-Strings RestoreCoordinationLocal::waitStatus(const Strings &, const String &)
+Strings RestoreCoordinationLocal::waitForStage(const Strings &, const String &)
 {
    return {};
 }

-Strings RestoreCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64)
+Strings RestoreCoordinationLocal::waitForStage(const Strings &, const String &, std::chrono::milliseconds)
 {
    return {};
 }
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -18,11 +18,11 @@ public:
    RestoreCoordinationLocal();
    ~RestoreCoordinationLocal() override;

-    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
-    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    void setErrorStatus(const String & current_host, const Exception & exception) override;
-    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
-    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;
+    /// Sets the current stage and waits for other hosts to come to this stage too.
+    void setStage(const String & current_host, const String & new_stage, const String & message) override;
+    void setError(const String & current_host, const Exception & exception) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -6,57 +6,86 @@
 namespace DB
 {

-RestoreCoordinationRemote::RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
+RestoreCoordinationRemote::RestoreCoordinationRemote(
+    const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_)
    : zookeeper_path(zookeeper_path_)
    , get_zookeeper(get_zookeeper_)
-    , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("RestoreCoordination"))
+    , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_)
 {
    createRootNodes();
+
+    stage_sync.emplace(
+        zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("RestoreCoordination"));
 }

-RestoreCoordinationRemote::~RestoreCoordinationRemote() = default;
+RestoreCoordinationRemote::~RestoreCoordinationRemote()
+{
+    try
+    {
+        if (remove_zk_nodes_in_destructor)
+            removeAllNodes();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+zkutil::ZooKeeperPtr RestoreCoordinationRemote::getZooKeeper() const
+{
+    std::lock_guard lock{mutex};
+    if (!zookeeper || zookeeper->expired())
+    {
+        zookeeper = get_zookeeper();
+
+        /// It's possible that we connected to different [Zoo]Keeper instance
+        /// so we may read a bit stale state.
+        zookeeper->sync(zookeeper_path);
+    }
+    return zookeeper;
+}

 void RestoreCoordinationRemote::createRootNodes()
 {
-    auto zookeeper = get_zookeeper();
-    zookeeper->createAncestors(zookeeper_path);
-    zookeeper->createIfNotExists(zookeeper_path, "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
+    auto zk = getZooKeeper();
+    zk->createAncestors(zookeeper_path);
+    zk->createIfNotExists(zookeeper_path, "");
+    zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
+    zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
+    zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
 }


-void RestoreCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message)
+void RestoreCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message)
 {
-    status_sync.set(current_host, new_status, message);
+    stage_sync->set(current_host, new_stage, message);
 }

-void RestoreCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception)
+void RestoreCoordinationRemote::setError(const String & current_host, const Exception & exception)
 {
-    status_sync.setError(current_host, exception);
+    stage_sync->setError(current_host, exception);
 }

-Strings RestoreCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait)
+Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait)
 {
-    return status_sync.wait(all_hosts, status_to_wait);
+    return stage_sync->wait(all_hosts, stage_to_wait);
 }

-Strings RestoreCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
+Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout)
 {
-    return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms);
+    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
 }


 bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();

    String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
-    zookeeper->createIfNotExists(path, "");
+    zk->createIfNotExists(path, "");

    path += "/" + escapeForFileName(table_name);
-    auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent);
+    auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
    if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
        throw zkutil::KeeperException(code, path);

@ -65,10 +94,10 @@ bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const S

 bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();

    String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
-    auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent);
+    auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
    if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
        throw zkutil::KeeperException(code, path);

@ -77,10 +106,10 @@ bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const St

 bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
 {
-    auto zookeeper = get_zookeeper();
+    auto zk = getZooKeeper();

    String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
-    auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent);
+    auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
    if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
        throw zkutil::KeeperException(code, path);

@ -89,13 +118,15 @@ bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & ac

 void RestoreCoordinationRemote::removeAllNodes()
 {
-    auto zookeeper = get_zookeeper();
-    zookeeper->removeRecursive(zookeeper_path);
-}
+    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
+    ///
+    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
+    /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination
+    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
+    /// of their restore work before that.

-void RestoreCoordinationRemote::drop()
-{
-    removeAllNodes();
+    auto zk = getZooKeeper();
+    zk->removeRecursive(zookeeper_path);
 }

 }
--- a/src/Backups/RestoreCoordinationRemote.h
+++ b/src/Backups/RestoreCoordinationRemote.h
@ -1,7 +1,7 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
-#include <Backups/BackupCoordinationStatusSync.h>
+#include <Backups/BackupCoordinationStageSync.h>


 namespace DB
@ -11,14 +11,14 @@ namespace DB
 class RestoreCoordinationRemote : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationRemote(const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper);
+    RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_);
    ~RestoreCoordinationRemote() override;

-    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
-    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    void setErrorStatus(const String & current_host, const Exception & exception) override;
-    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
-    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;
+    /// Sets the current stage and waits for other hosts to come to this stage too.
+    void setStage(const String & current_host, const String & new_stage, const String & message) override;
+    void setError(const String & current_host, const Exception & exception) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override;
+    Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -31,10 +31,8 @@ public:
    /// The function returns false if this access storage is being already restored by another replica.
    bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override;

-    /// Removes remotely stored information.
-    void drop() override;
-
 private:
+    zkutil::ZooKeeperPtr getZooKeeper() const;
    void createRootNodes();
    void removeAllNodes();

@ -42,7 +40,12 @@ private:

    const String zookeeper_path;
    const zkutil::GetZooKeeper get_zookeeper;
-    BackupCoordinationStatusSync status_sync;
+    const bool remove_zk_nodes_in_destructor;
+
+    std::optional<BackupCoordinationStageSync> stage_sync;
+
+    mutable std::mutex mutex;
+    mutable zkutil::ZooKeeperPtr zookeeper;
 };

 }
--- a/src/Backups/RestoreSettings.cpp
+++ b/src/Backups/RestoreSettings.cpp
@ -143,6 +143,7 @@ namespace

 /// List of restore settings except base_backup_name and cluster_host_ids.
 #define LIST_OF_RESTORE_SETTINGS(M) \
+    M(String, id) \
    M(String, password) \
    M(Bool, structure_only) \
    M(RestoreTableCreationMode, create_table) \
--- a/src/Backups/RestoreSettings.h
+++ b/src/Backups/RestoreSettings.h
@ -41,6 +41,9 @@ using RestoreUDFCreationMode = RestoreAccessCreationMode;
 /// Settings specified in the "SETTINGS" clause of a RESTORE query.
 struct RestoreSettings
 {
+    /// ID of the restore operation, to identify it in the system.backups table. Auto-generated if not set.
+    String id;
+
    /// Base backup, with this setting we can override the location of the base backup while restoring.
    /// Any incremental backup keeps inside the information about its base backup, so using this setting is optional.
    std::optional<BackupInfo> base_backup_info;
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -1,5 +1,6 @@
 #include <Backups/RestorerFromBackup.h>
 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupCoordinationStage.h>
 #include <Backups/BackupSettings.h>
 #include <Backups/IBackup.h>
 #include <Backups/IBackupEntry.h>
@ -38,20 +39,10 @@ namespace ErrorCodes
 }


+namespace Stage = BackupCoordinationStage;
+
 namespace
 {
-    /// Finding databases and tables in the backup which we're going to restore.
-    constexpr const char * kFindingTablesInBackupStatus = "finding tables in backup";
-
-    /// Creating databases or finding them and checking their definitions.
-    constexpr const char * kCreatingDatabasesStatus = "creating databases";
-
-    /// Creating tables or finding them and checking their definition.
-    constexpr const char * kCreatingTablesStatus = "creating tables";
-
-    /// Inserting restored data to tables.
-    constexpr const char * kInsertingDataToTablesStatus = "inserting data to tables";
-
    /// Uppercases the first character of a passed string.
    String toUpperFirst(const String & str)
    {
@ -102,6 +93,7 @@ RestorerFromBackup::RestorerFromBackup(
    , restore_coordination(restore_coordination_)
    , backup(backup_)
    , context(context_)
+    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
    , log(&Poco::Logger::get("RestorerFromBackup"))
 {
@ -112,7 +104,7 @@ RestorerFromBackup::~RestorerFromBackup() = default;
 RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode)
 {
    /// run() can be called onle once.
-    if (!current_status.empty())
+    if (!current_stage.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");

    /// Find other hosts working along with us to execute this ON CLUSTER query.
@ -126,7 +118,7 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode)
    findRootPathsInBackup();

    /// Find all the databases and tables which we will read from the backup.
-    setStatus(kFindingTablesInBackupStatus);
+    setStage(Stage::FINDING_TABLES_IN_BACKUP);
    findDatabasesAndTablesInBackup();

    /// Check access rights.
@ -136,27 +128,31 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode)
        return {};

    /// Create databases using the create queries read from the backup.
-    setStatus(kCreatingDatabasesStatus);
+    setStage(Stage::CREATING_DATABASES);
    createDatabases();

    /// Create tables using the create queries read from the backup.
-    setStatus(kCreatingTablesStatus);
+    setStage(Stage::CREATING_TABLES);
    createTables();

    /// All what's left is to insert data to tables.
    /// No more data restoring tasks are allowed after this point.
-    setStatus(kInsertingDataToTablesStatus);
+    setStage(Stage::INSERTING_DATA_TO_TABLES);
    return getDataRestoreTasks();
 }

-void RestorerFromBackup::setStatus(const String & new_status, const String & message)
+void RestorerFromBackup::setStage(const String & new_stage, const String & message)
 {
-    LOG_TRACE(log, "{}", toUpperFirst(new_status));
-    current_status = new_status;
+    LOG_TRACE(log, "{}", toUpperFirst(new_stage));
+    current_stage = new_stage;
+
    if (restore_coordination)
    {
-        restore_coordination->setStatus(restore_settings.host_id, new_status, message);
-        restore_coordination->waitStatus(all_hosts, new_status);
+        restore_coordination->setStage(restore_settings.host_id, new_stage, message);
+        if (new_stage == Stage::FINDING_TABLES_IN_BACKUP)
+            restore_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout);
+        else
+            restore_coordination->waitForStage(all_hosts, new_stage);
    }
 }

@ -814,14 +810,14 @@ std::vector<QualifiedTableName> RestorerFromBackup::findTablesWithoutDependencie

 void RestorerFromBackup::addDataRestoreTask(DataRestoreTask && new_task)
 {
-    if (current_status == kInsertingDataToTablesStatus)
+    if (current_stage == Stage::INSERTING_DATA_TO_TABLES)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed");
    data_restore_tasks.push_back(std::move(new_task));
 }

 void RestorerFromBackup::addDataRestoreTasks(DataRestoreTasks && new_tasks)
 {
-    if (current_status == kInsertingDataToTablesStatus)
+    if (current_stage == Stage::INSERTING_DATA_TO_TABLES)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed");
    insertAtEnd(data_restore_tasks, std::move(new_tasks));
 }
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -73,6 +73,7 @@ private:
    std::shared_ptr<IRestoreCoordination> restore_coordination;
    BackupPtr backup;
    ContextMutablePtr context;
+    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds create_table_timeout;
    Poco::Logger * log;

@ -100,7 +101,7 @@ private:

    DataRestoreTasks getDataRestoreTasks();

-    void setStatus(const String & new_status, const String & message = "");
+    void setStage(const String & new_stage, const String & message = "");

    struct DatabaseInfo
    {
@ -124,7 +125,7 @@ private:

    std::vector<QualifiedTableName> findTablesWithoutDependencies() const;

-    String current_status;
+    String current_stage;
    std::unordered_map<String, DatabaseInfo> database_infos;
    std::map<QualifiedTableName, TableInfo> table_infos;
    std::vector<DataRestoreTask> data_restore_tasks;
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -69,6 +69,7 @@
 #include <IO/CompressionMethod.h>
 #include <Client/InternalTextLogs.h>
 #include <boost/algorithm/string/replace.hpp>
+#include <IO/ForkWriteBuffer.h>


 namespace fs = std::filesystem;
@ -403,7 +404,6 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
        return;

    processed_rows += block.rows();
-
    /// Even if all blocks are empty, we still need to initialize the output stream to write empty resultset.
    initOutputFormat(block, parsed_query);

@ -414,7 +414,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
        return;

    /// If results are written INTO OUTFILE, we can avoid clearing progress to avoid flicker.
-    if (need_render_progress && (stdout_is_a_tty || is_interactive) && !select_into_file)
+    if (need_render_progress && (stdout_is_a_tty || is_interactive) && (!select_into_file || select_into_file_and_stdout))
        progress_indication.clearProgressOutput();

    try
@ -434,7 +434,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
    /// Restore progress bar after data block.
    if (need_render_progress && (stdout_is_a_tty || is_interactive))
    {
-        if (select_into_file)
+        if (select_into_file && !select_into_file_and_stdout)
            std::cerr << "\r";
        progress_indication.writeProgress();
    }
@ -511,7 +511,7 @@ try
        String current_format = format;

        select_into_file = false;
-
+        select_into_file_and_stdout = false;
        /// The query can specify output format or output file.
        if (const auto * query_with_output = dynamic_cast<const ASTQueryWithOutput *>(parsed_query.get()))
        {
@ -554,6 +554,13 @@ try
                    compression_level
                );

+                if (query_with_output->is_into_outfile_with_stdout)
+                {
+                    select_into_file_and_stdout = true;
+                    out_file_buf = std::make_unique<ForkWriteBuffer>(std::vector<WriteBufferPtr>{std::move(out_file_buf),
+                            std::make_shared<WriteBufferFromFileDescriptor>(STDOUT_FILENO)});
+                }
+
                // We are writing to file, so default format is the same as in non-interactive mode.
                if (is_interactive && is_default_format)
                    current_format = "TabSeparated";
@ -578,7 +585,7 @@ try

        /// It is not clear how to write progress intermixed with data with parallel formatting.
        /// It may increase code complexity significantly.
-        if (!need_render_progress || select_into_file)
+        if (!need_render_progress || (select_into_file && !select_into_file_and_stdout))
            output_format = global_context->getOutputFormatParallelIfPossible(
                current_format, out_file_buf ? *out_file_buf : *out_buf, block);
        else
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -181,6 +181,7 @@ protected:

    String format; /// Query results output format.
    bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering.
+    bool select_into_file_and_stdout = false; /// If writing result INTO OUTFILE AND STDOUT. It affects progress rendering.
    bool is_default_format = true; /// false, if format is set in the config or command line.
    size_t format_max_block_size = 0; /// Max block size for console output.
    String insert_format; /// Format of INSERT data that is read from stdin in batch mode.
--- a/src/Client/QueryFuzzer.cpp
+++ b/src/Client/QueryFuzzer.cpp
@ -329,9 +329,9 @@ void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def)
        case 0:
        {
            const auto r = fuzz_rand() % 3;
-            def.frame_type = r == 0 ? WindowFrame::FrameType::Rows
-                : r == 1 ? WindowFrame::FrameType::Range
-                    : WindowFrame::FrameType::Groups;
+            def.frame_type = r == 0 ? WindowFrame::FrameType::ROWS
+                : r == 1 ? WindowFrame::FrameType::RANGE
+                    : WindowFrame::FrameType::GROUPS;
            break;
        }
        case 1:
@ -385,7 +385,7 @@ void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def)
            break;
    }

-    if (def.frame_type == WindowFrame::FrameType::Range
+    if (def.frame_type == WindowFrame::FrameType::RANGE
        && def.frame_begin_type == WindowFrame::BoundaryType::Unbounded
        && def.frame_begin_preceding
        && def.frame_end_type == WindowFrame::BoundaryType::Current)
--- a/src/Common/ConcurrencyControl.h
+++ b/src/Common/ConcurrencyControl.h
@ -0,0 +1,266 @@
+#pragma once
+
+#include <base/types.h>
+#include <boost/core/noncopyable.hpp>
+#include <mutex>
+#include <memory>
+#include <list>
+#include <condition_variable>
+
+#include <Common/Exception.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+}
+
+/*
+ * Controls how many threads can be allocated for a query (or another activity).
+ * There is a limited amount of slots for threads. It can be set with `setMaxConcurrency(limit)`.
+ *
+ * Lifecycle of a slot: free -> granted -> acquired -> free.
+ * free: slot is available to be allocated by any query.
+ * granted: slot is allocated by specific query, but not yet acquired by any thread.
+ * acquired: slot is allocated by specific query and acquired by a thread.
+ *
+ * USAGE:
+ *   1. Create an allocation for a query:
+ *      `auto slots = ConcurrencyControl::instance().allocate(min, max);`
+ *      It will allocate at least `min` and at most `max` slots.
+ *      Note that `min` slots are granted immediately, but other `max - min` may be granted later.
+ *   2. For every thread a slot has to be acquired from that allocation:
+ *      `while (auto slot = slots->tryAcquire()) createYourThread([slot = std::move(slot)] { ... });`
+ *      This snippet can be used at query startup and for upscaling later.
+ * (both functions are non-blocking)
+ *
+ * Released slots are distributed between waiting allocations in a round-robin manner to provide fairness.
+ * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)`
+ * because `min` amount of slots is allocated for each query unconditionally.
+ */
+class ConcurrencyControl : boost::noncopyable
+{
+public:
+    struct Allocation;
+    using AllocationPtr = std::shared_ptr<Allocation>;
+    using SlotCount = UInt64;
+    using Waiters = std::list<Allocation *>;
+
+    static constexpr SlotCount Unlimited = std::numeric_limits<SlotCount>::max();
+
+    // Scoped guard for acquired slot, see Allocation::tryAcquire()
+    struct Slot : boost::noncopyable
+    {
+        ~Slot()
+        {
+            allocation->release();
+        }
+
+    private:
+        friend struct Allocation; // for ctor
+
+        explicit Slot(AllocationPtr && allocation_)
+            : allocation(std::move(allocation_))
+        {}
+
+        AllocationPtr allocation;
+    };
+
+    // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet
+    using SlotPtr = std::shared_ptr<Slot>;
+
+    // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max)
+    struct Allocation : std::enable_shared_from_this<Allocation>, boost::noncopyable
+    {
+        ~Allocation()
+        {
+            // We have to lock parent's mutex to avoid race with grant()
+            // NOTE: shortcut can be added, but it requires Allocation::mutex lock even to check if shortcut is possible
+            parent.free(this);
+        }
+
+        // Take one already granted slot if available. Lock-free iff there is no granted slot.
+        [[nodiscard]] SlotPtr tryAcquire()
+        {
+            SlotCount value = granted.load();
+            while (value)
+            {
+                if (granted.compare_exchange_strong(value, value - 1))
+                {
+                    std::unique_lock lock{mutex};
+                    return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
+                }
+            }
+            return {}; // avoid unnecessary locking
+        }
+
+        SlotCount grantedCount() const
+        {
+            return granted;
+        }
+
+    private:
+        friend struct Slot; // for release()
+        friend class ConcurrencyControl; // for grant(), free() and ctor
+
+        Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_ = {})
+            : parent(parent_)
+            , limit(limit_)
+            , allocated(granted_)
+            , granted(granted_)
+            , waiter(waiter_)
+        {
+            if (allocated < limit)
+                *waiter = this;
+        }
+
+        auto cancel()
+        {
+            std::unique_lock lock{mutex};
+            return std::pair{allocated - released,
+                allocated < limit ?
+                    std::optional<Waiters::iterator>(waiter) :
+                    std::optional<Waiters::iterator>()};
+        }
+
+        // Grant single slot to allocation, returns true iff more slot(s) are required
+        bool grant()
+        {
+            std::unique_lock lock{mutex};
+            granted++;
+            allocated++;
+            return allocated < limit;
+        }
+
+        // Release one slot and grant it to other allocation if required
+        void release()
+        {
+            parent.release(1);
+            std::unique_lock lock{mutex};
+            released++;
+            if (released > allocated)
+                abort();
+        }
+
+        ConcurrencyControl & parent;
+        const SlotCount limit;
+
+        std::mutex mutex; // the following values must be accessed under this mutex
+        SlotCount allocated; // allocated total (including already `released`)
+        SlotCount released = 0;
+
+        std::atomic<SlotCount> granted; // allocated, but not yet acquired
+
+        const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit
+    };
+
+public:
+    ConcurrencyControl()
+        : cur_waiter(waiters.end())
+    {}
+
+    // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl
+    // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries
+    ~ConcurrencyControl()
+    {
+        if (!waiters.empty())
+            abort();
+    }
+
+    // Allocate at least `min` and at most `max` slots.
+    // If not all `max` slots were successfully allocated, a subscription for later allocation is created
+    // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread.
+    [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max)
+    {
+        if (min > max)
+            throw DB::Exception("ConcurrencyControl: invalid allocation requirements", DB::ErrorCodes::LOGICAL_ERROR);
+
+        std::unique_lock lock{mutex};
+
+        // Acquire as much slots as we can, but not lower than `min`
+        SlotCount granted = std::max(min, std::min(max, available(lock)));
+        cur_concurrency += granted;
+
+        // Create allocation and start waiting if more slots are required
+        if (granted < max)
+            return AllocationPtr(new Allocation(*this, max, granted,
+                waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */)));
+        else
+            return AllocationPtr(new Allocation(*this, max, granted));
+    }
+
+    void setMaxConcurrency(SlotCount value)
+    {
+        std::unique_lock lock{mutex};
+        max_concurrency = std::max<SlotCount>(1, value); // never allow max_concurrency to be zero
+        schedule(lock);
+    }
+
+    static ConcurrencyControl & instance()
+    {
+        static ConcurrencyControl result;
+        return result;
+    }
+
+private:
+    friend struct Allocation; // for free() and release()
+
+    void free(Allocation * allocation)
+    {
+        // Allocation is allowed to be canceled even if there are:
+        //  - `amount`: granted slots (acquired slots are not possible, because Slot holds AllocationPtr)
+        //  - `waiter`: active waiting for more slots to be allocated
+        // Thus Allocation destruction may require the following lock, to avoid race conditions
+        std::unique_lock lock{mutex};
+        auto [amount, waiter] = allocation->cancel();
+
+        cur_concurrency -= amount;
+        if (waiter)
+        {
+            if (cur_waiter == *waiter)
+                cur_waiter = waiters.erase(*waiter);
+            else
+                waiters.erase(*waiter);
+        }
+        schedule(lock);
+    }
+
+    void release(SlotCount amount)
+    {
+        std::unique_lock lock{mutex};
+        cur_concurrency -= amount;
+        schedule(lock);
+    }
+
+    // Round-robin scheduling of available slots among waiting allocations
+    void schedule(std::unique_lock<std::mutex> &)
+    {
+        while (cur_concurrency < max_concurrency && !waiters.empty())
+        {
+            cur_concurrency++;
+            if (cur_waiter == waiters.end())
+                cur_waiter = waiters.begin();
+            Allocation * allocation = *cur_waiter;
+            if (allocation->grant())
+                ++cur_waiter;
+            else
+                cur_waiter = waiters.erase(cur_waiter); // last required slot has just been granted -- stop waiting
+        }
+    }
+
+    SlotCount available(std::unique_lock<std::mutex> &)
+    {
+        if (cur_concurrency < max_concurrency)
+            return max_concurrency - cur_concurrency;
+        else
+            return 0;
+    }
+
+    std::mutex mutex;
+    Waiters waiters;
+    Waiters::iterator cur_waiter; // round-robin pointer
+    SlotCount max_concurrency = Unlimited;
+    SlotCount cur_concurrency = 0;
+};
--- a/src/Common/MemoryTrackerBlockerInThread.cpp
+++ b/src/Common/MemoryTrackerBlockerInThread.cpp
@ -3,12 +3,18 @@
 // MemoryTrackerBlockerInThread
 thread_local uint64_t MemoryTrackerBlockerInThread::counter = 0;
 thread_local VariableContext MemoryTrackerBlockerInThread::level = VariableContext::Global;
+
 MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread(VariableContext level_)
    : previous_level(level)
 {
    ++counter;
    level = level_;
 }
+
+MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread() : MemoryTrackerBlockerInThread(VariableContext::User)
+{
+}
+
 MemoryTrackerBlockerInThread::~MemoryTrackerBlockerInThread()
 {
    --counter;
--- a/src/Common/MemoryTrackerBlockerInThread.h
+++ b/src/Common/MemoryTrackerBlockerInThread.h
@ -11,9 +11,12 @@ private:
    static thread_local VariableContext level;

    VariableContext previous_level;
-public:
+
    /// level_ - block in level and above
-    explicit MemoryTrackerBlockerInThread(VariableContext level_ = VariableContext::User);
+    explicit MemoryTrackerBlockerInThread(VariableContext level_);
+
+public:
+    explicit MemoryTrackerBlockerInThread();
    ~MemoryTrackerBlockerInThread();

    MemoryTrackerBlockerInThread(const MemoryTrackerBlockerInThread &) = delete;
@ -23,4 +26,6 @@ public:
    {
        return counter > 0 && current_level >= level;
    }
+
+    friend class MemoryTracker;
 };
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@ -79,7 +79,7 @@ void SystemLogBase<LogElement>::add(const LogElement & element)
    /// The size of allocation can be in order of a few megabytes.
    /// But this should not be accounted for query memory usage.
    /// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky.
-    MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global);
+    MemoryTrackerBlockerInThread temporarily_disable_memory_tracker;

    /// Should not log messages under mutex.
    bool queue_is_half_full = false;
--- a/src/Common/TLDListsHolder.cpp
+++ b/src/Common/TLDListsHolder.cpp
@ -15,20 +15,31 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+constexpr size_t StringHashTablePadRequirement = 8;
+
 /// TLDList
 TLDList::TLDList(size_t size)
    : tld_container(size)
-    , pool(std::make_unique<Arena>(10 << 20))
-{}
-bool TLDList::insert(StringRef host)
+    , memory_pool(std::make_unique<Arena>())
 {
-    bool inserted;
-    tld_container.emplace(DB::ArenaKeyHolder{host, *pool}, inserted);
-    return inserted;
+    /// StringHashTable requires padded to 8 bytes key,
+    /// and Arena (memory_pool here) does satisfies this,
+    /// since it has padding with 15 bytes at the right.
+    ///
+    /// However, StringHashTable may reference -1 byte of the key,
+    /// so left padding is also required:
+    memory_pool->alignedAlloc(StringHashTablePadRequirement, StringHashTablePadRequirement);
 }
-bool TLDList::has(StringRef host) const
+void TLDList::insert(const String & host, TLDType type)
 {
-    return tld_container.has(host);
+    StringRef owned_host{memory_pool->insert(host.data(), host.size()), host.size()};
+    tld_container[owned_host] = type;
+}
+TLDType TLDList::lookup(StringRef host) const
+{
+    if (auto it = tld_container.find(host); it != nullptr)
+        return it->getMapped();
+    return TLDType::TLD_NONE;
 }

 /// TLDListsHolder
@ -57,32 +68,44 @@ void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, con

 size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path)
 {
-    std::unordered_set<std::string> tld_list_tmp;
+    std::unordered_map<std::string, TLDType> tld_list_tmp;

    ReadBufferFromFile in(path);
-    String line;
+    String buffer;
    while (!in.eof())
    {
-        readEscapedStringUntilEOL(line, in);
+        readEscapedStringUntilEOL(buffer, in);
        if (!in.eof())
            ++in.position();
+        std::string_view line(buffer);
        /// Skip comments
-        if (line.size() > 2 && line[0] == '/' && line[1] == '/')
+        if (line.starts_with("//"))
            continue;
-        line = trim(line, [](char c) { return std::isspace(c); });
+        line = line.substr(0, line.rend() - std::find_if_not(line.rbegin(), line.rend(), ::isspace));
        /// Skip empty line
        if (line.empty())
            continue;
-        tld_list_tmp.emplace(line);
+        /// Validate special symbols.
+        if (line.starts_with("*."))
+        {
+            line = line.substr(2);
+            tld_list_tmp.emplace(line, TLDType::TLD_ANY);
+        }
+        else if (line[0] == '!')
+        {
+            line = line.substr(1);
+            tld_list_tmp.emplace(line, TLDType::TLD_EXCLUDE);
+        }
+        else
+            tld_list_tmp.emplace(line, TLDType::TLD_REGULAR);
    }
    if (!in.eof())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Not all list had been read", name);

    TLDList tld_list(tld_list_tmp.size());
-    for (const auto & host : tld_list_tmp)
+    for (const auto & [host, type] : tld_list_tmp)
    {
-        StringRef host_ref{host.data(), host.size()};
-        tld_list.insert(host_ref);
+        tld_list.insert(host, type);
    }

    size_t tld_list_size = tld_list.size();
--- a/src/Common/TLDListsHolder.h
+++ b/src/Common/TLDListsHolder.h
@ -2,7 +2,7 @@

 #include <base/defines.h>
 #include <base/StringRef.h>
-#include <Common/HashTable/StringHashSet.h>
+#include <Common/HashTable/StringHashMap.h>
 #include <Common/Arena.h>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <mutex>
@ -12,25 +12,35 @@
 namespace DB
 {

+enum TLDType
+{
+    /// Does not exist marker
+    TLD_NONE,
+    /// For regular lines
+    TLD_REGULAR,
+    /// For asterisk (*)
+    TLD_ANY,
+    /// For exclamation mark (!)
+    TLD_EXCLUDE,
+};
+
 /// Custom TLD List
 ///
-/// Unlike tldLookup (which uses gperf) this one uses plain StringHashSet.
+/// Unlike tldLookup (which uses gperf) this one uses plain StringHashMap.
 class TLDList
 {
 public:
-    using Container = StringHashSet<>;
+    using Container = StringHashMap<TLDType>;

    explicit TLDList(size_t size);

-    /// Return true if the tld_container does not contains such element.
-    bool insert(StringRef host);
-    /// Check is there such TLD
-    bool has(StringRef host) const;
+    void insert(const String & host, TLDType type);
+    TLDType lookup(StringRef host) const;
    size_t size() const { return tld_container.size(); }

 private:
    Container tld_container;
-    std::unique_ptr<Arena> pool;
+    std::unique_ptr<Arena> memory_pool;
 };

 class TLDListsHolder
@ -48,6 +58,11 @@ public:
    /// - "//" -- comment,
    /// - empty lines will be ignored.
    ///
+    /// Treats the following special symbols:
+    /// - "*"
+    /// - "!"
+    ///
+    /// Format : https://github.com/publicsuffix/list/wiki/Format
    /// Example: https://publicsuffix.org/list/public_suffix_list.dat
    ///
    /// Return size of the list.
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@ -2,6 +2,7 @@

 #include <base/types.h>
 #include <Common/Exception.h>
+#include <Coordination/KeeperConstants.h>

 #include <vector>
 #include <memory>
@ -57,6 +58,8 @@ struct Stat
    int32_t dataLength{0}; /// NOLINT
    int32_t numChildren{0}; /// NOLINT
    int64_t pzxid{0};
+
+    bool operator==(const Stat &) const = default;
 };

 enum class Error : int32_t
@ -109,7 +112,6 @@ bool isUserError(Error code);

 const char * errorMessage(Error code);

-
 struct Request;
 using RequestPtr = std::shared_ptr<Request>;
 using Requests = std::vector<RequestPtr>;
@ -516,6 +518,8 @@ public:
        const Requests & requests,
        MultiCallback callback) = 0;

+    virtual DB::KeeperApiVersion getApiVersion() = 0;
+
    /// Expire session and finish all pending requests
    virtual void finalize(const String & reason) = 0;
 };
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@ -90,6 +90,11 @@ public:

    void finalize(const String & reason) override;

+    DB::KeeperApiVersion getApiVersion() override
+    {
+        return KeeperApiVersion::ZOOKEEPER_COMPATIBLE;
+    }
+
    struct Node
    {
        String data;
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -337,17 +337,17 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings
    }
 }

-Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch)
+Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch, Coordination::ListRequestType list_request_type)
 {
    Strings res;
-    check(tryGetChildren(path, res, stat, watch), path);
+    check(tryGetChildren(path, res, stat, watch, list_request_type), path);
    return res;
 }

-Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback)
+Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback, Coordination::ListRequestType list_request_type)
 {
    Strings res;
-    check(tryGetChildrenWatch(path, res, stat, watch_callback), path);
+    check(tryGetChildrenWatch(path, res, stat, watch_callback, list_request_type), path);
    return res;
 }

@ -540,7 +540,6 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r
    }
 }

-
 std::string ZooKeeper::get(const std::string & path, Coordination::Stat * stat, const EventPtr & watch)
 {
    Coordination::Error code = Coordination::Error::ZOK;
@ -904,6 +903,11 @@ bool ZooKeeper::expired()
    return impl->isExpired();
 }

+DB::KeeperApiVersion ZooKeeper::getApiVersion()
+{
+    return impl->getApiVersion();
+}
+
 Int64 ZooKeeper::getClientID()
 {
    return impl->getSessionID();
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -127,6 +127,8 @@ public:
    /// Returns true, if the session has expired.
    bool expired();

+    DB::KeeperApiVersion getApiVersion();
+
    /// Create a znode.
    /// Throw an exception if something went wrong.
    std::string create(const std::string & path, const std::string & data, int32_t mode);
@ -184,11 +186,13 @@ public:

    Strings getChildren(const std::string & path,
                        Coordination::Stat * stat = nullptr,
-                        const EventPtr & watch = nullptr);
+                        const EventPtr & watch = nullptr,
+                        Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL);

    Strings getChildrenWatch(const std::string & path,
                             Coordination::Stat * stat,
-                             Coordination::WatchCallback watch_callback);
+                             Coordination::WatchCallback watch_callback,
+                             Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL);

    /// Doesn't not throw in the following cases:
    /// * The node doesn't exist.
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -724,7 +724,10 @@ void ZooKeeperResponse::fillLogElements(LogElements & elems, size_t idx) const
    assert(!elem.xid || elem.xid == xid);
    elem.xid = xid;
    int32_t response_op = tryGetOpNum();
-    assert(!elem.op_num || elem.op_num == response_op || response_op < 0);
+
+    [[maybe_unused]] const bool is_filtered_list = elem.op_num == static_cast<int32_t>(Coordination::OpNum::FilteredList)
+        && response_op == static_cast<int32_t>(Coordination::OpNum::List);
+    assert(!elem.op_num || elem.op_num == response_op || is_filtered_list || response_op < 0);
    elem.op_num = response_op;

    elem.zxid = zxid;
@ -892,6 +895,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
    registerZooKeeperRequest<OpNum::SessionID, ZooKeeperSessionIDRequest>(*this);
    registerZooKeeperRequest<OpNum::GetACL, ZooKeeperGetACLRequest>(*this);
    registerZooKeeperRequest<OpNum::SetACL, ZooKeeperSetACLRequest>(*this);
+    registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }

 }
--- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
@ -24,6 +24,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
    static_cast<int32_t>(OpNum::SessionID),
    static_cast<int32_t>(OpNum::SetACL),
    static_cast<int32_t>(OpNum::GetACL),
+    static_cast<int32_t>(OpNum::FilteredList),
 };

 std::string toString(OpNum op_num)
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -6,6 +6,7 @@
 #include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <IO/WriteHelpers.h>
 #include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromString.h>
 #include <IO/Operators.h>
 #include <IO/WriteBufferFromString.h>
 #include <Common/logger_useful.h>
@ -352,6 +353,8 @@ ZooKeeper::ZooKeeper(
    send_thread = ThreadFromGlobalPool([this] { sendThread(); });
    receive_thread = ThreadFromGlobalPool([this] { receiveThread(); });

+    initApiVersion();
+
    ProfileEvents::increment(ProfileEvents::ZooKeeperInit);
 }

@ -1057,6 +1060,44 @@ void ZooKeeper::pushRequest(RequestInfo && info)
    ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions);
 }

+KeeperApiVersion ZooKeeper::getApiVersion()
+{
+    return keeper_api_version;
+}
+
+void ZooKeeper::initApiVersion()
+{
+    auto promise = std::make_shared<std::promise<Coordination::GetResponse>>();
+    auto future = promise->get_future();
+
+    auto callback = [promise](const Coordination::GetResponse & response) mutable
+    {
+        promise->set_value(response);
+    };
+
+    get(keeper_api_version_path, std::move(callback), {});
+    if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready)
+    {
+        LOG_TRACE(log, "Failed to get API version: timeout");
+        return;
+    }
+
+    auto response = future.get();
+
+    if (response.error != Coordination::Error::ZOK)
+    {
+        LOG_TRACE(log, "Failed to get API version");
+        return;
+    }
+
+    uint8_t keeper_version{0};
+    DB::ReadBufferFromOwnString buf(response.data);
+    DB::readIntText(keeper_version, buf);
+    keeper_api_version = static_cast<DB::KeeperApiVersion>(keeper_version);
+    LOG_TRACE(log, "Detected server's API version: {}", keeper_api_version);
+}
+
+
 void ZooKeeper::executeGenericRequest(
    const ZooKeeperRequestPtr & request,
    ResponseCallback callback)
@ -1172,14 +1213,27 @@ void ZooKeeper::list(
    ListCallback callback,
    WatchCallback watch)
 {
-    ZooKeeperFilteredListRequest request;
-    request.path = path;
-    request.list_request_type = list_request_type;
+    std::shared_ptr<ZooKeeperListRequest> request{nullptr};
+    if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST)
+    {
+        if (list_request_type != ListRequestType::ALL)
+            throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS);
+
+        request = std::make_shared<ZooKeeperListRequest>();
+    }
+    else
+    {
+        auto filtered_list_request = std::make_shared<ZooKeeperFilteredListRequest>();
+        filtered_list_request->list_request_type = list_request_type;
+        request = std::move(filtered_list_request);
+    }
+
+    request->path = path;

    RequestInfo request_info;
-    request_info.request = std::make_shared<ZooKeeperListRequest>(std::move(request));
    request_info.callback = [callback](const Response & response) { callback(dynamic_cast<const ListResponse &>(response)); };
    request_info.watch = watch;
+    request_info.request = std::move(request);

    pushRequest(std::move(request_info));
    ProfileEvents::increment(ProfileEvents::ZooKeeperList);
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@ -7,6 +7,7 @@
 #include <Common/ThreadPool.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Coordination/KeeperConstants.h>

 #include <IO/ReadBuffer.h>
 #include <IO/WriteBuffer.h>
@ -181,6 +182,8 @@ public:
        const Requests & requests,
        MultiCallback callback) override;

+    DB::KeeperApiVersion getApiVersion() override;
+
    /// Without forcefully invalidating (finalizing) ZooKeeper session before
    /// establishing a new one, there was a possibility that server is using
    /// two ZooKeeper sessions simultaneously in different parts of code.
@ -275,8 +278,12 @@ private:

    void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false);

+    void initApiVersion();
+
    CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession};
    std::shared_ptr<ZooKeeperLog> zk_log;
+
+    DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::ZOOKEEPER_COMPATIBLE};
 };

 }
--- a/src/Common/logger_useful.h
+++ b/src/Common/logger_useful.h
@ -14,8 +14,10 @@ namespace
    template <typename T, typename... Ts> constexpr auto firstArg(T && x, Ts &&...) { return std::forward<T>(x); }
    /// For implicit conversion of fmt::basic_runtime<> to char* for std::string ctor
    template <typename T, typename... Ts> constexpr auto firstArg(fmt::basic_runtime<T> && data, Ts &&...) { return data.str.data(); }
-}

+    [[maybe_unused]] const ::Poco::Logger * getLogger(const ::Poco::Logger * logger) { return logger; };
+    [[maybe_unused]] const ::Poco::Logger * getLogger(const std::atomic<::Poco::Logger *> & logger) { return logger.load(); };
+}

 /// Logs a message to a specified logger with that level.
 /// If more than one argument is provided,
@ -25,20 +27,21 @@ namespace

 #define LOG_IMPL(logger, priority, PRIORITY, ...) do                              \
 {                                                                                 \
-    const bool is_clients_log = (DB::CurrentThread::getGroup() != nullptr) &&     \
+    auto _logger = ::getLogger(logger);                                           \
+    const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) &&    \
        (DB::CurrentThread::getGroup()->client_logs_level >= (priority));         \
-    if ((logger)->is((PRIORITY)) || is_clients_log)                               \
+    if (_logger->is((PRIORITY)) || _is_clients_log)                               \
    {                                                                             \
        std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
-        if (auto channel = (logger)->getChannel())                                \
+        if (auto _channel = _logger->getChannel())                                \
        {                                                                         \
            std::string file_function;                                            \
            file_function += __FILE__;                                            \
            file_function += "; ";                                                \
            file_function += __PRETTY_FUNCTION__;                                 \
-            Poco::Message poco_message((logger)->name(), formatted_message,       \
+            Poco::Message poco_message(_logger->name(), formatted_message,        \
                                 (PRIORITY), file_function.c_str(), __LINE__);    \
-            channel->log(poco_message);                                           \
+            _channel->log(poco_message);                                          \
        }                                                                         \
    }                                                                             \
 } while (false)
--- a/src/Common/tests/gtest_concurrency_control.cpp
+++ b/src/Common/tests/gtest_concurrency_control.cpp
@ -0,0 +1,289 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+#include <thread>
+#include <pcg_random.hpp>
+
+#include <base/types.h>
+#include <base/sleep.h>
+#include <Common/ConcurrencyControl.h>
+#include <Common/randomSeed.h>
+
+struct ConcurrencyControlTest
+{
+    ConcurrencyControl cc;
+
+    explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited)
+    {
+        cc.setMaxConcurrency(limit);
+    }
+};
+
+TEST(ConcurrencyControl, Unlimited)
+{
+    ConcurrencyControlTest t; // unlimited number of slots
+    auto slots = t.cc.allocate(0, 100500);
+    std::vector<ConcurrencyControl::SlotPtr> acquired;
+    while (auto slot = slots->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 100500);
+}
+
+TEST(ConcurrencyControl, Fifo)
+{
+    ConcurrencyControlTest t(1); // use single slot
+    std::vector<ConcurrencyControl::AllocationPtr> allocations;
+    constexpr int count = 42;
+    allocations.reserve(count);
+    for (int i = 0; i < count; i++)
+        allocations.emplace_back(t.cc.allocate(0, 1));
+    for (int i = 0; i < count; i++)
+    {
+        ConcurrencyControl::SlotPtr holder;
+        for (int j = 0; j < count; j++)
+        {
+            auto slot = allocations[j]->tryAcquire();
+            if (i == j) // check fifo order of allocations
+            {
+                ASSERT_TRUE(slot);
+                holder = std::move(slot);
+            }
+            else
+                ASSERT_TRUE(!slot);
+        }
+        holder.reset(); // release slot -- leads to the next allocation
+    }
+}
+
+TEST(ConcurrencyControl, Oversubscription)
+{
+    ConcurrencyControlTest t(10);
+    std::vector<ConcurrencyControl::AllocationPtr> allocations;
+    allocations.reserve(10);
+    for (int i = 0; i < 10; i++)
+        allocations.emplace_back(t.cc.allocate(1, 2));
+    std::vector<ConcurrencyControl::SlotPtr> slots;
+    // Normal allocation using maximum amount of slots
+    for (int i = 0; i < 5; i++)
+    {
+        auto slot1 = allocations[i]->tryAcquire();
+        ASSERT_TRUE(slot1);
+        slots.emplace_back(std::move(slot1));
+        auto slot2 = allocations[i]->tryAcquire();
+        ASSERT_TRUE(slot2);
+        slots.emplace_back(std::move(slot2));
+        ASSERT_TRUE(!allocations[i]->tryAcquire());
+    }
+    // Oversubscription: only minimum amount of slots are allocated
+    for (int i = 5; i < 10; i++)
+    {
+        auto slot1 = allocations[i]->tryAcquire();
+        ASSERT_TRUE(slot1);
+        slots.emplace_back(std::move(slot1));
+        ASSERT_TRUE(!allocations[i]->tryAcquire());
+    }
+}
+
+TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
+{
+    ConcurrencyControlTest t(10);
+    {
+        std::vector<ConcurrencyControl::AllocationPtr> allocations;
+        allocations.reserve(10);
+        for (int i = 0; i < 10; i++)
+            allocations.emplace_back(t.cc.allocate(1, 2));
+        // Do not acquire - just destroy allocations with granted slots
+    }
+    // Check that slots were actually released
+    auto allocation = t.cc.allocate(0, 20);
+    std::vector<ConcurrencyControl::SlotPtr> acquired;
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 10);
+}
+
+TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation)
+{
+    ConcurrencyControlTest t(10);
+    for (int i = 0; i < 3; i++)
+    {
+        auto allocation = t.cc.allocate(5, 20);
+        std::vector<ConcurrencyControl::SlotPtr> acquired;
+        while (auto slot = allocation->tryAcquire())
+            acquired.emplace_back(std::move(slot));
+        ASSERT_TRUE(acquired.size() == 10);
+    }
+}
+
+TEST(ConcurrencyControl, DestroyAllocationBeforeSlots)
+{
+    ConcurrencyControlTest t(10);
+    for (int i = 0; i < 3; i++)
+    {
+        std::vector<ConcurrencyControl::SlotPtr> acquired;
+        auto allocation = t.cc.allocate(5, 20);
+        while (auto slot = allocation->tryAcquire())
+            acquired.emplace_back(std::move(slot));
+        ASSERT_TRUE(acquired.size() == 10);
+        allocation.reset(); // slots are still acquired (they should actually hold allocation)
+    }
+}
+
+TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation)
+{
+    ConcurrencyControlTest t(3);
+    auto allocation = t.cc.allocate(0, 10);
+    std::list<ConcurrencyControl::SlotPtr> acquired;
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 3); // 0 1 2
+    acquired.clear();
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 3); // 3 4 5
+    acquired.pop_back();
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 3); // 3 4 6
+    acquired.pop_front();
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 3); // 4 6 7
+    acquired.clear();
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 2); // 8 9
+}
+
+TEST(ConcurrencyControl, FairGranting)
+{
+    ConcurrencyControlTest t(3);
+    auto start_busy_period = t.cc.allocate(3, 3);
+    auto a1 = t.cc.allocate(0, 10);
+    auto a2 = t.cc.allocate(0, 10);
+    auto a3 = t.cc.allocate(0, 10);
+    start_busy_period.reset();
+    for (int i = 0; i < 10; i++)
+    {
+        auto s1 = a1->tryAcquire();
+        ASSERT_TRUE(s1);
+        ASSERT_TRUE(!a1->tryAcquire());
+        auto s2 = a2->tryAcquire();
+        ASSERT_TRUE(s2);
+        ASSERT_TRUE(!a2->tryAcquire());
+        auto s3 = a3->tryAcquire();
+        ASSERT_TRUE(s3);
+        ASSERT_TRUE(!a3->tryAcquire());
+    }
+}
+
+TEST(ConcurrencyControl, SetSlotCount)
+{
+    ConcurrencyControlTest t(10);
+    auto allocation = t.cc.allocate(5, 30);
+    std::vector<ConcurrencyControl::SlotPtr> acquired;
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 10);
+
+    t.cc.setMaxConcurrency(15);
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 15);
+
+    t.cc.setMaxConcurrency(5);
+    acquired.clear();
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 5);
+
+    // Check that newly added slots are equally distributed over waiting allocations
+    std::vector<ConcurrencyControl::SlotPtr> acquired2;
+    auto allocation2 = t.cc.allocate(0, 30);
+    ASSERT_TRUE(!allocation->tryAcquire());
+    t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one
+    while (auto slot = allocation->tryAcquire())
+        acquired.emplace_back(std::move(slot));
+    while (auto slot = allocation2->tryAcquire())
+        acquired2.emplace_back(std::move(slot));
+    ASSERT_TRUE(acquired.size() == 10);
+    ASSERT_TRUE(acquired2.size() == 5);
+}
+
+TEST(ConcurrencyControl, MultipleThreads)
+{
+    constexpr int cfg_total_queries = 1000; // total amount of queries to run
+    constexpr int cfg_work_us = 49; // max microseconds per single work
+    constexpr int cfg_concurrent_queries = 8; // do not run more than specified number of concurrent queries
+    constexpr int cfg_max_threads = 4; // max amount of threads a query is allowed to have
+    constexpr int cfg_max_concurrency = 16; // concurrency control limit (must be >3)
+
+    ConcurrencyControlTest t(cfg_max_concurrency);
+
+    auto run_query = [&] (size_t max_threads)
+    {
+        ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads);
+        std::mutex threads_mutex;
+        std::vector<std::thread> threads;
+        threads.reserve(max_threads);
+
+        std::function<void()> spawn_threads = [&] ()
+        {
+            while (auto slot = slots->tryAcquire())
+            {
+                std::unique_lock lock{threads_mutex};
+                threads.emplace_back([&, slot = std::move(slot)]
+                {
+                    pcg64 rng(randomSeed());
+                    std::uniform_int_distribution<size_t> distribution(1, cfg_work_us);
+                    size_t steps = distribution(rng);
+                    for (size_t step = 0; step < steps; step++)
+                    {
+                        sleepForMicroseconds(distribution(rng)); // emulate work
+                        spawn_threads(); // upscale
+                    }
+                });
+            }
+        };
+
+        spawn_threads();
+
+        // graceful shutdown of a query
+        for (size_t thread_num = 0; ; thread_num++)
+        {
+            std::unique_lock lock{threads_mutex};
+            if (thread_num >= threads.size())
+                break;
+            if (threads[thread_num].joinable())
+            {
+                auto & thread = threads[thread_num];
+                lock.unlock(); // to avoid deadlock if thread we are going to join starts spawning threads
+                thread.join();
+            }
+        }
+        // NOTE: No races: all concurrent spawn_threads() calls are done from `threads`, but they're already joined.
+    };
+
+    pcg64 rng(randomSeed());
+    std::uniform_int_distribution<size_t> max_threads_distribution(1, cfg_max_threads);
+    std::vector<std::thread> queries;
+    std::atomic<int> started = 0; // queries started in total
+    std::atomic<int> finished = 0; // queries finished in total
+    while (started < cfg_total_queries)
+    {
+        while (started < finished + cfg_concurrent_queries)
+        {
+            queries.emplace_back([&, max_threads = max_threads_distribution(rng)]
+            {
+                run_query(max_threads);
+                finished++;
+            });
+            started++;
+        }
+        sleepForMicroseconds(5); // wait some queries to finish
+        t.cc.setMaxConcurrency(cfg_max_concurrency - started % 3); // emulate configuration updates
+    }
+
+    for (auto & query : queries)
+        query.join();
+}
--- a/src/Common/tests/gtest_wide_integer.cpp
+++ b/src/Common/tests/gtest_wide_integer.cpp
@ -61,8 +61,11 @@ GTEST_TEST(WideInteger, Conversions)
    ASSERT_EQ(zero, minus_one);

    zero += minus_one;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
+#else
    ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
-
+#endif
    zero += 2;
    ASSERT_EQ(zero, 0);

@ -156,8 +159,11 @@ GTEST_TEST(WideInteger, Arithmetic)
    ASSERT_EQ(zero, minus_one);

    zero += minus_one;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
+#else
    ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero)));
-
+#endif
    zero += 2;
    ASSERT_EQ(zero, 0);

@ -236,8 +242,12 @@ GTEST_TEST(WideInteger, Shift)
    Int128 x = 1;

    auto y = x << 64;
-    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128)));

+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", sizeof(Int128)));
+#else
+    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128)));
+#endif
    auto z = y << 11;
    ASSERT_EQ(toString(z), "37778931862957161709568");

@ -250,8 +260,11 @@ GTEST_TEST(WideInteger, Shift)
    x = -1;
    y = x << 16;

+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
+#else
    ASSERT_EQ(0, memcmp(&y, "\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
-
+#endif
    y >>= 16;
    ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));

@ -259,10 +272,18 @@ GTEST_TEST(WideInteger, Shift)
    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));

    y >>= 32;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
+#else
    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128)));
+#endif

    y <<= 64;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\x00\x00\x00\x00", sizeof(Int128)));
+#else
    ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF", sizeof(Int128)));
+#endif
 }


--- a/src/Compression/CompressionCodecDeflateQpl.cpp
+++ b/src/Compression/CompressionCodecDeflateQpl.cpp
@ -28,8 +28,8 @@ DeflateQplJobHWPool & DeflateQplJobHWPool::instance()
 }

 DeflateQplJobHWPool::DeflateQplJobHWPool()
-    :random_engine(std::random_device()())
-    ,distribution(0, MAX_HW_JOB_NUMBER-1)
+    : random_engine(std::random_device()())
+    , distribution(0, MAX_HW_JOB_NUMBER - 1)
 {
    Poco::Logger * log = &Poco::Logger::get("DeflateQplJobHWPool");
    UInt32 job_size = 0;
@ -73,7 +73,7 @@ DeflateQplJobHWPool::~DeflateQplJobHWPool()
    job_pool_ready = false;
 }

-qpl_job * DeflateQplJobHWPool::acquireJob(UInt32 &job_id)
+qpl_job * DeflateQplJobHWPool::acquireJob(UInt32 & job_id)
 {
    if (isJobPoolReady())
    {
@ -141,7 +141,7 @@ HardwareCodecDeflateQpl::~HardwareCodecDeflateQpl()
 Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const
 {
    UInt32 job_id = 0;
-    qpl_job* job_ptr = nullptr;
+    qpl_job * job_ptr = nullptr;
    UInt32 compressed_size = 0;
    if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id)))
    {
@ -330,10 +330,9 @@ void SoftwareCodecDeflateQpl::doDecompressData(const char * source, UInt32 sourc
            "Execution of DeflateQpl software fallback codec failed. (Details: qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", status);
 }

-//CompressionCodecDeflateQpl
 CompressionCodecDeflateQpl::CompressionCodecDeflateQpl()
-    :hw_codec(std::make_unique<HardwareCodecDeflateQpl>())
-    ,sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
+    : hw_codec(std::make_unique<HardwareCodecDeflateQpl>())
+    , sw_codec(std::make_unique<SoftwareCodecDeflateQpl>())
 {
    setCodecDescription("DEFLATE_QPL");
 }
--- a/src/Compression/CompressionCodecDeflateQpl.h
+++ b/src/Compression/CompressionCodecDeflateQpl.h
@ -1,8 +1,9 @@
 #pragma once

 #include <Compression/ICompressionCodec.h>
-#include <qpl/qpl.h>
+#include <map>
 #include <random>
+#include <qpl/qpl.h>

 namespace Poco
 {
@ -18,20 +19,16 @@ class DeflateQplJobHWPool
 {
 public:
    DeflateQplJobHWPool();
-
    ~DeflateQplJobHWPool();

-    qpl_job * acquireJob(UInt32 &job_id);
-
-    static void releaseJob(UInt32 job_id);
-
-    static const bool & isJobPoolReady() { return job_pool_ready; }
-
    static DeflateQplJobHWPool & instance();

+    qpl_job * acquireJob(UInt32 & job_id);
+    static void releaseJob(UInt32 job_id);
+    static const bool & isJobPoolReady() { return job_pool_ready; }
+
 private:
    static bool tryLockJob(UInt32 index);
-
    static void unLockJob(UInt32 index);

    /// Maximum jobs running in parallel supported by IAA hardware
@ -39,9 +36,9 @@ private:
    /// Entire buffer for storing all job objects
    static std::unique_ptr<uint8_t[]> hw_jobs_buffer;
    /// Job pool for storing all job object pointers
-    static std::array<qpl_job *, DeflateQplJobHWPool::MAX_HW_JOB_NUMBER> hw_job_ptr_pool;
+    static std::array<qpl_job *, MAX_HW_JOB_NUMBER> hw_job_ptr_pool;
    /// Locks for accessing each job object pointers
-    static std::array<std::atomic_bool, DeflateQplJobHWPool::MAX_HW_JOB_NUMBER> hw_job_ptr_locks;
+    static std::array<std::atomic_bool, MAX_HW_JOB_NUMBER> hw_job_ptr_locks;
    static bool job_pool_ready;
    std::mt19937 random_engine;
    std::uniform_int_distribution<int> distribution;
@ -57,23 +54,25 @@ public:
 private:
    qpl_job * sw_job = nullptr;
    std::unique_ptr<uint8_t[]> sw_buffer;
+
    qpl_job * getJobCodecPtr();
 };

 class HardwareCodecDeflateQpl
 {
 public:
-    /// RET_ERROR stands for hardware codec fail,need fallback to software codec.
+    /// RET_ERROR stands for hardware codec fail, needs fallback to software codec.
    static constexpr Int32 RET_ERROR = -1;

    HardwareCodecDeflateQpl();
    ~HardwareCodecDeflateQpl();
+
    Int32 doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const;

-    ///Submit job request to the IAA hardware and then busy waiting till it complete.
+    /// Submit job request to the IAA hardware and then busy waiting till it complete.
    Int32 doDecompressDataSynchronous(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size);

-    ///Submit job request to the IAA hardware and return immediately. IAA hardware will process decompression jobs automatically.
+    /// Submit job request to the IAA hardware and return immediately. IAA hardware will process decompression jobs automatically.
    Int32 doDecompressDataAsynchronous(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size);

    /// Flush result for all previous requests which means busy waiting till all the jobs in "decomp_async_job_map" are finished.
@ -96,23 +95,19 @@ public:
    void updateHash(SipHash & hash) const override;

 protected:
-    bool isCompression() const override
-    {
-        return true;
-    }
-
-    bool isGenericCompression() const override
-    {
-        return true;
-    }
+    bool isCompression() const override { return true; }
+    bool isGenericCompression() const override { return true; }
+    bool isExperimental() const override { return true; }

    UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
    void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
-    ///Flush result for previous asynchronous decompression requests on asynchronous mode.
+
+    /// Flush result for previous asynchronous decompression requests on asynchronous mode.
    void flushAsynchronousDecompressRequests() override;

 private:
    UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
+
    std::unique_ptr<HardwareCodecDeflateQpl> hw_codec;
    std::unique_ptr<SoftwareCodecDeflateQpl> sw_codec;
 };
--- a/src/Coordination/CoordinationSettings.cpp
+++ b/src/Coordination/CoordinationSettings.cpp
@ -36,7 +36,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco
 }


-const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr";
+const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv";

 KeeperConfigurationAndSettings::KeeperConfigurationAndSettings()
    : server_id(NOT_EXIST)
--- a/src/Coordination/FourLetterCommand.cpp
+++ b/src/Coordination/FourLetterCommand.cpp
@ -2,6 +2,7 @@

 #include <Coordination/KeeperDispatcher.h>
 #include <Server/KeeperTCPHandler.h>
+#include <Common/ZooKeeper/IKeeper.h>
 #include <Common/logger_useful.h>
 #include <Poco/Environment.h>
 #include <Poco/Path.h>
@ -132,6 +133,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat
        FourLetterCommandPtr recovery_command = std::make_shared<RecoveryCommand>(keeper_dispatcher);
        factory.registerCommand(recovery_command);

+        FourLetterCommandPtr api_version_command = std::make_shared<ApiVersionCommand>(keeper_dispatcher);
+        factory.registerCommand(api_version_command);
+
        factory.initializeAllowList(keeper_dispatcher);
        factory.setInitialize(true);
    }
@ -463,4 +467,9 @@ String RecoveryCommand::run()
    return "ok";
 }

+String ApiVersionCommand::run()
+{
+    return toString(static_cast<uint8_t>(Coordination::current_keeper_api_version));
+}
+
 }
--- a/src/Coordination/FourLetterCommand.h
+++ b/src/Coordination/FourLetterCommand.h
@ -315,4 +315,16 @@ struct RecoveryCommand : public IFourLetterCommand
    String run() override;
    ~RecoveryCommand() override = default;
 };
+
+struct ApiVersionCommand : public IFourLetterCommand
+{
+    explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_)
+        : IFourLetterCommand(keeper_dispatcher_)
+    {
+    }
+
+    String name() override { return "apiv"; }
+    String run() override;
+    ~ApiVersionCommand() override = default;
+};
 }
--- a/src/Coordination/KeeperConstants.h
+++ b/src/Coordination/KeeperConstants.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include <IO/WriteHelpers.h>
+
+namespace DB
+{
+
+enum class KeeperApiVersion : uint8_t
+{
+    ZOOKEEPER_COMPATIBLE = 0,
+    WITH_FILTERED_LIST
+};
+
+inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_FILTERED_LIST;
+
+const std::string keeper_system_path = "/keeper";
+const std::string keeper_api_version_path = keeper_system_path + "/api_version";
+
+using PathWithData = std::pair<std::string_view, std::string>;
+const std::vector<PathWithData> child_system_paths_with_data
+{
+    {keeper_api_version_path, toString(static_cast<uint8_t>(current_keeper_api_version))}
+};
+
+}
--- a/src/Coordination/KeeperContext.h
+++ b/src/Coordination/KeeperContext.h
@ -0,0 +1,22 @@
+#pragma once
+
+namespace DB
+{
+
+struct KeeperContext
+{
+    enum class Phase : uint8_t
+    {
+        INIT,
+        RUNNING
+    };
+
+    Phase server_state{Phase::INIT};
+
+    bool ignore_system_path_on_startup{false};
+    bool digest_enabled{true};
+};
+
+using KeeperContextPtr = std::shared_ptr<KeeperContext>;
+
+}
--- a/src/Coordination/KeeperServer.cpp
+++ b/src/Coordination/KeeperServer.cpp
@ -106,20 +106,31 @@ KeeperServer::KeeperServer(
    SnapshotsQueue & snapshots_queue_)
    : server_id(configuration_and_settings_->server_id)
    , coordination_settings(configuration_and_settings_->coordination_settings)
-    , state_machine(nuraft::cs_new<KeeperStateMachine>(
-          responses_queue_,
-          snapshots_queue_,
-          configuration_and_settings_->snapshot_storage_path,
-          coordination_settings,
-          checkAndGetSuperdigest(configuration_and_settings_->super_digest),
-          config.getBool("keeper_server.digest_enabled", false)))
-    , state_manager(nuraft::cs_new<KeeperStateManager>(
-          server_id, "keeper_server", configuration_and_settings_->log_storage_path, configuration_and_settings_->state_file_path, config, coordination_settings))
    , log(&Poco::Logger::get("KeeperServer"))
    , is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery"))
+    , keeper_context{std::make_shared<KeeperContext>()}
 {
    if (coordination_settings->quorum_reads)
        LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower.");
+
+    keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false);
+    keeper_context->ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false);
+
+    state_machine = nuraft::cs_new<KeeperStateMachine>(
+        responses_queue_,
+        snapshots_queue_,
+        configuration_and_settings_->snapshot_storage_path,
+        coordination_settings,
+        keeper_context,
+        checkAndGetSuperdigest(configuration_and_settings_->super_digest));
+
+    state_manager = nuraft::cs_new<KeeperStateManager>(
+        server_id,
+        "keeper_server",
+        configuration_and_settings_->log_storage_path,
+        configuration_and_settings_->state_file_path,
+        config,
+        coordination_settings);
 }

 /**
@ -341,6 +352,8 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
    last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config;

    launchRaftServer(enable_ipv6);
+
+    keeper_context->server_state = KeeperContext::Phase::RUNNING;
 }

 void KeeperServer::shutdownRaftServer()
--- a/src/Coordination/KeeperServer.h
+++ b/src/Coordination/KeeperServer.h
@ -9,6 +9,7 @@
 #include <libnuraft/raft_server.hxx>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Coordination/Keeper4LWInfo.h>
+#include <Coordination/KeeperContext.h>

 namespace DB
 {
@ -61,6 +62,8 @@ private:

    std::atomic_bool is_recovering = false;

+    std::shared_ptr<KeeperContext> keeper_context;
+
 public:
    KeeperServer(
        const KeeperConfigurationAndSettingsPtr & settings_,
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -13,6 +13,8 @@
 #include <filesystem>
 #include <memory>
 #include <Common/logger_useful.h>
+#include "Coordination/KeeperContext.h"
+#include <Coordination/KeeperConstants.h>

 namespace DB
 {
@ -144,8 +146,34 @@ namespace
    }
 }

+namespace
+{

-void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out)
+enum class PathMatchResult
+{
+    NOT_MATCH,
+    EXACT,
+    IS_CHILD
+};
+
+PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
+{
+    using enum PathMatchResult;
+
+    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
+
+    if (second_it != match_to.end())
+        return NOT_MATCH;
+
+    if (first_it == path.end())
+        return EXACT;
+
+    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
+}
+
+}
+
+void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
 {
    writeBinary(static_cast<uint8_t>(snapshot.version), out);
    serializeSnapshotMetadata(snapshot.snapshot_meta, out);
@ -153,7 +181,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
    if (snapshot.version >= SnapshotVersion::V5)
    {
        writeBinary(snapshot.zxid, out);
-        if (snapshot.storage->digest_enabled)
+        if (keeper_context->digest_enabled)
        {
            writeBinary(static_cast<uint8_t>(KeeperStorage::CURRENT_DIGEST_VERSION), out);
            writeBinary(snapshot.nodes_digest, out);
@ -182,12 +210,21 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
    }

    /// Serialize data tree
-    writeBinary(snapshot.snapshot_container_size, out);
+    writeBinary(snapshot.snapshot_container_size - child_system_paths_with_data.size(), out);
    size_t counter = 0;
    for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter)
    {
        const auto & path = it->key;
+
+        // write only the root system path because of digest
+        if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
+        {
+            ++it;
+            continue;
+        }
+
        const auto & node = it->value;
+
        /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id
        /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be
        /// slightly bigger than required.
@ -241,7 +278,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
    }
 }

-void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in)
+void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context)
 {
    uint8_t version;
    readBinary(version, in);
@ -252,7 +289,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
    deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in);
    KeeperStorage & storage = *deserialization_result.storage;

-    bool recalculate_digest = storage.digest_enabled;
+    bool recalculate_digest = keeper_context->digest_enabled;
    if (version >= SnapshotVersion::V5)
    {
        readBinary(storage.zxid, in);
@ -316,19 +353,55 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
    if (recalculate_digest)
        storage.nodes_digest = 0;

-    size_t current_size = 0;
-    while (current_size < snapshot_container_size)
+    const auto is_node_empty = [](const auto & node)
+    {
+        return node.getData().empty() && node.stat == Coordination::Stat{};
+    };
+
+    for (size_t nodes_read = 0; nodes_read < snapshot_container_size; ++nodes_read)
    {
        std::string path;
        readBinary(path, in);
        KeeperStorage::Node node{};
        readNode(node, in, current_version, storage.acl_map);
+
+        using enum PathMatchResult;
+        auto match_result = matchPath(path, keeper_system_path);
+
+        const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
+        if (match_result == IS_CHILD)
+        {
+            if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
+            {
+                LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
+                continue;
+            }
+            else
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "{}. Ignoring it can lead to data loss. "
+                    "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true",
+                    error_msg);
+        }
+        else if (match_result == EXACT && !is_node_empty(node))
+        {
+            if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT)
+            {
+                LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg);
+                node = KeeperStorage::Node{};
+            }
+            else
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "{}. Ignoring it can lead to data loss. "
+                    "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true",
+                    error_msg);
+        }
+
        storage.container.insertOrReplace(path, node);
        if (node.stat.ephemeralOwner != 0)
            storage.ephemerals[node.stat.ephemeralOwner].insert(path);

-        current_size++;
-
        if (recalculate_digest)
            storage.nodes_digest += node.getDigest(path);
    }
@ -451,16 +524,16 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot()
 KeeperSnapshotManager::KeeperSnapshotManager(
    const std::string & snapshots_path_,
    size_t snapshots_to_keep_,
+    const KeeperContextPtr & keeper_context_,
    bool compress_snapshots_zstd_,
    const std::string & superdigest_,
-    size_t storage_tick_time_,
-    const bool digest_enabled_)
+    size_t storage_tick_time_)
    : snapshots_path(snapshots_path_)
    , snapshots_to_keep(snapshots_to_keep_)
    , compress_snapshots_zstd(compress_snapshots_zstd_)
    , superdigest(superdigest_)
    , storage_tick_time(storage_tick_time_)
-    , digest_enabled(digest_enabled_)
+    , keeper_context(keeper_context_)
 {
    namespace fs = std::filesystem;

@ -554,7 +627,7 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::serializeSnapshotToBuffer(con
    else
        compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);

-    KeeperStorageSnapshot::serialize(snapshot, *compressed_writer);
+    KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
    compressed_writer->finalize();
    return buffer_raw_ptr->getBuffer();
 }
@ -583,8 +656,9 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff
        compressed_reader = std::make_unique<CompressedReadBuffer>(*reader);

    SnapshotDeserializationResult result;
-    result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, digest_enabled);
-    KeeperStorageSnapshot::deserialize(result, *compressed_reader);
+    result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, keeper_context, /* initialize_system_nodes */ false);
+    KeeperStorageSnapshot::deserialize(result, *compressed_reader, keeper_context);
+    result.storage->initializeSystemNodes();
    return result;
 }

@ -629,7 +703,7 @@ std::pair<std::string, std::error_code> KeeperSnapshotManager::serializeSnapshot
    else
        compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);

-    KeeperStorageSnapshot::serialize(snapshot, *compressed_writer);
+    KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
    compressed_writer->finalize();
    compressed_writer->sync();

--- a/src/Coordination/KeeperSnapshotManager.h
+++ b/src/Coordination/KeeperSnapshotManager.h
@ -5,6 +5,7 @@
 #include <IO/ReadBuffer.h>
 #include <IO/WriteBuffer.h>
 #include <libnuraft/nuraft.hxx>
+#include <Coordination/KeeperContext.h>

 namespace DB
 {
@ -55,9 +56,9 @@ public:

    ~KeeperStorageSnapshot();

-    static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out);
+    static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context);

-    static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in);
+    static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context);

    KeeperStorage * storage;

@ -99,10 +100,10 @@ public:
    KeeperSnapshotManager(
        const std::string & snapshots_path_,
        size_t snapshots_to_keep_,
+        const KeeperContextPtr & keeper_context_,
        bool compress_snapshots_zstd_ = true,
        const std::string & superdigest_ = "",
-        size_t storage_tick_time_ = 500,
-        bool digest_enabled_ = true);
+        size_t storage_tick_time_ = 500);

    /// Restore storage from latest available snapshot
    SnapshotDeserializationResult restoreFromLatestSnapshot();
@ -168,7 +169,8 @@ private:
    const std::string superdigest;
    /// Storage sessions timeout check interval (also for deserializatopn)
    size_t storage_tick_time;
-    const bool digest_enabled;
+
+    KeeperContextPtr keeper_context;
 };

 /// Keeper create snapshots in background thread. KeeperStateMachine just create
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -41,22 +41,22 @@ KeeperStateMachine::KeeperStateMachine(
    SnapshotsQueue & snapshots_queue_,
    const std::string & snapshots_path_,
    const CoordinationSettingsPtr & coordination_settings_,
-    const std::string & superdigest_,
-    const bool digest_enabled_)
+    const KeeperContextPtr & keeper_context_,
+    const std::string & superdigest_)
    : coordination_settings(coordination_settings_)
    , snapshot_manager(
          snapshots_path_,
          coordination_settings->snapshots_to_keep,
+          keeper_context_,
          coordination_settings->compress_snapshots_with_zstd_format,
          superdigest_,
-          coordination_settings->dead_session_check_period_ms.totalMilliseconds(),
-          digest_enabled_)
+          coordination_settings->dead_session_check_period_ms.totalMilliseconds())
    , responses_queue(responses_queue_)
    , snapshots_queue(snapshots_queue_)
    , last_committed_idx(0)
    , log(&Poco::Logger::get("KeeperStateMachine"))
    , superdigest(superdigest_)
-    , digest_enabled(digest_enabled_)
+    , keeper_context(keeper_context_)
 {
 }

@ -109,7 +109,7 @@ void KeeperStateMachine::init()

    if (!storage)
        storage = std::make_unique<KeeperStorage>(
-            coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, digest_enabled);
+            coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context);
 }

 namespace
@ -204,7 +204,7 @@ void KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
        true /* check_acl */,
        request_for_session.digest);

-    if (digest_enabled && request_for_session.digest)
+    if (keeper_context->digest_enabled && request_for_session.digest)
        assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false);
 }

@ -253,10 +253,8 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
                    response_for_session.session_id);
            }

-        if (digest_enabled && request_for_session.digest)
-        {
+        if (keeper_context->digest_enabled && request_for_session.digest)
            assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
-        }
    }

    ProfileEvents::increment(ProfileEvents::KeeperCommits);
--- a/src/Coordination/KeeperStateMachine.h
+++ b/src/Coordination/KeeperStateMachine.h
@ -6,6 +6,7 @@
 #include <libnuraft/nuraft.hxx>
 #include <Common/ConcurrentBoundedQueue.h>
 #include <Common/logger_useful.h>
+#include <Coordination/KeeperContext.h>


 namespace DB
@ -24,8 +25,8 @@ public:
        SnapshotsQueue & snapshots_queue_,
        const std::string & snapshots_path_,
        const CoordinationSettingsPtr & coordination_settings_,
-        const std::string & superdigest_ = "",
-        bool digest_enabled_ = true);
+        const KeeperContextPtr & keeper_context_,
+        const std::string & superdigest_ = "");

    /// Read state from the latest snapshot
    void init();
@ -140,7 +141,7 @@ private:
    /// Special part of ACL system -- superdigest specified in server config.
    const std::string superdigest;

-    const bool digest_enabled;
+    KeeperContextPtr keeper_context;
 };

 }
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -15,6 +15,7 @@
 #include <Common/logger_useful.h>
 #include <Common/setThreadName.h>
 #include <Coordination/pathUtils.h>
+#include <Coordination/KeeperConstants.h>
 #include <sstream>
 #include <iomanip>
 #include <mutex>
@ -226,12 +227,67 @@ void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other)
    cached_digest = other.cached_digest;
 }

-KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_)
-    : session_expiry_queue(tick_time_ms), digest_enabled(digest_enabled_), superdigest(superdigest_)
+KeeperStorage::KeeperStorage(
+    int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, const bool initialize_system_nodes)
+    : session_expiry_queue(tick_time_ms), keeper_context(keeper_context_), superdigest(superdigest_)
 {
    Node root_node;
    container.insert("/", root_node);
-    nodes_digest += root_node.getDigest("/");
+    addDigest(root_node, "/");
+
+    if (initialize_system_nodes)
+        initializeSystemNodes();
+}
+
+void KeeperStorage::initializeSystemNodes()
+{
+    if (initialized)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes initialized twice");
+
+    // insert root system path if it isn't already inserted
+    if (container.find(keeper_system_path) == container.end())
+    {
+        Node system_node;
+        container.insert(keeper_system_path, system_node);
+        // store digest for the empty node because we won't update
+        // its stats
+        addDigest(system_node, keeper_system_path);
+
+        // update root and the digest based on it
+        auto current_root_it = container.find("/");
+        assert(current_root_it != container.end());
+        removeDigest(current_root_it->value, "/");
+        auto updated_root_it = container.updateValue(
+            "/",
+            [](auto & node)
+            {
+                ++node.stat.numChildren;
+                node.addChild(getBaseName(keeper_system_path));
+            }
+        );
+        addDigest(updated_root_it->value, "/");
+    }
+
+    // insert child system nodes
+    for (const auto & [path, data] : child_system_paths_with_data)
+    {
+        assert(keeper_api_version_path.starts_with(keeper_system_path));
+        Node child_system_node;
+        child_system_node.setData(data);
+        auto [map_key, _] = container.insert(std::string{path}, child_system_node);
+        /// Take child path from key owned by map.
+        auto child_path = getBaseName(map_key->getKey());
+        container.updateValue(
+            parentPath(StringRef(path)),
+            [child_path](auto & parent)
+            {
+                // don't update stats so digest is okay
+                parent.addChild(child_path);
+            }
+        );
+    }
+
+    initialized = true;
 }

 template <class... Ts>
@ -610,7 +666,7 @@ struct KeeperStorageRequestProcessor
    explicit KeeperStorageRequestProcessor(const Coordination::ZooKeeperRequestPtr & zk_request_) : zk_request(zk_request_) { }
    virtual Coordination::ZooKeeperResponsePtr process(KeeperStorage & storage, int64_t zxid) const = 0;
    virtual std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const
+    preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const
    {
        return {};
    }
@ -658,21 +714,34 @@ struct KeeperStorageSyncRequestProcessor final : public KeeperStorageRequestProc
 namespace
 {

-    Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local)
+Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local)
+{
+    if (is_local)
    {
-        if (is_local)
-        {
-            auto node_it = storage.container.find(path);
-            if (node_it == storage.container.end())
-                return {};
+        auto node_it = storage.container.find(path);
+        if (node_it == storage.container.end())
+            return {};

-            return storage.acl_map.convertNumber(node_it->value.acl_id);
-        }
-
-        return storage.uncommitted_state.getACLs(path);
+        return storage.acl_map.convertNumber(node_it->value.acl_id);
    }

+    return storage.uncommitted_state.getACLs(path);
 }
+
+void handleSystemNodeModification(const KeeperContext & keeper_context, std::string_view error_msg)
+{
+    if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup)
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "{}. Ignoring it can lead to data loss. "
+            "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.",
+            error_msg);
+
+    LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg));
+}
+
+}
+
 bool KeeperStorage::checkACL(StringRef path, int32_t permission, int64_t session_id, bool is_local)
 {
    const auto node_acls = getNodeACLs(*this, path, is_local);
@ -726,7 +795,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
    }

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
    {
        Coordination::ZooKeeperCreateRequest & request = dynamic_cast<Coordination::ZooKeeperCreateRequest &>(*zk_request);

@ -752,6 +821,14 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
            path_created += seq_num_str.str();
        }

+        if (path_created.starts_with(keeper_system_path))
+        {
+            auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);
+
+            handleSystemNodeModification(keeper_context, error_msg);
+            return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
+        }
+
        if (storage.uncommitted_state.getNode(path_created))
            return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}};

@ -839,10 +916,13 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce
    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperGetRequest & request = dynamic_cast<Coordination::ZooKeeperGetRequest &>(*zk_request);

+        if (request.path == Coordination::keeper_api_version_path)
+            return {};
+
        if (!storage.uncommitted_state.getNode(request.path))
            return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};

@ -905,12 +985,20 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr

    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override
    {
        Coordination::ZooKeeperRemoveRequest & request = dynamic_cast<Coordination::ZooKeeperRemoveRequest &>(*zk_request);

        std::vector<KeeperStorage::Delta> new_deltas;

+        if (request.path.starts_with(keeper_system_path))
+        {
+            auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);
+
+            handleSystemNodeModification(keeper_context, error_msg);
+            return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
+        }
+
        const auto update_parent_pzxid = [&]()
        {
            auto parent_path = parentPath(request.path);
@ -987,7 +1075,7 @@ struct KeeperStorageExistsRequestProcessor final : public KeeperStorageRequestPr
    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperExistsRequest & request = dynamic_cast<Coordination::ZooKeeperExistsRequest &>(*zk_request);

@ -1051,12 +1139,20 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce

    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
    {
        Coordination::ZooKeeperSetRequest & request = dynamic_cast<Coordination::ZooKeeperSetRequest &>(*zk_request);

        std::vector<KeeperStorage::Delta> new_deltas;

+        if (request.path.starts_with(keeper_system_path))
+        {
+            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
+
+            handleSystemNodeModification(keeper_context, error_msg);
+            return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
+        }
+
        if (!storage.uncommitted_state.getNode(request.path))
            return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};

@ -1135,7 +1231,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc

    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperListRequest & request = dynamic_cast<Coordination::ZooKeeperListRequest &>(*zk_request);

@ -1187,7 +1283,9 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc

                auto list_request_type = ALL;
                if (auto * filtered_list = dynamic_cast<Coordination::ZooKeeperFilteredListRequest *>(&request))
+                {
                    list_request_type = filtered_list->list_request_type;
+                }

                if (list_request_type == ALL)
                    return true;
@ -1234,7 +1332,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro

    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperCheckRequest & request = dynamic_cast<Coordination::ZooKeeperCheckRequest &>(*zk_request);

@ -1312,10 +1410,18 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override
    {
        Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);

+        if (request.path.starts_with(keeper_system_path))
+        {
+            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
+
+            handleSystemNodeModification(keeper_context, error_msg);
+            return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}};
+        }
+
        auto & uncommitted_state = storage.uncommitted_state;
        if (!uncommitted_state.getNode(request.path))
            return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}};
@ -1386,7 +1492,7 @@ struct KeeperStorageGetACLRequestProcessor final : public KeeperStorageRequestPr
    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperGetACLRequest & request = dynamic_cast<Coordination::ZooKeeperGetACLRequest &>(*zk_request);

@ -1483,14 +1589,14 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro
    }

    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override
    {
        std::vector<Coordination::Error> response_errors;
        response_errors.reserve(concrete_requests.size());
        uint64_t current_digest = digest;
        for (size_t i = 0; i < concrete_requests.size(); ++i)
        {
-            auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest);
+            auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest, keeper_context);

            if (!new_deltas.empty())
            {
@ -1609,7 +1715,7 @@ struct KeeperStorageAuthRequestProcessor final : public KeeperStorageRequestProc
 {
    using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor;
    std::vector<KeeperStorage::Delta>
-    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/) const override
+    preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override
    {
        Coordination::ZooKeeperAuthRequest & auth_request = dynamic_cast<Coordination::ZooKeeperAuthRequest &>(*zk_request);
        Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
@ -1725,7 +1831,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory()

 UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vector<Delta> & new_deltas) const
 {
-    if (!digest_enabled)
+    if (!keeper_context->digest_enabled)
        return current_digest;

    std::unordered_map<std::string, std::shared_ptr<Node>> updated_nodes;
@ -1792,6 +1898,9 @@ void KeeperStorage::preprocessRequest(
    bool check_acl,
    std::optional<Digest> digest)
 {
+    if (!initialized)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized");
+
    int64_t last_zxid = getNextZXID() - 1;

    if (uncommitted_transactions.empty())
@ -1816,7 +1925,7 @@ void KeeperStorage::preprocessRequest(
    TransactionInfo transaction{.zxid = new_last_zxid};
    uint64_t new_digest = getNodesDigest(false).value;
    SCOPE_EXIT({
-        if (digest_enabled)
+        if (keeper_context->digest_enabled)
            // if the version of digest we got from the leader is the same as the one this instances has, we can simply copy the value
            // and just check the digest on the commit
            // a mistake can happen while applying the changes to the uncommitted_state so for now let's just recalculate the digest here also
@ -1867,7 +1976,7 @@ void KeeperStorage::preprocessRequest(
        return;
    }

-    new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest);
+    new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest, *keeper_context);
 }

 KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
@ -1877,6 +1986,9 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
    bool check_acl,
    bool is_local)
 {
+    if (!initialized)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized");
+
    if (new_last_zxid)
    {
        if (uncommitted_transactions.empty())
@ -1968,8 +2080,10 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(
        {
            if (response->error == Coordination::Error::ZOK)
            {
-                auto & watches_type
-                    = zk_request->getOpNum() == Coordination::OpNum::List || zk_request->getOpNum() == Coordination::OpNum::SimpleList
+                static constexpr std::array list_requests{
+                    Coordination::OpNum::List, Coordination::OpNum::SimpleList, Coordination::OpNum::FilteredList};
+
+                auto & watches_type = std::find(list_requests.begin(), list_requests.end(), zk_request->getOpNum()) != list_requests.end()
                    ? list_watches
                    : watches;

@ -2011,7 +2125,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid)

 KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const
 {
-    if (!digest_enabled)
+    if (!keeper_context->digest_enabled)
        return {.version = DigestVersion::NO_DIGEST};

    if (committed || uncommitted_transactions.empty())
@ -2022,13 +2136,13 @@ KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const

 void KeeperStorage::removeDigest(const Node & node, const std::string_view path)
 {
-    if (digest_enabled)
+    if (keeper_context->digest_enabled)
        nodes_digest -= node.getDigest(path);
 }

 void KeeperStorage::addDigest(const Node & node, const std::string_view path)
 {
-    if (digest_enabled)
+    if (keeper_context->digest_enabled)
    {
        node.invalidateDigestCache();
        nodes_digest += node.getDigest(path);
--- a/src/Coordination/KeeperStorage.h
+++ b/src/Coordination/KeeperStorage.h
@ -9,6 +9,7 @@
 #include <Common/ConcurrentBoundedQueue.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Coordination/KeeperContext.h>

 #include <absl/container/flat_hash_set.h>

@ -336,11 +337,15 @@ public:

    Digest getNodesDigest(bool committed) const;

-    const bool digest_enabled;
+    KeeperContextPtr keeper_context;

    const String superdigest;

-    KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_);
+    bool initialized{false};
+
+    KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, bool initialize_system_nodes = true);
+
+    void initializeSystemNodes();

    /// Allocate new session id with the specified timeouts
    int64_t getSessionID(int64_t session_timeout_ms)
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -2,6 +2,7 @@
 #include <gtest/gtest.h>
 #include "Common/ZooKeeper/IKeeper.h"

+#include "Coordination/KeeperContext.h"
 #include "Coordination/KeeperStorage.h"
 #include "Core/Defines.h"
 #include "IO/WriteHelpers.h"
@ -63,7 +64,10 @@ struct CompressionParam
 };

 class CoordinationTest : public ::testing::TestWithParam<CompressionParam>
-{};
+{
+protected:
+    DB::KeeperContextPtr keeper_context = std::make_shared<DB::KeeperContext>();
+};

 TEST_P(CoordinationTest, BuildTest)
 {
@ -1083,9 +1087,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)
 {
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);

-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperStorage storage(500, "", keeper_context);
    addNode(storage, "/hello", "world", 1);
    addNode(storage, "/hello/somepath", "somedata", 3);
    storage.session_id_counter = 5;
@ -1099,7 +1103,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)

    EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2);
    EXPECT_EQ(snapshot.session_id, 7);
-    EXPECT_EQ(snapshot.snapshot_container_size, 3);
+    EXPECT_EQ(snapshot.snapshot_container_size, 5);
    EXPECT_EQ(snapshot.session_and_timeout.size(), 2);

    auto buf = manager.serializeSnapshotToBuffer(snapshot);
@ -1111,8 +1115,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)

    auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf);

-    EXPECT_EQ(restored_storage->container.size(), 3);
-    EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1);
+    EXPECT_EQ(restored_storage->container.size(), 5);
+    EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2);
    EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1);
    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0);

@ -1131,9 +1135,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
 {
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);

-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperStorage storage(500, "", keeper_context);
    storage.getSessionID(130);

    for (size_t i = 0; i < 50; ++i)
@ -1143,14 +1147,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)

    DB::KeeperStorageSnapshot snapshot(&storage, 50);
    EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50);
-    EXPECT_EQ(snapshot.snapshot_container_size, 51);
+    EXPECT_EQ(snapshot.snapshot_container_size, 53);

    for (size_t i = 50; i < 100; ++i)
    {
        addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
    }

-    EXPECT_EQ(storage.container.size(), 101);
+    EXPECT_EQ(storage.container.size(), 103);

    auto buf = manager.serializeSnapshotToBuffer(snapshot);
    manager.serializeSnapshotBufferToDisk(*buf, 50);
@ -1160,7 +1164,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
    auto debuf = manager.deserializeSnapshotBufferFromDisk(50);
    auto [restored_storage, meta, _] = manager.deserializeSnapshotFromBuffer(debuf);

-    EXPECT_EQ(restored_storage->container.size(), 51);
+    EXPECT_EQ(restored_storage->container.size(), 53);
    for (size_t i = 0; i < 50; ++i)
    {
        EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i));
@ -1172,9 +1176,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots)
 {
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);

-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperStorage storage(500, "", keeper_context);
    storage.getSessionID(130);

    for (size_t j = 1; j <= 5; ++j)
@ -1199,7 +1203,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots)

    auto [restored_storage, meta, _] = manager.restoreFromLatestSnapshot();

-    EXPECT_EQ(restored_storage->container.size(), 251);
+    EXPECT_EQ(restored_storage->container.size(), 253);

    for (size_t i = 0; i < 250; ++i)
    {
@ -1211,8 +1215,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode)
 {
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
+    DB::KeeperStorage storage(500, "", keeper_context);
    for (size_t i = 0; i < 50; ++i)
    {
        addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1233,16 +1237,16 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode)
            if (i % 2 == 0)
                storage.container.erase("/hello_" + std::to_string(i));
        }
-        EXPECT_EQ(storage.container.size(), 26);
-        EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 102);
+        EXPECT_EQ(storage.container.size(), 28);
+        EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 104);
        EXPECT_EQ(storage.container.snapshotSizeWithVersion().second, 1);
        auto buf = manager.serializeSnapshotToBuffer(snapshot);
        manager.serializeSnapshotBufferToDisk(*buf, 50);
    }
    EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin" + params.extension));
-    EXPECT_EQ(storage.container.size(), 26);
+    EXPECT_EQ(storage.container.size(), 28);
    storage.clearGarbageAfterSnapshot();
-    EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 26);
+    EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 28);
    for (size_t i = 0; i < 50; ++i)
    {
        if (i % 2 != 0)
@ -1264,8 +1268,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken)
 {
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);
+    DB::KeeperStorage storage(500, "", keeper_context);
    for (size_t i = 0; i < 50; ++i)
    {
        addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1304,7 +1308,7 @@ nuraft::ptr<nuraft::log_entry> getLogEntryFromZKRequest(size_t term, int64_t ses
    return nuraft::cs_new<nuraft::log_entry>(term, buffer);
 }

-void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression)
+void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression, Coordination::KeeperContextPtr keeper_context)
 {
    using namespace Coordination;
    using namespace DB;
@ -1314,7 +1318,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint

    ResponsesQueue queue(std::numeric_limits<size_t>::max());
    SnapshotsQueue snapshots_queue{1};
-    auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
+    auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings, keeper_context);
    state_machine->init();
    DB::KeeperLogStore changelog("./logs", settings->rotate_log_storage_interval, true, enable_compression);
    changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items);
@ -1355,7 +1359,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
    }

    SnapshotsQueue snapshots_queue1{1};
-    auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, "./snapshots", settings);
+    auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, "./snapshots", settings, keeper_context);
    restore_machine->init();
    EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance);

@ -1397,63 +1401,63 @@ TEST_P(CoordinationTest, TestStateMachineAndLogStore)
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 10;
        settings->rotate_log_storage_interval = 10;
-        testLogAndStateMachine(settings, 37, params.enable_compression);
+        testLogAndStateMachine(settings, 37, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 10;
        settings->rotate_log_storage_interval = 10;
-        testLogAndStateMachine(settings, 11, params.enable_compression);
+        testLogAndStateMachine(settings, 11, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 10;
        settings->rotate_log_storage_interval = 10;
-        testLogAndStateMachine(settings, 40, params.enable_compression);
+        testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 20;
        settings->rotate_log_storage_interval = 30;
-        testLogAndStateMachine(settings, 40, params.enable_compression);
+        testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 0;
        settings->rotate_log_storage_interval = 10;
-        testLogAndStateMachine(settings, 40, params.enable_compression);
+        testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 1;
        settings->reserved_log_items = 1;
        settings->rotate_log_storage_interval = 32;
-        testLogAndStateMachine(settings, 32, params.enable_compression);
+        testLogAndStateMachine(settings, 32, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 10;
        settings->reserved_log_items = 7;
        settings->rotate_log_storage_interval = 1;
-        testLogAndStateMachine(settings, 33, params.enable_compression);
+        testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 37;
        settings->reserved_log_items = 1000;
        settings->rotate_log_storage_interval = 5000;
-        testLogAndStateMachine(settings, 33, params.enable_compression);
+        testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context);
    }
    {
        CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
        settings->snapshot_distance = 37;
        settings->reserved_log_items = 1000;
        settings->rotate_log_storage_interval = 5000;
-        testLogAndStateMachine(settings, 45, params.enable_compression);
+        testLogAndStateMachine(settings, 45, params.enable_compression, keeper_context);
    }
 }

@ -1467,7 +1471,7 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)

    ResponsesQueue queue(std::numeric_limits<size_t>::max());
    SnapshotsQueue snapshots_queue{1};
-    auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings);
+    auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, "./snapshots", settings, keeper_context);
    state_machine->init();

    std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
@ -1634,9 +1638,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions)
    auto params = GetParam();

    ChangelogDirTest test("./snapshots");
-    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
+    DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);

-    DB::KeeperStorage storage(500, "", true);
+    DB::KeeperStorage storage(500, "", keeper_context);
    addNode(storage, "/hello", "world", 1);
    addNode(storage, "/hello/somepath", "somedata", 3);
    storage.session_id_counter = 5;
@ -1652,14 +1656,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions)
    manager.serializeSnapshotBufferToDisk(*buf, 2);
    EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin" + params.extension));

-    DB::KeeperSnapshotManager new_manager("./snapshots", 3, !params.enable_compression);
+    DB::KeeperSnapshotManager new_manager("./snapshots", 3, keeper_context, !params.enable_compression);

    auto debuf = new_manager.deserializeSnapshotBufferFromDisk(2);

    auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf);

-    EXPECT_EQ(restored_storage->container.size(), 3);
-    EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1);
+    EXPECT_EQ(restored_storage->container.size(), 5);
+    EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2);
    EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1);
    EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0);

@ -1786,9 +1790,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual)
    std::optional<UInt128> snapshot_hash;
    for (size_t i = 0; i < 15; ++i)
    {
-        DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
+        DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression);

-        DB::KeeperStorage storage(500, "", true);
+        DB::KeeperStorage storage(500, "", keeper_context);
        addNode(storage, "/hello", "");
        for (size_t j = 0; j < 5000; ++j)
        {
@ -1859,7 +1863,7 @@ TEST_P(CoordinationTest, TestUncommittedStateBasicCrud)
    using namespace DB;
    using namespace Coordination;

-    DB::KeeperStorage storage{500, "", true};
+    DB::KeeperStorage storage{500, "", keeper_context};

    constexpr std::string_view path = "/test";

@ -1976,32 +1980,35 @@ TEST_P(CoordinationTest, TestListRequestTypes)
    using namespace DB;
    using namespace Coordination;

-    KeeperStorage storage{500, "", true};
+    KeeperStorage storage{500, "", keeper_context};

    int64_t zxid = 0;

-    static constexpr std::string_view path = "/test";
+    static constexpr std::string_view test_path = "/list_request_type/node";

-    const auto create_path = [&](bool is_ephemeral)
+    const auto create_path = [&](const auto & path, bool is_ephemeral, bool is_sequential = true)
    {
        const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
        int new_zxid = ++zxid;
        create_request->path = path;
-        create_request->is_sequential = true;
+        create_request->is_sequential = is_sequential;
        create_request->is_ephemeral = is_ephemeral;
        storage.preprocessRequest(create_request, 1, 0, new_zxid);
        auto responses = storage.processRequest(create_request, 1, new_zxid);

        EXPECT_GE(responses.size(), 1);
+        EXPECT_EQ(responses[0].response->error, Coordination::Error::ZOK) << "Failed to create " << path;
        const auto & create_response = dynamic_cast<ZooKeeperCreateResponse &>(*responses[0].response);
        return create_response.path_created;
    };

+    create_path(parentPath(StringRef{test_path}).toString(), false, false);
+
    static constexpr size_t persistent_num = 5;
    std::unordered_set<std::string> expected_persistent_children;
    for (size_t i = 0; i < persistent_num; ++i)
    {
-        expected_persistent_children.insert(getBaseName(create_path(false)).toString());
+        expected_persistent_children.insert(getBaseName(create_path(test_path, false)).toString());
    }
    ASSERT_EQ(expected_persistent_children.size(), persistent_num);

@ -2009,7 +2016,7 @@ TEST_P(CoordinationTest, TestListRequestTypes)
    std::unordered_set<std::string> expected_ephemeral_children;
    for (size_t i = 0; i < ephemeral_num; ++i)
    {
-        expected_ephemeral_children.insert(getBaseName(create_path(true)).toString());
+        expected_ephemeral_children.insert(getBaseName(create_path(test_path, true)).toString());
    }
    ASSERT_EQ(expected_ephemeral_children.size(), ephemeral_num);

@ -2017,7 +2024,7 @@ TEST_P(CoordinationTest, TestListRequestTypes)
    {
        const auto list_request = std::make_shared<ZooKeeperFilteredListRequest>();
        int new_zxid = ++zxid;
-        list_request->path = parentPath(StringRef{path}).toString();
+        list_request->path = parentPath(StringRef{test_path}).toString();
        list_request->list_request_type = list_request_type;
        storage.preprocessRequest(list_request, 1, 0, new_zxid);
        auto responses = storage.processRequest(list_request, 1, new_zxid);
@ -2120,6 +2127,20 @@ TEST_P(CoordinationTest, TestDurableState)
    }
 }

+TEST_P(CoordinationTest, TestCurrentApiVersion)
+{
+    using namespace Coordination;
+    KeeperStorage storage{500, "", keeper_context};
+    auto request = std::make_shared<ZooKeeperGetRequest>();
+    request->path = DB::keeper_api_version_path;
+    auto responses = storage.processRequest(request, 0, std::nullopt, true, true);
+    const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
+    uint8_t keeper_version{0};
+    DB::ReadBufferFromOwnString buf(get_response.data);
+    DB::readIntText(keeper_version, buf);
+    EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
+}
+
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
    CoordinationTest,
    ::testing::ValuesIn(std::initializer_list<CompressionParam>{
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -359,7 +359,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, max_bytes_in_join, 0, "Maximum size of the hash table for JOIN (in number of bytes in memory).", 0) \
    M(OverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \
    M(Bool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.", IMPORTANT) \
-    M(JoinAlgorithm, join_algorithm, JoinAlgorithm::HASH, "Specify join algorithm: 'auto', 'hash', 'partial_merge', 'prefer_partial_merge', 'parallel_hash'. 'auto' tries to change HashJoin to MergeJoin on the fly to avoid out of memory.", 0) \
+    M(JoinAlgorithm, join_algorithm, JoinAlgorithm::DEFAULT, "Specify join algorithm.", 0) \
    M(UInt64, default_max_bytes_in_join, 1000000000, "Maximum size of right-side table if limit is required but max_bytes_in_join is not set.", 0) \
    M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, "If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread.", 0) \
    M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \
@ -464,6 +464,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \
    M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \
    M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \
+    M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \
    M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \
    M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \
    M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \
@ -697,9 +698,11 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \
    M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \
    M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \
-    M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \
-    M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \
-    M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \
+    M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \
+    M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \
+    M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \
+    M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \
+    M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \
    M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \
    M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \
    M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@ -31,7 +31,8 @@ IMPLEMENT_SETTING_ENUM(JoinStrictness, ErrorCodes::UNKNOWN_JOIN,


 IMPLEMENT_SETTING_MULTI_ENUM(JoinAlgorithm, ErrorCodes::UNKNOWN_JOIN,
-    {{"auto",                 JoinAlgorithm::AUTO},
+    {{"default",              JoinAlgorithm::DEFAULT},
+     {"auto",                 JoinAlgorithm::AUTO},
     {"hash",                 JoinAlgorithm::HASH},
     {"partial_merge",        JoinAlgorithm::PARTIAL_MERGE},
     {"prefer_partial_merge", JoinAlgorithm::PREFER_PARTIAL_MERGE},
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@ -38,7 +38,8 @@ DECLARE_SETTING_ENUM(JoinStrictness)

 enum class JoinAlgorithm
 {
-    AUTO = 0,
+    DEFAULT = 0,
+    AUTO,
    HASH,
    PARTIAL_MERGE,
    PREFER_PARTIAL_MERGE,
--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -28,7 +28,9 @@ struct FillColumnDescription
    /// All missed values in range [FROM, TO) will be filled
    /// Range [FROM, TO) respects sorting direction
    Field fill_from;        /// Fill value >= FILL_FROM
+    DataTypePtr fill_from_type;
    Field fill_to;          /// Fill value + STEP < FILL_TO
+    DataTypePtr fill_to_type;
    Field fill_step;        /// Default = +1 or -1 according to direction
    std::optional<IntervalKind> step_kind;

--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@ -68,6 +68,8 @@ static void loadDiskLocalConfig(const String & name,
            throw Exception("Disk path can not be empty. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
        if (path.back() != '/')
            throw Exception("Disk path must end with /. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
+        if (path == context->getPath())
+            throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Disk path ('{}') cannot be equal to <path>. Use <default> disk instead.", path);
    }

    bool has_space_ratio = config.has(config_prefix + ".keep_free_space_ratio");
--- a/src/Formats/CapnProtoUtils.cpp
+++ b/src/Formats/CapnProtoUtils.cpp
@ -29,6 +29,7 @@ namespace ErrorCodes
    extern const int UNKNOWN_EXCEPTION;
    extern const int INCORRECT_DATA;
    extern const int CAPN_PROTO_BAD_TYPE;
+    extern const int BAD_ARGUMENTS;
 }

 capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info)
@ -450,7 +451,7 @@ static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_
    throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums");
 }

-static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type)
+static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields)
 {
    switch (capnp_type.which())
    {
@ -483,24 +484,44 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type)
        case capnp::schema::Type::LIST:
        {
            auto list_schema = capnp_type.asList();
-            auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType());
+            auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields);
+            if (!nested_type)
+                return nullptr;
            return std::make_shared<DataTypeArray>(nested_type);
        }
        case capnp::schema::Type::STRUCT:
        {
            auto struct_schema = capnp_type.asStruct();

+
+            if (struct_schema.getFields().size() == 0)
+            {
+                if (skip_unsupported_fields)
+                    return nullptr;
+                throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported");
+            }
+
            /// Check if it can be Nullable.
            if (checkIfStructIsNamedUnion(struct_schema))
            {
                auto fields = struct_schema.getUnionFields();
                if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid()))
+                {
+                    if (skip_unsupported_fields)
+                        return nullptr;
                    throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported");
+                }
                auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType();
                if (value_type.isStruct() || value_type.isList())
+                {
+                    if (skip_unsupported_fields)
+                        return nullptr;
                    throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable");
+                }

-                auto nested_type = getDataTypeFromCapnProtoType(value_type);
+                auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields);
+                if (!nested_type)
+                    return nullptr;
                return std::make_shared<DataTypeNullable>(nested_type);
            }

@ -512,17 +533,26 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type)
            Names nested_names;
            for (auto field : struct_schema.getNonUnionFields())
            {
+                auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
+                if (!nested_type)
+                    continue;
                nested_names.push_back(field.getProto().getName());
-                nested_types.push_back(getDataTypeFromCapnProtoType(field.getType()));
+                nested_types.push_back(nested_type);
            }
+            if (nested_types.empty())
+                return nullptr;
            return std::make_shared<DataTypeTuple>(std::move(nested_types), std::move(nested_names));
        }
        default:
+        {
+            if (skip_unsupported_fields)
+                return nullptr;
            throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type));
+        }
    }
 }

-NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema)
+NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields)
 {
    if (checkIfStructContainsUnnamedUnion(schema))
        throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported");
@ -531,9 +561,13 @@ NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema)
    for (auto field : schema.getNonUnionFields())
    {
        auto name = field.getProto().getName();
-        auto type = getDataTypeFromCapnProtoType(field.getType());
-        names_and_types.emplace_back(name, type);
+        auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields);
+        if (type)
+            names_and_types.emplace_back(name, type);
    }
+    if (names_and_types.empty())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types");
+
    return names_and_types;
 }

--- a/Show More
+++ b/Show More