merge with other working branch

2024-11-21 23:21:59 +00:00 · 2022-07-11 23:02:58 +03:00 · 2022-07-11 23:02:58 +03:00 · 783e6e398c
commit 783e6e398c
parent 8968ff8995 b08c032f8f
275 changed files with 8981 additions and 2041 deletions
--- a/.github/ISSUE_TEMPLATE/85_bug-report.md
+++ b/.github/ISSUE_TEMPLATE/85_bug-report.md
@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: Wrong behaviour (visible to users) in official ClickHouse release.
+about: Wrong behavior (visible to users) in the official ClickHouse release.
 title: ''
 labels: 'potential bug'
 assignees: ''
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -350,6 +350,36 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
+##################################### Docker images  #######################################
+############################################################################################
+  DockerServerImages:
+    needs:
+      - BuilderDebRelease
+      - BuilderDebAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+      - name: Check docker clickhouse/clickhouse-server building
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 docker_server.py --release-type head --no-push
+          python3 docker_server.py --release-type head --no-push --no-ubuntu \
+            --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
+############################################################################################
 ##################################### BUILD REPORTER #######################################
 ############################################################################################
  BuilderReport:
@ -560,6 +590,7 @@ jobs:
  FinishCheck:
    needs:
      - DockerHubPush
+      - DockerServerImages
      - BuilderReport
      - FunctionalStatelessTestAsan
      - FunctionalStatefulTestDebug
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -427,6 +427,36 @@ jobs:
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
+##################################### Docker images  #######################################
+############################################################################################
+  DockerServerImages:
+    needs:
+      - BuilderDebRelease
+      - BuilderDebAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+      - name: Check docker clickhouse/clickhouse-server building
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 docker_server.py --release-type head --no-push
+          python3 docker_server.py --release-type head --no-push --no-ubuntu \
+            --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
+############################################################################################
 ##################################### BUILD REPORTER #######################################
 ############################################################################################
  BuilderReport:
@ -1815,6 +1845,7 @@ jobs:
  FinishCheck:
    needs:
      - DockerHubPush
+      - DockerServerImages
      - BuilderReport
      - FunctionalStatelessTestDebug0
      - FunctionalStatelessTestDebug1
--- a/base/base/defines.h
+++ b/base/base/defines.h
@ -93,6 +93,7 @@
 #    define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address")))
 #    define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread")))
 #    define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED __attribute__((__always_inline__, __no_sanitize__("undefined")))
+#    define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation))
 #else  /// It does not work in GCC. GCC 7 cannot recognize this attribute and GCC 8 simply ignores it.
 #    define NO_SANITIZE_UNDEFINED
 #    define NO_SANITIZE_ADDRESS
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -164,7 +164,6 @@ if (ENABLE_ANNOY)
    add_contrib(annoy-cmake annoy)
    target_compile_definitions(_annoy PUBLIC ENABLE_ANNOY)
 endif()
-
 # Put all targets defined here and in subdirectories under "contrib/<immediate-subdir>" folders in GUI-based IDEs.
 # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear
 # in "contrib/..." as originally planned, so we workaround this by fixing FOLDER properties of all targets manually,
--- a/contrib/annoy
+++ b/contrib/annoy
@ -1 +1 @@
-Subproject commit 301ff04e2213abaa7cbe30041b9b576c968bd994
+Subproject commit f2ae13120a2d2a6b35ee27ea7f275782541fdd75
--- a/contrib/annoy-cmake/CMakeLists.txt
+++ b/contrib/annoy-cmake/CMakeLists.txt
@ -1,9 +1,9 @@
 set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy")
 set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src")
-set(ANNOY_INCLUDE_DIR "${ANNOY_PROJECT_DIR}/src")

-add_library(_annoy ${ANNOY_SOURCE_DIR}/mman.h)
-target_include_directories(_annoy SYSTEM PUBLIC ${ANNOY_INCLUDE_DIR})
+add_library(_annoy ${ANNOY_SOURCE_DIR}/annoylib.h)
+target_include_directories(_annoy SYSTEM PUBLIC ${ANNOY_SOURCE_DIR})
 set_target_properties(_annoy PROPERTIES LINKER_LANGUAGE CXX)

 add_library(ch_contrib::annoy ALIAS _annoy)
+
--- a/docker/server/README.md
+++ b/docker/server/README.md
@ -2,9 +2,9 @@

 ## What is ClickHouse?

-ClickHouse is an open-source column-oriented database management system that allows the generation of analytical data reports in real-time.
+We are the creators of the popular open-source column-oriented DBMS (columnar database management system) for online analytical processing (OLAP) which allows users to generate analytical reports using SQL queries in real-time.

-ClickHouse manages extremely large volumes of data. It currently powers [Yandex.Metrica](https://metrica.yandex.com/), the world’s [second-largest](http://w3techs.com/technologies/overview/traffic_analysis/all) web analytics platform, with over 13 trillion database records and over 20 billion events a day, generating customized reports on-the-fly, directly from non-aggregated data. This system was successfully implemented at [CERN’s LHCb experiment](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) to store and process metadata on 10bn events with over 1000 attributes per event registered in 2011.
+ClickHouse works 100-1000x faster than traditional database management systems, and processes hundreds of millions to over a billion rows and tens of gigabytes of data per server per second.  With a widespread user base around the globe, the technology has received praise for its reliability, ease of use, and fault tolerance.

 For more information and documentation see https://clickhouse.com/.

@ -52,7 +52,10 @@ You can expose your ClickHouse running in docker by [mapping a particular port](
 ```bash
 docker run -d -p 18123:8123 -p19000:9000 --name some-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server
 echo 'SELECT version()' | curl 'http://localhost:18123/' --data-binary @-
-20.12.3.3
+```
+
+```response
+22.6.3.35
 ```

 or by allowing the container to use [host ports directly](https://docs.docker.com/network/host/) using `--network=host` (also allows archiving better network performance):
@ -60,7 +63,10 @@ or by allowing the container to use [host ports directly](https://docs.docker.co
 ```bash
 docker run -d --network=host --name some-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server
 echo 'SELECT version()' | curl 'http://localhost:8123/' --data-binary @-
-20.12.3.3
+```
+
+```response
+22.6.3.35
 ```

 ### Volumes
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@ -36,6 +36,9 @@ ERROR_LOG_DIR=""
 if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi
 FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"

+# There could be many disks declared in config
+readarray -t FILESYSTEM_CACHE_PATHS < <(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key='storage_configuration.disks.*.data_cache_path' || true)
+
 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
 CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
 CLICKHOUSE_DB="${CLICKHOUSE_DB:-}"
@ -46,7 +49,8 @@ for dir in "$DATA_DIR" \
  "$LOG_DIR" \
  "$TMP_DIR" \
  "$USER_PATH" \
-  "$FORMAT_SCHEMA_PATH"
+  "$FORMAT_SCHEMA_PATH" \
+  "${FILESYSTEM_CACHE_PATHS[@]}"
 do
    # check if variable not empty
    [ -z "$dir" ] && continue
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -47,7 +47,8 @@ function configure()

    # we mount tests folder from repo to /usr/share
    ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
-    ln -s /usr/share/clickhouse-test/ci/download_previous_release.py /usr/bin/download_previous_release
+    ln -s /usr/share/clickhouse-test/ci/download_release_packets.py /usr/bin/download_release_packets
+    ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag

    # avoid too slow startup
    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
@ -228,7 +229,7 @@ clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_
 # Sanitizer asserts
 grep -Fa "==================" /var/log/clickhouse-server/stderr.log | grep -v "in query:" >> /test_output/tmp
 grep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
-zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" /test_output/tmp > /dev/null \
+zgrep -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
    && echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \
    || echo -e 'No sanitizer asserts\tOK' >> /test_output/test_results.tsv
 rm -f /test_output/tmp
@ -267,16 +268,31 @@ zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \

 echo -e "Backward compatibility check\n"

+echo "Get previous release tag"
+previous_release_tag=$(clickhouse-client --query="SELECT version()" | get_previous_release_tag)
+echo $previous_release_tag
+
+echo "Clone previous release repository"
+git clone https://github.com/ClickHouse/ClickHouse.git --no-tags --progress --branch=$previous_release_tag --no-recurse-submodules --depth=1 previous_release_repository
+
 echo "Download previous release server"
 mkdir previous_release_package_folder
-clickhouse-client --query="SELECT version()" | download_previous_release && echo -e 'Download script exit code\tOK' >> /test_output/test_results.tsv \
+
+echo $previous_release_tag | download_release_packets && echo -e 'Download script exit code\tOK' >> /test_output/test_results.tsv \
    || echo -e 'Download script failed\tFAIL' >> /test_output/test_results.tsv

 stop
 mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.clean.log

-if [ "$(ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb)" ]
+# Check if we cloned previous release repository successfully
+if ! [ "$(ls -A previous_release_repository/tests/queries)" ]
 then
+    echo -e "Backward compatibility check: Failed to clone previous release tests\tFAIL" >> /test_output/test_results.tsv
+elif ! [ "$(ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb)" ]
+then
+    echo -e "Backward compatibility check: Failed to download previous release packets\tFAIL" >> /test_output/test_results.tsv
+else
+    echo -e "Successfully cloned previous release tests\tOK" >> /test_output/test_results.tsv
    echo -e "Successfully downloaded previous release packets\tOK" >> /test_output/test_results.tsv

    # Uninstall current packages
@ -310,7 +326,7 @@ then

    mkdir tmp_stress_output

-    ./stress --backward-compatibility-check --output-folder tmp_stress_output --global-time-limit=1200 \
+    ./stress --test-cmd="/usr/bin/clickhouse-test --queries=\"previous_release_repository/tests/queries\""  --backward-compatibility-check --output-folder tmp_stress_output --global-time-limit=1200 \
        && echo -e 'Backward compatibility check: Test script exit code\tOK' >> /test_output/test_results.tsv \
        || echo -e 'Backward compatibility check: Test script failed\tFAIL' >> /test_output/test_results.tsv
    rm -rf tmp_stress_output
@ -336,6 +352,8 @@ then
    mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.backward.clean.log

    # Error messages (we should ignore some errors)
+    # FIXME https://github.com/ClickHouse/ClickHouse/issues/38629 ("pp.proj, errno: 21")
+    # FIXME https://github.com/ClickHouse/ClickHouse/issues/38643 ("Unknown index: idx.")
    echo "Check for Error messages in server log:"
    zgrep -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
               -e "Code: 236. DB::Exception: Cancelled mutating parts" \
@ -358,6 +376,8 @@ then
               -e "and a merge is impossible: we didn't find" \
               -e "found in queue and some source parts for it was lost" \
               -e "is lost forever." \
+               -e "pp.proj, errno: 21" \
+               -e "Unknown index: idx." \
        /var/log/clickhouse-server/clickhouse-server.backward.clean.log | zgrep -Fa "<Error>" > /test_output/bc_check_error_messages.txt \
        && echo -e 'Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)\tFAIL' >> /test_output/test_results.tsv \
        || echo -e 'Backward compatibility check: No Error messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv
@ -368,7 +388,7 @@ then
    # Sanitizer asserts
    zgrep -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
    zgrep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
-    zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" /test_output/tmp > /dev/null \
+    zgrep -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
        && echo -e 'Backward compatibility check: Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \
        || echo -e 'Backward compatibility check: No sanitizer asserts\tOK' >> /test_output/test_results.tsv
    rm -f /test_output/tmp
@ -400,8 +420,6 @@ then

    # Remove file bc_check_fatal_messages.txt if it's empty
    [ -s /test_output/bc_check_fatal_messages.txt ] || rm /test_output/bc_check_fatal_messages.txt
-else
-    echo -e "Backward compatibility check: Failed to download previous release packets\tFAIL" >> /test_output/test_results.tsv
 fi

 tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -28,18 +28,20 @@ def get_options(i, backward_compatibility_check):
    if i % 2 == 1:
        options.append(" --database=test_{}".format(i))

-    if i % 5 == 1:
+    if i % 3 == 1:
        client_options.append("join_use_nulls=1")

-    if i % 15 == 1:
-        client_options.append("join_algorithm='parallel_hash'")
-
-    if i % 15 == 6:
-        client_options.append("join_algorithm='partial_merge'")
-
-    if i % 15 == 11:
-        client_options.append("join_algorithm='auto'")
-        client_options.append("max_rows_in_join=1000")
+    if i % 2 == 1:
+        join_alg_num = i // 2
+        if join_alg_num % 4 == 0:
+            client_options.append("join_algorithm='parallel_hash'")
+        if join_alg_num % 4 == 1:
+            client_options.append("join_algorithm='partial_merge'")
+        if join_alg_num % 4 == 2:
+            client_options.append("join_algorithm='full_sorting_merge'")
+        if join_alg_num % 4 == 3:
+            client_options.append("join_algorithm='auto'")
+            client_options.append('max_rows_in_join=1000')

    if i == 13:
        client_options.append("memory_tracker_fault_probability=0.001")
--- a/docs/changelogs/v22.6.3.35-stable.md
+++ b/docs/changelogs/v22.6.3.35-stable.md
@ -0,0 +1,34 @@
+---
+sidebar_position: 1
+sidebar_label: 2022
+---
+
+# 2022 Changelog
+
+### ClickHouse release v22.6.3.35-stable FIXME as compared to v22.6.2.12-stable
+
+#### Bug Fix
+* Backported in [#38812](https://github.com/ClickHouse/ClickHouse/issues/38812): Fix crash when executing GRANT ALL ON *.* with ON CLUSTER. It was broken in https://github.com/ClickHouse/ClickHouse/pull/35767. This closes [#38618](https://github.com/ClickHouse/ClickHouse/issues/38618). [#38674](https://github.com/ClickHouse/ClickHouse/pull/38674) ([Vitaly Baranov](https://github.com/vitlibar)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#38883](https://github.com/ClickHouse/ClickHouse/issues/38883): Add `clickhouse-diagnostics` binary to the packages. [#38647](https://github.com/ClickHouse/ClickHouse/pull/38647) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### Bug Fix (user-visible misbehavior in official stable or prestable release)
+
+* Backported in [#38690](https://github.com/ClickHouse/ClickHouse/issues/38690): Fix incorrect columns order in subqueries of UNION (in case of duplicated columns in subselects may produce incorrect result). [#37887](https://github.com/ClickHouse/ClickHouse/pull/37887) ([Azat Khuzhin](https://github.com/azat)).
+* Backported in [#38500](https://github.com/ClickHouse/ClickHouse/issues/38500): Do not allow recursive usage of OvercommitTracker during logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794) cc @tavplubix @davenger. [#38246](https://github.com/ClickHouse/ClickHouse/pull/38246) ([Dmitry Novik](https://github.com/novikd)).
+* Backported in [#38712](https://github.com/ClickHouse/ClickHouse/issues/38712): Fix incorrect result of distributed queries with `DISTINCT` and `LIMIT`. Fixes [#38282](https://github.com/ClickHouse/ClickHouse/issues/38282). [#38371](https://github.com/ClickHouse/ClickHouse/pull/38371) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#38594](https://github.com/ClickHouse/ClickHouse/issues/38594): Fix parts removal (will be left forever if they had not been removed on server shutdown) after incorrect server shutdown. [#38486](https://github.com/ClickHouse/ClickHouse/pull/38486) ([Azat Khuzhin](https://github.com/azat)).
+* Backported in [#38597](https://github.com/ClickHouse/ClickHouse/issues/38597): Fix table creation to avoid replication issues with pre-22.4 replicas. [#38541](https://github.com/ClickHouse/ClickHouse/pull/38541) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#38687](https://github.com/ClickHouse/ClickHouse/issues/38687): Now it's possible to start a clickhouse-server and attach/detach tables even for tables with the incorrect values of IPv4/IPv6 representation. Proper fix for issue [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#38590](https://github.com/ClickHouse/ClickHouse/pull/38590) ([alesapin](https://github.com/alesapin)).
+* Backported in [#38665](https://github.com/ClickHouse/ClickHouse/issues/38665): Adapt some more nodes to avoid issues with pre-22.4 replicas. [#38627](https://github.com/ClickHouse/ClickHouse/pull/38627) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#38778](https://github.com/ClickHouse/ClickHouse/issues/38778): `rankCorr` function will work correctly if some arguments are NaNs. This closes [#38396](https://github.com/ClickHouse/ClickHouse/issues/38396). [#38722](https://github.com/ClickHouse/ClickHouse/pull/38722) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Backported in [#38782](https://github.com/ClickHouse/ClickHouse/issues/38782): Fix use-after-free for Map combinator that leads to incorrect result. [#38748](https://github.com/ClickHouse/ClickHouse/pull/38748) ([Azat Khuzhin](https://github.com/azat)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix test for system table  count in diag tool [#38236](https://github.com/ClickHouse/ClickHouse/pull/38236) ([Dale McDiarmid](https://github.com/gingerwizard)).
+* Integration tests volume [#38291](https://github.com/ClickHouse/ClickHouse/pull/38291) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Update docker-compose to try get rid of v1 errors [#38394](https://github.com/ClickHouse/ClickHouse/pull/38394) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix backports diff [#38703](https://github.com/ClickHouse/ClickHouse/pull/38703) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
--- a/docs/en/development/build-cross-arm.md
+++ b/docs/en/development/build-cross-arm.md
@ -5,34 +5,6 @@ sidebar_label: Build on Linux for AARCH64 (ARM64)

 # How to Build ClickHouse on Linux for AARCH64 (ARM64) Architecture 

-This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. 
-This is intended for continuous integration checks that run on Linux servers.
+If you use AArch64 machine and want to build ClickHouse for AArch64, build as usual.

-The cross-build for AARCH64 is based on the [Build instructions](../development/build.md), follow them first.
-
-## Install Clang-14 or newer
-
-Follow the instructions from https://apt.llvm.org/ for your Ubuntu or Debian setup or do
-```
-sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
-```
-
-## Install Cross-Compilation Toolset {#install-cross-compilation-toolset}
-
-``` bash
-cd ClickHouse
-mkdir -p build-aarch64/cmake/toolchain/linux-aarch64
-wget 'https://developer.arm.com/-/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz?revision=2e88a73f-d233-4f96-b1f4-d8b36e9bb0b9&la=en' -O gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz
-tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build-aarch64/cmake/toolchain/linux-aarch64 --strip-components=1
-```
-
-## Build ClickHouse {#build-clickhouse}
-
-``` bash
-cd ClickHouse
-mkdir build-arm64
-CC=clang-14 CXX=clang++-14 cmake . -Bbuild-arm64 -DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake
-ninja -C build-arm64
-```
-
-The resulting binary will run only on Linux with the AARCH64 CPU architecture.
+If you use x86_64 machine and want cross-compile for AArch64, add the following flag to `cmake`: `-DCMAKE_TOOLCHAIN_FILE=cmake/linux/toolchain-aarch64.cmake`
--- a/docs/en/development/build-cross-osx.md
+++ b/docs/en/development/build-cross-osx.md
@ -25,6 +25,7 @@ sudo apt-get install clang-14
 Let’s remember the path where we install `cctools` as ${CCTOOLS}

 ``` bash
+export CCTOOLS=$(cd ~/cctools && pwd)
 mkdir ${CCTOOLS}
 cd ${CCTOOLS}

@ -43,10 +44,8 @@ make install
 Also, we need to download macOS X SDK into the working tree.

 ``` bash
-cd ClickHouse
-wget 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.15.sdk.tar.xz'
-mkdir -p build-darwin/cmake/toolchain/darwin-x86_64
-tar xJf MacOSX10.15.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --strip-components=1
+cd ClickHouse/cmake/toolchain/darwin-x86_64
+curl -L 'https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.15.sdk.tar.xz' | tar xJ --strip-components=1
 ```

 ## Build ClickHouse {#build-clickhouse}
@ -55,7 +54,7 @@ tar xJf MacOSX10.15.sdk.tar.xz -C build-darwin/cmake/toolchain/darwin-x86_64 --s
 cd ClickHouse
 mkdir build-darwin
 cd build-darwin
-CC=clang-13 CXX=clang++-13 cmake -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/aarch64-apple-darwin-ar -DCMAKE_INSTALL_NAME_TOOL=${CCTOOLS}/bin/aarch64-apple-darwin-install_name_tool -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/aarch64-apple-darwin-ranlib -DLINKER_NAME=${CCTOOLS}/bin/aarch64-apple-darwin-ld -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake ..
+CC=clang-14 CXX=clang++-14 cmake -DCMAKE_AR:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ar -DCMAKE_INSTALL_NAME_TOOL=${CCTOOLS}/bin/x86_64-apple-darwin-install_name_tool -DCMAKE_RANLIB:FILEPATH=${CCTOOLS}/bin/x86_64-apple-darwin-ranlib -DLINKER_NAME=${CCTOOLS}/bin/x86_64-apple-darwin-ld -DCMAKE_TOOLCHAIN_FILE=cmake/darwin/toolchain-x86_64.cmake ..
 ninja
 ```

--- a/docs/en/engines/table-engines/integrations/mysql.md
+++ b/docs/en/engines/table-engines/integrations/mysql.md
@ -17,10 +17,12 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
    ...
 ) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause'])
 SETTINGS
-    [connection_pool_size=16, ]
-    [connection_max_tries=3, ]
-    [connection_wait_timeout=5, ] /* 0 -- do not wait */
-    [connection_auto_close=true ]
+    [ connection_pool_size=16, ]
+    [ connection_max_tries=3, ]
+    [ connection_wait_timeout=5, ]
+    [ connection_auto_close=true, ]
+    [ connect_timeout=10, ]
+    [ read_write_timeout=300 ]
 ;
 ```

@ -144,6 +146,36 @@ Possible values:

 Default value: `16`.

+### connection_wait_timeout {#connection-wait-timeout}
+
+Timeout (in seconds) for waiting for free connection (in case of there is already connection_pool_size active connections), 0 - do not wait.
+
+Possible values:
+
+-   Positive integer.
+
+Default value: `5`.
+
+### connect_timeout {#connect-timeout}
+
+Connect timeout (in seconds).
+
+Possible values:
+
+-   Positive integer.
+
+Default value: `10`.
+
+### read_write_timeout {#read-write-timeout}
+
+Read/write timeout (in seconds).
+
+Possible values:
+
+-   Positive integer.
+
+Default value: `300`.
+
 ## See Also {#see-also}

 -   [The mysql table function](../../../sql-reference/table-functions/mysql.md)
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@ -0,0 +1,75 @@
+# Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex}
+
+The main task that indexes help to solve is to find the nearest neighbors for multidimensional data. An example of such a problem could be similar pictures or texts, for which the problem is reduced to finding the nearest [embeddings](https://cloud.google.com/architecture/overview-extracting-and-serving-feature-embeddings-for-machine-learning). They can be created from data using [UDF](../../../sql-reference/functions/index.md#executable-user-defined-functions).
+
+## Indexes Structure
+
+Approximate Nearest Neighbor Search Indexes (`ANNIndexes`) are simmilar to skip indexes. They are constructed by some granules and determine which of them should be skipped. Compared to skip indices, ANN indices use their results not only to skip some group of granules, but also to select particular granules from a set of granules.
+
+`ANNIndexes` are designed to speed up two types of queries:
+
+- ######  Type 1: Where 
+   ``` sql 
+   SELECT * 
+   FROM table_name 
+   WHERE DistanceFunction(Column, TargetVector) < Value 
+   LIMIT N
+   ```
+- ###### Type 2: Order by
+  ``` sql
+  SELECT * 
+  FROM table_name [WHERE ...] 
+  ORDER BY DistanceFunction(Column, TargetVector) 
+  LIMIT N
+  ```
+
+In these queries, `DistanceFunction` is selected from tuples of distance functions. `TargetVector` is a known embedding (something like `(0.1, 0.1, ... )`). `Value` - a float value that will bound the neighbourhood.
+
+!!! note "Note"
+    ANNIndex can't speed up query that satisfies both types and they work only for Tuples. All queries must have the limit, as algorithms are used to find nearest neighbors and need a specific number of them.
+
+Both types of queries are handled the same way. The indexes get `n` neighbors (where `n` is taken from the `LIMIT` section) and work with them. In `ORDER BY` query they remember the numbers of all parts of the granule that have at least one of neighbor. In `WHERE` query they remember only those parts that satisfy the requirements.
+
+###### Create table with ANNIndex
+```
+CREATE TABLE t
+(
+  `id` Int64,
+  `number` Tuple(Float32, Float32, Float32),
+  INDEX x number TYPE annoy GRANULARITY N
+)
+ENGINE = MergeTree
+ORDER BY id;
+```
+    
+Number of granules in granularity should be large. With greater `GRANULARITY` indexes remember the data structure better. But some indexes can't be built if they don't have enough data, so this granule will always participate in the query. For more information, see the description of indexes.
+
+As the indexes are built only during insertions into table, `INSERT` and `OPTIMIZE` queries are slower than for ordinary table. At this stage indexes remember all the information about the given data. ANNIndexes should be used if you have immutable or rarely changed data and many read requests.
+    
+You can create your table with index which uses certain algorithm. Now only indices based on the following algorithms are supported:
+
+# Index list
+- [Annoy](../../../engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
+
+# Annoy {#annoy}
+Implementation of the algorithm was taken from [this repository](https://github.com/spotify/annoy).
+
+Short description of the algorithm:
+The algorithm recursively divides in half all space by random linear surfaces (lines in 2D, planes in 3D e.t.c.). Thus it makes tree of polyhedrons and points that they contains. Repeating the operation several times for greater accuracy it creates a forest.
+To find K Nearest Neighbours it goes down through the trees and fills the buffer of closest points using the priority queue of polyhedrons. Next, it sorts buffer and return the nearest K points.
+
+__Example__:
+```sql
+CREATE TABLE t
+(
+  id Int64,
+  number Tuple(Float32, Float32, Float32),
+  INDEX x number TYPE annoy(T) GRANULARITY N
+)
+ENGINE = MergeTree
+ORDER BY id;
+```
+Parameter `T` is the number of trees which algorithm will create. The bigger it is, the slower (approximately linear) it works (in both `CREATE` and `SELECT` requests), but the better accuracy you get (adjusted for randomness).
+
+In the `SELECT` in the settings (`ann_index_params`) you can specify the size of the internal buffer (more details in the description above or in the [original repository](https://github.com/spotify/annoy)).
+This parameter may help you to adjust the trade-off between query speed and accuracy.
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -480,7 +480,9 @@ For example:
    -   `NOT startsWith(s, 'test')`
 :::

-In addition to skip indices, there are also [Approximate Nearest Neighbor Search Indexes](../../../engines/table-engines/mergetree-family/replication.md).
+
+## Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex}
+In addition to skip indices, there are also [Approximate Nearest Neighbor Search Indexes](../../../engines/table-engines/mergetree-family/annindexes.md).

 ## Projections {#projections}
 Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries.
@ -1033,4 +1035,7 @@ Examples of working configurations can be found in integration tests directory (
 -   `_part_uuid` — Unique part identifier (if enabled MergeTree setting `assign_part_uuids`).
 -   `_partition_value` — Values (a tuple) of a `partition by` expression.
 -   `_sample_factor` — Sample factor (from the query).
+<<<<<<< HEAD

+=======
+>>>>>>> b08c032f8fc64d398045ad21324b5269cf5cd171
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -131,7 +131,7 @@ Integer value in the `UInt8`, `UInt16`, `UInt32`, `UInt64` or `UInt256` data typ

 Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers.

-The behavior of functions for negative agruments and for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions.
+The behavior of functions for negative arguments and for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions.

 **Example**

--- a/docs/en/sql-reference/statements/alter/index/index.md
+++ b/docs/en/sql-reference/statements/alter/index/index.md
@ -8,11 +8,11 @@ sidebar_label: INDEX

 The following operations are available:

-   `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - Adds index description to tables metadata.
+-   `ALTER TABLE [db].table_name [ON CLUSTER cluster] ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - Adds index description to tables metadata.

-   `ALTER TABLE [db].name DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk.
+-   `ALTER TABLE [db].table_name [ON CLUSTER cluster] DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk.

-   `ALTER TABLE [db.]table MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data.
+-   `ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name [IN PARTITION partition_name]` - Rebuilds the secondary index `name` for the specified `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations). If `IN PARTITION` part is omitted then it rebuilds the index for the whole table data.

 The first two commands are lightweight in a sense that they only change metadata or remove files.

--- a/docs/en/sql-reference/statements/alter/partition.md
+++ b/docs/en/sql-reference/statements/alter/partition.md
@ -27,7 +27,7 @@ The following operations with [partitions](../../../engines/table-engines/merget
 ## DETACH PARTITION\|PART

 ``` sql
-ALTER TABLE table_name DETACH PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DETACH PARTITION|PART partition_expr
 ```

 Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#alter_attach-partition) query.
@ -48,7 +48,7 @@ This query is replicated – it moves the data to the `detached` directory on al
 ## DROP PARTITION\|PART

 ``` sql
-ALTER TABLE table_name DROP PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DROP PARTITION|PART partition_expr
 ```

 Deletes the specified partition from the table. This query tags the partition as inactive and deletes data completely, approximately in 10 minutes.
@ -67,7 +67,7 @@ ALTER TABLE mt DROP PART 'all_4_4_0';
 ## DROP DETACHED PARTITION\|PART

 ``` sql
-ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DROP DETACHED PARTITION|PART partition_expr
 ```

 Removes the specified part or all parts of the specified partition from `detached`.
@ -76,7 +76,7 @@ Read more about setting the partition expression in a section [How to specify th
 ## ATTACH PARTITION\|PART

 ``` sql
-ALTER TABLE table_name ATTACH PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] ATTACH PARTITION|PART partition_expr
 ```

 Adds data to the table from the `detached` directory. It is possible to add data for an entire partition or for a separate part. Examples:
@ -99,7 +99,7 @@ You can put data to the `detached` directory on one replica and use the `ALTER .
 ## ATTACH PARTITION FROM

 ``` sql
-ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1
+ALTER TABLE table2 [ON CLUSTER cluster] ATTACH PARTITION partition_expr FROM table1
 ```

 This query copies the data partition from `table1` to `table2`.
@ -113,7 +113,7 @@ For the query to run successfully, the following conditions must be met:
 ## REPLACE PARTITION

 ``` sql
-ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1
+ALTER TABLE table2 [ON CLUSTER cluster] REPLACE PARTITION partition_expr FROM table1
 ```

 This query copies the data partition from the `table1` to `table2` and replaces existing partition in the `table2`. Note that data won’t be deleted from `table1`.
@ -126,7 +126,7 @@ For the query to run successfully, the following conditions must be met:
 ## MOVE PARTITION TO TABLE

 ``` sql
-ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest
+ALTER TABLE table_source [ON CLUSTER cluster] MOVE PARTITION partition_expr TO TABLE table_dest
 ```

 This query moves the data partition from the `table_source` to `table_dest` with deleting the data from `table_source`.
@ -141,7 +141,7 @@ For the query to run successfully, the following conditions must be met:
 ## CLEAR COLUMN IN PARTITION

 ``` sql
-ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] CLEAR COLUMN column_name IN PARTITION partition_expr
 ```

 Resets all values in the specified column in a partition. If the `DEFAULT` clause was determined when creating a table, this query sets the column value to a specified default value.
@ -155,7 +155,7 @@ ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902
 ## FREEZE PARTITION

 ``` sql
-ALTER TABLE table_name FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name']
+ALTER TABLE table_name [ON CLUSTER cluster] FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name']
 ```

 This query creates a local backup of a specified partition. If the `PARTITION` clause is omitted, the query creates the backup of all partitions at once.
@ -197,7 +197,7 @@ For more information about backups and restoring data, see the [Data Backup](../
 ## UNFREEZE PARTITION

 ``` sql
-ALTER TABLE 'table_name' UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name'
+ALTER TABLE table_name [ON CLUSTER cluster] UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name'
 ```

 Removes `freezed` partitions with the specified name from the disk. If the `PARTITION` clause is omitted, the query removes the backup of all partitions at once.
@ -205,7 +205,7 @@ Removes `freezed` partitions with the specified name from the disk. If the `PART
 ## CLEAR INDEX IN PARTITION

 ``` sql
-ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] CLEAR INDEX index_name IN PARTITION partition_expr
 ```

 The query works similar to `CLEAR COLUMN`, but it resets an index instead of a column data.
@ -213,7 +213,7 @@ The query works similar to `CLEAR COLUMN`, but it resets an index instead of a c
 ## FETCH PARTITION|PART

 ``` sql
-ALTER TABLE table_name FETCH PARTITION|PART partition_expr FROM 'path-in-zookeeper'
+ALTER TABLE table_name [ON CLUSTER cluster] FETCH PARTITION|PART partition_expr FROM 'path-in-zookeeper'
 ```

 Downloads a partition from another server. This query only works for the replicated tables.
@ -250,7 +250,7 @@ Although the query is called `ALTER TABLE`, it does not change the table structu
 Moves partitions or data parts to another volume or disk for `MergeTree`-engine tables. See [Using Multiple Block Devices for Data Storage](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes).

 ``` sql
-ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name'
+ALTER TABLE table_name [ON CLUSTER cluster] MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name'
 ```

 The `ALTER TABLE t MOVE` query:
@ -273,7 +273,7 @@ Manipulates data in the specifies partition matching the specified filtering exp
 Syntax:

 ``` sql
-ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr
 ```

 ### Example
@ -293,7 +293,7 @@ Deletes data in the specifies partition matching the specified filtering express
 Syntax:

 ``` sql
-ALTER TABLE [db.]table DELETE [IN PARTITION partition_id] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE [IN PARTITION partition_id] WHERE filter_expr
 ```

 ### Example
--- a/docs/en/sql-reference/statements/alter/update.md
+++ b/docs/en/sql-reference/statements/alter/update.md
@ -6,7 +6,7 @@ sidebar_label: UPDATE
 # ALTER TABLE … UPDATE Statements

 ``` sql
-ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr
 ```

 Manipulates data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
--- a/docs/en/sql-reference/statements/exchange.md
+++ b/docs/en/sql-reference/statements/exchange.md
@ -15,7 +15,7 @@ The `EXCHANGE` query is supported by the [Atomic](../../engines/database-engines
 **Syntax**

 ```sql
-EXCHANGE TABLES|DICTIONARIES [db0.]name_A AND [db1.]name_B
+EXCHANGE TABLES|DICTIONARIES [db0.]name_A AND [db1.]name_B [ON CLUSTER cluster]
 ```

 ## EXCHANGE TABLES
@ -25,7 +25,7 @@ Exchanges the names of two tables.
 **Syntax**

 ```sql
-EXCHANGE TABLES [db0.]table_A AND [db1.]table_B
+EXCHANGE TABLES [db0.]table_A AND [db1.]table_B [ON CLUSTER cluster]
 ```

 ## EXCHANGE DICTIONARIES
@ -35,7 +35,7 @@ Exchanges the names of two dictionaries.
 **Syntax**

 ```sql
-EXCHANGE DICTIONARIES [db0.]dict_A AND [db1.]dict_B
+EXCHANGE DICTIONARIES [db0.]dict_A AND [db1.]dict_B [ON CLUSTER cluster]
 ```

 **See Also**
--- a/docs/en/sql-reference/statements/select/index.md
+++ b/docs/en/sql-reference/statements/select/index.md
@ -25,7 +25,7 @@ SELECT [DISTINCT [ON (column1, column2, ...)]] expr_list
 [LIMIT [n, ]m] [WITH TIES]
 [SETTINGS ...]
 [UNION  ...]
-[INTO OUTFILE filename [COMPRESSION type] ]
+[INTO OUTFILE filename [COMPRESSION type [LEVEL level]] ]
 [FORMAT format]
 ```

--- a/docs/en/sql-reference/statements/select/into-outfile.md
+++ b/docs/en/sql-reference/statements/select/into-outfile.md
@ -6,16 +6,18 @@ sidebar_label: INTO OUTFILE

 `INTO OUTFILE` clause redirects the result of a `SELECT` query to a file on the **client** side.

-Compressed files are supported. Compression type is detected by the extension of the file name (mode `'auto'` is used by default). Or it can be explicitly specified in a `COMPRESSION` clause.
+Compressed files are supported. Compression type is detected by the extension of the file name (mode `'auto'` is used by default). Or it can be explicitly specified in a `COMPRESSION` clause. The compression level for a certain compression type can be specified in a `LEVEL` clause.

 **Syntax**

 ```sql
-SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type]
+SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
 ```

 `file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.

+`level` is a numeric literal. Positive integers in following ranges are supported: `1-12` for `lz4` type, `1-22` for `zstd` type and `1-9` for other compression types.
+
 ## Implementation Details

 -   This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail.
--- a/docs/en/sql-reference/table-functions/url.md
+++ b/docs/en/sql-reference/table-functions/url.md
@ -41,7 +41,7 @@ INSERT INTO FUNCTION url('http://127.0.0.1:8123/?query=INSERT+INTO+test_table+FO
 SELECT * FROM test_table;
 ```

-## Globs in URL {globs-in-url}
+## Globs in URL

 Patterns in curly brackets `{ }` are used to generate a set of shards or to specify failover addresses. Supported pattern types and examples see in the description of the [remote](remote.md#globs-in-addresses) function.
 Character `|` inside patterns is used to specify failover addresses. They are iterated in the same order as listed in the pattern. The number of generated addresses is limited by [glob_expansion_max_elements](../../operations/settings/settings.md#glob_expansion_max_elements) setting.
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@ -70,13 +70,29 @@ https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html

 ```text
 aggregate_function (column_name)
-  OVER ([PARTITION BY groupping_column] [ORDER BY sorting_column]
-        [ROWS or RANGE expression_to_bounds_of_frame])
+  OVER ([[PARTITION BY grouping_column] [ORDER BY sorting_column] 
+        [ROWS or RANGE expression_to_bound_rows_withing_the_group]] | [window_name])
+FROM table_name
+WINDOW window_name as ([[PARTITION BY grouping_column] [ORDER BY sorting_column])
 ```

 - `PARTITION BY` - defines how to break a resultset into groups.
 - `ORDER BY` - defines how to order rows inside the group during calculation aggregate_function.
 - `ROWS or RANGE` - defines bounds of a frame, aggregate_function is calculated within a frame.
+- `WINDOW` - allows to reuse a window definition with multiple exressions.
+
+### Functions
+
+These functions can be used only as a window function.
+
+`row_number()` -	Number the current row within its partition starting from 1.
+`first_value(x)` -	Return the first non-NULL value evaluated within its ordered frame.
+`last_value(x)` -	Return the last non-NULL value evaluated within its ordered frame.
+`nth_value(x, offset)` - Return the first non-NULL value evaluated against the nth row (offset) in its ordered frame.
+`rank()` -	Rank the current row within its partition with gaps.
+`dense_rank()`	- Rank the current row within its partition without gaps.
+`lagInFrame(x)` - Return a value evaluated at the row that is at a specified physical offset row before the current row within the ordered frame.
+`leadInFrame(x)` - Return a value evaluated at the row that is offset rows after the current row within the ordered frame.

 ```text
      PARTITION
@ -101,7 +117,8 @@ aggregate_function (column_name)
 CREATE TABLE wf_partition
 (
    `part_key` UInt64,
-    `value` UInt64
+    `value` UInt64,
+    `order` UInt64    
 )
 ENGINE = Memory;

@ -271,6 +288,89 @@ ORDER BY
 │        1 │     4 │     4 │ [3,4,5]      │
 │        1 │     5 │     5 │ [4,5]        │
 └──────────┴───────┴───────┴──────────────┘
+
+-- row_number does not respect the frame, so rn_1 = rn_2 = rn_3 != rn_4
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER w1 AS frame_values,
+    row_number() OVER w1 AS rn_1,
+    sum(1) OVER w1 AS rn_2,
+    row_number() OVER w2 AS rn_3,
+    sum(1) OVER w2 AS rn_4
+FROM wf_frame
+WINDOW
+    w1 AS (PARTITION BY part_key ORDER BY order DESC),
+    w2 AS (PARTITION BY part_key ORDER BY order DESC 
+                   Rows BETWEEN 1 PRECEDING AND CURRENT ROW)
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─part_key─┬─value─┬─order─┬─frame_values─┬─rn_1─┬─rn_2─┬─rn_3─┬─rn_4─┐
+│        1 │     1 │     1 │ [5,4,3,2,1]  │    5 │    5 │    5 │    2 │
+│        1 │     2 │     2 │ [5,4,3,2]    │    4 │    4 │    4 │    2 │
+│        1 │     3 │     3 │ [5,4,3]      │    3 │    3 │    3 │    2 │
+│        1 │     4 │     4 │ [5,4]        │    2 │    2 │    2 │    2 │
+│        1 │     5 │     5 │ [5]          │    1 │    1 │    1 │    1 │
+└──────────┴───────┴───────┴──────────────┴──────┴──────┴──────┴──────┘
+
+-- first_value and last_value respect the frame
+SELECT
+    groupArray(value) OVER w1 AS frame_values_1,
+    first_value(value) OVER w1 AS first_value_1,
+    last_value(value) OVER w1 AS last_value_1,
+    groupArray(value) OVER w2 AS frame_values_2,
+    first_value(value) OVER w2 AS first_value_2,
+    last_value(value) OVER w2 AS last_value_2
+FROM wf_frame
+WINDOW
+    w1 AS (PARTITION BY part_key ORDER BY order ASC),
+    w2 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 1 PRECEDING AND CURRENT ROW)
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─frame_values_1─┬─first_value_1─┬─last_value_1─┬─frame_values_2─┬─first_value_2─┬─last_value_2─┐
+│ [1]            │             1 │            1 │ [1]            │             1 │            1 │
+│ [1,2]          │             1 │            2 │ [1,2]          │             1 │            2 │
+│ [1,2,3]        │             1 │            3 │ [2,3]          │             2 │            3 │
+│ [1,2,3,4]      │             1 │            4 │ [3,4]          │             3 │            4 │
+│ [1,2,3,4,5]    │             1 │            5 │ [4,5]          │             4 │            5 │
+└────────────────┴───────────────┴──────────────┴────────────────┴───────────────┴──────────────┘
+
+-- second value within the frame
+SELECT
+    groupArray(value) OVER w1 AS frame_values_1,
+    nth_value(value, 2) OVER w1 AS second_value
+FROM wf_frame
+WINDOW w1 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 3 PRECEDING AND CURRENT ROW)
+ORDER BY
+    part_key ASC,
+    value ASC
+┌─frame_values_1─┬─second_value─┐
+│ [1]            │            0 │
+│ [1,2]          │            2 │
+│ [1,2,3]        │            2 │
+│ [1,2,3,4]      │            2 │
+│ [2,3,4,5]      │            3 │
+└────────────────┴──────────────┘
+
+-- second value within the frame + Null for missing values
+SELECT
+    groupArray(value) OVER w1 AS frame_values_1,
+    nth_value(toNullable(value), 2) OVER w1 AS second_value
+FROM wf_frame
+WINDOW w1 AS (PARTITION BY part_key ORDER BY order ASC Rows BETWEEN 3 PRECEDING AND CURRENT ROW)
+ORDER BY
+    part_key ASC,
+    value ASC
+┌─frame_values_1─┬─second_value─┐
+│ [1]            │         ᴺᵁᴸᴸ │
+│ [1,2]          │            2 │
+│ [1,2,3]        │            2 │
+│ [1,2,3,4]      │            2 │
+│ [2,3,4,5]      │            3 │
+└────────────────┴──────────────┘
 ```

 ## Real world examples
--- a/docs/ru/engines/table-engines/integrations/mysql.md
+++ b/docs/ru/engines/table-engines/integrations/mysql.md
@ -17,10 +17,12 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
    ...
 ) ENGINE = MySQL('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause'])
 SETTINGS
-    [connection_pool_size=16, ]
-    [connection_max_tries=3, ]
-    [connection_wait_timeout=5, ] /* 0 -- не ждать */
-    [connection_auto_close=true ]
+    [ connection_pool_size=16, ]
+    [ connection_max_tries=3, ]
+    [ connection_wait_timeout=5, ]
+    [ connection_auto_close=true, ]
+    [ connect_timeout=10, ]
+    [ read_write_timeout=300 ]    
 ;
 ```

@ -144,7 +146,37 @@ SELECT * FROM mysql_table

 Значение по умолчанию: `16`.

+### connection_wait_timeout {#connection-wait-timeout}
+
+Задает таймаут (в секундах) ожидания свободного подключения (в случае, если уже есть активные подключения connection_pool_size), 0 - не ждать.
+
+Возможные значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: `5`.
+
+### connect_timeout {#connect-timeout}
+
+Задает таймаут ожидания подключения (в секундах).
+
+Возможные значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: `10`.
+
+### read_write_timeout {#read-write-timeout}
+
+Задает таймаут ожидания ввода/вывода (в секундах).
+
+Возможные значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: `300`.
+
 ## См. также {#see-also}

-   [Табличная функция mysql](../../../engines/table-engines/integrations/mysql.md)
-   [Использование MySQL в качестве источника для внешнего словаря](../../../engines/table-engines/integrations/mysql.md#dicts-external_dicts_dict_sources-mysql)
+-   [Табличная функция mysql](../../../sql-reference/table-functions/mysql.md)
+-   [Использование MySQL в качестве источника для внешнего словаря](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql)
--- a/docs/ru/sql-reference/statements/alter/column.md
+++ b/docs/ru/sql-reference/statements/alter/column.md
@ -75,8 +75,9 @@ DROP COLUMN [IF EXISTS] name

 Запрос удаляет данные из файловой системы. Так как это представляет собой удаление целых файлов, запрос выполняется почти мгновенно.

-:::danger "Предупреждение"
+:::warning "Предупреждение"
    Вы не можете удалить столбец, используемый в [материализованном представлениии](../../../sql-reference/statements/create/view.md#materialized). В противном случае будет ошибка.
+:::

 Пример:

--- a/docs/ru/sql-reference/statements/alter/index/index.md
+++ b/docs/ru/sql-reference/statements/alter/index/index.md
@ -9,9 +9,9 @@ sidebar_label: "Манипуляции с индексами"
 Добавить или удалить индекс можно с помощью операций

 ``` sql
-ALTER TABLE [db.]name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]
-ALTER TABLE [db.]name DROP INDEX name
-ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] DROP INDEX name
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] MATERIALIZE INDEX name IN PARTITION partition_name
 ```

 Поддерживается только таблицами семейства `*MergeTree`.
--- a/docs/ru/sql-reference/statements/alter/partition.md
+++ b/docs/ru/sql-reference/statements/alter/partition.md
@ -25,7 +25,7 @@ sidebar_label: PARTITION
 ## DETACH PARTITION\|PART {#alter_detach-partition}

 ``` sql
-ALTER TABLE table_name DETACH PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DETACH PARTITION|PART partition_expr
 ```

 Перемещает заданную партицию в директорию `detached`. Сервер не будет знать об этой партиции до тех пор, пока вы не выполните запрос [ATTACH](#alter_attach-partition).
@ -46,7 +46,7 @@ ALTER TABLE mt DETACH PART 'all_2_2_0';
 ## DROP PARTITION\|PART {#alter_drop-partition}

 ``` sql
-ALTER TABLE table_name DROP PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DROP PARTITION|PART partition_expr
 ```

 Удаляет партицию. Партиция помечается как неактивная и будет полностью удалена примерно через 10 минут.
@ -65,7 +65,7 @@ ALTER TABLE mt DROP PART 'all_4_4_0';
 ## DROP DETACHED PARTITION\|PART {#alter_drop-detached}

 ``` sql
-ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DROP DETACHED PARTITION|PART partition_expr
 ```

 Удаляет из `detached` кусок или все куски, принадлежащие партиции.
@ -74,7 +74,7 @@ ALTER TABLE table_name DROP DETACHED PARTITION|PART partition_expr
 ## ATTACH PARTITION\|PART {#alter_attach-partition}

 ``` sql
-ALTER TABLE table_name ATTACH PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] ATTACH PARTITION|PART partition_expr
 ```

 Добавляет данные в таблицу из директории `detached`. Можно добавить данные как для целой партиции, так и для отдельного куска. Примеры:
@ -97,7 +97,7 @@ ALTER TABLE visits ATTACH PART 201901_2_2_0;
 ## ATTACH PARTITION FROM {#alter_attach-partition-from}

 ``` sql
-ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1
+ALTER TABLE table2 [ON CLUSTER cluster] ATTACH PARTITION partition_expr FROM table1
 ```

 Копирует партицию из таблицы `table1` в таблицу `table2`.
@ -113,7 +113,7 @@ ALTER TABLE table2 ATTACH PARTITION partition_expr FROM table1
 ## REPLACE PARTITION {#alter_replace-partition}

 ``` sql
-ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1
+ALTER TABLE table2 [ON CLUSTER cluster] REPLACE PARTITION partition_expr FROM table1
 ```

 Копирует партицию из таблицы `table1` в таблицу `table2` с заменой существующих данных в `table2`. Данные из `table1` не удаляются.
@ -128,7 +128,7 @@ ALTER TABLE table2 REPLACE PARTITION partition_expr FROM table1
 ## MOVE PARTITION TO TABLE {#alter_move_to_table-partition}

 ``` sql
-ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest
+ALTER TABLE table_source [ON CLUSTER cluster] MOVE PARTITION partition_expr TO TABLE table_dest
 ```

 Перемещает партицию из таблицы `table_source` в таблицу `table_dest` (добавляет к существующим данным в `table_dest`) с удалением данных из таблицы `table_source`.
@ -143,7 +143,7 @@ ALTER TABLE table_source MOVE PARTITION partition_expr TO TABLE table_dest
 ## CLEAR COLUMN IN PARTITION {#alter_clear-column-partition}

 ``` sql
-ALTER TABLE table_name CLEAR COLUMN column_name IN PARTITION partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] CLEAR COLUMN column_name IN PARTITION partition_expr
 ```

 Сбрасывает все значения в столбце для заданной партиции. Если для столбца определено значение по умолчанию (в секции `DEFAULT`), то будет выставлено это значение.
@ -157,7 +157,7 @@ ALTER TABLE visits CLEAR COLUMN hour in PARTITION 201902
 ## CLEAR INDEX IN PARTITION {#alter_clear-index-partition}

 ``` sql
-ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] CLEAR INDEX index_name IN PARTITION partition_expr
 ```

 Работает как `CLEAR COLUMN`, но сбрасывает индексы вместо данных в столбцах.
@ -165,7 +165,7 @@ ALTER TABLE table_name CLEAR INDEX index_name IN PARTITION partition_expr
 ## FREEZE PARTITION {#alter_freeze-partition}

 ``` sql
-ALTER TABLE table_name FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name']
+ALTER TABLE table_name [ON CLUSTER cluster] FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name']
 ```

 Создаёт резервную копию для заданной партиции. Если выражение `PARTITION` опущено, резервные копии будут созданы для всех партиций.
@ -205,7 +205,7 @@ ALTER TABLE table_name FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name
 ## UNFREEZE PARTITION {#alter_unfreeze-partition}

 ``` sql
-ALTER TABLE 'table_name' UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name'
+ALTER TABLE table_name [ON CLUSTER cluster] UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name'
 ```

 Удаляет с диска "замороженные" партиции с указанным именем. Если секция `PARTITION` опущена, запрос удаляет резервную копию всех партиций сразу.
@ -213,7 +213,7 @@ ALTER TABLE 'table_name' UNFREEZE [PARTITION 'part_expr'] WITH NAME 'backup_name
 ## FETCH PARTITION\|PART {#alter_fetch-partition}

 ``` sql
-ALTER TABLE table_name FETCH PARTITION|PART partition_expr FROM 'path-in-zookeeper'
+ALTER TABLE table_name [ON CLUSTER cluster] FETCH PARTITION|PART partition_expr FROM 'path-in-zookeeper'
 ```

 Загружает партицию с другого сервера. Этот запрос работает только для реплицированных таблиц.
@ -250,7 +250,7 @@ ALTER TABLE users ATTACH PART 201901_2_2_0;
 Перемещает партицию или кусок данных на другой том или диск для таблиц с движком `MergeTree`. Смотрите [Хранение данных таблицы на нескольких блочных устройствах](../../statements/alter/index.md#table_engine-mergetree-multiple-volumes).

 ``` sql
-ALTER TABLE table_name MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name'
+ALTER TABLE table_name [ON CLUSTER cluster] MOVE PARTITION|PART partition_expr TO DISK|VOLUME 'disk_name'
 ```

 Запрос `ALTER TABLE t MOVE`:
@ -273,7 +273,7 @@ ALTER TABLE hits MOVE PARTITION '2019-09-01' TO DISK 'fast_ssd'
 Синтаксис:

 ``` sql
-ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] [IN PARTITION partition_id] WHERE filter_expr
 ```

 ### Пример
@ -293,7 +293,7 @@ ALTER TABLE mt UPDATE x = x + 1 IN PARTITION 2 WHERE p = 2;
 Синтаксис:

 ``` sql
-ALTER TABLE [db.]table DELETE [IN PARTITION partition_id] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE [IN PARTITION partition_id] WHERE filter_expr
 ```

 ### Пример
--- a/docs/ru/sql-reference/statements/alter/update.md
+++ b/docs/ru/sql-reference/statements/alter/update.md
@ -6,7 +6,7 @@ sidebar_label: UPDATE
 # ALTER TABLE … UPDATE {#alter-table-update-statements}

 ``` sql
-ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr
+ALTER TABLE [db.]table [ON CLUSTER cluster] UPDATE column1 = expr1 [, ...] WHERE filter_expr
 ```

 Манипулирует данными, соответствующими заданному выражению фильтрации. Реализовано как [мутация](../../../sql-reference/statements/alter/index.md#mutations).
--- a/docs/ru/sql-reference/statements/exchange.md
+++ b/docs/ru/sql-reference/statements/exchange.md
@ -14,7 +14,7 @@ sidebar_label: EXCHANGE
 **Синтаксис**

 ```sql
-EXCHANGE TABLES|DICTIONARIES [db0.]name_A AND [db1.]name_B
+EXCHANGE TABLES|DICTIONARIES [db0.]name_A AND [db1.]name_B [ON CLUSTER cluster]
 ```

 ## EXCHANGE TABLES {#exchange_tables}
@ -24,7 +24,7 @@ EXCHANGE TABLES|DICTIONARIES [db0.]name_A AND [db1.]name_B
 **Синтаксис**

 ```sql
-EXCHANGE TABLES [db0.]table_A AND [db1.]table_B
+EXCHANGE TABLES [db0.]table_A AND [db1.]table_B [ON CLUSTER cluster]
 ```

 ## EXCHANGE DICTIONARIES {#exchange_dictionaries}
@ -34,7 +34,7 @@ EXCHANGE TABLES [db0.]table_A AND [db1.]table_B
 **Синтаксис**

 ```sql
-EXCHANGE DICTIONARIES [db0.]dict_A AND [db1.]dict_B
+EXCHANGE DICTIONARIES [db0.]dict_A AND [db1.]dict_B [ON CLUSTER cluster]
 ```

 **Смотрите также**
--- a/docs/ru/sql-reference/statements/select/index.md
+++ b/docs/ru/sql-reference/statements/select/index.md
@ -24,7 +24,7 @@ SELECT [DISTINCT [ON (column1, column2, ...)]] expr_list
 [LIMIT [n, ]m] [WITH TIES]
 [SETTINGS ...]
 [UNION ALL ...]
-[INTO OUTFILE filename [COMPRESSION type] ]
+[INTO OUTFILE filename [COMPRESSION type [LEVEL level]] ]
 [FORMAT format]
 ```

--- a/docs/ru/sql-reference/statements/select/into-outfile.md
+++ b/docs/ru/sql-reference/statements/select/into-outfile.md
@ -6,16 +6,18 @@ sidebar_label: INTO OUTFILE

 Секция `INTO OUTFILE` перенаправляет результат запроса `SELECT` в файл на стороне **клиента**.

-Поддерживаются сжатые файлы. Формат сжатия определяется по расширению файла (по умолчанию используется режим `'auto'`), либо он может быть задан явно в секции `COMPRESSION`. 
+Поддерживаются сжатые файлы. Формат сжатия определяется по расширению файла (по умолчанию используется режим `'auto'`), либо он может быть задан явно в секции `COMPRESSION`. Уровень сжатия для конкретного алгоритма может быть задан в секции `LEVEL`.

 **Синтаксис**

 ```sql
-SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type]
+SELECT <expr_list> INTO OUTFILE file_name [COMPRESSION type [LEVEL level]]
 ```

 `file_name` и `type` задаются в виде строковых литералов. Поддерживаются форматы сжатия: `'none`', `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`.

+`level` задается в виде числового литерала. Поддерживаются положительные значения в следующих диапазонах: `1-12` для формата `lz4`, `1-22` для формата `zstd` и `1-9` для остальных форматов.
+
 ## Детали реализации {#implementation-details}

 -   Эта функция доступна только в следующих интерфейсах: [клиент командной строки](../../../interfaces/cli.md) и [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Таким образом, запрос, отправленный через [HTTP интерфейс](../../../interfaces/http.md) вернет ошибку.
--- a/programs/extract-from-config/ExtractFromConfig.cpp
+++ b/programs/extract-from-config/ExtractFromConfig.cpp
@ -1,6 +1,11 @@
 #include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <queue>

 #include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
 #include <Poco/Logger.h>
 #include <Poco/ConsoleChannel.h>
 #include <Poco/FormattingChannel.h>
@ -11,7 +16,9 @@
 #include <Common/ZooKeeper/ZooKeeperNodeCache.h>
 #include <Common/Config/ConfigProcessor.h>
 #include <Common/Exception.h>
+#include <Common/parseGlobs.h>

+#include <re2/re2.h>

 static void setupLogging(const std::string & log_level)
 {
@ -23,7 +30,57 @@ static void setupLogging(const std::string & log_level)
    Poco::Logger::root().setLevel(log_level);
 }

-static std::string extractFromConfig(
+
+static std::vector<std::string> extactFromConfigAccordingToGlobs(DB::ConfigurationPtr configuration, const std::string & pattern, bool try_get)
+{
+    auto pattern_prefix = pattern.substr(0, pattern.find_first_of("*?{"));
+    boost::algorithm::trim_if(pattern_prefix, [](char s){ return s == '.'; });
+
+    auto matcher = std::make_unique<re2::RE2>(DB::makeRegexpPatternFromGlobs(pattern));
+
+    std::vector<std::string> result;
+    std::queue<std::string> working_queue;
+    working_queue.emplace(pattern_prefix);
+
+    while (!working_queue.empty())
+    {
+        auto node = working_queue.front();
+        working_queue.pop();
+
+        /// Disclose one more layer
+        Poco::Util::AbstractConfiguration::Keys keys;
+        configuration->keys(node, keys);
+
+        /// This is a leave
+        if (keys.empty())
+        {
+            if (!re2::RE2::FullMatch(node, *matcher))
+                continue;
+
+
+            if (try_get)
+            {
+                auto value = configuration->getString(node, "");
+                if (!value.empty())
+                    result.emplace_back(value);
+            }
+            else
+            {
+                result.emplace_back(configuration->getString(node));
+            }
+            continue;
+        }
+
+        /// Add new nodes to working queue
+        for (const auto & key : keys)
+            working_queue.emplace(fmt::format("{}.{}", node, key));
+    }
+
+    return result;
+}
+
+
+static std::vector<std::string> extractFromConfig(
        const std::string & config_path, const std::string & key, bool process_zk_includes, bool try_get = false)
 {
    DB::ConfigProcessor processor(config_path, /* throw_on_bad_incl = */ false, /* log_to_console = */ false);
@ -38,10 +95,15 @@ static std::string extractFromConfig(
        config_xml = processor.processConfig(&has_zk_includes, &zk_node_cache);
    }
    DB::ConfigurationPtr configuration(new Poco::Util::XMLConfiguration(config_xml));
-    // do not throw exception if not found
+
+    /// Check if a key has globs.
+    if (key.find_first_of("*?{") != std::string::npos)
+        return extactFromConfigAccordingToGlobs(configuration, key, try_get);
+
+    /// Do not throw exception if not found.
    if (try_get)
-        return configuration->getString(key, "");
-    return configuration->getString(key);
+        return {configuration->getString(key, "")};
+    return {configuration->getString(key)};
 }

 #pragma GCC diagnostic ignored "-Wunused-function"
@ -91,7 +153,8 @@ int mainEntryClickHouseExtractFromConfig(int argc, char ** argv)
        po::notify(options);

        setupLogging(log_level);
-        std::cout << extractFromConfig(config_path, key, process_zk_includes, try_get) << std::endl;
+        for (const auto & value : extractFromConfig(config_path, key, process_zk_includes, try_get))
+            std::cout << value << std::endl;
    }
    catch (...)
    {
--- a/programs/keeper/TinyContext.cpp
+++ b/programs/keeper/TinyContext.cpp
@ -49,6 +49,12 @@ std::shared_ptr<KeeperDispatcher> TinyContext::getKeeperDispatcher() const
    return keeper_dispatcher;
 }

+std::shared_ptr<KeeperDispatcher> TinyContext::tryGetKeeperDispatcher() const
+{
+    std::lock_guard lock(keeper_dispatcher_mutex);
+    return keeper_dispatcher;
+}
+
 void TinyContext::shutdownKeeperDispatcher() const
 {
    std::lock_guard lock(keeper_dispatcher_mutex);
--- a/programs/keeper/TinyContext.h
+++ b/programs/keeper/TinyContext.h
@ -14,6 +14,7 @@ class TinyContext: public std::enable_shared_from_this<TinyContext>
 {
 public:
    std::shared_ptr<KeeperDispatcher> getKeeperDispatcher() const;
+    std::shared_ptr<KeeperDispatcher> tryGetKeeperDispatcher() const;
    void initializeKeeperDispatcher(bool start_async) const;
    void shutdownKeeperDispatcher() const;
    void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
--- a/src/Access/AccessBackup.cpp
+++ b/src/Access/AccessBackup.cpp
@ -321,7 +321,7 @@ AccessRestorerFromBackup::AccessRestorerFromBackup(

 AccessRestorerFromBackup::~AccessRestorerFromBackup() = default;

-void AccessRestorerFromBackup::addDataPath(const String & data_path, const QualifiedTableName & table_name_for_logs)
+void AccessRestorerFromBackup::addDataPath(const String & data_path)
 {
    if (!data_paths.emplace(data_path).second)
        return;
@ -334,8 +334,8 @@ void AccessRestorerFromBackup::addDataPath(const String & data_path, const Quali
    for (const String & filename : filenames)
    {
        if (!filename.starts_with("access") || !filename.ends_with(".txt"))
-            throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, "Cannot restore table {}: File name {} doesn't match the wildcard \"access*.txt\"",
-                            table_name_for_logs.getFullName(), String{data_path_in_backup_fs / filename});
+            throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, "File name {} doesn't match the wildcard \"access*.txt\"",
+                            String{data_path_in_backup_fs / filename});
    }

    ::sort(filenames.begin(), filenames.end());
--- a/src/Access/AccessBackup.h
+++ b/src/Access/AccessBackup.h
@ -17,7 +17,6 @@ using BackupPtr = std::shared_ptr<const IBackup>;
 class IBackupEntry;
 using BackupEntryPtr = std::shared_ptr<const IBackupEntry>;
 struct RestoreSettings;
-struct QualifiedTableName;


 /// Makes a backup of access entities of a specified type.
@ -36,7 +35,7 @@ public:
    ~AccessRestorerFromBackup();

    /// Adds a data path to loads access entities from.
-    void addDataPath(const String & data_path, const QualifiedTableName & table_name_for_logs);
+    void addDataPath(const String & data_path);

    /// Checks that the current user can do restoring.
    AccessRightsElements getRequiredAccess() const;
--- a/src/Access/IAccessStorage.cpp
+++ b/src/Access/IAccessStorage.cpp
@ -531,6 +531,9 @@ void IAccessStorage::backup(BackupEntriesCollector & backup_entries_collector, c
    auto entities = readAllWithIDs(type);
    boost::range::remove_erase_if(entities, [](const std::pair<UUID, AccessEntityPtr> & x) { return !x.second->isBackupAllowed(); });

+    if (entities.empty())
+        return;
+
    auto backup_entry = makeBackupEntryForAccess(
        entities,
        data_path_in_backup,
--- a/src/Access/ReplicatedAccessStorage.cpp
+++ b/src/Access/ReplicatedAccessStorage.cpp
@ -627,6 +627,9 @@ void ReplicatedAccessStorage::backup(BackupEntriesCollector & backup_entries_col
    auto entities = readAllWithIDs(type);
    boost::range::remove_erase_if(entities, [](const std::pair<UUID, AccessEntityPtr> & x) { return !x.second->isBackupAllowed(); });

+    if (entities.empty())
+        return;
+
    auto backup_entry_with_path = makeBackupEntryForAccess(
        entities,
        data_path_in_backup,
@ -634,21 +637,18 @@ void ReplicatedAccessStorage::backup(BackupEntriesCollector & backup_entries_col
        backup_entries_collector.getContext()->getAccessControl());

    auto backup_coordination = backup_entries_collector.getBackupCoordination();
-    backup_coordination->addReplicatedAccessPath(zookeeper_path, backup_entry_with_path.first);
    String current_host_id = backup_entries_collector.getBackupSettings().host_id;
-    backup_coordination->setReplicatedAccessHost(zookeeper_path, current_host_id);
+    backup_coordination->addReplicatedAccessFilePath(zookeeper_path, type, current_host_id, backup_entry_with_path.first);

    backup_entries_collector.addPostTask(
        [backup_entry = backup_entry_with_path.second,
         zookeeper_path = zookeeper_path,
+         type,
         current_host_id,
         &backup_entries_collector,
         backup_coordination]
        {
-            if (current_host_id != backup_coordination->getReplicatedAccessHost(zookeeper_path))
-                return;
-
-            for (const String & path : backup_coordination->getReplicatedAccessPaths(zookeeper_path))
+            for (const String & path : backup_coordination->getReplicatedAccessFilePaths(zookeeper_path, type, current_host_id))
                backup_entries_collector.addBackupEntry(path, backup_entry);
        });
 }
--- a/src/Backups/BackupCoordinationHelpers.cpp
+++ b/src/Backups/BackupCoordinationHelpers.cpp
@ -1,392 +0,0 @@
-#include <Backups/BackupCoordinationHelpers.h>
-#include <Storages/MergeTree/MergeTreePartInfo.h>
-#include <Common/Exception.h>
-#include <Common/escapeForFileName.h>
-#include <IO/ReadHelpers.h>
-#include <base/chrono_io.h>
-#include <boost/range/adaptor/map.hpp>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int CANNOT_BACKUP_TABLE;
-    extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE;
-    extern const int LOGICAL_ERROR;
-}
-
-
-namespace
-{
-    struct LessReplicaName
-    {
-        bool operator()(const std::shared_ptr<const String> & left, const std::shared_ptr<const String> & right) { return *left < *right; }
-    };
-}
-
-
-class BackupCoordinationReplicatedPartNames::CoveredPartsFinder
-{
-public:
-    explicit CoveredPartsFinder(const String & table_name_for_logs_) : table_name_for_logs(table_name_for_logs_) {}
-
-    void addPartName(const String & new_part_name, const std::shared_ptr<const String> & replica_name)
-    {
-        addPartName(MergeTreePartInfo::fromPartName(new_part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING), replica_name);
-    }
-
-    void addPartName(MergeTreePartInfo && new_part_info, const std::shared_ptr<const String> & replica_name)
-    {
-        auto new_min_block = new_part_info.min_block;
-        auto new_max_block = new_part_info.max_block;
-        auto & parts = partitions[new_part_info.partition_id];
-
-        /// Find the first part with max_block >= `part_info.min_block`.
-        auto first_it = parts.lower_bound(new_min_block);
-        if (first_it == parts.end())
-        {
-            /// All max_blocks < part_info.min_block, so we can safely add the `part_info` to the list of parts.
-            parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
-            return;
-        }
-
-        {
-            /// part_info.min_block <= current_info.max_block
-            const auto & part = first_it->second;
-            if (new_max_block < part.info.min_block)
-            {
-                /// (prev_info.max_block < part_info.min_block) AND (part_info.max_block < current_info.min_block),
-                /// so we can safely add the `part_info` to the list of parts.
-                parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
-                return;
-            }
-
-            /// (part_info.min_block <= current_info.max_block) AND (part_info.max_block >= current_info.min_block), parts intersect.
-
-            if (part.info.contains(new_part_info))
-            {
-                /// `part_info` is already contained in another part.
-                return;
-            }
-        }
-
-        /// Probably `part_info` is going to replace multiple parts, find the range of parts to replace.
-        auto last_it = first_it;
-        while (last_it != parts.end())
-        {
-            const auto & part = last_it->second;
-            if (part.info.min_block > new_max_block)
-                break;
-            if (!new_part_info.contains(part.info))
-            {
-                throw Exception(
-                    ErrorCodes::CANNOT_BACKUP_TABLE,
-                    "Intersected parts detected in the table {}: {} on replica {} and {} on replica {}. It should be investigated",
-                    table_name_for_logs,
-                    part.info.getPartName(),
-                    *part.replica_name,
-                    new_part_info.getPartName(),
-                    *replica_name);
-            }
-            ++last_it;
-        }
-
-        /// `part_info` will replace multiple parts [first_it..last_it)
-        parts.erase(first_it, last_it);
-        parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
-    }
-
-    bool isCoveredByAnotherPart(const String & part_name) const
-    {
-        return isCoveredByAnotherPart(MergeTreePartInfo::fromPartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING));
-    }
-
-    bool isCoveredByAnotherPart(const MergeTreePartInfo & part_info) const
-    {
-        auto partition_it = partitions.find(part_info.partition_id);
-        if (partition_it == partitions.end())
-            return false;
-
-        const auto & parts = partition_it->second;
-
-        /// Find the first part with max_block >= `part_info.min_block`.
-        auto it_part = parts.lower_bound(part_info.min_block);
-        if (it_part == parts.end())
-        {
-            /// All max_blocks < part_info.min_block, so there is no parts covering `part_info`.
-            return false;
-        }
-
-        /// part_info.min_block <= current_info.max_block
-        const auto & existing_part = it_part->second;
-        if (part_info.max_block < existing_part.info.min_block)
-        {
-            /// (prev_info.max_block < part_info.min_block) AND (part_info.max_block < current_info.min_block),
-            /// so there is no parts covering `part_info`.
-            return false;
-        }
-
-        /// (part_info.min_block <= current_info.max_block) AND (part_info.max_block >= current_info.min_block), parts intersect.
-
-        if (existing_part.info == part_info)
-        {
-            /// It's the same part, it's kind of covers itself, but we check in this function whether a part is covered by another part.
-            return false;
-        }
-
-        /// Check if `part_info` is covered by `current_info`.
-        return existing_part.info.contains(part_info);
-    }
-
-private:
-    struct PartInfo
-    {
-        MergeTreePartInfo info;
-        std::shared_ptr<const String> replica_name;
-    };
-
-    using Parts = std::map<Int64 /* max_block */, PartInfo>;
-    std::unordered_map<String, Parts> partitions;
-    const String table_name_for_logs;
-};
-
-
-BackupCoordinationReplicatedPartNames::BackupCoordinationReplicatedPartNames() = default;
-BackupCoordinationReplicatedPartNames::~BackupCoordinationReplicatedPartNames() = default;
-
-void BackupCoordinationReplicatedPartNames::addPartNames(
-    const String & table_shared_id,
-    const String & table_name_for_logs,
-    const String & replica_name,
-    const std::vector<PartNameAndChecksum> & part_names_and_checksums)
-{
-    if (part_names_prepared)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after getPartNames()");
-
-    auto & table_info = table_infos[table_shared_id];
-    if (!table_info.covered_parts_finder)
-        table_info.covered_parts_finder = std::make_unique<CoveredPartsFinder>(table_name_for_logs);
-
-    auto replica_name_ptr = std::make_shared<String>(replica_name);
-
-    for (const auto & part_name_and_checksum : part_names_and_checksums)
-    {
-        const auto & part_name = part_name_and_checksum.part_name;
-        const auto & checksum = part_name_and_checksum.checksum;
-        auto it = table_info.parts_replicas.find(part_name);
-        if (it == table_info.parts_replicas.end())
-        {
-            it = table_info.parts_replicas.emplace(part_name, PartReplicas{}).first;
-            it->second.checksum = checksum;
-        }
-        else
-        {
-            const auto & other = it->second;
-            if (other.checksum != checksum)
-            {
-                const String & other_replica_name = **other.replica_names.begin();
-                throw Exception(
-                    ErrorCodes::CANNOT_BACKUP_TABLE,
-                    "Table {} on replica {} has part {} which is different from the part on replica {}. Must be the same",
-                    table_name_for_logs,
-                    replica_name,
-                    part_name,
-                    other_replica_name);
-            }
-        }
-
-        auto & replica_names = it->second.replica_names;
-
-        /// `replica_names` should be ordered because we need this vector to be in the same order on every replica.
-        replica_names.insert(
-            std::upper_bound(replica_names.begin(), replica_names.end(), replica_name_ptr, LessReplicaName{}), replica_name_ptr);
-
-        table_info.covered_parts_finder->addPartName(part_name, replica_name_ptr);
-    }
-}
-
-Strings BackupCoordinationReplicatedPartNames::getPartNames(const String & table_shared_id, const String & replica_name) const
-{
-    preparePartNames();
-    auto it = table_infos.find(table_shared_id);
-    if (it == table_infos.end())
-        return {};
-    const auto & replicas_parts = it->second.replicas_parts;
-    auto it2 = replicas_parts.find(replica_name);
-    if (it2 == replicas_parts.end())
-        return {};
-    return it2->second;
-}
-
-void BackupCoordinationReplicatedPartNames::preparePartNames() const
-{
-    if (part_names_prepared)
-        return;
-
-    size_t counter = 0;
-    for (const auto & table_info : table_infos | boost::adaptors::map_values)
-    {
-        for (const auto & [part_name, part_replicas] : table_info.parts_replicas)
-        {
-            if (table_info.covered_parts_finder->isCoveredByAnotherPart(part_name))
-                continue;
-            size_t chosen_index = (counter++) % part_replicas.replica_names.size();
-            const auto & chosen_replica_name = *part_replicas.replica_names[chosen_index];
-            table_info.replicas_parts[chosen_replica_name].push_back(part_name);
-        }
-    }
-
-    part_names_prepared = true;
-}
-
-
-/// Helps to wait until all hosts come to a specified stage.
-BackupCoordinationStatusSync::BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_)
-    : zookeeper_path(zookeeper_path_)
-    , get_zookeeper(get_zookeeper_)
-    , log(log_)
-{
-    createRootNodes();
-}
-
-void BackupCoordinationStatusSync::createRootNodes()
-{
-    auto zookeeper = get_zookeeper();
-    zookeeper->createAncestors(zookeeper_path);
-    zookeeper->createIfNotExists(zookeeper_path, "");
-}
-
-void BackupCoordinationStatusSync::set(const String & current_host, const String & new_status, const String & message)
-{
-    setImpl(current_host, new_status, message, {}, {});
-}
-
-Strings BackupCoordinationStatusSync::setAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts)
-{
-    return setImpl(current_host, new_status, message, all_hosts, {});
-}
-
-Strings BackupCoordinationStatusSync::setAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms)
-{
-    return setImpl(current_host, new_status, message, all_hosts, timeout_ms);
-}
-
-Strings BackupCoordinationStatusSync::setImpl(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, const std::optional<UInt64> & timeout_ms)
-{
-    /// Put new status to ZooKeeper.
-    auto zookeeper = get_zookeeper();
-    zookeeper->createIfNotExists(zookeeper_path + "/" + current_host + "|" + new_status, message);
-
-    if (all_hosts.empty() || (new_status == kErrorStatus))
-        return {};
-
-    if ((all_hosts.size() == 1) && (all_hosts.front() == current_host))
-        return {message};
-
-    /// Wait for other hosts.
-
-    Strings ready_hosts_results;
-    ready_hosts_results.resize(all_hosts.size());
-
-    std::map<String, std::vector<size_t> /* index in `ready_hosts_results` */> unready_hosts;
-    for (size_t i = 0; i != all_hosts.size(); ++i)
-        unready_hosts[all_hosts[i]].push_back(i);
-
-    std::optional<String> host_with_error;
-    std::optional<String> error_message;
-
-    /// Process ZooKeeper's nodes and set `all_hosts_ready` or `unready_host` or `error_message`.
-    auto process_zk_nodes = [&](const Strings & zk_nodes)
-    {
-        for (const String & zk_node : zk_nodes)
-        {
-            if (zk_node.starts_with("remove_watch-"))
-                continue;
-
-            size_t separator_pos = zk_node.find('|');
-            if (separator_pos == String::npos)
-                throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected zk node {}", zookeeper_path + "/" + zk_node);
-            String host = zk_node.substr(0, separator_pos);
-            String status = zk_node.substr(separator_pos + 1);
-            if (status == kErrorStatus)
-            {
-                host_with_error = host;
-                error_message = zookeeper->get(zookeeper_path + "/" + zk_node);
-                return;
-            }
-            auto it = unready_hosts.find(host);
-            if ((it != unready_hosts.end()) && (status == new_status))
-            {
-                String result = zookeeper->get(zookeeper_path + "/" + zk_node);
-                for (size_t i : it->second)
-                    ready_hosts_results[i] = result;
-                unready_hosts.erase(it);
-            }
-        }
-    };
-
-    /// Wait until all hosts are ready or an error happens or time is out.
-    std::atomic<bool> watch_set = false;
-    std::condition_variable watch_triggered_event;
-
-    auto watch_callback = [&](const Coordination::WatchResponse &)
-    {
-        watch_set = false; /// After it's triggered it's not set until we call getChildrenWatch() again.
-        watch_triggered_event.notify_all();
-    };
-
-    auto watch_triggered = [&] { return !watch_set; };
-
-    bool use_timeout = timeout_ms.has_value();
-    std::chrono::milliseconds timeout{timeout_ms.value_or(0)};
-    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
-    std::chrono::steady_clock::duration elapsed;
-    std::mutex dummy_mutex;
-
-    while (!unready_hosts.empty() && !error_message)
-    {
-        watch_set = true;
-        Strings nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback);
-        process_zk_nodes(nodes);
-
-        if (!unready_hosts.empty() && !error_message)
-        {
-            LOG_TRACE(log, "Waiting for host {}", unready_hosts.begin()->first);
-            std::unique_lock dummy_lock{dummy_mutex};
-            if (use_timeout)
-            {
-                elapsed = std::chrono::steady_clock::now() - start_time;
-                if ((elapsed > timeout) || !watch_triggered_event.wait_for(dummy_lock, timeout - elapsed, watch_triggered))
-                    break;
-            }
-            else
-                watch_triggered_event.wait(dummy_lock, watch_triggered);
-        }
-    }
-
-    if (watch_set)
-    {
-        /// Remove watch by triggering it.
-        zookeeper->create(zookeeper_path + "/remove_watch-", "", zkutil::CreateMode::EphemeralSequential);
-        std::unique_lock dummy_lock{dummy_mutex};
-        watch_triggered_event.wait(dummy_lock, watch_triggered);
-    }
-
-    if (error_message)
-        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Error occurred on host {}: {}", *host_with_error, *error_message);
-
-    if (!unready_hosts.empty())
-    {
-        throw Exception(
-            ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
-            "Waited for host {} too long ({})",
-            unready_hosts.begin()->first,
-            to_string(elapsed));
-    }
-
-    return ready_hosts_results;
-}
-
-}
--- a/src/Backups/BackupCoordinationHelpers.h
+++ b/src/Backups/BackupCoordinationHelpers.h
@ -1,81 +0,0 @@
-#pragma once
-
-#include <Backups/IBackupCoordination.h>
-#include <Backups/IRestoreCoordination.h>
-#include <Common/ZooKeeper/Common.h>
-#include <map>
-#include <unordered_map>
-
-
-namespace DB
-{
-
-/// Helper designed to be used in an implementation of the IBackupCoordination interface in the part related to replicated tables.
-class BackupCoordinationReplicatedPartNames
-{
-public:
-    BackupCoordinationReplicatedPartNames();
-    ~BackupCoordinationReplicatedPartNames();
-
-    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
-
-    /// Adds part names which a specified replica of a replicated table is going to put to the backup.
-    /// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function
-    /// getPartNames().
-    /// Checksums are used only to control that parts under the same names on different replicas are the same.
-    void addPartNames(
-        const String & table_shared_id,
-        const String & table_name_for_logs,
-        const String & replica_name,
-        const std::vector<PartNameAndChecksum> & part_names_and_checksums);
-
-    /// Returns the names of the parts which a specified replica of a replicated table should put to the backup.
-    /// This is the same list as it was added by call of the function addPartNames() but without duplications and without
-    /// parts covered by another parts.
-    Strings getPartNames(const String & table_shared_id, const String & replica_name) const;
-
-private:
-    void preparePartNames() const;
-
-    class CoveredPartsFinder;
-
-    struct PartReplicas
-    {
-        std::vector<std::shared_ptr<const String>> replica_names;
-        UInt128 checksum;
-    };
-
-    struct TableInfo
-    {
-        std::map<String /* part_name */, PartReplicas> parts_replicas; /// Should be ordered because we need this map to be in the same order on every replica.
-        mutable std::unordered_map<String /* replica_name> */, Strings> replicas_parts;
-        std::unique_ptr<CoveredPartsFinder> covered_parts_finder;
-    };
-
-    std::map<String /* table_shared_id */, TableInfo> table_infos; /// Should be ordered because we need this map to be in the same order on every replica.
-    mutable bool part_names_prepared = false;
-};
-
-
-/// Helps to wait until all hosts come to a specified stage.
-class BackupCoordinationStatusSync
-{
-public:
-    BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_);
-
-    void set(const String & current_host, const String & new_status, const String & message);
-    Strings setAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts);
-    Strings setAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms);
-
-    static constexpr const char * kErrorStatus = "error";
-
-private:
-    void createRootNodes();
-    Strings setImpl(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, const std::optional<UInt64> & timeout_ms);
-
-    String zookeeper_path;
-    zkutil::GetZooKeeper get_zookeeper;
-    Poco::Logger * log;
-};
-
-}
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -17,12 +17,16 @@ void BackupCoordinationLocal::setStatus(const String &, const String &, const St
 {
 }

-Strings BackupCoordinationLocal::setStatusAndWait(const String &, const String &, const String &, const Strings &)
+void BackupCoordinationLocal::setErrorStatus(const String &, const Exception &)
+{
+}
+
+Strings BackupCoordinationLocal::waitStatus(const Strings &, const String &)
 {
    return {};
 }

-Strings BackupCoordinationLocal::setStatusAndWaitFor(const String &, const String &, const String &, const Strings &, UInt64)
+Strings BackupCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64)
 {
    return {};
 }
@ -30,60 +34,52 @@ Strings BackupCoordinationLocal::setStatusAndWaitFor(const String &, const Strin
 void BackupCoordinationLocal::addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector<PartNameAndChecksum> & part_names_and_checksums)
 {
    std::lock_guard lock{mutex};
-    replicated_part_names.addPartNames(table_shared_id, table_name_for_logs, replica_name, part_names_and_checksums);
+    replicated_tables.addPartNames(table_shared_id, table_name_for_logs, replica_name, part_names_and_checksums);
 }

 Strings BackupCoordinationLocal::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
 {
    std::lock_guard lock{mutex};
-    return replicated_part_names.getPartNames(table_shared_id, replica_name);
+    return replicated_tables.getPartNames(table_shared_id, replica_name);
+}
+
+
+void BackupCoordinationLocal::addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector<MutationInfo> & mutations)
+{
+    std::lock_guard lock{mutex};
+    replicated_tables.addMutations(table_shared_id, table_name_for_logs, replica_name, mutations);
+}
+
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationLocal::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const
+{
+    std::lock_guard lock{mutex};
+    return replicated_tables.getMutations(table_shared_id, replica_name);
 }


 void BackupCoordinationLocal::addReplicatedDataPath(const String & table_shared_id, const String & data_path)
 {
    std::lock_guard lock{mutex};
-    replicated_data_paths[table_shared_id].push_back(data_path);
+    replicated_tables.addDataPath(table_shared_id, data_path);
 }

 Strings BackupCoordinationLocal::getReplicatedDataPaths(const String & table_shared_id) const
 {
    std::lock_guard lock{mutex};
-    auto it = replicated_data_paths.find(table_shared_id);
-    if (it == replicated_data_paths.end())
-        return {};
-    return it->second;
+    return replicated_tables.getDataPaths(table_shared_id);
 }


-void BackupCoordinationLocal::addReplicatedAccessPath(const String & access_zk_path, const String & file_path)
+void BackupCoordinationLocal::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path)
 {
    std::lock_guard lock{mutex};
-    replicated_access_paths[access_zk_path].push_back(file_path);
+    replicated_access.addFilePath(access_zk_path, access_entity_type, host_id, file_path);
 }

-Strings BackupCoordinationLocal::getReplicatedAccessPaths(const String & access_zk_path) const
+Strings BackupCoordinationLocal::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const
 {
    std::lock_guard lock{mutex};
-    auto it = replicated_access_paths.find(access_zk_path);
-    if (it == replicated_access_paths.end())
-        return {};
-    return it->second;
-}
-
-void BackupCoordinationLocal::setReplicatedAccessHost(const String & access_zk_path, const String & host_id)
-{
-    std::lock_guard lock{mutex};
-    replicated_access_hosts[access_zk_path] = host_id;
-}
-
-String BackupCoordinationLocal::getReplicatedAccessHost(const String & access_zk_path) const
-{
-    std::lock_guard lock{mutex};
-    auto it = replicated_access_hosts.find(access_zk_path);
-    if (it == replicated_access_hosts.end())
-        return {};
-    return it->second;
+    return replicated_access.getFilePaths(access_zk_path, access_entity_type, host_id);
 }


--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -1,7 +1,8 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
-#include <Backups/BackupCoordinationHelpers.h>
+#include <Backups/BackupCoordinationReplicatedAccess.h>
+#include <Backups/BackupCoordinationReplicatedTables.h>
 #include <base/defines.h>
 #include <map>
 #include <mutex>
@ -12,7 +13,7 @@ namespace Poco { class Logger; }
 namespace DB
 {

-/// Stores backup contents information in memory.
+/// Implementation of the IBackupCoordination interface performing coordination in memory.
 class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
@ -20,21 +21,23 @@ public:
    ~BackupCoordinationLocal() override;

    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts) override;
-    Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms) override;
+    void setErrorStatus(const String & current_host, const Exception & exception) override;
+    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
+    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;

    void addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
    Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const override;

+    void addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name,
+                                const std::vector<MutationInfo> & mutations) override;
+    std::vector<MutationInfo> getReplicatedMutations(const String & table_shared_id, const String & replica_name) const override;
+
    void addReplicatedDataPath(const String & table_shared_id, const String & data_path) override;
    Strings getReplicatedDataPaths(const String & table_shared_id) const override;

-    void addReplicatedAccessPath(const String & access_zk_path, const String & file_path) override;
-    Strings getReplicatedAccessPaths(const String & access_zk_path) const override;
-
-    void setReplicatedAccessHost(const String & access_zk_path, const String & host_id) override;
-    String getReplicatedAccessHost(const String & access_zk_path) const override;
+    void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path) override;
+    Strings getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const override;

    void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override;
    void updateFileInfo(const FileInfo & file_info) override;
@ -52,15 +55,12 @@ public:

 private:
    mutable std::mutex mutex;
-    BackupCoordinationReplicatedPartNames replicated_part_names TSA_GUARDED_BY(mutex);
-    std::unordered_map<String, Strings> replicated_data_paths TSA_GUARDED_BY(mutex);
-    std::unordered_map<String, Strings> replicated_access_paths TSA_GUARDED_BY(mutex);
-    std::unordered_map<String, String> replicated_access_hosts TSA_GUARDED_BY(mutex);
+    BackupCoordinationReplicatedTables replicated_tables TSA_GUARDED_BY(mutex);
+    BackupCoordinationReplicatedAccess replicated_access TSA_GUARDED_BY(mutex);
    std::map<String /* file_name */, SizeAndChecksum> file_names TSA_GUARDED_BY(mutex); /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0.
    std::map<SizeAndChecksum, FileInfo> file_infos TSA_GUARDED_BY(mutex); /// Information about files. Without empty files.
    Strings archive_suffixes TSA_GUARDED_BY(mutex);
    size_t current_archive_suffix TSA_GUARDED_BY(mutex) = 0;
 };

-
 }
--- a/src/Backups/BackupCoordinationDistributed.cpp
+++ b/src/Backups/BackupCoordinationDistributed.cpp
@ -1,4 +1,5 @@
-#include <Backups/BackupCoordinationDistributed.h>
+#include <Backups/BackupCoordinationRemote.h>
+#include <Access/Common/AccessEntityType.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromString.h>
@ -27,6 +28,7 @@ namespace
    using SizeAndChecksum = IBackupCoordination::SizeAndChecksum;
    using FileInfo = IBackupCoordination::FileInfo;
    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
+    using MutationInfo = IBackupCoordination::MutationInfo;

    struct ReplicatedPartNames
    {
@ -63,6 +65,41 @@ namespace
        }
    };

+    struct ReplicatedMutations
+    {
+        std::vector<MutationInfo> mutations;
+        String table_name_for_logs;
+
+        static String serialize(const std::vector<MutationInfo> & mutations_, const String & table_name_for_logs_)
+        {
+            WriteBufferFromOwnString out;
+            writeBinary(mutations_.size(), out);
+            for (const auto & mutation : mutations_)
+            {
+                writeBinary(mutation.id, out);
+                writeBinary(mutation.entry, out);
+            }
+            writeBinary(table_name_for_logs_, out);
+            return out.str();
+        }
+
+        static ReplicatedMutations deserialize(const String & str)
+        {
+            ReadBufferFromString in{str};
+            ReplicatedMutations res;
+            size_t num;
+            readBinary(num, in);
+            res.mutations.resize(num);
+            for (size_t i = 0; i != num; ++i)
+            {
+                readBinary(res.mutations[i].id, in);
+                readBinary(res.mutations[i].entry, in);
+            }
+            readBinary(res.table_name_for_logs, in);
+            return res;
+        }
+    };
+
    String serializeFileInfo(const FileInfo & info)
    {
        WriteBufferFromOwnString out;
@ -128,7 +165,7 @@ namespace
    constexpr size_t NUM_ATTEMPTS = 10;
 }

-BackupCoordinationDistributed::BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
+BackupCoordinationRemote::BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
    : zookeeper_path(zookeeper_path_)
    , get_zookeeper(get_zookeeper_)
    , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("BackupCoordination"))
@ -136,46 +173,51 @@ BackupCoordinationDistributed::BackupCoordinationDistributed(const String & zook
    createRootNodes();
 }

-BackupCoordinationDistributed::~BackupCoordinationDistributed() = default;
+BackupCoordinationRemote::~BackupCoordinationRemote() = default;

-void BackupCoordinationDistributed::createRootNodes()
+void BackupCoordinationRemote::createRootNodes()
 {
    auto zookeeper = get_zookeeper();
    zookeeper->createAncestors(zookeeper_path);
    zookeeper->createIfNotExists(zookeeper_path, "");
    zookeeper->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+    zookeeper->createIfNotExists(zookeeper_path + "/repl_mutations", "");
    zookeeper->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_access_host", "");
-    zookeeper->createIfNotExists(zookeeper_path + "/repl_access_paths", "");
+    zookeeper->createIfNotExists(zookeeper_path + "/repl_access", "");
    zookeeper->createIfNotExists(zookeeper_path + "/file_names", "");
    zookeeper->createIfNotExists(zookeeper_path + "/file_infos", "");
    zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", "");
 }

-void BackupCoordinationDistributed::removeAllNodes()
+void BackupCoordinationRemote::removeAllNodes()
 {
    auto zookeeper = get_zookeeper();
    zookeeper->removeRecursive(zookeeper_path);
 }


-void BackupCoordinationDistributed::setStatus(const String & current_host, const String & new_status, const String & message)
+void BackupCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message)
 {
    status_sync.set(current_host, new_status, message);
 }

-Strings BackupCoordinationDistributed::setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts)
+void BackupCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception)
 {
-    return status_sync.setAndWait(current_host, new_status, message, all_hosts);
+    status_sync.setError(current_host, exception);
 }

-Strings BackupCoordinationDistributed::setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms)
+Strings BackupCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait)
 {
-    return status_sync.setAndWaitFor(current_host, new_status, message, all_hosts, timeout_ms);
+    return status_sync.wait(all_hosts, status_to_wait);
+}
+
+Strings BackupCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
+{
+    return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms);
 }


-void BackupCoordinationDistributed::addReplicatedPartNames(
+void BackupCoordinationRemote::addReplicatedPartNames(
    const String & table_shared_id,
    const String & table_name_for_logs,
    const String & replica_name,
@ -183,8 +225,8 @@ void BackupCoordinationDistributed::addReplicatedPartNames(
 {
    {
        std::lock_guard lock{mutex};
-        if (replicated_part_names)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after getPartNames()");
+        if (replicated_tables)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedPartNames() must not be called after preparing");
    }

    auto zookeeper = get_zookeeper();
@ -194,17 +236,49 @@ void BackupCoordinationDistributed::addReplicatedPartNames(
    zookeeper->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent);
 }

-Strings BackupCoordinationDistributed::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
+Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const
 {
    std::lock_guard lock{mutex};
-    prepareReplicatedPartNames();
-    return replicated_part_names->getPartNames(table_shared_id, replica_name);
+    prepareReplicatedTables();
+    return replicated_tables->getPartNames(table_shared_id, replica_name);
+}
+
+void BackupCoordinationRemote::addReplicatedMutations(
+    const String & table_shared_id,
+    const String & table_name_for_logs,
+    const String & replica_name,
+    const std::vector<MutationInfo> & mutations)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (replicated_tables)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedMutations() must not be called after preparing");
+    }
+
+    auto zookeeper = get_zookeeper();
+    String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id);
+    zookeeper->createIfNotExists(path, "");
+    path += "/" + escapeForFileName(replica_name);
+    zookeeper->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent);
+}
+
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const
+{
+    std::lock_guard lock{mutex};
+    prepareReplicatedTables();
+    return replicated_tables->getMutations(table_shared_id, replica_name);
 }


-void BackupCoordinationDistributed::addReplicatedDataPath(
+void BackupCoordinationRemote::addReplicatedDataPath(
    const String & table_shared_id, const String & data_path)
 {
+    {
+        std::lock_guard lock{mutex};
+        if (replicated_tables)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedDataPath() must not be called after preparing");
+    }
+
    auto zookeeper = get_zookeeper();
    String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id);
    zookeeper->createIfNotExists(path, "");
@ -212,83 +286,120 @@ void BackupCoordinationDistributed::addReplicatedDataPath(
    zookeeper->createIfNotExists(path, "");
 }

-Strings BackupCoordinationDistributed::getReplicatedDataPaths(const String & table_shared_id) const
+Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_shared_id) const
 {
-    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id);
-    Strings children = zookeeper->getChildren(path);
-    Strings data_paths;
-    data_paths.reserve(children.size());
-    for (const String & child : children)
-        data_paths.push_back(unescapeForFileName(child));
-    return data_paths;
+    std::lock_guard lock{mutex};
+    prepareReplicatedTables();
+    return replicated_tables->getDataPaths(table_shared_id);
 }


-void BackupCoordinationDistributed::prepareReplicatedPartNames() const
+void BackupCoordinationRemote::prepareReplicatedTables() const
 {
-    if (replicated_part_names)
+    if (replicated_tables)
        return;

-    replicated_part_names.emplace();
+    replicated_tables.emplace();
    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_part_names";
-    for (const String & escaped_table_zk_path : zookeeper->getChildren(path))
+
    {
-        String table_zk_path = unescapeForFileName(escaped_table_zk_path);
-        String path2 = path + "/" + escaped_table_zk_path;
-        for (const String & escaped_replica_name : zookeeper->getChildren(path2))
+        String path = zookeeper_path + "/repl_part_names";
+        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
        {
-            String replica_name = unescapeForFileName(escaped_replica_name);
-            auto part_names = ReplicatedPartNames::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name));
-            replicated_part_names->addPartNames(table_zk_path, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums);
+            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
+            String path2 = path + "/" + escaped_table_shared_id;
+            for (const String & escaped_replica_name : zookeeper->getChildren(path2))
+            {
+                String replica_name = unescapeForFileName(escaped_replica_name);
+                auto part_names = ReplicatedPartNames::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name));
+                replicated_tables->addPartNames(table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums);
+            }
+        }
+    }
+
+    {
+        String path = zookeeper_path + "/repl_mutations";
+        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
+        {
+            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
+            String path2 = path + "/" + escaped_table_shared_id;
+            for (const String & escaped_replica_name : zookeeper->getChildren(path2))
+            {
+                String replica_name = unescapeForFileName(escaped_replica_name);
+                auto mutations = ReplicatedMutations::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name));
+                replicated_tables->addMutations(table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations);
+            }
+        }
+    }
+
+    {
+        String path = zookeeper_path + "/repl_data_paths";
+        for (const String & escaped_table_shared_id : zookeeper->getChildren(path))
+        {
+            String table_shared_id = unescapeForFileName(escaped_table_shared_id);
+            String path2 = path + "/" + escaped_table_shared_id;
+            for (const String & escaped_data_path : zookeeper->getChildren(path2))
+            {
+                String data_path = unescapeForFileName(escaped_data_path);
+                replicated_tables->addDataPath(table_shared_id, data_path);
+            }
        }
    }
 }


-void BackupCoordinationDistributed::addReplicatedAccessPath(const String & access_zk_path, const String & file_path)
+void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path)
 {
+    {
+        std::lock_guard lock{mutex};
+        if (replicated_access)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedAccessFilePath() must not be called after preparing");
+    }
+
    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_access_paths/" + escapeForFileName(access_zk_path);
+    String path = zookeeper_path + "/repl_access/" + escapeForFileName(access_zk_path);
    zookeeper->createIfNotExists(path, "");
-    path += "/" + escapeForFileName(file_path);
+    path += "/" + AccessEntityTypeInfo::get(access_entity_type).name;
    zookeeper->createIfNotExists(path, "");
+    path += "/" + host_id;
+    zookeeper->createIfNotExists(path, file_path);
 }

-Strings BackupCoordinationDistributed::getReplicatedAccessPaths(const String & access_zk_path) const
+Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const
 {
-    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_access_paths/" + escapeForFileName(access_zk_path);
-    Strings children = zookeeper->getChildren(path);
-    Strings file_paths;
-    file_paths.reserve(children.size());
-    for (const String & child : children)
-        file_paths.push_back(unescapeForFileName(child));
-    return file_paths;
+    std::lock_guard lock{mutex};
+    prepareReplicatedAccess();
+    return replicated_access->getFilePaths(access_zk_path, access_entity_type, host_id);
 }

-void BackupCoordinationDistributed::setReplicatedAccessHost(const String & access_zk_path, const String & host_id)
+void BackupCoordinationRemote::prepareReplicatedAccess() const
 {
-    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_access_host/" + escapeForFileName(access_zk_path);
-    auto code = zookeeper->tryCreate(path, host_id, zkutil::CreateMode::Persistent);
-    if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-        throw zkutil::KeeperException(code, path);
+    if (replicated_access)
+        return;

-    if (code == Coordination::Error::ZNODEEXISTS)
-        zookeeper->set(path, host_id);
-}
-
-String BackupCoordinationDistributed::getReplicatedAccessHost(const String & access_zk_path) const
-{
+    replicated_access.emplace();
    auto zookeeper = get_zookeeper();
-    String path = zookeeper_path + "/repl_access_host/" + escapeForFileName(access_zk_path);
-    return zookeeper->get(path);
+
+    String path = zookeeper_path + "/repl_access";
+    for (const String & escaped_access_zk_path : zookeeper->getChildren(path))
+    {
+        String access_zk_path = unescapeForFileName(escaped_access_zk_path);
+        String path2 = path + "/" + escaped_access_zk_path;
+        for (const String & type_str : zookeeper->getChildren(path2))
+        {
+            AccessEntityType type = AccessEntityTypeInfo::parseType(type_str);
+            String path3 = path2 + "/" + type_str;
+            for (const String & host_id : zookeeper->getChildren(path3))
+            {
+                String file_path = zookeeper->get(path3 + "/" + host_id);
+                replicated_access->addFilePath(access_zk_path, type, host_id, file_path);
+            }
+        }
+    }
 }


-void BackupCoordinationDistributed::addFileInfo(const FileInfo & file_info, bool & is_data_file_required)
+void BackupCoordinationRemote::addFileInfo(const FileInfo & file_info, bool & is_data_file_required)
 {
    auto zookeeper = get_zookeeper();

@ -310,7 +421,7 @@ void BackupCoordinationDistributed::addFileInfo(const FileInfo & file_info, bool
    is_data_file_required = (code == Coordination::Error::ZOK) && (file_info.size > file_info.base_size);
 }

-void BackupCoordinationDistributed::updateFileInfo(const FileInfo & file_info)
+void BackupCoordinationRemote::updateFileInfo(const FileInfo & file_info)
 {
    if (!file_info.size)
        return; /// we don't keep FileInfos for empty files, nothing to update
@ -332,7 +443,7 @@ void BackupCoordinationDistributed::updateFileInfo(const FileInfo & file_info)
    }
 }

-std::vector<FileInfo> BackupCoordinationDistributed::getAllFileInfos() const
+std::vector<FileInfo> BackupCoordinationRemote::getAllFileInfos() const
 {
    auto zookeeper = get_zookeeper();
    std::vector<FileInfo> file_infos;
@ -350,7 +461,7 @@ std::vector<FileInfo> BackupCoordinationDistributed::getAllFileInfos() const
    return file_infos;
 }

-Strings BackupCoordinationDistributed::listFiles(const String & directory, bool recursive) const
+Strings BackupCoordinationRemote::listFiles(const String & directory, bool recursive) const
 {
    auto zookeeper = get_zookeeper();
    Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names");
@ -383,7 +494,7 @@ Strings BackupCoordinationDistributed::listFiles(const String & directory, bool
    return elements;
 }

-bool BackupCoordinationDistributed::hasFiles(const String & directory) const
+bool BackupCoordinationRemote::hasFiles(const String & directory) const
 {
    auto zookeeper = get_zookeeper();
    Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names");
@ -402,7 +513,7 @@ bool BackupCoordinationDistributed::hasFiles(const String & directory) const
    return false;
 }

-std::optional<FileInfo> BackupCoordinationDistributed::getFileInfo(const String & file_name) const
+std::optional<FileInfo> BackupCoordinationRemote::getFileInfo(const String & file_name) const
 {
    auto zookeeper = get_zookeeper();
    String size_and_checksum;
@ -416,7 +527,7 @@ std::optional<FileInfo> BackupCoordinationDistributed::getFileInfo(const String
    return file_info;
 }

-std::optional<FileInfo> BackupCoordinationDistributed::getFileInfo(const SizeAndChecksum & size_and_checksum) const
+std::optional<FileInfo> BackupCoordinationRemote::getFileInfo(const SizeAndChecksum & size_and_checksum) const
 {
    auto zookeeper = get_zookeeper();
    String file_info_str;
@ -425,7 +536,7 @@ std::optional<FileInfo> BackupCoordinationDistributed::getFileInfo(const SizeAnd
    return deserializeFileInfo(file_info_str);
 }

-std::optional<SizeAndChecksum> BackupCoordinationDistributed::getFileSizeAndChecksum(const String & file_name) const
+std::optional<SizeAndChecksum> BackupCoordinationRemote::getFileSizeAndChecksum(const String & file_name) const
 {
    auto zookeeper = get_zookeeper();
    String size_and_checksum;
@ -434,7 +545,7 @@ std::optional<SizeAndChecksum> BackupCoordinationDistributed::getFileSizeAndChec
    return deserializeSizeAndChecksum(size_and_checksum);
 }

-String BackupCoordinationDistributed::getNextArchiveSuffix()
+String BackupCoordinationRemote::getNextArchiveSuffix()
 {
    auto zookeeper = get_zookeeper();
    String path = zookeeper_path + "/archive_suffixes/a";
@ -445,7 +556,7 @@ String BackupCoordinationDistributed::getNextArchiveSuffix()
    return formatArchiveSuffix(extractCounterFromSequentialNodeName(path_created));
 }

-Strings BackupCoordinationDistributed::getAllArchiveSuffixes() const
+Strings BackupCoordinationRemote::getAllArchiveSuffixes() const
 {
    auto zookeeper = get_zookeeper();
    Strings node_names = zookeeper->getChildren(zookeeper_path + "/archive_suffixes");
@ -454,7 +565,7 @@ Strings BackupCoordinationDistributed::getAllArchiveSuffixes() const
    return node_names;
 }

-void BackupCoordinationDistributed::drop()
+void BackupCoordinationRemote::drop()
 {
    removeAllNodes();
 }
--- a/src/Backups/BackupCoordinationDistributed.h
+++ b/src/Backups/BackupCoordinationDistributed.h
@ -1,22 +1,25 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
-#include <Backups/BackupCoordinationHelpers.h>
+#include <Backups/BackupCoordinationReplicatedAccess.h>
+#include <Backups/BackupCoordinationReplicatedTables.h>
+#include <Backups/BackupCoordinationStatusSync.h>


 namespace DB
 {

-/// Stores backup temporary information in Zookeeper, used to perform BACKUP ON CLUSTER.
-class BackupCoordinationDistributed : public IBackupCoordination
+/// Implementation of the IBackupCoordination interface performing coordination via ZooKeeper. It's necessary for "BACKUP ON CLUSTER".
+class BackupCoordinationRemote : public IBackupCoordination
 {
 public:
-    BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_);
-    ~BackupCoordinationDistributed() override;
+    BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_);
+    ~BackupCoordinationRemote() override;

    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts) override;
-    Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms) override;
+    void setErrorStatus(const String & current_host, const Exception & exception) override;
+    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
+    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;

    void addReplicatedPartNames(
        const String & table_shared_id,
@ -26,14 +29,19 @@ public:

    Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const override;

+    void addReplicatedMutations(
+        const String & table_shared_id,
+        const String & table_name_for_logs,
+        const String & replica_name,
+        const std::vector<MutationInfo> & mutations) override;
+
+    std::vector<MutationInfo> getReplicatedMutations(const String & table_shared_id, const String & replica_name) const override;
+
    void addReplicatedDataPath(const String & table_shared_id, const String & data_path) override;
    Strings getReplicatedDataPaths(const String & table_shared_id) const override;

-    void addReplicatedAccessPath(const String & access_zk_path, const String & file_path) override;
-    Strings getReplicatedAccessPaths(const String & access_zk_path) const override;
-
-    void setReplicatedAccessHost(const String & access_zk_path, const String & host_id) override;
-    String getReplicatedAccessHost(const String & access_zk_path) const override;
+    void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path) override;
+    Strings getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const override;

    void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override;
    void updateFileInfo(const FileInfo & file_info) override;
@ -53,7 +61,8 @@ public:
 private:
    void createRootNodes();
    void removeAllNodes();
-    void prepareReplicatedPartNames() const;
+    void prepareReplicatedTables() const;
+    void prepareReplicatedAccess() const;

    const String zookeeper_path;
    const zkutil::GetZooKeeper get_zookeeper;
@ -61,7 +70,8 @@ private:
    BackupCoordinationStatusSync status_sync;

    mutable std::mutex mutex;
-    mutable std::optional<BackupCoordinationReplicatedPartNames> replicated_part_names;
+    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables;
+    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access;
 };

 }
--- a/src/Backups/BackupCoordinationReplicatedAccess.cpp
+++ b/src/Backups/BackupCoordinationReplicatedAccess.cpp
@ -0,0 +1,33 @@
+#include <Backups/BackupCoordinationReplicatedAccess.h>
+
+
+namespace DB
+{
+
+BackupCoordinationReplicatedAccess::BackupCoordinationReplicatedAccess() = default;
+BackupCoordinationReplicatedAccess::~BackupCoordinationReplicatedAccess() = default;
+
+void BackupCoordinationReplicatedAccess::addFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path)
+{
+    auto & ref = file_paths_by_zk_path[std::make_pair(access_zk_path, access_entity_type)];
+    ref.file_paths.emplace(file_path);
+
+    /// std::max() because the calculation must give the same result being repeated on a different replica.
+    ref.host_to_store_access = std::max(ref.host_to_store_access, host_id);
+}
+
+Strings BackupCoordinationReplicatedAccess::getFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const
+{
+    auto it = file_paths_by_zk_path.find(std::make_pair(access_zk_path, access_entity_type));
+    if (it == file_paths_by_zk_path.end())
+        return {};
+
+    const auto & file_paths = it->second;
+    if (file_paths.host_to_store_access != host_id)
+        return {};
+
+    Strings res{file_paths.file_paths.begin(), file_paths.file_paths.end()};
+    return res;
+}
+
+}
--- a/src/Backups/BackupCoordinationReplicatedAccess.h
+++ b/src/Backups/BackupCoordinationReplicatedAccess.h
@ -0,0 +1,49 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <map>
+#include <unordered_set>
+
+
+namespace DB
+{
+enum class AccessEntityType;
+
+/// This class is used by hosts to coordinate the access entities of ReplicatedAccessStorage they're writing to a backup.
+/// It's designed to make all hosts save the same access entities to the backup even in case the ReplicatedAccessStorage changes
+/// while the backup is being produced. This is important to make RESTORE more predicitible.
+///
+/// For example, let's consider three replicas having a ReplicatedAccessStorage on them.
+/// This class ensures that the following files in the backup are the same:
+/// /shards/1/replicas/1/data/system/users/access01.txt
+/// /shards/1/replicas/2/data/system/users/access01.txt
+/// /shards/1/replicas/3/data/system/users/access01.txt
+///
+/// To implement that this class chooses one host to write access entities for all the hosts so in fact all those files
+/// in the example above are written by the same host.
+
+class BackupCoordinationReplicatedAccess
+{
+public:
+    BackupCoordinationReplicatedAccess();
+    ~BackupCoordinationReplicatedAccess();
+
+    /// Adds a path to access*.txt file keeping access entities of a ReplicatedAccessStorage.
+    void addFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path);
+
+    /// Returns all paths added by addFilePath() if `host_id` is a host chosen to store access.
+    Strings getFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const;
+
+private:
+    using ZkPathAndEntityType = std::pair<String, AccessEntityType>;
+
+    struct FilePathsAndHost
+    {
+        std::unordered_set<String> file_paths;
+        String host_to_store_access;
+    };
+
+    std::map<ZkPathAndEntityType, FilePathsAndHost> file_paths_by_zk_path;
+};
+
+}
--- a/src/Backups/BackupCoordinationReplicatedTables.cpp
+++ b/src/Backups/BackupCoordinationReplicatedTables.cpp
@ -0,0 +1,335 @@
+#include <Backups/BackupCoordinationReplicatedTables.h>
+#include <Storages/MergeTree/MergeTreePartInfo.h>
+#include <Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h>
+#include <Common/Exception.h>
+#include <boost/range/adaptor/map.hpp>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_BACKUP_TABLE;
+    extern const int LOGICAL_ERROR;
+}
+
+
+namespace
+{
+    struct LessReplicaName
+    {
+        bool operator()(const std::shared_ptr<const String> & left, const std::shared_ptr<const String> & right) { return *left < *right; }
+    };
+}
+
+using MutationInfo = IBackupCoordination::MutationInfo;
+
+
+class BackupCoordinationReplicatedTables::CoveredPartsFinder
+{
+public:
+    explicit CoveredPartsFinder(const String & table_name_for_logs_) : table_name_for_logs(table_name_for_logs_) {}
+
+    void addPartInfo(MergeTreePartInfo && new_part_info, const std::shared_ptr<const String> & replica_name)
+    {
+        auto new_min_block = new_part_info.min_block;
+        auto new_max_block = new_part_info.max_block;
+        auto & parts = partitions[new_part_info.partition_id];
+
+        /// Find the first part with max_block >= `part_info.min_block`.
+        auto first_it = parts.lower_bound(new_min_block);
+        if (first_it == parts.end())
+        {
+            /// All max_blocks < part_info.min_block, so we can safely add the `part_info` to the list of parts.
+            parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
+            return;
+        }
+
+        {
+            /// part_info.min_block <= current_info.max_block
+            const auto & part = first_it->second;
+            if (new_max_block < part.info.min_block)
+            {
+                /// (prev_info.max_block < part_info.min_block) AND (part_info.max_block < current_info.min_block),
+                /// so we can safely add the `part_info` to the list of parts.
+                parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
+                return;
+            }
+
+            /// (part_info.min_block <= current_info.max_block) AND (part_info.max_block >= current_info.min_block), parts intersect.
+
+            if (part.info.contains(new_part_info))
+            {
+                /// `part_info` is already contained in another part.
+                return;
+            }
+        }
+
+        /// Probably `part_info` is going to replace multiple parts, find the range of parts to replace.
+        auto last_it = first_it;
+        while (last_it != parts.end())
+        {
+            const auto & part = last_it->second;
+            if (part.info.min_block > new_max_block)
+                break;
+            if (!new_part_info.contains(part.info))
+            {
+                throw Exception(
+                    ErrorCodes::CANNOT_BACKUP_TABLE,
+                    "Intersected parts detected: {} on replica {} and {} on replica {}",
+                    part.info.getPartName(),
+                    *part.replica_name,
+                    new_part_info.getPartName(),
+                    *replica_name);
+            }
+            ++last_it;
+        }
+
+        /// `part_info` will replace multiple parts [first_it..last_it)
+        parts.erase(first_it, last_it);
+        parts.emplace(new_max_block, PartInfo{std::move(new_part_info), replica_name});
+    }
+
+    bool isCoveredByAnotherPart(const String & part_name) const
+    {
+        return isCoveredByAnotherPart(MergeTreePartInfo::fromPartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING));
+    }
+
+    bool isCoveredByAnotherPart(const MergeTreePartInfo & part_info) const
+    {
+        auto partition_it = partitions.find(part_info.partition_id);
+        if (partition_it == partitions.end())
+            return false;
+
+        const auto & parts = partition_it->second;
+
+        /// Find the first part with max_block >= `part_info.min_block`.
+        auto it_part = parts.lower_bound(part_info.min_block);
+        if (it_part == parts.end())
+        {
+            /// All max_blocks < part_info.min_block, so there is no parts covering `part_info`.
+            return false;
+        }
+
+        /// part_info.min_block <= current_info.max_block
+        const auto & existing_part = it_part->second;
+        if (part_info.max_block < existing_part.info.min_block)
+        {
+            /// (prev_info.max_block < part_info.min_block) AND (part_info.max_block < current_info.min_block),
+            /// so there is no parts covering `part_info`.
+            return false;
+        }
+
+        /// (part_info.min_block <= current_info.max_block) AND (part_info.max_block >= current_info.min_block), parts intersect.
+
+        if (existing_part.info == part_info)
+        {
+            /// It's the same part, it's kind of covers itself, but we check in this function whether a part is covered by another part.
+            return false;
+        }
+
+        /// Check if `part_info` is covered by `current_info`.
+        return existing_part.info.contains(part_info);
+    }
+
+private:
+    struct PartInfo
+    {
+        MergeTreePartInfo info;
+        std::shared_ptr<const String> replica_name;
+    };
+
+    using Parts = std::map<Int64 /* max_block */, PartInfo>;
+    std::unordered_map<String, Parts> partitions;
+    const String table_name_for_logs;
+};
+
+
+BackupCoordinationReplicatedTables::BackupCoordinationReplicatedTables() = default;
+BackupCoordinationReplicatedTables::~BackupCoordinationReplicatedTables() = default;
+
+void BackupCoordinationReplicatedTables::addPartNames(
+    const String & table_shared_id,
+    const String & table_name_for_logs,
+    const String & replica_name,
+    const std::vector<PartNameAndChecksum> & part_names_and_checksums)
+{
+    if (prepared)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addPartNames() must not be called after preparing");
+
+    auto & table_info = table_infos[table_shared_id];
+    table_info.table_name_for_logs = table_name_for_logs;
+
+    if (!table_info.covered_parts_finder)
+        table_info.covered_parts_finder = std::make_unique<CoveredPartsFinder>(table_name_for_logs);
+
+    auto replica_name_ptr = std::make_shared<String>(replica_name);
+
+    for (const auto & part_name_and_checksum : part_names_and_checksums)
+    {
+        const auto & part_name = part_name_and_checksum.part_name;
+        const auto & checksum = part_name_and_checksum.checksum;
+        auto it = table_info.replicas_by_part_name.find(part_name);
+        if (it == table_info.replicas_by_part_name.end())
+        {
+            it = table_info.replicas_by_part_name.emplace(part_name, PartReplicas{}).first;
+            it->second.checksum = checksum;
+        }
+        else
+        {
+            const auto & other = it->second;
+            if (other.checksum != checksum)
+            {
+                const String & other_replica_name = **other.replica_names.begin();
+                throw Exception(
+                    ErrorCodes::CANNOT_BACKUP_TABLE,
+                    "Table {} on replica {} has part {} which is different from the part on replica {}. Must be the same",
+                    table_name_for_logs,
+                    replica_name,
+                    part_name,
+                    other_replica_name);
+            }
+        }
+
+        auto & replica_names = it->second.replica_names;
+
+        /// `replica_names` should be ordered because we need this vector to be in the same order on every replica.
+        replica_names.insert(
+            std::upper_bound(replica_names.begin(), replica_names.end(), replica_name_ptr, LessReplicaName{}), replica_name_ptr);
+    }
+}
+
+Strings BackupCoordinationReplicatedTables::getPartNames(const String & table_shared_id, const String & replica_name) const
+{
+    prepare();
+
+    auto it = table_infos.find(table_shared_id);
+    if (it == table_infos.end())
+        return {};
+
+    const auto & part_names_by_replica_name = it->second.part_names_by_replica_name;
+    auto it2 = part_names_by_replica_name.find(replica_name);
+    if (it2 == part_names_by_replica_name.end())
+        return {};
+
+    return it2->second;
+}
+
+void BackupCoordinationReplicatedTables::addMutations(
+    const String & table_shared_id,
+    const String & table_name_for_logs,
+    const String & replica_name,
+    const std::vector<MutationInfo> & mutations)
+{
+    if (prepared)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "addMutations() must not be called after preparing");
+
+    auto & table_info = table_infos[table_shared_id];
+    table_info.table_name_for_logs = table_name_for_logs;
+    for (const auto & [mutation_id, mutation_entry] : mutations)
+        table_info.mutations.emplace(mutation_id, mutation_entry);
+
+    /// std::max() because the calculation must give the same result being repeated on a different replica.
+    table_info.replica_name_to_store_mutations = std::max(table_info.replica_name_to_store_mutations, replica_name);
+}
+
+std::vector<MutationInfo>
+BackupCoordinationReplicatedTables::getMutations(const String & table_shared_id, const String & replica_name) const
+{
+    prepare();
+
+    auto it = table_infos.find(table_shared_id);
+    if (it == table_infos.end())
+        return {};
+
+    const auto & table_info = it->second;
+    if (table_info.replica_name_to_store_mutations != replica_name)
+        return {};
+
+    std::vector<MutationInfo> res;
+    for (const auto & [mutation_id, mutation_entry] : table_info.mutations)
+        res.emplace_back(MutationInfo{mutation_id, mutation_entry});
+    return res;
+}
+
+void BackupCoordinationReplicatedTables::addDataPath(const String & table_shared_id, const String & data_path)
+{
+    auto & table_info = table_infos[table_shared_id];
+    table_info.data_paths.emplace(data_path);
+}
+
+Strings BackupCoordinationReplicatedTables::getDataPaths(const String & table_shared_id) const
+{
+    auto it = table_infos.find(table_shared_id);
+    if (it == table_infos.end())
+        return {};
+
+    const auto & table_info = it->second;
+    return Strings{table_info.data_paths.begin(), table_info.data_paths.end()};
+}
+
+
+void BackupCoordinationReplicatedTables::prepare() const
+{
+    if (prepared)
+        return;
+
+    size_t counter = 0;
+    for (const auto & table_info : table_infos | boost::adaptors::map_values)
+    {
+        try
+        {
+            /// Remove parts covered by other parts.
+            for (const auto & [part_name, part_replicas] : table_info.replicas_by_part_name)
+            {
+                auto part_info = MergeTreePartInfo::fromPartName(part_name, MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING);
+
+                auto & min_data_versions_by_partition = table_info.min_data_versions_by_partition;
+                auto it2 = min_data_versions_by_partition.find(part_info.partition_id);
+                if (it2 == min_data_versions_by_partition.end())
+                    min_data_versions_by_partition[part_info.partition_id] = part_info.getDataVersion();
+                else
+                    it2->second = std::min(it2->second, part_info.getDataVersion());
+
+                table_info.covered_parts_finder->addPartInfo(std::move(part_info), part_replicas.replica_names[0]);
+            }
+
+            for (const auto & [part_name, part_replicas] : table_info.replicas_by_part_name)
+            {
+                if (table_info.covered_parts_finder->isCoveredByAnotherPart(part_name))
+                    continue;
+                size_t chosen_index = (counter++) % part_replicas.replica_names.size();
+                const auto & chosen_replica_name = *part_replicas.replica_names[chosen_index];
+                table_info.part_names_by_replica_name[chosen_replica_name].push_back(part_name);
+            }
+
+            /// Remove finished or unrelated mutations.
+            std::unordered_map<String, String> unfinished_mutations;
+            for (const auto & [mutation_id, mutation_entry_str] : table_info.mutations)
+            {
+                auto mutation_entry = ReplicatedMergeTreeMutationEntry::parse(mutation_entry_str, mutation_id);
+                std::map<String, Int64> new_block_numbers;
+                for (const auto & [partition_id, block_number] : mutation_entry.block_numbers)
+                {
+                    auto it = table_info.min_data_versions_by_partition.find(partition_id);
+                    if ((it != table_info.min_data_versions_by_partition.end()) && (it->second < block_number))
+                        new_block_numbers[partition_id] = block_number;
+                }
+                mutation_entry.block_numbers = std::move(new_block_numbers);
+                if (!mutation_entry.block_numbers.empty())
+                    unfinished_mutations[mutation_id] = mutation_entry.toString();
+            }
+            table_info.mutations = unfinished_mutations;
+        }
+        catch (Exception & e)
+        {
+            e.addMessage("While checking data of table {}", table_info.table_name_for_logs);
+            throw;
+        }
+    }
+
+    prepared = true;
+}
+
+}
--- a/src/Backups/BackupCoordinationReplicatedTables.h
+++ b/src/Backups/BackupCoordinationReplicatedTables.h
@ -0,0 +1,103 @@
+#pragma once
+
+#include <Backups/IBackupCoordination.h>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+
+
+namespace DB
+{
+
+/// Replicas used this class to coordinate how they're writing replicated tables to a backup.
+/// "BACKUP ON CLUSTER" can be executed on multiple hosts and parts of replicated tables on those hosts could be slightly different
+/// at any specific moment. This class is designed so that inside the backup all replicas would contain all the parts
+/// no matter if the replication queues of those tables are fast or slow.
+/// This is important to make RESTORE more correct and not dependent on random things like how fast the replicas doing RESTORE
+/// comparing to each other or how many replicas will be when RESTORE will be executed.
+///
+/// Example 1: Let's consider two replicas of a table, and let the first replica contain part all_1_1_0 and the second replica contain
+/// all_2_2_0. The files in the backup will look like this:
+/// /shards/1/replicas/1/data/mydb/mytable/all_1_1_0
+/// /shards/1/replicas/1/data/mydb/mytable/all_2_2_0
+/// /shards/1/replicas/2/data/mydb/mytable/all_1_1_0
+/// /shards/1/replicas/2/data/mydb/mytable/all_2_2_0
+///
+/// Example 2: Let's consider two replicas again, and let the first replica contain parts all_1_1_0 and all_2_2_0 and
+/// the second replica contain part all_1_2_1 (i.e. the second replica have those parts merged).
+/// In this case the files in the backup will look like this:
+/// /shards/1/replicas/1/data/mydb/mytable/all_1_2_1
+/// /shards/1/replicas/2/data/mydb/mytable/all_1_2_1
+
+class BackupCoordinationReplicatedTables
+{
+public:
+    BackupCoordinationReplicatedTables();
+    ~BackupCoordinationReplicatedTables();
+
+    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
+
+    /// Adds part names which a specified replica of a replicated table is going to put to the backup.
+    /// Multiple replicas of the replicated table call this function and then the added part names can be returned by call of the function
+    /// getPartNames().
+    /// Checksums are used only to control that parts under the same names on different replicas are the same.
+    void addPartNames(
+        const String & table_shared_id,
+        const String & table_name_for_logs,
+        const String & replica_name,
+        const std::vector<PartNameAndChecksum> & part_names_and_checksums);
+
+    /// Returns the names of the parts which a specified replica of a replicated table should put to the backup.
+    /// This is the same list as it was added by call of the function addPartNames() but without duplications and without
+    /// parts covered by another parts.
+    Strings getPartNames(const String & table_shared_id, const String & replica_name) const;
+
+    using MutationInfo = IBackupCoordination::MutationInfo;
+
+    /// Adds information about mutations of a replicated table.
+    void addMutations(
+        const String & table_shared_id,
+        const String & table_name_for_logs,
+        const String & replica_name,
+        const std::vector<MutationInfo> & mutations);
+
+    /// Returns all mutations of a replicated table which are not finished for some data parts added by addReplicatedPartNames().
+    std::vector<MutationInfo> getMutations(const String & table_shared_id, const String & replica_name) const;
+
+    /// Adds a data path in backup for a replicated table.
+    /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function
+    /// getDataPaths().
+    void addDataPath(const String & table_shared_id, const String & data_path);
+
+    /// Returns all the data paths in backup added for a replicated table (see also addReplicatedDataPath()).
+    Strings getDataPaths(const String & table_shared_id) const;
+
+private:
+    void prepare() const;
+
+    class CoveredPartsFinder;
+
+    struct PartReplicas
+    {
+        std::vector<std::shared_ptr<const String>> replica_names;
+        UInt128 checksum;
+    };
+
+    struct TableInfo
+    {
+        String table_name_for_logs;
+        std::map<String /* part_name */, PartReplicas> replicas_by_part_name; /// Should be ordered because we need this map to be in the same order on every replica.
+        mutable std::unordered_map<String /* replica_name> */, Strings> part_names_by_replica_name;
+        std::unique_ptr<CoveredPartsFinder> covered_parts_finder;
+        mutable std::unordered_map<String, Int64> min_data_versions_by_partition;
+        mutable std::unordered_map<String, String> mutations;
+        String replica_name_to_store_mutations;
+        std::unordered_set<String> data_paths;
+    };
+
+    std::map<String /* table_shared_id */, TableInfo> table_infos; /// Should be ordered because we need this map to be in the same order on every replica.
+    mutable bool prepared = false;
+};
+
+}
--- a/src/Backups/BackupCoordinationStatusSync.cpp
+++ b/src/Backups/BackupCoordinationStatusSync.cpp
@ -0,0 +1,182 @@
+#include <Backups/BackupCoordinationStatusSync.h>
+#include <Common/Exception.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+#include <base/chrono_io.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE;
+}
+
+
+BackupCoordinationStatusSync::BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_)
+    : zookeeper_path(zookeeper_path_)
+    , get_zookeeper(get_zookeeper_)
+    , log(log_)
+{
+    createRootNodes();
+}
+
+void BackupCoordinationStatusSync::createRootNodes()
+{
+    auto zookeeper = get_zookeeper();
+    zookeeper->createAncestors(zookeeper_path);
+    zookeeper->createIfNotExists(zookeeper_path, "");
+}
+
+void BackupCoordinationStatusSync::set(const String & current_host, const String & new_status, const String & message)
+{
+    auto zookeeper = get_zookeeper();
+    zookeeper->createIfNotExists(zookeeper_path + "/" + current_host + "|" + new_status, message);
+}
+
+void BackupCoordinationStatusSync::setError(const String & current_host, const Exception & exception)
+{
+    auto zookeeper = get_zookeeper();
+
+    Exception exception2 = exception;
+    exception2.addMessage("Host {}", current_host);
+    WriteBufferFromOwnString buf;
+    writeException(exception2, buf, true);
+
+    zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
+}
+
+Strings BackupCoordinationStatusSync::wait(const Strings & all_hosts, const String & status_to_wait)
+{
+    return waitImpl(all_hosts, status_to_wait, {});
+}
+
+Strings BackupCoordinationStatusSync::waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
+{
+    return waitImpl(all_hosts, status_to_wait, timeout_ms);
+}
+
+Strings BackupCoordinationStatusSync::waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional<UInt64> timeout_ms)
+{
+    if (all_hosts.empty())
+        return {};
+
+    /// Wait for other hosts.
+
+    Strings ready_hosts_results;
+    ready_hosts_results.resize(all_hosts.size());
+
+    std::map<String, std::vector<size_t> /* index in `ready_hosts_results` */> unready_hosts;
+    for (size_t i = 0; i != all_hosts.size(); ++i)
+        unready_hosts[all_hosts[i]].push_back(i);
+
+    std::optional<Exception> error;
+
+    auto zookeeper = get_zookeeper();
+
+    /// Process ZooKeeper's nodes and set `all_hosts_ready` or `unready_host` or `error_message`.
+    auto process_zk_nodes = [&](const Strings & zk_nodes)
+    {
+        for (const String & zk_node : zk_nodes)
+        {
+            if (zk_node.starts_with("remove_watch-"))
+                continue;
+
+            if (zk_node == "error")
+            {
+                ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")};
+                error = readException(buf, "", true);
+                break;
+            }
+
+            size_t separator_pos = zk_node.find('|');
+            if (separator_pos == String::npos)
+                throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected zk node {}", zookeeper_path + "/" + zk_node);
+
+            String host = zk_node.substr(0, separator_pos);
+            String status = zk_node.substr(separator_pos + 1);
+
+            auto it = unready_hosts.find(host);
+            if ((it != unready_hosts.end()) && (status == status_to_wait))
+            {
+                String result = zookeeper->get(zookeeper_path + "/" + zk_node);
+                for (size_t i : it->second)
+                    ready_hosts_results[i] = result;
+                unready_hosts.erase(it);
+            }
+        }
+    };
+
+    /// Wait until all hosts are ready or an error happens or time is out.
+    std::atomic<bool> watch_set = false;
+    std::condition_variable watch_triggered_event;
+
+    auto watch_callback = [&](const Coordination::WatchResponse &)
+    {
+        watch_set = false; /// After it's triggered it's not set until we call getChildrenWatch() again.
+        watch_triggered_event.notify_all();
+    };
+
+    auto watch_triggered = [&] { return !watch_set; };
+
+    bool use_timeout = timeout_ms.has_value();
+    std::chrono::milliseconds timeout{timeout_ms.value_or(0)};
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+    std::chrono::steady_clock::duration elapsed;
+    std::mutex dummy_mutex;
+    String previous_unready_host;
+
+    while (!unready_hosts.empty() && !error)
+    {
+        watch_set = true;
+        Strings nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback);
+        process_zk_nodes(nodes);
+
+        if (!unready_hosts.empty() && !error)
+        {
+            const auto & unready_host = unready_hosts.begin()->first;
+            if (unready_host != previous_unready_host)
+            {
+                LOG_TRACE(log, "Waiting for host {}", unready_host);
+                previous_unready_host = unready_host;
+            }
+
+            std::unique_lock dummy_lock{dummy_mutex};
+            if (use_timeout)
+            {
+                elapsed = std::chrono::steady_clock::now() - start_time;
+                if ((elapsed > timeout) || !watch_triggered_event.wait_for(dummy_lock, timeout - elapsed, watch_triggered))
+                    break;
+            }
+            else
+                watch_triggered_event.wait(dummy_lock, watch_triggered);
+        }
+    }
+
+    if (watch_set)
+    {
+        /// Remove watch by triggering it.
+        zookeeper->create(zookeeper_path + "/remove_watch-", "", zkutil::CreateMode::EphemeralSequential);
+        std::unique_lock dummy_lock{dummy_mutex};
+        watch_triggered_event.wait(dummy_lock, watch_triggered);
+    }
+
+    if (error)
+        error->rethrow();
+
+    if (!unready_hosts.empty())
+    {
+        throw Exception(
+            ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+            "Waited for host {} too long ({})",
+            unready_hosts.begin()->first,
+            to_string(elapsed));
+    }
+
+    return ready_hosts_results;
+}
+
+}
--- a/src/Backups/BackupCoordinationStatusSync.h
+++ b/src/Backups/BackupCoordinationStatusSync.h
@ -0,0 +1,37 @@
+#pragma once
+
+#include <Common/ZooKeeper/Common.h>
+
+
+namespace DB
+{
+
+/// Used to coordinate hosts so all hosts would come to a specific status at around the same time.
+class BackupCoordinationStatusSync
+{
+public:
+    BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_);
+
+    /// Sets the status of the current host and signal other hosts if there were other hosts waiting for that.
+    void set(const String & current_host, const String & new_status, const String & message);
+    void setError(const String & current_host, const Exception & exception);
+
+    /// Sets the status of the current host and waits until all hosts come to the same status.
+    /// The function returns the messages all hosts set when they come to the required status.
+    Strings wait(const Strings & all_hosts, const String & status_to_wait);
+
+    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
+    Strings waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms);
+
+    static constexpr const char * kErrorStatus = "error";
+
+private:
+    void createRootNodes();
+    Strings waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional<UInt64> timeout_ms);
+
+    String zookeeper_path;
+    zkutil::GetZooKeeper get_zookeeper;
+    Poco::Logger * log;
+};
+
+}
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -1,7 +1,6 @@
 #include <Backups/BackupEntriesCollector.h>
 #include <Backups/BackupEntryFromMemory.h>
 #include <Backups/IBackupCoordination.h>
-#include <Backups/BackupCoordinationHelpers.h>
 #include <Backups/BackupUtils.h>
 #include <Backups/DDLAdjustingForBackupVisitor.h>
 #include <Databases/IDatabase.h>
@ -46,9 +45,6 @@ namespace
    /// Writing backup entries to the backup and removing temporary hard links.
    constexpr const char * kWritingBackupStatus = "writing backup";

-    /// Error status.
-    constexpr const char * kErrorStatus = BackupCoordinationStatusSync::kErrorStatus;
-
    /// Uppercases the first character of a passed string.
    String toUpperFirst(const String & str)
    {
@ -103,85 +99,60 @@ BackupEntriesCollector::~BackupEntriesCollector() = default;

 BackupEntries BackupEntriesCollector::run()
 {
-    try
-    {
-        /// run() can be called onle once.
-        if (!current_status.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries");
+    /// run() can be called onle once.
+    if (!current_status.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries");

-        /// Find other hosts working along with us to execute this ON CLUSTER query.
-        all_hosts
-            = BackupSettings::Util::filterHostIDs(backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
+    /// Find other hosts working along with us to execute this ON CLUSTER query.
+    all_hosts
+        = BackupSettings::Util::filterHostIDs(backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);

-        /// Do renaming in the create queries according to the renaming config.
-        renaming_map = makeRenamingMapFromBackupQuery(backup_query_elements);
+    /// Do renaming in the create queries according to the renaming config.
+    renaming_map = makeRenamingMapFromBackupQuery(backup_query_elements);

-        /// Calculate the root path for collecting backup entries, it's either empty or has the format "shards/<shard_num>/replicas/<replica_num>/".
-        calculateRootPathInBackup();
+    /// Calculate the root path for collecting backup entries, it's either empty or has the format "shards/<shard_num>/replicas/<replica_num>/".
+    calculateRootPathInBackup();

-        /// Find databases and tables which we're going to put to the backup.
-        gatherMetadataAndCheckConsistency();
+    /// Find databases and tables which we're going to put to the backup.
+    gatherMetadataAndCheckConsistency();

-        /// Make backup entries for the definitions of the found databases.
-        makeBackupEntriesForDatabasesDefs();
+    /// Make backup entries for the definitions of the found databases.
+    makeBackupEntriesForDatabasesDefs();

-        /// Make backup entries for the definitions of the found tables.
-        makeBackupEntriesForTablesDefs();
+    /// Make backup entries for the definitions of the found tables.
+    makeBackupEntriesForTablesDefs();

-        /// Make backup entries for the data of the found tables.
-        setStatus(kExtractingDataFromTablesStatus);
-        makeBackupEntriesForTablesData();
+    /// Make backup entries for the data of the found tables.
+    setStatus(kExtractingDataFromTablesStatus);
+    makeBackupEntriesForTablesData();

-        /// Run all the tasks added with addPostCollectingTask().
-        setStatus(kRunningPostTasksStatus);
-        runPostTasks();
+    /// Run all the tasks added with addPostCollectingTask().
+    setStatus(kRunningPostTasksStatus);
+    runPostTasks();

-        /// No more backup entries or tasks are allowed after this point.
-        setStatus(kWritingBackupStatus);
+    /// No more backup entries or tasks are allowed after this point.
+    setStatus(kWritingBackupStatus);

-        return std::move(backup_entries);
-    }
-    catch (...)
-    {
-        try
-        {
-            setStatus(kErrorStatus, getCurrentExceptionMessage(false));
-        }
-        catch (...)
-        {
-        }
-        throw;
-    }
+    return std::move(backup_entries);
 }

 Strings BackupEntriesCollector::setStatus(const String & new_status, const String & message)
 {
-    if (new_status == kErrorStatus)
+    LOG_TRACE(log, "{}", toUpperFirst(new_status));
+    current_status = new_status;
+
+    backup_coordination->setStatus(backup_settings.host_id, new_status, message);
+
+    if (new_status.starts_with(kGatheringMetadataStatus))
    {
-        LOG_ERROR(log, "{} failed with error: {}", toUpperFirst(current_status), message);
-        backup_coordination->setStatus(backup_settings.host_id, new_status, message);
-        return {};
+        auto now = std::chrono::steady_clock::now();
+        auto end_of_timeout = std::max(now, consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout);
+        return backup_coordination->waitStatusFor(
+            all_hosts, new_status, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - now).count());
    }
    else
    {
-        LOG_TRACE(log, "{}", toUpperFirst(new_status));
-        current_status = new_status;
-        if (new_status.starts_with(kGatheringMetadataStatus))
-        {
-            auto now = std::chrono::steady_clock::now();
-            auto end_of_timeout = std::max(now, consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout);
-
-            return backup_coordination->setStatusAndWaitFor(
-                backup_settings.host_id,
-                new_status,
-                message,
-                all_hosts,
-                std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - now).count());
-        }
-        else
-        {
-            return backup_coordination->setStatusAndWait(backup_settings.host_id, new_status, message, all_hosts);
-        }
+        return backup_coordination->waitStatus(all_hosts, new_status);
    }
 }

@ -436,46 +407,7 @@ void BackupEntriesCollector::gatherTablesMetadata()
    table_infos.clear();
    for (const auto & [database_name, database_info] : database_infos)
    {
-        const auto & database = database_info.database;
-        bool is_temporary_database = (database_name == DatabaseCatalog::TEMPORARY_DATABASE);
-
-        auto filter_by_table_name = [database_info = &database_info](const String & table_name)
-        {
-            /// We skip inner tables of materialized views.
-            if (table_name.starts_with(".inner_id."))
-                return false;
-
-            if (database_info->tables.contains(table_name))
-                return true;
-
-            if (database_info->all_tables)
-                return !database_info->except_table_names.contains(table_name);
-
-            return false;
-        };
-
-        auto db_tables = database->getTablesForBackup(filter_by_table_name, context);
-
-        std::unordered_set<String> found_table_names;
-        for (const auto & db_table : db_tables)
-        {
-            const auto & create_table_query = db_table.first;
-            const auto & create = create_table_query->as<const ASTCreateQuery &>();
-            found_table_names.emplace(create.getTable());
-
-            if (is_temporary_database && !create.temporary)
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a non-temporary create query for {}", tableNameWithTypeToString(database_name, create.getTable(), false));
-
-            if (!is_temporary_database && (create.getDatabase() != database_name))
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a create query with unexpected database name {} for {}", backQuoteIfNeed(create.getDatabase()), tableNameWithTypeToString(database_name, create.getTable(), false));
-        }
-
-        /// Check that all tables were found.
-        for (const auto & [table_name, table_info] : database_info.tables)
-        {
-            if (table_info.throw_if_table_not_found && !found_table_names.contains(table_name))
-                throw Exception(ErrorCodes::UNKNOWN_TABLE, "{} not found", tableNameWithTypeToString(database_name, table_name, true));
-        }
+        std::vector<std::pair<ASTPtr, StoragePtr>> db_tables = findTablesInDatabase(database_name);

        for (const auto & db_table : db_tables)
        {
@ -501,7 +433,7 @@ void BackupEntriesCollector::gatherTablesMetadata()

            /// Add information to `table_infos`.
            auto & res_table_info = table_infos[QualifiedTableName{database_name, table_name}];
-            res_table_info.database = database;
+            res_table_info.database = database_info.database;
            res_table_info.storage = storage;
            res_table_info.create_table_query = create_table_query;
            res_table_info.metadata_path_in_backup = metadata_path_in_backup;
@ -528,6 +460,67 @@ void BackupEntriesCollector::gatherTablesMetadata()
    }
 }

+std::vector<std::pair<ASTPtr, StoragePtr>> BackupEntriesCollector::findTablesInDatabase(const String & database_name) const
+{
+    const auto & database_info = database_infos.at(database_name);
+    const auto & database = database_info.database;
+
+    auto filter_by_table_name = [database_info = &database_info](const String & table_name)
+    {
+        /// We skip inner tables of materialized views.
+        if (table_name.starts_with(".inner_id."))
+            return false;
+
+        if (database_info->tables.contains(table_name))
+            return true;
+
+        if (database_info->all_tables)
+            return !database_info->except_table_names.contains(table_name);
+
+        return false;
+    };
+
+    std::vector<std::pair<ASTPtr, StoragePtr>> db_tables;
+
+    try
+    {
+        db_tables = database->getTablesForBackup(filter_by_table_name, context);
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While collecting tables for backup in database {}", backQuoteIfNeed(database_name));
+        throw;
+    }
+
+    std::unordered_set<String> found_table_names;
+    for (const auto & db_table : db_tables)
+    {
+        const auto & create_table_query = db_table.first;
+        const auto & create = create_table_query->as<const ASTCreateQuery &>();
+        found_table_names.emplace(create.getTable());
+
+        if (database_name == DatabaseCatalog::TEMPORARY_DATABASE)
+        {
+            if (!create.temporary)
+                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a non-temporary create query for {}", tableNameWithTypeToString(database_name, create.getTable(), false));
+        }
+        else
+        {
+            if (create.getDatabase() != database_name)
+                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a create query with unexpected database name {} for {}", backQuoteIfNeed(create.getDatabase()), tableNameWithTypeToString(database_name, create.getTable(), false));
+        }
+    }
+
+    /// Check that all tables were found.
+    for (const auto & [table_name, table_info] : database_info.tables)
+    {
+        if (table_info.throw_if_table_not_found && !found_table_names.contains(table_name))
+            throw Exception(ErrorCodes::UNKNOWN_TABLE, "{} was not found", tableNameWithTypeToString(database_name, table_name, true));
+    }
+
+    return db_tables;
+}
+
 void BackupEntriesCollector::lockTablesForReading()
 {
    for (auto & [table_name, table_info] : table_infos)
@ -544,7 +537,7 @@ void BackupEntriesCollector::lockTablesForReading()
            {
                if (e.code() != ErrorCodes::TABLE_IS_DROPPED)
                    throw;
-                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "{} is dropped", tableNameWithTypeToString(table_name.database, table_name.table, true));
+                throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "{} was dropped during scanning", tableNameWithTypeToString(table_name.database, table_name.table, true));
            }
        }
    }
@ -648,7 +641,7 @@ void BackupEntriesCollector::makeBackupEntriesForDatabasesDefs()
        if (!database_info.create_database_query)
            continue; /// We store CREATE DATABASE queries only if there was BACKUP DATABASE specified.

-        LOG_TRACE(log, "Adding definition of database {}", backQuoteIfNeed(database_name));
+        LOG_TRACE(log, "Adding the definition of database {} to backup", backQuoteIfNeed(database_name));

        ASTPtr new_create_query = database_info.create_database_query;
        adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), nullptr);
@ -664,7 +657,7 @@ void BackupEntriesCollector::makeBackupEntriesForTablesDefs()
 {
    for (auto & [table_name, table_info] : table_infos)
    {
-        LOG_TRACE(log, "Adding definition of {}", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        LOG_TRACE(log, "Adding the definition of {} to backup", tableNameWithTypeToString(table_name.database, table_name.table, false));

        ASTPtr new_create_query = table_info.create_table_query;
        adjustCreateQueryForBackup(new_create_query, context->getGlobalContext(), &table_info.replicated_table_shared_id);
@ -680,24 +673,40 @@ void BackupEntriesCollector::makeBackupEntriesForTablesData()
    if (backup_settings.structure_only)
        return;

-    for (const auto & [table_name, table_info] : table_infos)
+    for (const auto & table_name : table_infos | boost::adaptors::map_keys)
+        makeBackupEntriesForTableData(table_name);
+}
+
+void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableName & table_name)
+{
+    if (backup_settings.structure_only)
+        return;
+
+    const auto & table_info = table_infos.at(table_name);
+    const auto & storage = table_info.storage;
+    const auto & data_path_in_backup = table_info.data_path_in_backup;
+
+    if (!storage)
    {
-        const auto & storage = table_info.storage;
-        const auto & data_path_in_backup = table_info.data_path_in_backup;
-        if (storage)
-        {
-            LOG_TRACE(log, "Adding data of {}", tableNameWithTypeToString(table_name.database, table_name.table, false));
-            storage->backupData(*this, data_path_in_backup, table_info.partitions);
-        }
-        else
-        {
-            /// Storage == null means this storage exists on other replicas but it has not been created on this replica yet.
-            /// If this table is replicated in this case we call IBackupCoordination::addReplicatedDataPath() which will cause
-            /// other replicas to fill the storage's data in the backup.
-            /// If this table is not replicated we'll do nothing leaving the storage's data empty in the backup.
-            if (table_info.replicated_table_shared_id)
-                backup_coordination->addReplicatedDataPath(*table_info.replicated_table_shared_id, data_path_in_backup);
-        }
+        /// If storage == null that means this storage exists on other replicas but it has not been created on this replica yet.
+        /// If this table is replicated in this case we call IBackupCoordination::addReplicatedDataPath() which will cause
+        /// other replicas to fill the storage's data in the backup.
+        /// If this table is not replicated we'll do nothing leaving the storage's data empty in the backup.
+        if (table_info.replicated_table_shared_id)
+            backup_coordination->addReplicatedDataPath(*table_info.replicated_table_shared_id, data_path_in_backup);
+        return;
+    }
+
+    LOG_TRACE(log, "Collecting data of {} for backup", tableNameWithTypeToString(table_name.database, table_name.table, false));
+
+    try
+    {
+        storage->backupData(*this, data_path_in_backup, table_info.partitions);
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While collecting data of {} for backup", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        throw;
    }
 }

@ -716,21 +725,21 @@ void BackupEntriesCollector::addBackupEntry(const std::pair<String, BackupEntryP
 void BackupEntriesCollector::addBackupEntries(const BackupEntries & backup_entries_)
 {
    if (current_status == kWritingBackupStatus)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed");
    insertAtEnd(backup_entries, backup_entries_);
 }

 void BackupEntriesCollector::addBackupEntries(BackupEntries && backup_entries_)
 {
    if (current_status == kWritingBackupStatus)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of backup entries is not allowed");
    insertAtEnd(backup_entries, std::move(backup_entries_));
 }

 void BackupEntriesCollector::addPostTask(std::function<void()> task)
 {
    if (current_status == kWritingBackupStatus)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding post tasks is not allowed");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of post tasks is not allowed");
    post_tasks.push(std::move(task));
 }

--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -75,12 +75,15 @@ private:
        const std::set<DatabaseAndTableName> & except_table_names);

    void gatherTablesMetadata();
+    std::vector<std::pair<ASTPtr, StoragePtr>> findTablesInDatabase(const String & database_name) const;
    void lockTablesForReading();
    bool compareWithPrevious(std::optional<Exception> & inconsistency_error);

    void makeBackupEntriesForDatabasesDefs();
    void makeBackupEntriesForTablesDefs();
    void makeBackupEntriesForTablesData();
+    void makeBackupEntriesForTableData(const QualifiedTableName & table_name);
+
    void runPostTasks();

    Strings setStatus(const String & new_status, const String & message = "");
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@ -1,9 +1,7 @@
 #include <Backups/BackupIO_Disk.h>
-#include <Common/Exception.h>
 #include <Disks/IDisk.h>
 #include <IO/ReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromFileBase.h>
-#include <Common/logger_useful.h>


 namespace DB
@ -49,17 +47,10 @@ std::unique_ptr<WriteBuffer> BackupWriterDisk::writeFile(const String & file_nam

 void BackupWriterDisk::removeFilesAfterFailure(const Strings & file_names)
 {
-    try
-    {
-        for (const auto & file_name : file_names)
-            disk->removeFileIfExists(path / file_name);
-        if (disk->isDirectory(path) && disk->isDirectoryEmpty(path))
-            disk->removeDirectory(path);
-    }
-    catch (...)
-    {
-        LOG_WARNING(&Poco::Logger::get("BackupWriterDisk"), "RemoveFilesAfterFailure: {}", getCurrentExceptionMessage(false));
-    }
+    for (const auto & file_name : file_names)
+        disk->removeFileIfExists(path / file_name);
+    if (disk->isDirectory(path) && disk->isDirectoryEmpty(path))
+        disk->removeDirectory(path);
 }

 }
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@ -1,8 +1,6 @@
 #include <Backups/BackupIO_File.h>
-#include <Common/Exception.h>
 #include <Disks/IO/createReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromFile.h>
-#include <Common/logger_useful.h>

 namespace fs = std::filesystem;

@ -50,17 +48,10 @@ std::unique_ptr<WriteBuffer> BackupWriterFile::writeFile(const String & file_nam

 void BackupWriterFile::removeFilesAfterFailure(const Strings & file_names)
 {
-    try
-    {
-        for (const auto & file_name : file_names)
-            fs::remove(path / file_name);
-        if (fs::is_directory(path) && fs::is_empty(path))
-            fs::remove(path);
-    }
-    catch (...)
-    {
-        LOG_WARNING(&Poco::Logger::get("BackupWriterFile"), "RemoveFilesAfterFailure: {}", getCurrentExceptionMessage(false));
-    }
+    for (const auto & file_name : file_names)
+        fs::remove(path / file_name);
+    if (fs::is_directory(path) && fs::is_empty(path))
+        fs::remove(path);
 }

 }
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -4,7 +4,7 @@
 #include <Backups/BackupIO.h>
 #include <Backups/IBackupEntry.h>
 #include <Backups/BackupCoordinationLocal.h>
-#include <Backups/BackupCoordinationDistributed.h>
+#include <Backups/BackupCoordinationRemote.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/hex.h>
 #include <Common/quoteString.h>
@ -167,7 +167,14 @@ BackupImpl::BackupImpl(

 BackupImpl::~BackupImpl()
 {
-    close();
+    try
+    {
+        close();
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
 }


@ -231,10 +238,11 @@ void BackupImpl::close()
        archive_writer = {"", nullptr};

    if (!is_internal_backup && writer && !writing_finalized)
-    {
-        LOG_INFO(log, "Removing all files of backup {} after failure", backup_name);
        removeAllFilesAfterFailure();
-    }
+
+    writer.reset();
+    reader.reset();
+    coordination.reset();
 }

 time_t BackupImpl::getTimestamp() const
@ -733,24 +741,33 @@ std::shared_ptr<IArchiveWriter> BackupImpl::getArchiveWriter(const String & suff

 void BackupImpl::removeAllFilesAfterFailure()
 {
-    Strings files_to_remove;
-    if (use_archives)
+    try
    {
-        files_to_remove.push_back(archive_params.archive_name);
-        for (const auto & suffix : coordination->getAllArchiveSuffixes())
+        LOG_INFO(log, "Removing all files of backup {} after failure", backup_name);
+
+        Strings files_to_remove;
+        if (use_archives)
        {
-            String archive_name_with_suffix = getArchiveNameWithSuffix(suffix);
-            files_to_remove.push_back(std::move(archive_name_with_suffix));
+            files_to_remove.push_back(archive_params.archive_name);
+            for (const auto & suffix : coordination->getAllArchiveSuffixes())
+            {
+                String archive_name_with_suffix = getArchiveNameWithSuffix(suffix);
+                files_to_remove.push_back(std::move(archive_name_with_suffix));
+            }
        }
+        else
+        {
+            files_to_remove.push_back(".backup");
+            for (const auto & file_info : coordination->getAllFileInfos())
+                files_to_remove.push_back(file_info.data_file_name);
+        }
+
+        writer->removeFilesAfterFailure(files_to_remove);
    }
-    else
+    catch (...)
    {
-        files_to_remove.push_back(".backup");
-        for (const auto & file_info : coordination->getAllFileInfos())
-            files_to_remove.push_back(file_info.data_file_name);
+        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
    }
-
-    writer->removeFilesAfterFailure(files_to_remove);
 }

 }
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -5,9 +5,9 @@
 #include <Backups/BackupUtils.h>
 #include <Backups/IBackupEntry.h>
 #include <Backups/BackupEntriesCollector.h>
-#include <Backups/BackupCoordinationDistributed.h>
+#include <Backups/BackupCoordinationRemote.h>
 #include <Backups/BackupCoordinationLocal.h>
-#include <Backups/RestoreCoordinationDistributed.h>
+#include <Backups/RestoreCoordinationRemote.h>
 #include <Backups/RestoreCoordinationLocal.h>
 #include <Backups/RestoreSettings.h>
 #include <Backups/RestorerFromBackup.h>
@ -15,7 +15,6 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Parsers/ASTBackupQuery.h>
-#include <Processors/Executors/PullingPipelineExecutor.h>
 #include <Common/Exception.h>
 #include <Common/Macros.h>
 #include <Common/logger_useful.h>
@ -31,6 +30,28 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+namespace
+{
+    /// Coordination status meaning that a host finished its work.
+    constexpr const char * kCompletedCoordinationStatus = "completed";
+
+    /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination.
+    template <typename CoordinationType>
+    void sendErrorToCoordination(std::shared_ptr<CoordinationType> coordination, const String & current_host)
+    {
+        if (!coordination)
+            return;
+        try
+        {
+            coordination->setErrorStatus(current_host, Exception{getCurrentExceptionCode(), getCurrentExceptionMessage(true, true)});
+        }
+        catch (...)
+        {
+        }
+    }
+}
+
+
 BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads)
    : backups_thread_pool(num_backup_threads, /* max_free_threads = */ 0, num_backup_threads)
    , restores_thread_pool(num_restore_threads, /* max_free_threads = */ 0, num_restore_threads)
@ -76,21 +97,19 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c
                backup_info,
                on_cluster,
                context_in_use,
-                thread_group = CurrentThread::getGroup(),
                mutable_context](bool async) mutable
    {
-        SCOPE_EXIT_SAFE(
-            if (async)
-                CurrentThread::detachQueryIfNotDetached();
-        );
+        std::optional<CurrentThread::QueryScope> query_scope;
+        std::shared_ptr<IBackupCoordination> backup_coordination;
+        SCOPE_EXIT_SAFE(if (backup_coordination && !backup_settings.internal) backup_coordination->drop(););

        try
        {
-            if (async && thread_group)
-                CurrentThread::attachTo(thread_group);
-
            if (async)
+            {
+                query_scope.emplace(context_in_use);
                setThreadName("BackupWorker");
+            }

            /// Checks access rights if this is not ON CLUSTER query.
            /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
@ -98,13 +117,6 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c
            if (!on_cluster)
                context_in_use->checkAccess(required_access);

-            /// Make a backup coordination.
-            std::shared_ptr<IBackupCoordination> backup_coordination;
-            SCOPE_EXIT({
-                if (backup_coordination && !backup_settings.internal)
-                    backup_coordination->drop();
-            });
-
            ClusterPtr cluster;
            if (on_cluster)
            {
@ -118,9 +130,10 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c
                }
            }

+            /// Make a backup coordination.
            if (!backup_settings.coordination_zk_path.empty())
            {
-                backup_coordination = std::make_shared<BackupCoordinationDistributed>(
+                backup_coordination = std::make_shared<BackupCoordinationRemote>(
                    backup_settings.coordination_zk_path,
                    [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); });
            }
@ -151,26 +164,34 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c
                params.only_shard_num = backup_settings.shard_num;
                params.only_replica_num = backup_settings.replica_num;
                params.access_to_check = required_access;
-                mutable_context->setSetting("distributed_ddl_task_timeout", -1); // No timeout
-                mutable_context->setSetting("distributed_ddl_output_mode", Field{"throw"});
                backup_settings.copySettingsToQuery(*backup_query);
-                auto res = executeDDLQueryOnCluster(backup_query, mutable_context, params);
-                auto on_cluster_io = std::make_shared<BlockIO>(std::move(res));
-                PullingPipelineExecutor executor(on_cluster_io->pipeline);
-                Block block;
-                while (executor.pull(block));
+
+                // executeDDLQueryOnCluster() will return without waiting for completion
+                mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
+                mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
+                executeDDLQueryOnCluster(backup_query, mutable_context, params);
+
+                /// Wait until all the hosts have written their backup entries.
+                auto all_hosts = BackupSettings::Util::filterHostIDs(
+                    backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
+                backup_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus);
            }
            else
            {
                backup_query->setCurrentDatabase(context_in_use->getCurrentDatabase());

+                /// Prepare backup entries.
                BackupEntries backup_entries;
                {
                    BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context_in_use};
                    backup_entries = backup_entries_collector.run();
                }

+                /// Write the backup entries to the backup.
                writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool);
+
+                /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
+                backup_coordination->setStatus(backup_settings.host_id, kCompletedCoordinationStatus, "");
            }

            /// Finalize backup (write its metadata).
@ -184,7 +205,9 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c
        }
        catch (...)
        {
+            /// Something bad happened, the backup has not built.
            setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP);
+            sendErrorToCoordination(backup_coordination, backup_settings.host_id);
            if (!async)
                throw;
        }
@ -224,21 +247,19 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte
                restore_settings,
                backup_info,
                on_cluster,
-                thread_group = CurrentThread::getGroup(),
                context_in_use](bool async) mutable
    {
-        SCOPE_EXIT_SAFE(
-            if (async)
-                CurrentThread::detachQueryIfNotDetached();
-        );
+        std::optional<CurrentThread::QueryScope> query_scope;
+        std::shared_ptr<IRestoreCoordination> restore_coordination;
+        SCOPE_EXIT_SAFE(if (restore_coordination && !restore_settings.internal) restore_coordination->drop(););

        try
        {
-            if (async && thread_group)
-                CurrentThread::attachTo(thread_group);
-
            if (async)
+            {
+                query_scope.emplace(context_in_use);
                setThreadName("RestoreWorker");
+            }

            /// Open the backup for reading.
            BackupFactory::CreateParams backup_open_params;
@ -277,12 +298,6 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte
            }

            /// Make a restore coordination.
-            std::shared_ptr<IRestoreCoordination> restore_coordination;
-            SCOPE_EXIT({
-                if (restore_coordination && !restore_settings.internal)
-                    restore_coordination->drop();
-            });
-
            if (on_cluster && restore_settings.coordination_zk_path.empty())
            {
                String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
@ -291,7 +306,7 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte

            if (!restore_settings.coordination_zk_path.empty())
            {
-                restore_coordination = std::make_shared<RestoreCoordinationDistributed>(
+                restore_coordination = std::make_shared<RestoreCoordinationRemote>(
                    restore_settings.coordination_zk_path,
                    [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); });
            }
@ -308,20 +323,24 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte
                params.cluster = cluster;
                params.only_shard_num = restore_settings.shard_num;
                params.only_replica_num = restore_settings.replica_num;
-                context_in_use->setSetting("distributed_ddl_task_timeout", -1); // No timeout
-                context_in_use->setSetting("distributed_ddl_output_mode", Field{"throw"});
                restore_settings.copySettingsToQuery(*restore_query);
-                auto res = executeDDLQueryOnCluster(restore_query, context_in_use, params);
-                auto on_cluster_io = std::make_shared<BlockIO>(std::move(res));
-                PullingPipelineExecutor executor(on_cluster_io->pipeline);
-                Block block;
-                while (executor.pull(block))
-                    ;
+
+                // executeDDLQueryOnCluster() will return without waiting for completion
+                context_in_use->setSetting("distributed_ddl_task_timeout", Field{0});
+                context_in_use->setSetting("distributed_ddl_output_mode", Field{"none"});
+
+                executeDDLQueryOnCluster(restore_query, context_in_use, params);
+
+                /// Wait until all the hosts have written their backup entries.
+                auto all_hosts = BackupSettings::Util::filterHostIDs(
+                    restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
+                restore_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus);
            }
            else
            {
                restore_query->setCurrentDatabase(current_database);

+                /// Restore metadata and prepare data restoring tasks.
                DataRestoreTasks data_restore_tasks;
                {
                    RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination,
@ -329,14 +348,20 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte
                    data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE);
                }

+                /// Execute the data restoring tasks.
                restoreTablesData(std::move(data_restore_tasks), restores_thread_pool);
+
+                /// We have restored everything, we need to tell other hosts (they could be waiting for it).
+                restore_coordination->setStatus(restore_settings.host_id, kCompletedCoordinationStatus, "");
            }

            setStatus(restore_uuid, BackupStatus::RESTORED);
        }
        catch (...)
        {
+            /// Something bad happened, the backup has not built.
            setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE);
+            sendErrorToCoordination(restore_coordination, restore_settings.host_id);
            if (!async)
                throw;
        }
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -6,17 +6,23 @@

 namespace DB
 {
+class Exception;
+enum class AccessEntityType;

-/// Keeps information about files contained in a backup.
+/// Replicas use this class to coordinate what they're writing to a backup while executing BACKUP ON CLUSTER.
+/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationRemote.
+/// BackupCoordinationLocal is used while executing BACKUP without ON CLUSTER and performs coordination in memory.
+/// BackupCoordinationRemote is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
 class IBackupCoordination
 {
 public:
    virtual ~IBackupCoordination() = default;

-    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
+    /// Sets the current status and waits for other hosts to come to this status too.
    virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0;
-    virtual Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & other_hosts) = 0;
-    virtual Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & other_hosts, UInt64 timeout_ms) = 0;
+    virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0;
+    virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0;
+    virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0;

    struct PartNameAndChecksum
    {
@ -36,6 +42,18 @@ public:
    /// parts covered by another parts.
    virtual Strings getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const = 0;

+    struct MutationInfo
+    {
+        String id;
+        String entry;
+    };
+
+    /// Adds information about mutations of a replicated table.
+    virtual void addReplicatedMutations(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector<MutationInfo> & mutations) = 0;
+
+    /// Returns all mutations of a replicated table which are not finished for some data parts added by addReplicatedPartNames().
+    virtual std::vector<MutationInfo> getReplicatedMutations(const String & table_shared_id, const String & replica_name) const = 0;
+
    /// Adds a data path in backup for a replicated table.
    /// Multiple replicas of the replicated table call this function and then all the added paths can be returned by call of the function
    /// getReplicatedDataPaths().
@ -45,12 +63,8 @@ public:
    virtual Strings getReplicatedDataPaths(const String & table_shared_id) const = 0;

    /// Adds a path to access.txt file keeping access entities of a ReplicatedAccessStorage.
-    virtual void addReplicatedAccessPath(const String & access_zk_path, const String & file_path) = 0;
-    virtual Strings getReplicatedAccessPaths(const String & access_zk_path) const = 0;
-
-    /// Sets the host id of a host storing access entities of a ReplicatedAccessStorage to backup.
-    virtual void setReplicatedAccessHost(const String & access_zk_path, const String & host) = 0;
-    virtual String getReplicatedAccessHost(const String & access_zk_path) const = 0;
+    virtual void addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id, const String & file_path) = 0;
+    virtual Strings getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const = 0;

    struct FileInfo
    {
--- a/src/Backups/IBackupEntriesBatch.cpp
+++ b/src/Backups/IBackupEntriesBatch.cpp
@ -1,37 +0,0 @@
-#include <Backups/IBackupEntriesBatch.h>
-#include <IO/SeekableReadBuffer.h>
-
-
-namespace DB
-{
-
-class IBackupEntriesBatch::BackupEntryFromBatch : public IBackupEntry
-{
-public:
-    BackupEntryFromBatch(const std::shared_ptr<IBackupEntriesBatch> & generator_, size_t index_) : batch(generator_), index(index_)
-    {
-        assert(batch);
-    }
-
-    UInt64 getSize() const override { return batch->getSize(index); }
-    std::optional<UInt128> getChecksum() const override { return batch->getChecksum(index); }
-    std::unique_ptr<SeekableReadBuffer> getReadBuffer() const override { return batch->getReadBuffer(index); }
-
-private:
-    const std::shared_ptr<IBackupEntriesBatch> batch;
-    const size_t index;
-};
-
-
-BackupEntries IBackupEntriesBatch::getBackupEntries()
-{
-    BackupEntries res;
-    res.reserve(entry_names.size());
-    for (size_t i = 0; i != entry_names.size(); ++i)
-    {
-        res.emplace_back(entry_names[i], std::make_unique<BackupEntryFromBatch>(shared_from_this(), i));
-    }
-    return res;
-}
-
-}
--- a/src/Backups/IBackupEntriesBatch.h
+++ b/src/Backups/IBackupEntriesBatch.h
@ -1,29 +0,0 @@
-#pragma once
-
-#include <Backups/IBackupEntry.h>
-#include <mutex>
-
-namespace DB
-{
-
-/// Helper class designed to generate multiple backup entries from one source.
-class IBackupEntriesBatch : public std::enable_shared_from_this<IBackupEntriesBatch>
-{
-public:
-    BackupEntries getBackupEntries();
-
-    virtual ~IBackupEntriesBatch() = default;
-
-protected:
-    IBackupEntriesBatch(const Strings & entry_names_) : entry_names(entry_names_) {}
-
-    virtual std::unique_ptr<SeekableReadBuffer> getReadBuffer(size_t index) = 0;
-    virtual UInt64 getSize(size_t index) = 0;
-    virtual std::optional<UInt128> getChecksum(size_t) { return {}; }
-
-private:
-    class BackupEntryFromBatch;
-    const Strings entry_names;
-};
-
-}
--- a/src/Backups/IBackupEntriesLazyBatch.cpp
+++ b/src/Backups/IBackupEntriesLazyBatch.cpp
@ -0,0 +1,77 @@
+#include <Backups/IBackupEntriesLazyBatch.h>
+#include <Common/Exception.h>
+#include <IO/SeekableReadBuffer.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+
+class IBackupEntriesLazyBatch::BackupEntryFromBatch : public IBackupEntry
+{
+public:
+    BackupEntryFromBatch(const std::shared_ptr<IBackupEntriesLazyBatch> & batch_, size_t index_) : batch(batch_), index(index_) { }
+
+    UInt64 getSize() const override { return getInternalBackupEntry()->getSize(); }
+    std::optional<UInt128> getChecksum() const override { return getInternalBackupEntry()->getChecksum(); }
+    std::unique_ptr<SeekableReadBuffer> getReadBuffer() const override { return getInternalBackupEntry()->getReadBuffer(); }
+
+private:
+    BackupEntryPtr getInternalBackupEntry() const
+    {
+        std::lock_guard lock{mutex};
+        if (!entry)
+        {
+            batch->generateIfNecessary();
+            entry = batch->entries[index].second;
+        }
+        return entry;
+    }
+
+    const std::shared_ptr<IBackupEntriesLazyBatch> batch;
+    const size_t index;
+    mutable std::mutex mutex;
+    mutable BackupEntryPtr entry;
+};
+
+
+BackupEntries IBackupEntriesLazyBatch::getBackupEntries()
+{
+    BackupEntries res;
+    size_t size = getSize();
+    res.reserve(size);
+    for (size_t i = 0; i != size; ++i)
+    {
+        res.emplace_back(getName(i), std::make_unique<BackupEntryFromBatch>(shared_from_this(), i));
+    }
+    return res;
+}
+
+void IBackupEntriesLazyBatch::generateIfNecessary()
+{
+    std::lock_guard lock{mutex};
+    if (generated)
+        return;
+
+    entries = generate();
+
+    if (entries.size() != getSize())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup entries were generated incorrectly");
+
+    for (size_t i = 0; i != entries.size(); ++i)
+    {
+        if (entries[i].first != getName(i))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup entries were generated incorrectly");
+    }
+
+    generated = true;
+}
+
+IBackupEntriesLazyBatch::~IBackupEntriesLazyBatch() = default;
+
+}
--- a/src/Backups/IBackupEntriesLazyBatch.h
+++ b/src/Backups/IBackupEntriesLazyBatch.h
@ -0,0 +1,30 @@
+#pragma once
+
+#include <Backups/IBackupEntry.h>
+#include <mutex>
+
+namespace DB
+{
+
+/// Helper class designed to generate multiple backup entries from one source.
+class IBackupEntriesLazyBatch : public std::enable_shared_from_this<IBackupEntriesLazyBatch>
+{
+public:
+    BackupEntries getBackupEntries();
+    virtual ~IBackupEntriesLazyBatch();
+
+protected:
+    virtual size_t getSize() const = 0;
+    virtual const String & getName(size_t i) const = 0;
+    virtual BackupEntries generate() = 0;
+
+private:
+    void generateIfNecessary();
+
+    class BackupEntryFromBatch;
+    std::mutex mutex;
+    BackupEntries entries;
+    bool generated = false;
+};
+
+}
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -5,18 +5,24 @@

 namespace DB
 {
-using DatabaseAndTableName = std::pair<String, String>;
+class Exception;

-/// Keeps information about files contained in a backup.
+/// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
+/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
+/// RestoreCoordinationLocal is used while executing RESTORE without ON CLUSTER and performs coordination in memory.
+/// RestoreCoordinationRemote is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
 class IRestoreCoordination
 {
 public:
    virtual ~IRestoreCoordination() = default;

-    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
+    /// Sets the current status and waits for other hosts to come to this status too.
    virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0;
-    virtual Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & other_hosts) = 0;
-    virtual Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & other_hosts, UInt64 timeout_ms) = 0;
+    virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0;
+    virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0;
+    virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0;
+
+    static constexpr const char * kErrorStatus = "error";

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -11,12 +11,16 @@ void RestoreCoordinationLocal::setStatus(const String &, const String &, const S
 {
 }

-Strings RestoreCoordinationLocal::setStatusAndWait(const String &, const String &, const String &, const Strings &)
+void RestoreCoordinationLocal::setErrorStatus(const String &, const Exception &)
+{
+}
+
+Strings RestoreCoordinationLocal::waitStatus(const Strings &, const String &)
 {
    return {};
 }

-Strings RestoreCoordinationLocal::setStatusAndWaitFor(const String &, const String &, const String &, const Strings &, UInt64)
+Strings RestoreCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64)
 {
    return {};
 }
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -11,6 +11,7 @@ namespace Poco { class Logger; }
 namespace DB
 {

+/// Implementation of the IRestoreCoordination interface performing coordination in memory.
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
@ -19,8 +20,9 @@ public:

    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts) override;
-    Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms) override;
+    void setErrorStatus(const String & current_host, const Exception & exception) override;
+    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
+    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
--- a/src/Backups/RestoreCoordinationDistributed.cpp
+++ b/src/Backups/RestoreCoordinationDistributed.cpp
@ -1,4 +1,4 @@
-#include <Backups/RestoreCoordinationDistributed.h>
+#include <Backups/RestoreCoordinationRemote.h>
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/escapeForFileName.h>

@ -6,7 +6,7 @@
 namespace DB
 {

-RestoreCoordinationDistributed::RestoreCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
+RestoreCoordinationRemote::RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_)
    : zookeeper_path(zookeeper_path_)
    , get_zookeeper(get_zookeeper_)
    , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("RestoreCoordination"))
@ -14,9 +14,9 @@ RestoreCoordinationDistributed::RestoreCoordinationDistributed(const String & zo
    createRootNodes();
 }

-RestoreCoordinationDistributed::~RestoreCoordinationDistributed() = default;
+RestoreCoordinationRemote::~RestoreCoordinationRemote() = default;

-void RestoreCoordinationDistributed::createRootNodes()
+void RestoreCoordinationRemote::createRootNodes()
 {
    auto zookeeper = get_zookeeper();
    zookeeper->createAncestors(zookeeper_path);
@ -26,22 +26,29 @@ void RestoreCoordinationDistributed::createRootNodes()
    zookeeper->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
 }

-void RestoreCoordinationDistributed::setStatus(const String & current_host, const String & new_status, const String & message)
+
+void RestoreCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message)
 {
    status_sync.set(current_host, new_status, message);
 }

-Strings RestoreCoordinationDistributed::setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts)
+void RestoreCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception)
 {
-    return status_sync.setAndWait(current_host, new_status, message, all_hosts);
+    status_sync.setError(current_host, exception);
 }

-Strings RestoreCoordinationDistributed::setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms)
+Strings RestoreCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait)
 {
-    return status_sync.setAndWaitFor(current_host, new_status, message, all_hosts, timeout_ms);
+    return status_sync.wait(all_hosts, status_to_wait);
 }

-bool RestoreCoordinationDistributed::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
+Strings RestoreCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms)
+{
+    return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms);
+}
+
+
+bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
 {
    auto zookeeper = get_zookeeper();

@ -56,7 +63,7 @@ bool RestoreCoordinationDistributed::acquireCreatingTableInReplicatedDatabase(co
    return (code == Coordination::Error::ZOK);
 }

-bool RestoreCoordinationDistributed::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
+bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
 {
    auto zookeeper = get_zookeeper();

@ -68,7 +75,7 @@ bool RestoreCoordinationDistributed::acquireInsertingDataIntoReplicatedTable(con
    return (code == Coordination::Error::ZOK);
 }

-bool RestoreCoordinationDistributed::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
+bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
 {
    auto zookeeper = get_zookeeper();

@ -80,13 +87,13 @@ bool RestoreCoordinationDistributed::acquireReplicatedAccessStorage(const String
    return (code == Coordination::Error::ZOK);
 }

-void RestoreCoordinationDistributed::removeAllNodes()
+void RestoreCoordinationRemote::removeAllNodes()
 {
    auto zookeeper = get_zookeeper();
    zookeeper->removeRecursive(zookeeper_path);
 }

-void RestoreCoordinationDistributed::drop()
+void RestoreCoordinationRemote::drop()
 {
    removeAllNodes();
 }
--- a/src/Backups/RestoreCoordinationDistributed.h
+++ b/src/Backups/RestoreCoordinationDistributed.h
@ -1,23 +1,24 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
-#include <Backups/BackupCoordinationHelpers.h>
+#include <Backups/BackupCoordinationStatusSync.h>


 namespace DB
 {

-/// Stores restore temporary information in Zookeeper, used to perform RESTORE ON CLUSTER.
-class RestoreCoordinationDistributed : public IRestoreCoordination
+/// Implementation of the IRestoreCoordination interface performing coordination via ZooKeeper. It's necessary for "RESTORE ON CLUSTER".
+class RestoreCoordinationRemote : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationDistributed(const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper);
-    ~RestoreCoordinationDistributed() override;
+    RestoreCoordinationRemote(const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper);
+    ~RestoreCoordinationRemote() override;

    /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts.
    void setStatus(const String & current_host, const String & new_status, const String & message) override;
-    Strings setStatusAndWait(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts) override;
-    Strings setStatusAndWaitFor(const String & current_host, const String & new_status, const String & message, const Strings & all_hosts, UInt64 timeout_ms) override;
+    void setErrorStatus(const String & current_host, const Exception & exception) override;
+    Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override;
+    Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -1,6 +1,5 @@
 #include <Backups/RestorerFromBackup.h>
 #include <Backups/IRestoreCoordination.h>
-#include <Backups/BackupCoordinationHelpers.h>
 #include <Backups/BackupSettings.h>
 #include <Backups/IBackup.h>
 #include <Backups/IBackupEntry.h>
@ -53,9 +52,6 @@ namespace
    /// Inserting restored data to tables.
    constexpr const char * kInsertingDataToTablesStatus = "inserting data to tables";

-    /// Error status.
-    constexpr const char * kErrorStatus = BackupCoordinationStatusSync::kErrorStatus;
-
    /// Uppercases the first character of a passed string.
    String toUpperFirst(const String & str)
    {
@ -115,73 +111,52 @@ RestorerFromBackup::~RestorerFromBackup() = default;

 RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode)
 {
-    try
-    {
-        /// run() can be called onle once.
-        if (!current_status.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");
+    /// run() can be called onle once.
+    if (!current_status.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");

-        /// Find other hosts working along with us to execute this ON CLUSTER query.
-        all_hosts = BackupSettings::Util::filterHostIDs(
-            restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
+    /// Find other hosts working along with us to execute this ON CLUSTER query.
+    all_hosts = BackupSettings::Util::filterHostIDs(
+        restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);

-        /// Do renaming in the create queries according to the renaming config.
-        renaming_map = makeRenamingMapFromBackupQuery(restore_query_elements);
+    /// Do renaming in the create queries according to the renaming config.
+    renaming_map = makeRenamingMapFromBackupQuery(restore_query_elements);

-        /// Calculate the root path in the backup for restoring, it's either empty or has the format "shards/<shard_num>/replicas/<replica_num>/".
-        findRootPathsInBackup();
+    /// Calculate the root path in the backup for restoring, it's either empty or has the format "shards/<shard_num>/replicas/<replica_num>/".
+    findRootPathsInBackup();

-        /// Find all the databases and tables which we will read from the backup.
-        setStatus(kFindingTablesInBackupStatus);
-        findDatabasesAndTablesInBackup();
+    /// Find all the databases and tables which we will read from the backup.
+    setStatus(kFindingTablesInBackupStatus);
+    findDatabasesAndTablesInBackup();

-        /// Check access rights.
-        checkAccessForObjectsFoundInBackup();
+    /// Check access rights.
+    checkAccessForObjectsFoundInBackup();

-        if (mode == Mode::CHECK_ACCESS_ONLY)
-            return {};
+    if (mode == Mode::CHECK_ACCESS_ONLY)
+        return {};

-        /// Create databases using the create queries read from the backup.
-        setStatus(kCreatingDatabasesStatus);
-        createDatabases();
+    /// Create databases using the create queries read from the backup.
+    setStatus(kCreatingDatabasesStatus);
+    createDatabases();

-        /// Create tables using the create queries read from the backup.
-        setStatus(kCreatingTablesStatus);
-        createTables();
+    /// Create tables using the create queries read from the backup.
+    setStatus(kCreatingTablesStatus);
+    createTables();

-        /// All what's left is to insert data to tables.
-        /// No more data restoring tasks are allowed after this point.
-        setStatus(kInsertingDataToTablesStatus);
-        return getDataRestoreTasks();
-    }
-    catch (...)
-    {
-        try
-        {
-            /// Other hosts should know that we've encountered an error.
-            setStatus(kErrorStatus, getCurrentExceptionMessage(false));
-        }
-        catch (...)
-        {
-        }
-        throw;
-    }
+    /// All what's left is to insert data to tables.
+    /// No more data restoring tasks are allowed after this point.
+    setStatus(kInsertingDataToTablesStatus);
+    return getDataRestoreTasks();
 }

 void RestorerFromBackup::setStatus(const String & new_status, const String & message)
 {
-    if (new_status == kErrorStatus)
+    LOG_TRACE(log, "{}", toUpperFirst(new_status));
+    current_status = new_status;
+    if (restore_coordination)
    {
-        LOG_ERROR(log, "{} failed with {}", toUpperFirst(current_status), message);
-        if (restore_coordination)
-            restore_coordination->setStatus(restore_settings.host_id, new_status, message);
-    }
-    else
-    {
-        LOG_TRACE(log, "{}", toUpperFirst(new_status));
-        current_status = new_status;
-        if (restore_coordination)
-            restore_coordination->setStatusAndWait(restore_settings.host_id, new_status, message, all_hosts);
+        restore_coordination->setStatus(restore_settings.host_id, new_status, message);
+        restore_coordination->waitStatus(all_hosts, new_status);
    }
 }

@ -381,11 +356,23 @@ void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name
        insertAtEnd(*res_table_info.partitions, *partitions);
    }

+    /// Special handling for ACL-related system tables.
    if (!restore_settings.structure_only && isSystemAccessTableName(table_name))
    {
        if (!access_restorer)
            access_restorer = std::make_unique<AccessRestorerFromBackup>(backup, restore_settings);
-        access_restorer->addDataPath(data_path_in_backup, table_name);
+
+        try
+        {
+            /// addDataPath() will parse access*.txt files and extract access entities from them.
+            /// We need to do that early because we need those access entities to check access.
+            access_restorer->addDataPath(data_path_in_backup);
+        }
+        catch (Exception & e)
+        {
+            e.addMessage("While parsing data of {} from backup", tableNameWithTypeToString(table_name.database, table_name.table, false));
+            throw;
+        }
    }
 }

@ -563,33 +550,57 @@ void RestorerFromBackup::checkAccessForObjectsFoundInBackup() const
    required_access = AccessRights{required_access}.getElements();

    context->checkAccess(required_access);
-
 }

 void RestorerFromBackup::createDatabases()
 {
-    for (const auto & [database_name, database_info] : database_infos)
+    for (const auto & database_name : database_infos | boost::adaptors::map_keys)
    {
-        bool need_create_database = (restore_settings.create_database != RestoreDatabaseCreationMode::kMustExist);
-        if (database_info.is_predefined_database)
-            need_create_database = false; /// Predefined databases always exist.
+        createDatabase(database_name);
+        checkDatabase(database_name);
+    }
+}

-        if (need_create_database)
-        {
-            /// Execute CREATE DATABASE query.
-            auto create_database_query = database_info.create_database_query;
-            if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
-            {
-                create_database_query = create_database_query->clone();
-                create_database_query->as<ASTCreateQuery &>().if_not_exists = true;
-            }
-            LOG_TRACE(log, "Creating database {}: {}", backQuoteIfNeed(database_name), serializeAST(*create_database_query));
-            InterpreterCreateQuery interpreter{create_database_query, context};
-            interpreter.setInternal(true);
-            interpreter.execute();
-        }
+void RestorerFromBackup::createDatabase(const String & database_name) const
+{
+    if (restore_settings.create_database == RestoreDatabaseCreationMode::kMustExist)
+        return;

+    /// Predefined databases always exist.
+    const auto & database_info = database_infos.at(database_name);
+    if (database_info.is_predefined_database)
+        return;
+
+    auto create_database_query = database_info.create_database_query;
+    if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
+    {
+        create_database_query = create_database_query->clone();
+        create_database_query->as<ASTCreateQuery &>().if_not_exists = true;
+    }
+
+    LOG_TRACE(log, "Creating database {}: {}", backQuoteIfNeed(database_name), serializeAST(*create_database_query));
+
+    try
+    {
+        /// Execute CREATE DATABASE query.
+        InterpreterCreateQuery interpreter{create_database_query, context};
+        interpreter.setInternal(true);
+        interpreter.execute();
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While creating database {}", backQuoteIfNeed(database_name));
+        throw;
+    }
+}
+
+void RestorerFromBackup::checkDatabase(const String & database_name)
+{
+    auto & database_info = database_infos.at(database_name);
+    try
+    {
        DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name);
+        database_info.database = database;

        if (!restore_settings.allow_different_database_def && !database_info.is_predefined_database)
        {
@ -601,14 +612,18 @@ void RestorerFromBackup::createDatabases()
            {
                throw Exception(
                    ErrorCodes::CANNOT_RESTORE_DATABASE,
-                    "The database {} has a different definition: {} "
+                    "The database has a different definition: {} "
                    "comparing to its definition in the backup: {}",
-                    backQuoteIfNeed(database_name),
                    serializeAST(*create_database_query),
                    serializeAST(*expected_create_query));
            }
        }
    }
+    catch (Exception & e)
+    {
+        e.addMessage("While checking database {}", backQuoteIfNeed(database_name));
+        throw;
+    }
 }

 void RestorerFromBackup::createTables()
@ -622,82 +637,123 @@ void RestorerFromBackup::createTables()

        for (const auto & table_name : tables_to_create)
        {
-            auto & table_info = table_infos.at(table_name);
+            createTable(table_name);
+            checkTable(table_name);
+            insertDataToTable(table_name);
+        }
+    }
+}

-            DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_name.database);
+void RestorerFromBackup::createTable(const QualifiedTableName & table_name)
+{
+    if (restore_settings.create_table == RestoreTableCreationMode::kMustExist)
+        return;

-            bool need_create_table = (restore_settings.create_table != RestoreTableCreationMode::kMustExist);
-            if (table_info.is_predefined_table)
-                need_create_table = false; /// Predefined tables always exist.
+    /// Predefined tables always exist.
+    auto & table_info = table_infos.at(table_name);
+    if (table_info.is_predefined_table)
+        return;

-            if (need_create_table)
+    auto create_table_query = table_info.create_table_query;
+    if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
+    {
+        create_table_query = create_table_query->clone();
+        create_table_query->as<ASTCreateQuery &>().if_not_exists = true;
+    }
+
+    LOG_TRACE(
+        log, "Creating {}: {}", tableNameWithTypeToString(table_name.database, table_name.table, false), serializeAST(*create_table_query));
+
+    try
+    {
+        DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_name.database);
+        table_info.database = database;
+
+        /// Execute CREATE TABLE query (we call IDatabase::createTableRestoredFromBackup() to allow the database to do some
+        /// database-specific things).
+        database->createTableRestoredFromBackup(
+            create_table_query,
+            context,
+            restore_coordination,
+            std::chrono::duration_cast<std::chrono::milliseconds>(create_table_timeout).count());
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While creating {}", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        throw;
+    }
+}
+
+void RestorerFromBackup::checkTable(const QualifiedTableName & table_name)
+{
+    auto & table_info = table_infos.at(table_name);
+    auto database = table_info.database;
+
+    try
+    {
+        if (!database)
+        {
+            database = DatabaseCatalog::instance().getDatabase(table_name.database);
+            table_info.database = database;
+        }
+
+        auto resolved_id = (table_name.database == DatabaseCatalog::TEMPORARY_DATABASE)
+            ? context->resolveStorageID(StorageID{"", table_name.table}, Context::ResolveExternal)
+            : context->resolveStorageID(StorageID{table_name.database, table_name.table}, Context::ResolveGlobal);
+
+        StoragePtr storage = database->getTable(resolved_id.table_name, context);
+        table_info.storage = storage;
+        table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
+
+        if (!restore_settings.allow_different_table_def && !table_info.is_predefined_table)
+        {
+            ASTPtr create_table_query = database->getCreateTableQuery(resolved_id.table_name, context);
+            adjustCreateQueryForBackup(create_table_query, context->getGlobalContext(), nullptr);
+            ASTPtr expected_create_query = table_info.create_table_query;
+            if (serializeAST(*create_table_query) != serializeAST(*expected_create_query))
            {
-                auto create_table_query = table_info.create_table_query;
-                if (restore_settings.create_table == RestoreTableCreationMode::kCreateIfNotExists)
-                {
-                    create_table_query = create_table_query->clone();
-                    create_table_query->as<ASTCreateQuery &>().if_not_exists = true;
-                }
-
-                LOG_TRACE(
-                    log,
-                    "Creating {}: {}",
-                    tableNameWithTypeToString(table_name.database, table_name.table, false),
-                    serializeAST(*create_table_query));
-
-                /// Execute CREATE TABLE query (we call IDatabase::createTableRestoredFromBackup() to allow the database to do some
-                /// database-specific things).
-                database->createTableRestoredFromBackup(
-                    create_table_query,
-                    context,
-                    restore_coordination,
-                    std::chrono::duration_cast<std::chrono::milliseconds>(create_table_timeout).count());
-            }
-
-            table_info.created = true;
-
-            auto resolved_id = (table_name.database == DatabaseCatalog::TEMPORARY_DATABASE)
-                ? context->resolveStorageID(StorageID{"", table_name.table}, Context::ResolveExternal)
-                : context->resolveStorageID(StorageID{table_name.database, table_name.table}, Context::ResolveGlobal);
-
-            auto storage = database->getTable(resolved_id.table_name, context);
-            table_info.storage = storage;
-            table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout);
-
-            if (!restore_settings.allow_different_table_def && !table_info.is_predefined_table)
-            {
-                ASTPtr create_table_query = database->getCreateTableQuery(resolved_id.table_name, context);
-                adjustCreateQueryForBackup(create_table_query, context->getGlobalContext(), nullptr);
-                ASTPtr expected_create_query = table_info.create_table_query;
-                if (serializeAST(*create_table_query) != serializeAST(*expected_create_query))
-                {
-                    throw Exception(
-                        ErrorCodes::CANNOT_RESTORE_TABLE,
-                        "{} has a different definition: {} "
-                        "comparing to its definition in the backup: {}",
-                        tableNameWithTypeToString(table_name.database, table_name.table, true),
-                        serializeAST(*create_table_query),
-                        serializeAST(*expected_create_query));
-                }
-            }
-
-            if (!restore_settings.structure_only)
-            {
-                const auto & data_path_in_backup = table_info.data_path_in_backup;
-                const auto & partitions = table_info.partitions;
-                if (partitions && !storage->supportsBackupPartition())
-                {
-                    throw Exception(
-                        ErrorCodes::CANNOT_RESTORE_TABLE,
-                        "Table engine {} doesn't support partitions, cannot restore {}",
-                        storage->getName(),
-                        tableNameWithTypeToString(table_name.database, table_name.table, false));
-                }
-
-                storage->restoreDataFromBackup(*this, data_path_in_backup, partitions);
+                throw Exception(
+                    ErrorCodes::CANNOT_RESTORE_TABLE,
+                    "The table has a different definition: {} "
+                    "comparing to its definition in the backup: {}",
+                    serializeAST(*create_table_query),
+                    serializeAST(*expected_create_query));
            }
        }
    }
+    catch (Exception & e)
+    {
+        e.addMessage("While checking {}", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        throw;
+    }
+}
+
+void RestorerFromBackup::insertDataToTable(const QualifiedTableName & table_name)
+{
+    if (restore_settings.structure_only)
+        return;
+
+    auto & table_info = table_infos.at(table_name);
+    auto storage = table_info.storage;
+
+    try
+    {
+        const auto & data_path_in_backup = table_info.data_path_in_backup;
+        const auto & partitions = table_info.partitions;
+        if (partitions && !storage->supportsBackupPartition())
+        {
+            throw Exception(
+                ErrorCodes::CANNOT_RESTORE_TABLE,
+                "Table engine {} doesn't support partitions",
+                storage->getName());
+        }
+        storage->restoreDataFromBackup(*this, data_path_in_backup, partitions);
+    }
+    catch (Exception & e)
+    {
+        e.addMessage("While restoring data of {}", tableNameWithTypeToString(table_name.database, table_name.table, false));
+        throw;
+    }
 }

 /// Returns the list of tables without dependencies or those which dependencies have been created before.
@ -708,7 +764,7 @@ std::vector<QualifiedTableName> RestorerFromBackup::findTablesWithoutDependencie

    for (const auto & [key, table_info] : table_infos)
    {
-        if (table_info.created)
+        if (table_info.storage)
            continue;

        /// Found a table which is not created yet.
@ -719,7 +775,7 @@ std::vector<QualifiedTableName> RestorerFromBackup::findTablesWithoutDependencie
        for (const auto & dependency : table_info.dependencies)
        {
            auto it = table_infos.find(dependency);
-            if ((it != table_infos.end()) && !it->second.created)
+            if ((it != table_infos.end()) && !it->second.storage)
            {
                all_dependencies_met = false;
                break;
@ -740,7 +796,7 @@ std::vector<QualifiedTableName> RestorerFromBackup::findTablesWithoutDependencie
    std::vector<QualifiedTableName> tables_with_cyclic_dependencies;
    for (const auto & [key, table_info] : table_infos)
    {
-        if (!table_info.created)
+        if (!table_info.storage)
            tables_with_cyclic_dependencies.push_back(key);
    }

@ -759,14 +815,14 @@ std::vector<QualifiedTableName> RestorerFromBackup::findTablesWithoutDependencie
 void RestorerFromBackup::addDataRestoreTask(DataRestoreTask && new_task)
 {
    if (current_status == kInsertingDataToTablesStatus)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding data-restoring tasks is not allowed");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed");
    data_restore_tasks.push_back(std::move(new_task));
 }

 void RestorerFromBackup::addDataRestoreTasks(DataRestoreTasks && new_tasks)
 {
    if (current_status == kInsertingDataToTablesStatus)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding data-restoring tasks is not allowed");
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed");
    insertAtEnd(data_restore_tasks, std::move(new_tasks));
 }

--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -15,10 +15,13 @@ class IBackup;
 using BackupPtr = std::shared_ptr<const IBackup>;
 class IRestoreCoordination;
 struct StorageID;
+class IDatabase;
+using DatabasePtr = std::shared_ptr<IDatabase>;
 class AccessRestorerFromBackup;
 struct IAccessEntity;
 using AccessEntityPtr = std::shared_ptr<const IAccessEntity>;

+
 /// Restores the definition of databases and tables and prepares tasks to restore the data of the tables.
 class RestorerFromBackup : private boost::noncopyable
 {
@ -87,7 +90,13 @@ private:
    void checkAccessForObjectsFoundInBackup() const;

    void createDatabases();
+    void createDatabase(const String & database_name) const;
+    void checkDatabase(const String & database_name);
+
    void createTables();
+    void createTable(const QualifiedTableName & table_name);
+    void checkTable(const QualifiedTableName & table_name);
+    void insertDataToTable(const QualifiedTableName & table_name);

    DataRestoreTasks getDataRestoreTasks();

@ -97,6 +106,7 @@ private:
    {
        ASTPtr create_database_query;
        bool is_predefined_database = false;
+        DatabasePtr database;
    };

    struct TableInfo
@ -107,7 +117,7 @@ private:
        bool has_data = false;
        std::filesystem::path data_path_in_backup;
        std::optional<ASTs> partitions;
-        bool created = false;
+        DatabasePtr database;
        StoragePtr storage;
        TableLockHolder table_lock;
    };
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -553,6 +553,7 @@ if (TARGET ch_contrib::rapidjson)
 endif()

 dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
+
 if (TARGET ch_contrib::annoy AND ENABLE_ANNOY)
    dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
 endif()
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -524,17 +524,35 @@ try
                const auto & out_file_node = query_with_output->out_file->as<ASTLiteral &>();
                out_file = out_file_node.value.safeGet<std::string>();

-                std::string compression_method;
+                std::string compression_method_string;
+
                if (query_with_output->compression)
                {
                    const auto & compression_method_node = query_with_output->compression->as<ASTLiteral &>();
-                    compression_method = compression_method_node.value.safeGet<std::string>();
+                    compression_method_string = compression_method_node.value.safeGet<std::string>();
+                }
+
+                CompressionMethod compression_method = chooseCompressionMethod(out_file, compression_method_string);
+                UInt64 compression_level = 3;
+
+                if (query_with_output->compression_level)
+                {
+                    const auto & compression_level_node = query_with_output->compression_level->as<ASTLiteral &>();
+                    bool res = compression_level_node.value.tryGet<UInt64>(compression_level);
+                    auto range = getCompressionLevelRange(compression_method);
+
+                    if (!res || compression_level < range.first || compression_level > range.second)
+                        throw Exception(
+                            ErrorCodes::BAD_ARGUMENTS,
+                            "Invalid compression level, must be positive integer in range {}-{}",
+                            range.first,
+                            range.second);
                }

                out_file_buf = wrapWriteBufferWithCompressionMethod(
                    std::make_unique<WriteBufferFromFile>(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT),
-                    chooseCompressionMethod(out_file, compression_method),
-                    /* compression level = */ 3
+                    compression_method,
+                    compression_level
                );

                // We are writing to file, so default format is the same as in non-interactive mode.
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@ -104,7 +104,7 @@ public:
    Field operator[](size_t n) const override { return DecimalField(data[n], scale); }
    void get(size_t n, Field & res) const override { res = (*this)[n]; }
    bool getBool(size_t n) const override { return bool(data[n].value); }
-    Int64 getInt(size_t n) const override { return Int64(data[n].value) * scale; }
+    Int64 getInt(size_t n) const override { return Int64(data[n].value); }
    UInt64 get64(size_t n) const override;
    bool isDefaultAt(size_t n) const override { return data[n].value == 0; }

--- a/src/Columns/ColumnObject.cpp
+++ b/src/Columns/ColumnObject.cpp
@ -21,7 +21,7 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
    extern const int ILLEGAL_COLUMN;
    extern const int DUPLICATE_COLUMN;
-    extern const int NUMBER_OF_DIMENSIONS_MISMATHED;
+    extern const int NUMBER_OF_DIMENSIONS_MISMATCHED;
    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
    extern const int ARGUMENT_OUT_OF_BOUND;
 }
@ -298,7 +298,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
        value_dim = column_dim;

    if (value_dim != column_dim)
-        throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATHED,
+        throw Exception(ErrorCodes::NUMBER_OF_DIMENSIONS_MISMATCHED,
            "Dimension of types mismatched between inserted value and column. "
            "Dimension of value: {}. Dimension of column: {}",
             value_dim, column_dim);
--- a/src/Common/CurrentMemoryTracker.cpp
+++ b/src/Common/CurrentMemoryTracker.cpp
@ -4,6 +4,18 @@
 #include <Common/CurrentMemoryTracker.h>


+#ifdef MEMORY_TRACKER_DEBUG_CHECKS
+thread_local bool memory_tracker_always_throw_logical_error_on_allocation = false;
+#endif
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+}
+
 namespace
 {

@ -23,64 +35,66 @@ MemoryTracker * getMemoryTracker()

 }

-namespace CurrentMemoryTracker
-{
-
 using DB::current_thread;

-namespace
+void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
 {
-    void allocImpl(Int64 size, bool throw_if_memory_exceeded)
+#ifdef MEMORY_TRACKER_DEBUG_CHECKS
+    if (unlikely(memory_tracker_always_throw_logical_error_on_allocation))
    {
-        if (auto * memory_tracker = getMemoryTracker())
-        {
-            if (current_thread)
-            {
-                current_thread->untracked_memory += size;
+        memory_tracker_always_throw_logical_error_on_allocation = false;
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Memory tracker: allocations not allowed.");
+    }
+#endif

-                if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
-                {
-                    /// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
-                    /// more. It could be useful to enlarge Exception message in rethrow logic.
-                    Int64 tmp = current_thread->untracked_memory;
-                    current_thread->untracked_memory = 0;
-                    memory_tracker->allocImpl(tmp, throw_if_memory_exceeded);
-                }
-            }
-            /// total_memory_tracker only, ignore untracked_memory
-            else
+    if (auto * memory_tracker = getMemoryTracker())
+    {
+        if (current_thread)
+        {
+            current_thread->untracked_memory += size;
+
+            if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
            {
-                memory_tracker->allocImpl(size, throw_if_memory_exceeded);
+                /// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
+                /// more. It could be useful to enlarge Exception message in rethrow logic.
+                Int64 tmp = current_thread->untracked_memory;
+                current_thread->untracked_memory = 0;
+                memory_tracker->allocImpl(tmp, throw_if_memory_exceeded);
            }
        }
+        /// total_memory_tracker only, ignore untracked_memory
+        else
+        {
+            memory_tracker->allocImpl(size, throw_if_memory_exceeded);
+        }
    }
 }

-void check()
+void CurrentMemoryTracker::check()
 {
    if (auto * memory_tracker = getMemoryTracker())
        memory_tracker->allocImpl(0, true);
 }

-void alloc(Int64 size)
+void CurrentMemoryTracker::alloc(Int64 size)
 {
    bool throw_if_memory_exceeded = true;
    allocImpl(size, throw_if_memory_exceeded);
 }

-void allocNoThrow(Int64 size)
+void CurrentMemoryTracker::allocNoThrow(Int64 size)
 {
    bool throw_if_memory_exceeded = false;
    allocImpl(size, throw_if_memory_exceeded);
 }

-void realloc(Int64 old_size, Int64 new_size)
+void CurrentMemoryTracker::realloc(Int64 old_size, Int64 new_size)
 {
    Int64 addition = new_size - old_size;
    addition > 0 ? alloc(addition) : free(-addition);
 }

-void free(Int64 size)
+void CurrentMemoryTracker::free(Int64 size)
 {
    if (auto * memory_tracker = getMemoryTracker())
    {
@ -101,4 +115,3 @@ void free(Int64 size)
    }
 }

-}
--- a/src/Common/CurrentMemoryTracker.h
+++ b/src/Common/CurrentMemoryTracker.h
@ -3,11 +3,17 @@
 #include <base/types.h>

 /// Convenience methods, that use current thread's memory_tracker if it is available.
-namespace CurrentMemoryTracker
+struct CurrentMemoryTracker
 {
-    void alloc(Int64 size);
-    void allocNoThrow(Int64 size);
-    void realloc(Int64 old_size, Int64 new_size);
-    void free(Int64 size);
-    void check();
-}
+    /// Call the following functions before calling of corresponding operations with memory allocators.
+    static void alloc(Int64 size);
+    static void allocNoThrow(Int64 size);
+    static void realloc(Int64 old_size, Int64 new_size);
+
+    /// This function should be called after memory deallocation.
+    static void free(Int64 size);
+    static void check();
+
+private:
+    static void allocImpl(Int64 size, bool throw_if_memory_exceeded);
+};
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -93,6 +93,8 @@
    M(CacheFileSegments, "Number of existing cache file segments") \
    M(CacheDetachedFileSegments, "Number of existing detached cache file segments") \
    M(S3Requests, "S3 requests") \
+    M(KeeperAliveConnections, "Number of alive connections") \
+    M(KeeperOutstandingRequets, "Number of outstanding requests") \

 namespace CurrentMetrics
 {
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -613,7 +613,7 @@
    M(642, CANNOT_PACK_ARCHIVE) \
    M(643, CANNOT_UNPACK_ARCHIVE) \
    M(644, REMOTE_FS_OBJECT_CACHE_ERROR) \
-    M(645, NUMBER_OF_DIMENSIONS_MISMATHED) \
+    M(645, NUMBER_OF_DIMENSIONS_MISMATCHED) \
    M(646, CANNOT_BACKUP_DATABASE) \
    M(647, CANNOT_BACKUP_TABLE) \
    M(648, WRONG_DDL_RENAMING_SETTINGS) \
--- a/src/Common/IntervalKind.cpp
+++ b/src/Common/IntervalKind.cpp
@ -7,6 +7,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int SYNTAX_ERROR;
+    extern const int BAD_ARGUMENTS;
 }

 Int32 IntervalKind::toAvgSeconds() const
@ -15,7 +16,7 @@ Int32 IntervalKind::toAvgSeconds() const
    {
        case IntervalKind::Nanosecond:
        case IntervalKind::Microsecond:
-        case IntervalKind::Millisecond: return 0; /// fractional parts of seconds have 0 seconds
+        case IntervalKind::Millisecond: return 0;
        case IntervalKind::Second: return 1;
        case IntervalKind::Minute: return 60;
        case IntervalKind::Hour: return 3600;
@ -28,6 +29,51 @@ Int32 IntervalKind::toAvgSeconds() const
    __builtin_unreachable();
 }

+Float64 IntervalKind::toSeconds() const
+{
+    switch (kind)
+    {
+        case IntervalKind::Nanosecond:
+            return 0.000000001;
+        case IntervalKind::Microsecond:
+            return 0.000001;
+        case IntervalKind::Millisecond:
+            return 0.001;
+        case IntervalKind::Second:
+            return 1;
+        case IntervalKind::Minute:
+            return 60;
+        case IntervalKind::Hour:
+            return 3600;
+        case IntervalKind::Day:
+            return 86400;
+        case IntervalKind::Week:
+            return 604800;
+        default:
+            throw Exception("Not possible to get precise number of seconds in non-precise interval", ErrorCodes::BAD_ARGUMENTS);
+    }
+    __builtin_unreachable();
+}
+
+bool IntervalKind::isFixedLength() const
+{
+    switch (kind)
+    {
+        case IntervalKind::Nanosecond:
+        case IntervalKind::Microsecond:
+        case IntervalKind::Millisecond:
+        case IntervalKind::Second:
+        case IntervalKind::Minute:
+        case IntervalKind::Hour:
+        case IntervalKind::Day:
+        case IntervalKind::Week: return true;
+        case IntervalKind::Month:
+        case IntervalKind::Quarter:
+        case IntervalKind::Year: return false;
+    }
+    __builtin_unreachable();
+}
+
 IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds)
 {
    if (num_seconds)
--- a/src/Common/IntervalKind.h
+++ b/src/Common/IntervalKind.h
@ -33,10 +33,17 @@ struct IntervalKind
    /// For `Month`, `Quarter` and `Year` the function returns an average number of seconds.
    Int32 toAvgSeconds() const;

+    /// Returns exact number of seconds in one interval.
+    /// For `Month`, `Quarter` and `Year` the function raises an error.
+    Float64 toSeconds() const;
+
    /// Chooses an interval kind based on number of seconds.
    /// For example, `IntervalKind::fromAvgSeconds(3600)` returns `IntervalKind::Hour`.
    static IntervalKind fromAvgSeconds(Int64 num_seconds);

+    /// Returns whether IntervalKind has a fixed number of seconds (e.g. Day) or non-fixed(e.g. Month)
+    bool isFixedLength() const;
+
    /// Returns an uppercased version of what `toString()` returns.
    const char * toKeyword() const;

--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -19,10 +19,6 @@
 #include <string>


-#ifdef MEMORY_TRACKER_DEBUG_CHECKS
-thread_local bool memory_tracker_always_throw_logical_error_on_allocation = false;
-#endif
-
 namespace
 {

@ -95,7 +91,7 @@ MemoryTracker::MemoryTracker(MemoryTracker * parent_, VariableContext level_) :

 MemoryTracker::~MemoryTracker()
 {
-    if ((level == VariableContext::Process || level == VariableContext::User) && peak)
+    if ((level == VariableContext::Process || level == VariableContext::User) && peak && log_peak_memory_usage_in_destructor)
    {
        try
        {
@ -109,8 +105,9 @@ MemoryTracker::~MemoryTracker()
 }


-void MemoryTracker::logPeakMemoryUsage() const
+void MemoryTracker::logPeakMemoryUsage()
 {
+    log_peak_memory_usage_in_destructor = false;
    const auto * description = description_ptr.load(std::memory_order_relaxed);
    LOG_DEBUG(&Poco::Logger::get("MemoryTracker"),
        "Peak memory usage{}: {}.", (description ? " " + std::string(description) : ""), ReadableSize(peak));
@ -169,14 +166,6 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
        }
    }

-#ifdef MEMORY_TRACKER_DEBUG_CHECKS
-    if (unlikely(memory_tracker_always_throw_logical_error_on_allocation))
-    {
-        memory_tracker_always_throw_logical_error_on_allocation = false;
-        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Memory tracker: allocations not allowed.");
-    }
-#endif
-
    std::bernoulli_distribution fault(fault_probability);
    if (unlikely(fault_probability && fault(thread_local_rng)) && memoryTrackerCanThrow(level, true) && throw_if_memory_exceeded)
    {
@ -270,16 +259,12 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
            level == VariableContext::Process ? this : query_tracker);
 }

-void MemoryTracker::alloc(Int64 size)
+void MemoryTracker::adjustWithUntrackedMemory(Int64 untracked_memory)
 {
-    bool throw_if_memory_exceeded = true;
-    allocImpl(size, throw_if_memory_exceeded);
-}
-
-void MemoryTracker::allocNoThrow(Int64 size)
-{
-    bool throw_if_memory_exceeded = false;
-    allocImpl(size, throw_if_memory_exceeded);
+    if (untracked_memory > 0)
+        allocImpl(untracked_memory, /*throw_if_memory_exceeded*/ false);
+    else
+        free(-untracked_memory);
 }

 bool MemoryTracker::updatePeak(Int64 will_be, bool log_memory_usage)
--- a/src/Common/MemoryTracker.h
+++ b/src/Common/MemoryTracker.h
@ -78,11 +78,17 @@ private:

    std::atomic<OvercommitTracker *> overcommit_tracker = nullptr;

+    bool log_peak_memory_usage_in_destructor = true;
+
    bool updatePeak(Int64 will_be, bool log_memory_usage);
    void logMemoryUsage(Int64 current) const;

    void setOrRaiseProfilerLimit(Int64 value);

+    /// allocImpl(...) and free(...) should not be used directly
+    friend struct CurrentMemoryTracker;
+    void allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker = nullptr);
+    void free(Int64 size);
 public:

    static constexpr auto USAGE_EVENT_NAME = "MemoryTrackerUsage";
@ -94,26 +100,7 @@ public:

    VariableContext level;

-    /** Call the following functions before calling of corresponding operations with memory allocators.
-      */
-    void alloc(Int64 size);
-
-    void allocNoThrow(Int64 size);
-
-    void allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryTracker * query_tracker = nullptr);
-
-    void realloc(Int64 old_size, Int64 new_size)
-    {
-        Int64 addition = new_size - old_size;
-        if (addition > 0)
-            alloc(addition);
-        else
-            free(-addition);
-    }
-
-    /** This function should be called after memory deallocation.
-      */
-    void free(Int64 size);
+    void adjustWithUntrackedMemory(Int64 untracked_memory);

    Int64 get() const
    {
@ -216,7 +203,7 @@ public:
    void set(Int64 to);

    /// Prints info about peak memory consumption into log.
-    void logPeakMemoryUsage() const;
+    void logPeakMemoryUsage();
 };

 extern MemoryTracker total_memory_tracker;
--- a/src/Common/OvercommitTracker.cpp
+++ b/src/Common/OvercommitTracker.cpp
@ -25,21 +25,6 @@ OvercommitTracker::OvercommitTracker(std::mutex & global_mutex_)
    , allow_release(true)
 {}

-#define LOG_DEBUG_SAFE(...)                                                                               \
-    do {                                                                                                  \
-        OvercommitTrackerBlockerInThread blocker;                                                         \
-        try                                                                                               \
-        {                                                                                                 \
-            ALLOW_ALLOCATIONS_IN_SCOPE;                                                                   \
-            LOG_DEBUG(__VA_ARGS__);                                                                       \
-        }                                                                                                 \
-        catch (...)                                                                                       \
-        {                                                                                                 \
-            if (fprintf(stderr, "Allocation failed during writing to log in OvercommitTracker\n") != -1)  \
-                ;                                                                                         \
-        }                                                                                                 \
-    } while (false)
-
 OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int64 amount)
 {
    DENY_ALLOCATIONS_IN_SCOPE;
@ -95,7 +80,6 @@ OvercommitResult OvercommitTracker::needToStopQuery(MemoryTracker * tracker, Int
    });
    auto wait_end_time = std::chrono::system_clock::now();
    ProfileEvents::increment(ProfileEvents::MemoryOvercommitWaitTimeMicroseconds, (wait_end_time - wait_start_time) / 1us);
-    LOG_DEBUG_SAFE(getLogger(), "Memory was{} freed within timeout", (timeout ? " not" : ""));

    required_memory -= amount;
    bool still_need = !(id < id_to_release); // True if thread wasn't released
@ -140,8 +124,6 @@ void OvercommitTracker::onQueryStop(MemoryTracker * tracker)
    std::lock_guard lk(overcommit_m);
    if (picked_tracker == tracker)
    {
-        LOG_DEBUG_SAFE(getLogger(), "Picked query stopped");
-
        reset();
        cv.notify_all();
    }
@ -167,7 +149,6 @@ void UserOvercommitTracker::pickQueryToExcludeImpl()
    // At this moment query list must be read only.
    // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery.
    auto & queries = user_process_list->queries;
-    LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", queries.size());
    for (auto const & query : queries)
    {
        if (query.second->isKilled())
@ -178,15 +159,12 @@ void UserOvercommitTracker::pickQueryToExcludeImpl()
            continue;

        auto ratio = memory_tracker->getOvercommitRatio();
-        LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit);
        if (ratio.soft_limit != 0 && current_ratio < ratio)
        {
            query_tracker = memory_tracker;
            current_ratio   = ratio;
        }
    }
-    LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}",
-        current_ratio.committed, current_ratio.soft_limit);
    picked_tracker = query_tracker;
 }

@ -201,7 +179,6 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl()
    OvercommitRatio current_ratio{0, 0};
    // At this moment query list must be read only.
    // This is guaranteed by locking global_mutex in OvercommitTracker::needToStopQuery.
-    LOG_DEBUG_SAFE(logger, "Trying to choose query to stop from {} queries", process_list->size());
    for (auto const & query : process_list->processes)
    {
        if (query.isKilled())
@ -217,15 +194,12 @@ void GlobalOvercommitTracker::pickQueryToExcludeImpl()
        if (!memory_tracker)
            continue;
        auto ratio = memory_tracker->getOvercommitRatio(user_soft_limit);
-        LOG_DEBUG_SAFE(logger, "Query has ratio {}/{}", ratio.committed, ratio.soft_limit);
        if (current_ratio < ratio)
        {
            query_tracker = memory_tracker;
            current_ratio   = ratio;
        }
    }
-    LOG_DEBUG_SAFE(logger, "Selected to stop query with overcommit ratio {}/{}",
-        current_ratio.committed, current_ratio.soft_limit);
    picked_tracker = query_tracker;
 }

--- a/src/Common/OvercommitTracker.h
+++ b/src/Common/OvercommitTracker.h
@ -86,8 +86,6 @@ protected:
    // overcommit tracker is in SELECTED state.
    MemoryTracker * picked_tracker;

-    virtual Poco::Logger * getLogger() = 0;
-
 private:

    void pickQueryToExclude()
@ -145,10 +143,8 @@ struct UserOvercommitTracker : OvercommitTracker
 protected:
    void pickQueryToExcludeImpl() override;

-    Poco::Logger * getLogger() override final { return logger; }
 private:
    DB::ProcessListForUser * user_process_list;
-    Poco::Logger * logger = &Poco::Logger::get("UserOvercommitTracker");
 };

 struct GlobalOvercommitTracker : OvercommitTracker
@ -160,10 +156,8 @@ struct GlobalOvercommitTracker : OvercommitTracker
 protected:
    void pickQueryToExcludeImpl() override;

-    Poco::Logger * getLogger() override final { return logger; }
 private:
    DB::ProcessList * process_list;
-    Poco::Logger * logger = &Poco::Logger::get("GlobalOvercommitTracker");
 };

 // This class is used to disallow tracking during logging to avoid deadlocks.
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -344,7 +344,20 @@
    \
    M(ScalarSubqueriesGlobalCacheHit, "Number of times a read from a scalar subquery was done using the global cache") \
    M(ScalarSubqueriesLocalCacheHit, "Number of times a read from a scalar subquery was done using the local cache") \
-    M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely")
+    M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely") \
+    M(KeeperPacketsSent, "Packets sent by keeper server") \
+    M(KeeperPacketsReceived, "Packets received by keeper server") \
+    M(KeeperRequestTotal, "Total requests number on keeper server") \
+    M(KeeperLatency, "Keeper latency") \
+    M(KeeperCommits, "Number of successful commits") \
+    M(KeeperCommitsFailed, "Number of failed commits") \
+    M(KeeperSnapshotCreations, "Number of snapshots creations")\
+    M(KeeperSnapshotCreationsFailed, "Number of failed snapshot creations")\
+    M(KeeperSnapshotApplys, "Number of snapshot applying")\
+    M(KeeperSnapshotApplysFailed, "Number of failed snapshot applying")\
+    M(KeeperReadSnapshot, "Number of snapshot read(serialization)")\
+    M(KeeperSaveSnapshot, "Number of snapshot save")\
+

 namespace ProfileEvents
 {
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@ -148,10 +148,7 @@ ThreadStatus::ThreadStatus()

 ThreadStatus::~ThreadStatus()
 {
-    if (untracked_memory > 0)
-        memory_tracker.allocNoThrow(untracked_memory);
-    else
-        memory_tracker.free(-untracked_memory);
+    memory_tracker.adjustWithUntrackedMemory(untracked_memory);

    if (thread_group)
    {
--- a/src/Common/ZooKeeper/KeeperException.h
+++ b/src/Common/ZooKeeper/KeeperException.h
@ -24,9 +24,7 @@ public:
    static void check(Coordination::Error code, const Coordination::Requests & requests, const Coordination::Responses & responses);

    KeeperMultiException(Coordination::Error code, const Coordination::Requests & requests, const Coordination::Responses & responses);
-
-private:
-    static size_t getFailedOpIndex(Coordination::Error code, const Coordination::Responses & responses);
 };

+size_t getFailedOpIndex(Coordination::Error code, const Coordination::Responses & responses);
 }
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -885,7 +885,7 @@ void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)
    if (!tryGet(path, content, nullptr, eph_node_disappeared))
        return;

-    int32_t timeout_ms = 2 * session_timeout_ms;
+    int32_t timeout_ms = 3 * session_timeout_ms;
    if (!eph_node_disappeared->tryWait(timeout_ms))
        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
                            "Ephemeral node {} still exists after {}s, probably it's owned by someone else. "
@ -1227,7 +1227,7 @@ void ZooKeeper::setZooKeeperLog(std::shared_ptr<DB::ZooKeeperLog> zk_log_)
 }


-size_t KeeperMultiException::getFailedOpIndex(Coordination::Error exception_code, const Coordination::Responses & responses)
+size_t getFailedOpIndex(Coordination::Error exception_code, const Coordination::Responses & responses)
 {
    if (responses.empty())
        throw DB::Exception("Responses for multi transaction is empty", DB::ErrorCodes::LOGICAL_ERROR);
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@ -134,6 +134,7 @@ struct ZooKeeperWatchResponse final : WatchResponse, ZooKeeperResponse

    OpNum getOpNum() const override
    {
+        chassert(false);
        throw Exception("OpNum for watch response doesn't exist", Error::ZRUNTIMEINCONSISTENCY);
    }

--- a/src/Coordination/KeeperConnectionStats.cpp
+++ b/src/Coordination/KeeperConnectionStats.cpp
@ -1,5 +1,14 @@
 #include <atomic>
 #include <Coordination/KeeperConnectionStats.h>
+#include <Common/ProfileEvents.h>
+
+namespace ProfileEvents
+{
+    extern const Event KeeperPacketsSent;
+    extern const Event KeeperPacketsReceived;
+    extern const Event KeeperRequestTotal;
+    extern const Event KeeperLatency;
+}

 namespace DB
 {
@ -40,18 +49,22 @@ uint64_t KeeperConnectionStats::getPacketsSent() const
 void KeeperConnectionStats::incrementPacketsReceived()
 {
    packets_received.fetch_add(1, std::memory_order_relaxed);
+    ProfileEvents::increment(ProfileEvents::KeeperPacketsReceived, 1);
 }

 void KeeperConnectionStats::incrementPacketsSent()
 {
    packets_sent.fetch_add(1, std::memory_order_relaxed);
+    ProfileEvents::increment(ProfileEvents::KeeperPacketsSent, 1);
 }

 void KeeperConnectionStats::updateLatency(uint64_t latency_ms)
 {
    last_latency.store(latency_ms, std::memory_order_relaxed);
    total_latency.fetch_add(latency_ms, std::memory_order_relaxed);
+    ProfileEvents::increment(ProfileEvents::KeeperLatency, latency_ms);
    count.fetch_add(1, std::memory_order_relaxed);
+    ProfileEvents::increment(ProfileEvents::KeeperRequestTotal, 1);

    uint64_t prev_val = min_latency.load(std::memory_order_relaxed);
    while (prev_val > latency_ms && !min_latency.compare_exchange_weak(prev_val, latency_ms, std::memory_order_relaxed)) {}
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@ -7,6 +7,13 @@
 #include <Common/hex.h>
 #include <filesystem>
 #include <Common/checkStackSize.h>
+#include <Common/CurrentMetrics.h>
+
+namespace CurrentMetrics
+{
+    extern const Metric KeeperAliveConnections;
+    extern const Metric KeeperOutstandingRequets;
+}

 namespace fs = std::filesystem;

@ -57,6 +64,7 @@ void KeeperDispatcher::requestThread()
        {
            if (requests_queue->tryPop(request, max_wait))
            {
+                CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets);
                if (shutdown_called)
                    break;

@ -78,6 +86,7 @@ void KeeperDispatcher::requestThread()
                        /// Trying to get batch requests as fast as possible
                        if (requests_queue->tryPop(request, 1))
                        {
+                            CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets);
                            /// Don't append read request into batch, we have to process them separately
                            if (!coordination_settings->quorum_reads && request.request->isReadRequest())
                            {
@ -215,7 +224,8 @@ void KeeperDispatcher::setResponse(int64_t session_id, const Coordination::ZooKe
        /// Session was disconnected, just skip this response
        if (session_response_callback == session_to_response_callback.end())
        {
-            LOG_TEST(log, "Cannot write response xid={}, op={}, session {} disconnected", response->xid, response->getOpNum(), session_id);
+            LOG_TEST(log, "Cannot write response xid={}, op={}, session {} disconnected",
+                response->xid, response->xid == Coordination::WATCH_XID ? "Watch" : toString(response->getOpNum()), session_id);
            return;
        }

@ -225,6 +235,7 @@ void KeeperDispatcher::setResponse(int64_t session_id, const Coordination::ZooKe
        if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
        {
            session_to_response_callback.erase(session_response_callback);
+            CurrentMetrics::sub(CurrentMetrics::KeeperAliveConnections);
        }
    }
 }
@ -259,6 +270,7 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ
    {
        throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
    }
+    CurrentMetrics::add(CurrentMetrics::KeeperOutstandingRequets);
    return true;
 }

@ -350,6 +362,7 @@ void KeeperDispatcher::shutdown()
        /// Set session expired for all pending requests
        while (requests_queue && requests_queue->tryPop(request_for_session))
        {
+            CurrentMetrics::sub(CurrentMetrics::KeeperOutstandingRequets);
            auto response = request_for_session.request->makeResponse();
            response->error = Coordination::Error::ZSESSIONEXPIRED;
            setResponse(request_for_session.session_id, response);
@ -358,6 +371,7 @@ void KeeperDispatcher::shutdown()
        /// Clear all registered sessions
        std::lock_guard lock(session_to_response_callback_mutex);
        session_to_response_callback.clear();
+        CurrentMetrics::set(CurrentMetrics::KeeperAliveConnections, 0);
    }
    catch (...)
    {
@ -382,6 +396,7 @@ void KeeperDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCall
    std::lock_guard lock(session_to_response_callback_mutex);
    if (!session_to_response_callback.try_emplace(session_id, callback).second)
        throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
+    CurrentMetrics::add(CurrentMetrics::KeeperAliveConnections);
 }

 void KeeperDispatcher::sessionCleanerTask()
@ -414,6 +429,7 @@ void KeeperDispatcher::sessionCleanerTask()
                        std::lock_guard lock(push_request_mutex);
                        if (!requests_queue->push(std::move(request_info)))
                            LOG_INFO(log, "Cannot push close request to queue while cleaning outdated sessions");
+                        CurrentMetrics::add(CurrentMetrics::KeeperOutstandingRequets);
                    }

                    /// Remove session from registered sessions
@ -437,7 +453,10 @@ void KeeperDispatcher::finishSession(int64_t session_id)
    std::lock_guard lock(session_to_response_callback_mutex);
    auto session_it = session_to_response_callback.find(session_id);
    if (session_it != session_to_response_callback.end())
+    {
        session_to_response_callback.erase(session_it);
+        CurrentMetrics::sub(CurrentMetrics::KeeperAliveConnections);
+    }
 }

 void KeeperDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error)
@ -520,6 +539,7 @@ int64_t KeeperDispatcher::getSessionID(int64_t session_timeout_ms)
        std::lock_guard lock(push_request_mutex);
        if (!requests_queue->tryPush(std::move(request_info), session_timeout_ms))
            throw Exception("Cannot push session id request to queue within session timeout", ErrorCodes::TIMEOUT_EXCEEDED);
+        CurrentMetrics::add(CurrentMetrics::KeeperOutstandingRequets);
    }

    if (future.wait_for(std::chrono::milliseconds(session_timeout_ms)) != std::future_status::ready)
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -8,8 +8,21 @@
 #include <sys/mman.h>
 #include "Common/ZooKeeper/ZooKeeperCommon.h"
 #include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <Common/ProfileEvents.h>
 #include "Coordination/KeeperStorage.h"

+namespace ProfileEvents
+{
+    extern const Event KeeperCommits;
+    extern const Event KeeperCommitsFailed;
+    extern const Event KeeperSnapshotCreations;
+    extern const Event KeeperSnapshotCreationsFailed;
+    extern const Event KeeperSnapshotApplys;
+    extern const Event KeeperSnapshotApplysFailed;
+    extern const Event KeeperReadSnapshot;
+    extern const Event KeeperSaveSnapshot;
+}
+
 namespace DB
 {

@ -219,7 +232,10 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
            LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms);
            response->session_id = session_id;
            if (!responses_queue.push(response_for_session))
+            {
+                ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
                throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", session_id);
+            }
        }
    }
    else
@ -229,10 +245,13 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
            request_for_session.request, request_for_session.session_id, request_for_session.zxid);
        for (auto & response_for_session : responses_for_sessions)
            if (!responses_queue.push(response_for_session))
+            {
+                ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
                throw Exception(
                    ErrorCodes::SYSTEM_ERROR,
                    "Could not push response with session id {} into responses queue",
                    response_for_session.session_id);
+            }

        if (digest_enabled && request_for_session.digest)
        {
@ -240,6 +259,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
        }
    }

+    ProfileEvents::increment(ProfileEvents::KeeperCommits);
    last_committed_idx = log_idx;
    return nullptr;
 }
@ -251,11 +271,14 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
    { /// save snapshot into memory
        std::lock_guard lock(snapshots_lock);
        if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx())
+        {
+            ProfileEvents::increment(ProfileEvents::KeeperSnapshotApplysFailed);
            throw Exception(
                ErrorCodes::LOGICAL_ERROR,
                "Required to apply snapshot with last log index {}, but our last log index is {}",
                s.get_last_log_idx(),
                latest_snapshot_meta->get_last_log_idx());
+        }
        latest_snapshot_ptr = latest_snapshot_buf;
    }

@ -268,6 +291,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
        cluster_config = snapshot_deserialization_result.cluster_config;
    }

+    ProfileEvents::increment(ProfileEvents::KeeperSnapshotApplys);
    last_committed_idx = s.get_last_log_idx();
    return true;
 }
@ -335,6 +359,7 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
                }
                latest_snapshot_path = path;
                latest_snapshot_meta = snapshot->snapshot_meta;
+                ProfileEvents::increment(ProfileEvents::KeeperSnapshotCreations);
                LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), path);
            }

@ -350,6 +375,7 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
        }
        catch (...)
        {
+            ProfileEvents::increment(ProfileEvents::KeeperSnapshotCreationsFailed);
            LOG_TRACE(log, "Exception happened during snapshot");
            tryLogCurrentException(log);
            ret = false;
@ -383,6 +409,7 @@ void KeeperStateMachine::save_logical_snp_obj(
        latest_snapshot_meta = cloned_meta;
        LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path);
        obj_id++;
+        ProfileEvents::increment(ProfileEvents::KeeperSaveSnapshot);
    }
    catch (...)
    {
@ -444,6 +471,7 @@ int KeeperStateMachine::read_logical_snp_obj(
        return -1;
    }
    is_last_obj = true;
+    ProfileEvents::increment(ProfileEvents::KeeperReadSnapshot);

    return 1;
 }
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@ -36,7 +36,8 @@ IMPLEMENT_SETTING_MULTI_ENUM(JoinAlgorithm, ErrorCodes::UNKNOWN_JOIN,
     {"partial_merge",        JoinAlgorithm::PARTIAL_MERGE},
     {"prefer_partial_merge", JoinAlgorithm::PREFER_PARTIAL_MERGE},
     {"parallel_hash",        JoinAlgorithm::PARALLEL_HASH},
-     {"direct",               JoinAlgorithm::DIRECT}})
+     {"direct",               JoinAlgorithm::DIRECT},
+     {"full_sorting_merge",   JoinAlgorithm::FULL_SORTING_MERGE}})


 IMPLEMENT_SETTING_ENUM(TotalsMode, ErrorCodes::UNKNOWN_TOTALS_MODE,
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@ -44,6 +44,7 @@ enum class JoinAlgorithm
    PREFER_PARTIAL_MERGE,
    PARALLEL_HASH,
    DIRECT,
+    FULL_SORTING_MERGE,
 };

 DECLARE_SETTING_MULTI_ENUM(JoinAlgorithm)
--- a/src/Core/SettingsFields.cpp
+++ b/src/Core/SettingsFields.cpp
@ -192,7 +192,8 @@ namespace
 }

 template <>
-SettingFieldSeconds::SettingFieldTimespan(const Field & f) : SettingFieldTimespan(float64AsSecondsToTimespan(fieldToNumber<Float64>(f)))
+SettingFieldSeconds::SettingFieldTimespan(const Field & f)
+    : SettingFieldTimespan(Poco::Timespan{float64AsSecondsToTimespan(fieldToNumber<Float64>(f))})
 {
 }

--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -142,9 +142,12 @@ struct SortCursorImpl
    bool isLast() const { return pos + 1 >= rows; }
    bool isLast(size_t size) const { return pos + size >= rows; }
    bool isValid() const { return pos < rows; }
+
    void next() { ++pos; }
    void next(size_t size) { pos += size; }
+
    size_t getSize() const { return rows; }
+    size_t rowsLeft() const { return rows - pos; }

 /// Prevent using pos instead of getRow()
 private:
--- a/src/Daemon/BaseDaemon.cpp
+++ b/src/Daemon/BaseDaemon.cpp
@ -395,7 +395,12 @@ private:
 #if defined(SANITIZER)
 extern "C" void __sanitizer_set_death_callback(void (*)());

-static void sanitizerDeathCallback()
+/// Sanitizers may not expect some function calls from death callback.
+/// Let's try to disable instrumentation to avoid possible issues.
+/// However, this callback may call other functions that are still instrumented.
+/// We can try [[clang::always_inline]] attribute for statements in future (available in clang-15)
+/// See https://github.com/google/sanitizers/issues/1543 and https://github.com/google/sanitizers/issues/1549.
+static DISABLE_SANITIZER_INSTRUMENTATION void sanitizerDeathCallback()
 {
    DENY_ALLOCATIONS_IN_SCOPE;
    /// Also need to send data via pipe. Otherwise it may lead to deadlocks or failures in printing diagnostic info.
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@ -71,6 +71,7 @@ public:
    DiskType getType() const override { return delegate->getType(); }
    bool isRemote() const override { return delegate->isRemote(); }
    bool supportZeroCopyReplication() const override { return delegate->supportZeroCopyReplication(); }
+    bool supportParallelWrite() const override { return delegate->supportParallelWrite(); }
    void onFreeze(const String & path) override;
    SyncGuardPtr getDirectorySyncGuard(const String & path) const override;
    void shutdown() override;
--- a/src/Functions/FunctionsMultiStringFuzzySearch.h
+++ b/src/Functions/FunctionsMultiStringFuzzySearch.h
@ -46,7 +46,7 @@ public:
    size_t getNumberOfArguments() const override { return 3; }
    bool useDefaultImplementationForConstants() const override { return true; }
    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }

    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
@ -65,26 +65,30 @@ public:

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
    {
-        const ColumnPtr & column_haystack = arguments[0].column;
-        const ColumnPtr & num_ptr = arguments[1].column;
-        const ColumnPtr & arr_ptr = arguments[2].column;
+        const ColumnPtr & haystack_ptr = arguments[0].column;
+        const ColumnPtr & edit_distance_ptr = arguments[1].column;
+        const ColumnPtr & needles_ptr = arguments[2].column;

-        const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
-        assert(col_haystack_vector); // getReturnTypeImpl() checks the data type
+        const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*haystack_ptr);
+        const ColumnConst * col_haystack_const = checkAndGetColumnConst<ColumnString>(&*haystack_ptr);
+        assert(static_cast<bool>(col_haystack_vector) ^ static_cast<bool>(col_haystack_const));

        UInt32 edit_distance = 0;
-        if (const auto * col_const_uint8 = checkAndGetColumnConst<ColumnUInt8>(num_ptr.get()))
+        if (const auto * col_const_uint8 = checkAndGetColumnConst<ColumnUInt8>(edit_distance_ptr.get()))
            edit_distance = col_const_uint8->getValue<UInt8>();
-        else if (const auto * col_const_uint16 = checkAndGetColumnConst<ColumnUInt16>(num_ptr.get()))
+        else if (const auto * col_const_uint16 = checkAndGetColumnConst<ColumnUInt16>(edit_distance_ptr.get()))
            edit_distance = col_const_uint16->getValue<UInt16>();
-        else if (const auto * col_const_uint32 = checkAndGetColumnConst<ColumnUInt32>(num_ptr.get()))
+        else if (const auto * col_const_uint32 = checkAndGetColumnConst<ColumnUInt32>(edit_distance_ptr.get()))
            edit_distance = col_const_uint32->getValue<UInt32>();
        else
            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The number is not const or does not fit in UInt32", arguments[1].column->getName());

-        const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
-        if (!col_const_arr)
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {}. The array is not const", arguments[2].column->getName());
+        const ColumnArray * col_needles_vector = checkAndGetColumn<ColumnArray>(needles_ptr.get());
+        const ColumnConst * col_needles_const = checkAndGetColumnConst<ColumnArray>(needles_ptr.get());
+        assert(static_cast<bool>(col_needles_vector) ^ static_cast<bool>(col_needles_const));
+
+        if (col_haystack_const && col_needles_vector)
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Function '{}' doesn't support search with non-constant needles in constant haystack", name);

        using ResultType = typename Impl::ResultType;
        auto col_res = ColumnVector<ResultType>::create();
@ -92,12 +96,30 @@ public:

        auto & vec_res = col_res->getData();
        auto & offsets_res = col_offsets->getData();
-        // the implementations are responsible for resizing the output column
+        /// the implementations are responsible for resizing the output column

-        Array needles_arr = col_const_arr->getValue<Array>();
-        Impl::vectorConstant(
-            col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needles_arr, vec_res, offsets_res, edit_distance,
-            allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
+        if (col_needles_const)
+        {
+            Impl::vectorConstant(
+                col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
+                col_needles_const->getValue<Array>(),
+                vec_res, offsets_res,
+                edit_distance,
+                allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
+        }
+        else
+        {
+            Impl::vectorVector(
+                col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
+                col_needles_vector->getData(), col_needles_vector->getOffsets(),
+                vec_res, offsets_res,
+                edit_distance,
+                allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
+        }
+
+        // the combination of const haystack + const needle is not implemented because
+        // useDefaultImplementationForConstants() == true makes upper layers convert both to
+        // non-const columns

        if constexpr (Impl::is_column_array)
            return ColumnArray::create(std::move(col_res), std::move(col_offsets));
--- a/Show More
+++ b/Show More