Merge branch 'master' into pufit/fix-on-cluster-user

# Conflicts: # src/Core/ServerSettings.cpp # src/Interpreters/executeDDLQueryOnCluster.cpp
2024-12-03 13:02:00 +00:00 · 2024-11-08 02:51:53 -05:00 · 2024-11-08 02:51:53 -05:00 · 23f9f24b3c
commit 23f9f24b3c
parent a91af7030c a15e5b72af
458 changed files with 10924 additions and 4952 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -332,7 +332,7 @@
 	url = https://github.com/ClickHouse/usearch.git
 [submodule "contrib/SimSIMD"]
 	path = contrib/SimSIMD
-	url = https://github.com/ashvardanian/SimSIMD.git
+	url = https://github.com/ClickHouse/SimSIMD.git
 [submodule "contrib/FP16"]
 	path = contrib/FP16
 	url = https://github.com/Maratyszcza/FP16.git
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -488,6 +488,7 @@
 * Remove `is_deterministic` field from the `system.functions` table. [#66630](https://github.com/ClickHouse/ClickHouse/pull/66630) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * Function `tuple` will now try to construct named tuples in query (controlled by `enable_named_columns_in_function_tuple`). Introduce function `tupleNames` to extract names from tuples. [#54881](https://github.com/ClickHouse/ClickHouse/pull/54881) ([Amos Bird](https://github.com/amosbird)).
 * Change how deduplication for Materialized Views works. Fixed a lot of cases like: - on destination table: data is split for 2 or more blocks and that blocks is considered as duplicate when that block is inserted in parallel. - on MV destination table: the equal blocks are deduplicated, that happens when MV often produces equal data as a result for different input data due to performing aggregation. - on MV destination table: the equal blocks which comes from different MV are deduplicated. [#61601](https://github.com/ClickHouse/ClickHouse/pull/61601) ([Sema Checherinda](https://github.com/CheSema)).
+* Functions `bitShiftLeft` and `bitShitfRight` return an error for out of bounds shift positions [#65838](https://github.com/ClickHouse/ClickHouse/pull/65838) ([Pablo Marcos](https://github.com/pamarcos)).

 #### New Feature
 * Add `ASOF JOIN` support for `full_sorting_join` algorithm. [#55051](https://github.com/ClickHouse/ClickHouse/pull/55051) ([vdimir](https://github.com/vdimir)).
@ -599,7 +600,6 @@
 * Functions `bitTest`, `bitTestAll`, and `bitTestAny` now return an error if the specified bit index is out-of-bounds [#65818](https://github.com/ClickHouse/ClickHouse/pull/65818) ([Pablo Marcos](https://github.com/pamarcos)).
 * Setting `join_any_take_last_row` is supported in any query with hash join. [#65820](https://github.com/ClickHouse/ClickHouse/pull/65820) ([vdimir](https://github.com/vdimir)).
 * Better handling of join conditions involving `IS NULL` checks (for example `ON (a = b AND (a IS NOT NULL) AND (b IS NOT NULL) ) OR ( (a IS NULL) AND (b IS NULL) )` is rewritten to `ON a <=> b`), fix incorrect optimization when condition other then `IS NULL` are present. [#65835](https://github.com/ClickHouse/ClickHouse/pull/65835) ([vdimir](https://github.com/vdimir)).
-* Functions `bitShiftLeft` and `bitShitfRight` return an error for out of bounds shift positions [#65838](https://github.com/ClickHouse/ClickHouse/pull/65838) ([Pablo Marcos](https://github.com/pamarcos)).
 * Fix growing memory usage in S3Queue. [#65839](https://github.com/ClickHouse/ClickHouse/pull/65839) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Fix tie handling in `arrayAUC` to match sklearn. [#65840](https://github.com/ClickHouse/ClickHouse/pull/65840) ([gabrielmcg44](https://github.com/gabrielmcg44)).
 * Fix possible issues with MySQL server protocol TLS connections. [#65917](https://github.com/ClickHouse/ClickHouse/pull/65917) ([Azat Khuzhin](https://github.com/azat)).
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -88,6 +88,7 @@ string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
 list(REVERSE CMAKE_FIND_LIBRARY_SUFFIXES)

 option (ENABLE_FUZZING "Fuzzy testing using libfuzzer" OFF)
+option (ENABLE_FUZZER_TEST "Build testing fuzzers in order to test libFuzzer functionality" OFF)

 if (ENABLE_FUZZING)
    # Also set WITH_COVERAGE=1 for better fuzzing process
--- a/SECURITY.md
+++ b/SECURITY.md
@ -14,9 +14,10 @@ The following versions of ClickHouse server are currently supported with securit

 | Version | Supported |
 |:-|:-|
+| 24.10 | ✔️ |
 | 24.9 | ✔️ |
 | 24.8 | ✔️ |
-| 24.7 | ✔️ |
+| 24.7 | ❌ |
 | 24.6 | ❌ |
 | 24.5 | ❌ |
 | 24.4 | ❌ |
--- a/base/base/StringRef.h
+++ b/base/base/StringRef.h
@ -86,7 +86,7 @@ using StringRefs = std::vector<StringRef>;
  * For more information, see hash_map_string_2.cpp
  */

-inline bool compare8(const char * p1, const char * p2)
+inline bool compare16(const char * p1, const char * p2)
 {
    return 0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(
        _mm_loadu_si128(reinterpret_cast<const __m128i *>(p1)),
@ -115,7 +115,7 @@ inline bool compare64(const char * p1, const char * p2)

 #elif defined(__aarch64__) && defined(__ARM_NEON)

-inline bool compare8(const char * p1, const char * p2)
+inline bool compare16(const char * p1, const char * p2)
 {
    uint64_t mask = getNibbleMask(vceqq_u8(
            vld1q_u8(reinterpret_cast<const unsigned char *>(p1)), vld1q_u8(reinterpret_cast<const unsigned char *>(p2))));
@ -185,13 +185,22 @@ inline bool memequalWide(const char * p1, const char * p2, size_t size)

    switch (size / 16) // NOLINT(bugprone-switch-missing-default-case)
    {
-        case 3: if (!compare8(p1 + 32, p2 + 32)) return false; [[fallthrough]];
-        case 2: if (!compare8(p1 + 16, p2 + 16)) return false; [[fallthrough]];
-        case 1: if (!compare8(p1, p2)) return false; [[fallthrough]];
+        case 3:
+            if (!compare16(p1 + 32, p2 + 32))
+                return false;
+            [[fallthrough]];
+        case 2:
+            if (!compare16(p1 + 16, p2 + 16))
+                return false;
+            [[fallthrough]];
+        case 1:
+            if (!compare16(p1, p2))
+                return false;
+            [[fallthrough]];
        default: ;
    }

-    return compare8(p1 + size - 16, p2 + size - 16);
+    return compare16(p1 + size - 16, p2 + size - 16);
 }

 #endif
--- a/base/base/chrono_io.h
+++ b/base/base/chrono_io.h
@ -4,6 +4,7 @@
 #include <string>
 #include <sstream>
 #include <cctz/time_zone.h>
+#include <fmt/core.h>


 inline std::string to_string(const std::time_t & time)
@ -11,18 +12,6 @@ inline std::string to_string(const std::time_t & time)
    return cctz::format("%Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(time), cctz::local_time_zone());
 }

-template <typename Clock, typename Duration = typename Clock::duration>
-std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
-{
-    // Don't use DateLUT because it shows weird characters for
-    // TimePoint::max(). I wish we could use C++20 format, but it's not
-    // there yet.
-    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
-
-    auto in_time_t = std::chrono::system_clock::to_time_t(tp);
-    return to_string(in_time_t);
-}
-
 template <typename Rep, typename Period = std::ratio<1>>
 std::string to_string(const std::chrono::duration<Rep, Period> & duration)
 {
@ -33,6 +22,20 @@ std::string to_string(const std::chrono::duration<Rep, Period> & duration)
    return std::to_string(seconds_as_double.count()) + "s";
 }

+template <typename Clock, typename Duration = typename Clock::duration>
+std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
+{
+    // Don't use DateLUT because it shows weird characters for
+    // TimePoint::max(). I wish we could use C++20 format, but it's not
+    // there yet.
+    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
+
+    if constexpr (std::is_same_v<Clock, std::chrono::system_clock>)
+        return to_string(std::chrono::system_clock::to_time_t(tp));
+    else
+        return to_string(tp.time_since_epoch());
+}
+
 template <typename Clock, typename Duration = typename Clock::duration>
 std::ostream & operator<<(std::ostream & o, const std::chrono::time_point<Clock, Duration> & tp)
 {
@ -44,3 +47,23 @@ std::ostream & operator<<(std::ostream & o, const std::chrono::duration<Rep, Per
 {
    return o << to_string(duration);
 }
+
+template <typename Clock, typename Duration>
+struct fmt::formatter<std::chrono::time_point<Clock, Duration>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::time_point<Clock, Duration> & tp, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(tp), ctx);
+    }
+};
+
+template <typename Rep, typename Period>
+struct fmt::formatter<std::chrono::duration<Rep, Period>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::duration<Rep, Period> & duration, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(duration), ctx);
+    }
+};
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@ -1 +1 @@
-Subproject commit ff51434d90c66f916e94ff05b24530b127aa4cff
+Subproject commit ee3c9c9c00b51645f62a1a9e99611b78c0052a21
--- a/contrib/SimSIMD-cmake/CMakeLists.txt
+++ b/contrib/SimSIMD-cmake/CMakeLists.txt
@ -1,4 +1,8 @@
-set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
-
-add_library(_simsimd INTERFACE)
-target_include_directories(_simsimd SYSTEM INTERFACE "${SIMSIMD_PROJECT_DIR}/include")
+# See contrib/usearch-cmake/CMakeLists.txt, why only enabled on x86
+if (ARCH_AMD64)
+    set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
+    set(SIMSIMD_SRCS ${SIMSIMD_PROJECT_DIR}/c/lib.c)
+    add_library(_simsimd ${SIMSIMD_SRCS})
+    target_include_directories(_simsimd SYSTEM PUBLIC "${SIMSIMD_PROJECT_DIR}/include")
+    target_compile_definitions(_simsimd PUBLIC SIMSIMD_DYNAMIC_DISPATCH)
+endif()
--- a/contrib/arrow
+++ b/contrib/arrow
@ -1 +1 @@
-Subproject commit 5cfccd8ea65f33d4517e7409815d761c7650b45d
+Subproject commit 6e2574f5013a005c050c9a7787d341aef09d0063
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -213,13 +213,19 @@ target_include_directories(_orc SYSTEM PRIVATE
 set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow")

 # arrow/cpp/src/arrow/CMakeLists.txt (ARROW_SRCS + ARROW_COMPUTE + ARROW_IPC)
+# find . \( -iname \*.cc -o -iname \*.cpp -o -iname \*.c \) | sort | awk '{print "\"${LIBRARY_DIR}" substr($1,2) "\"" }' | grep -v 'test.cc' | grep -v 'json' | grep -v 'flight' \|
+# grep -v 'csv' | grep -v 'acero' | grep -v 'dataset' | grep -v 'testing' | grep -v 'gpu' | grep -v 'engine' | grep -v 'filesystem' | grep -v 'benchmark.cc'
 set(ARROW_SRCS
+        "${LIBRARY_DIR}/adapters/orc/adapter.cc"
+        "${LIBRARY_DIR}/adapters/orc/options.cc"
+        "${LIBRARY_DIR}/adapters/orc/util.cc"
        "${LIBRARY_DIR}/array/array_base.cc"
        "${LIBRARY_DIR}/array/array_binary.cc"
        "${LIBRARY_DIR}/array/array_decimal.cc"
        "${LIBRARY_DIR}/array/array_dict.cc"
        "${LIBRARY_DIR}/array/array_nested.cc"
        "${LIBRARY_DIR}/array/array_primitive.cc"
+        "${LIBRARY_DIR}/array/array_run_end.cc"
        "${LIBRARY_DIR}/array/builder_adaptive.cc"
        "${LIBRARY_DIR}/array/builder_base.cc"
        "${LIBRARY_DIR}/array/builder_binary.cc"
@ -227,124 +233,26 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/array/builder_dict.cc"
        "${LIBRARY_DIR}/array/builder_nested.cc"
        "${LIBRARY_DIR}/array/builder_primitive.cc"
-        "${LIBRARY_DIR}/array/builder_union.cc"
        "${LIBRARY_DIR}/array/builder_run_end.cc"
-        "${LIBRARY_DIR}/array/array_run_end.cc"
+        "${LIBRARY_DIR}/array/builder_union.cc"
        "${LIBRARY_DIR}/array/concatenate.cc"
        "${LIBRARY_DIR}/array/data.cc"
        "${LIBRARY_DIR}/array/diff.cc"
        "${LIBRARY_DIR}/array/util.cc"
        "${LIBRARY_DIR}/array/validate.cc"
-        "${LIBRARY_DIR}/builder.cc"
        "${LIBRARY_DIR}/buffer.cc"
-        "${LIBRARY_DIR}/chunked_array.cc"
-        "${LIBRARY_DIR}/chunk_resolver.cc"
-        "${LIBRARY_DIR}/compare.cc"
-        "${LIBRARY_DIR}/config.cc"
-        "${LIBRARY_DIR}/datum.cc"
-        "${LIBRARY_DIR}/device.cc"
-        "${LIBRARY_DIR}/extension_type.cc"
-        "${LIBRARY_DIR}/memory_pool.cc"
-        "${LIBRARY_DIR}/pretty_print.cc"
-        "${LIBRARY_DIR}/record_batch.cc"
-        "${LIBRARY_DIR}/result.cc"
-        "${LIBRARY_DIR}/scalar.cc"
-        "${LIBRARY_DIR}/sparse_tensor.cc"
-        "${LIBRARY_DIR}/status.cc"
-        "${LIBRARY_DIR}/table.cc"
-        "${LIBRARY_DIR}/table_builder.cc"
-        "${LIBRARY_DIR}/tensor.cc"
-        "${LIBRARY_DIR}/tensor/coo_converter.cc"
-        "${LIBRARY_DIR}/tensor/csf_converter.cc"
-        "${LIBRARY_DIR}/tensor/csx_converter.cc"
-        "${LIBRARY_DIR}/type.cc"
-        "${LIBRARY_DIR}/visitor.cc"
+        "${LIBRARY_DIR}/builder.cc"
        "${LIBRARY_DIR}/c/bridge.cc"
-        "${LIBRARY_DIR}/io/buffered.cc"
-        "${LIBRARY_DIR}/io/caching.cc"
-        "${LIBRARY_DIR}/io/compressed.cc"
-        "${LIBRARY_DIR}/io/file.cc"
-        "${LIBRARY_DIR}/io/hdfs.cc"
-        "${LIBRARY_DIR}/io/hdfs_internal.cc"
-        "${LIBRARY_DIR}/io/interfaces.cc"
-        "${LIBRARY_DIR}/io/memory.cc"
-        "${LIBRARY_DIR}/io/slow.cc"
-        "${LIBRARY_DIR}/io/stdio.cc"
-        "${LIBRARY_DIR}/io/transform.cc"
-        "${LIBRARY_DIR}/util/async_util.cc"
-        "${LIBRARY_DIR}/util/basic_decimal.cc"
-        "${LIBRARY_DIR}/util/bit_block_counter.cc"
-        "${LIBRARY_DIR}/util/bit_run_reader.cc"
-        "${LIBRARY_DIR}/util/bit_util.cc"
-        "${LIBRARY_DIR}/util/bitmap.cc"
-        "${LIBRARY_DIR}/util/bitmap_builders.cc"
-        "${LIBRARY_DIR}/util/bitmap_ops.cc"
-        "${LIBRARY_DIR}/util/bpacking.cc"
-        "${LIBRARY_DIR}/util/cancel.cc"
-        "${LIBRARY_DIR}/util/compression.cc"
-        "${LIBRARY_DIR}/util/counting_semaphore.cc"
-        "${LIBRARY_DIR}/util/cpu_info.cc"
-        "${LIBRARY_DIR}/util/decimal.cc"
-        "${LIBRARY_DIR}/util/delimiting.cc"
-        "${LIBRARY_DIR}/util/formatting.cc"
-        "${LIBRARY_DIR}/util/future.cc"
-        "${LIBRARY_DIR}/util/int_util.cc"
-        "${LIBRARY_DIR}/util/io_util.cc"
-        "${LIBRARY_DIR}/util/logging.cc"
-        "${LIBRARY_DIR}/util/key_value_metadata.cc"
-        "${LIBRARY_DIR}/util/memory.cc"
-        "${LIBRARY_DIR}/util/mutex.cc"
-        "${LIBRARY_DIR}/util/string.cc"
-        "${LIBRARY_DIR}/util/string_builder.cc"
-        "${LIBRARY_DIR}/util/task_group.cc"
-        "${LIBRARY_DIR}/util/tdigest.cc"
-        "${LIBRARY_DIR}/util/thread_pool.cc"
-        "${LIBRARY_DIR}/util/time.cc"
-        "${LIBRARY_DIR}/util/trie.cc"
-        "${LIBRARY_DIR}/util/unreachable.cc"
-        "${LIBRARY_DIR}/util/uri.cc"
-        "${LIBRARY_DIR}/util/utf8.cc"
-        "${LIBRARY_DIR}/util/value_parsing.cc"
-        "${LIBRARY_DIR}/util/byte_size.cc"
-        "${LIBRARY_DIR}/util/debug.cc"
-        "${LIBRARY_DIR}/util/tracing.cc"
-        "${LIBRARY_DIR}/util/atfork_internal.cc"
-        "${LIBRARY_DIR}/util/crc32.cc"
-        "${LIBRARY_DIR}/util/hashing.cc"
-        "${LIBRARY_DIR}/util/ree_util.cc"
-        "${LIBRARY_DIR}/util/union_util.cc"
-        "${LIBRARY_DIR}/vendored/base64.cpp"
-        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
-        "${LIBRARY_DIR}/vendored/musl/strptime.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
-
+        "${LIBRARY_DIR}/c/dlpack.cc"
+        "${LIBRARY_DIR}/chunk_resolver.cc"
+        "${LIBRARY_DIR}/chunked_array.cc"
+        "${LIBRARY_DIR}/compare.cc"
        "${LIBRARY_DIR}/compute/api_aggregate.cc"
        "${LIBRARY_DIR}/compute/api_scalar.cc"
        "${LIBRARY_DIR}/compute/api_vector.cc"
        "${LIBRARY_DIR}/compute/cast.cc"
        "${LIBRARY_DIR}/compute/exec.cc"
+        "${LIBRARY_DIR}/compute/expression.cc"
        "${LIBRARY_DIR}/compute/function.cc"
        "${LIBRARY_DIR}/compute/function_internal.cc"
        "${LIBRARY_DIR}/compute/kernel.cc"
@ -355,6 +263,7 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc"
        "${LIBRARY_DIR}/compute/kernels/codegen_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc"
+        "${LIBRARY_DIR}/compute/kernels/ree_util_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/row_encoder.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc"
@ -382,30 +291,139 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_pairwise.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_run_end_encode.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc"
-        "${LIBRARY_DIR}/compute/light_array.cc"
-        "${LIBRARY_DIR}/compute/registry.cc"
-        "${LIBRARY_DIR}/compute/expression.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
+        "${LIBRARY_DIR}/compute/key_hash_internal.cc"
+        "${LIBRARY_DIR}/compute/key_map_internal.cc"
+        "${LIBRARY_DIR}/compute/light_array_internal.cc"
        "${LIBRARY_DIR}/compute/ordering.cc"
+        "${LIBRARY_DIR}/compute/registry.cc"
        "${LIBRARY_DIR}/compute/row/compare_internal.cc"
        "${LIBRARY_DIR}/compute/row/encode_internal.cc"
        "${LIBRARY_DIR}/compute/row/grouper.cc"
        "${LIBRARY_DIR}/compute/row/row_internal.cc"
-
+        "${LIBRARY_DIR}/compute/util.cc"
+        "${LIBRARY_DIR}/config.cc"
+        "${LIBRARY_DIR}/datum.cc"
+        "${LIBRARY_DIR}/device.cc"
+        "${LIBRARY_DIR}/extension_type.cc"
+        "${LIBRARY_DIR}/integration/c_data_integration_internal.cc"
+        "${LIBRARY_DIR}/io/buffered.cc"
+        "${LIBRARY_DIR}/io/caching.cc"
+        "${LIBRARY_DIR}/io/compressed.cc"
+        "${LIBRARY_DIR}/io/file.cc"
+        "${LIBRARY_DIR}/io/hdfs.cc"
+        "${LIBRARY_DIR}/io/hdfs_internal.cc"
+        "${LIBRARY_DIR}/io/interfaces.cc"
+        "${LIBRARY_DIR}/io/memory.cc"
+        "${LIBRARY_DIR}/io/slow.cc"
+        "${LIBRARY_DIR}/io/stdio.cc"
+        "${LIBRARY_DIR}/io/transform.cc"
        "${LIBRARY_DIR}/ipc/dictionary.cc"
        "${LIBRARY_DIR}/ipc/feather.cc"
+        "${LIBRARY_DIR}/ipc/file_to_stream.cc"
        "${LIBRARY_DIR}/ipc/message.cc"
        "${LIBRARY_DIR}/ipc/metadata_internal.cc"
        "${LIBRARY_DIR}/ipc/options.cc"
        "${LIBRARY_DIR}/ipc/reader.cc"
+        "${LIBRARY_DIR}/ipc/stream_to_file.cc"
        "${LIBRARY_DIR}/ipc/writer.cc"
+        "${LIBRARY_DIR}/memory_pool.cc"
+        "${LIBRARY_DIR}/pretty_print.cc"
+        "${LIBRARY_DIR}/record_batch.cc"
+        "${LIBRARY_DIR}/result.cc"
+        "${LIBRARY_DIR}/scalar.cc"
+        "${LIBRARY_DIR}/sparse_tensor.cc"
+        "${LIBRARY_DIR}/status.cc"
+        "${LIBRARY_DIR}/table.cc"
+        "${LIBRARY_DIR}/table_builder.cc"
+        "${LIBRARY_DIR}/tensor.cc"
+        "${LIBRARY_DIR}/tensor/coo_converter.cc"
+        "${LIBRARY_DIR}/tensor/csf_converter.cc"
+        "${LIBRARY_DIR}/tensor/csx_converter.cc"
+        "${LIBRARY_DIR}/type.cc"
+        "${LIBRARY_DIR}/type_traits.cc"
+        "${LIBRARY_DIR}/util/align_util.cc"
+        "${LIBRARY_DIR}/util/async_util.cc"
+        "${LIBRARY_DIR}/util/atfork_internal.cc"
+        "${LIBRARY_DIR}/util/basic_decimal.cc"
+        "${LIBRARY_DIR}/util/bit_block_counter.cc"
+        "${LIBRARY_DIR}/util/bit_run_reader.cc"
+        "${LIBRARY_DIR}/util/bit_util.cc"
+        "${LIBRARY_DIR}/util/bitmap.cc"
+        "${LIBRARY_DIR}/util/bitmap_builders.cc"
+        "${LIBRARY_DIR}/util/bitmap_ops.cc"
+        "${LIBRARY_DIR}/util/bpacking.cc"
+        "${LIBRARY_DIR}/util/byte_size.cc"
+        "${LIBRARY_DIR}/util/cancel.cc"
+        "${LIBRARY_DIR}/util/compression.cc"
+        "${LIBRARY_DIR}/util/counting_semaphore.cc"
+        "${LIBRARY_DIR}/util/cpu_info.cc"
+        "${LIBRARY_DIR}/util/crc32.cc"
+        "${LIBRARY_DIR}/util/debug.cc"
+        "${LIBRARY_DIR}/util/decimal.cc"
+        "${LIBRARY_DIR}/util/delimiting.cc"
+        "${LIBRARY_DIR}/util/dict_util.cc"
+        "${LIBRARY_DIR}/util/float16.cc"
+        "${LIBRARY_DIR}/util/formatting.cc"
+        "${LIBRARY_DIR}/util/future.cc"
+        "${LIBRARY_DIR}/util/hashing.cc"
+        "${LIBRARY_DIR}/util/int_util.cc"
+        "${LIBRARY_DIR}/util/io_util.cc"
+        "${LIBRARY_DIR}/util/key_value_metadata.cc"
+        "${LIBRARY_DIR}/util/list_util.cc"
+        "${LIBRARY_DIR}/util/logging.cc"
+        "${LIBRARY_DIR}/util/memory.cc"
+        "${LIBRARY_DIR}/util/mutex.cc"
+        "${LIBRARY_DIR}/util/ree_util.cc"
+        "${LIBRARY_DIR}/util/string.cc"
+        "${LIBRARY_DIR}/util/string_builder.cc"
+        "${LIBRARY_DIR}/util/task_group.cc"
+        "${LIBRARY_DIR}/util/tdigest.cc"
+        "${LIBRARY_DIR}/util/thread_pool.cc"
+        "${LIBRARY_DIR}/util/time.cc"
+        "${LIBRARY_DIR}/util/tracing.cc"
+        "${LIBRARY_DIR}/util/trie.cc"
+        "${LIBRARY_DIR}/util/union_util.cc"
+        "${LIBRARY_DIR}/util/unreachable.cc"
+        "${LIBRARY_DIR}/util/uri.cc"
+        "${LIBRARY_DIR}/util/utf8.cc"
+        "${LIBRARY_DIR}/util/value_parsing.cc"
+        "${LIBRARY_DIR}/vendored/base64.cpp"
+        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
+        "${LIBRARY_DIR}/vendored/musl/strptime.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
+        "${LIBRARY_DIR}/visitor.cc"

        "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
        "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
@ -465,22 +483,38 @@ set(PARQUET_SRCS
        "${LIBRARY_DIR}/arrow/schema.cc"
        "${LIBRARY_DIR}/arrow/schema_internal.cc"
        "${LIBRARY_DIR}/arrow/writer.cc"
+        "${LIBRARY_DIR}/benchmark_util.cc"
        "${LIBRARY_DIR}/bloom_filter.cc"
+        "${LIBRARY_DIR}/bloom_filter_reader.cc"
        "${LIBRARY_DIR}/column_reader.cc"
        "${LIBRARY_DIR}/column_scanner.cc"
        "${LIBRARY_DIR}/column_writer.cc"
        "${LIBRARY_DIR}/encoding.cc"
+        "${LIBRARY_DIR}/encryption/crypto_factory.cc"
        "${LIBRARY_DIR}/encryption/encryption.cc"
        "${LIBRARY_DIR}/encryption/encryption_internal.cc"
+        "${LIBRARY_DIR}/encryption/encryption_internal_nossl.cc"
+        "${LIBRARY_DIR}/encryption/file_key_unwrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_key_wrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_system_key_material_store.cc"
        "${LIBRARY_DIR}/encryption/internal_file_decryptor.cc"
        "${LIBRARY_DIR}/encryption/internal_file_encryptor.cc"
+        "${LIBRARY_DIR}/encryption/key_material.cc"
+        "${LIBRARY_DIR}/encryption/key_metadata.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit_internal.cc"
+        "${LIBRARY_DIR}/encryption/kms_client.cc"
+        "${LIBRARY_DIR}/encryption/local_wrap_kms_client.cc"
+        "${LIBRARY_DIR}/encryption/openssl_internal.cc"
        "${LIBRARY_DIR}/exception.cc"
        "${LIBRARY_DIR}/file_reader.cc"
        "${LIBRARY_DIR}/file_writer.cc"
-        "${LIBRARY_DIR}/page_index.cc"
-        "${LIBRARY_DIR}/level_conversion.cc"
        "${LIBRARY_DIR}/level_comparison.cc"
+        "${LIBRARY_DIR}/level_comparison_avx2.cc"
+        "${LIBRARY_DIR}/level_conversion.cc"
+        "${LIBRARY_DIR}/level_conversion_bmi2.cc"
        "${LIBRARY_DIR}/metadata.cc"
+        "${LIBRARY_DIR}/page_index.cc"
        "${LIBRARY_DIR}/platform.cc"
        "${LIBRARY_DIR}/printer.cc"
        "${LIBRARY_DIR}/properties.cc"
@ -489,7 +523,6 @@ set(PARQUET_SRCS
        "${LIBRARY_DIR}/stream_reader.cc"
        "${LIBRARY_DIR}/stream_writer.cc"
        "${LIBRARY_DIR}/types.cc"
-        "${LIBRARY_DIR}/bloom_filter_reader.cc"
        "${LIBRARY_DIR}/xxhasher.cc"

        "${GEN_LIBRARY_DIR}/parquet_constants.cpp"
@ -520,6 +553,9 @@ endif ()
 add_definitions(-DPARQUET_THRIFT_VERSION_MAJOR=0)
 add_definitions(-DPARQUET_THRIFT_VERSION_MINOR=16)

+# As per https://github.com/apache/arrow/pull/35672 you need to enable it explicitly.
+add_definitions(-DARROW_ENABLE_THREADING)
+
 # === tools

 set(TOOLS_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/tools/parquet")
--- a/contrib/flatbuffers
+++ b/contrib/flatbuffers
@ -1 +1 @@
-Subproject commit eb3f827948241ce0e701516f16cd67324802bce9
+Subproject commit 0100f6a5779831fa7a651e4b67ef389a8752bd9b
--- a/contrib/krb5
+++ b/contrib/krb5
@ -1 +1 @@
-Subproject commit 71b06c2276009ae649c7703019f3b4605f66fd3d
+Subproject commit c5b4b994c18db86933255907a97eee5993fd18fe
--- a/contrib/usearch
+++ b/contrib/usearch
@ -1 +1 @@
-Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48
+Subproject commit 7efe8b710c9831bfe06573b1df0fad001b04a2b5
--- a/contrib/usearch-cmake/CMakeLists.txt
+++ b/contrib/usearch-cmake/CMakeLists.txt
@ -6,12 +6,63 @@ target_include_directories(_usearch SYSTEM INTERFACE ${USEARCH_PROJECT_DIR}/incl
 target_link_libraries(_usearch INTERFACE _fp16)
 target_compile_definitions(_usearch INTERFACE USEARCH_USE_FP16LIB)

-# target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
-# ^^ simsimd is not enabled at the moment. Reasons:
-# - Vectorization is important for raw scans but not so much for HNSW. We use usearch only for HNSW.
-# - Simsimd does compile-time dispatch (choice of SIMD kernels determined by capabilities of the build machine) or dynamic dispatch (SIMD
-#   kernels chosen at runtime based on cpuid instruction). Since current builds are limited to SSE 4.2 (x86) and NEON (ARM), the speedup of
-#   the former would be moderate compared to AVX-512 / SVE. The latter is at the moment too fragile with respect to portability across x86
-#   and ARM machines ... certain conbinations of quantizations / distance functions / SIMD instructions are not implemented at the moment.
+# Only x86 for now. On ARM, the linker goes down in flames. To make SimSIMD compile, I had to remove a macro checks in SimSIMD
+# for AVX512 (x86, worked nicely) and __ARM_BF16_FORMAT_ALTERNATIVE. It is probably because of that.
+if (ARCH_AMD64)
+    target_link_libraries(_usearch INTERFACE _simsimd)
+    target_compile_definitions(_usearch INTERFACE USEARCH_USE_SIMSIMD)
+
+    target_compile_definitions(_usearch INTERFACE USEARCH_CAN_COMPILE_FLOAT16)
+    target_compile_definitions(_usearch INTERFACE USEARCH_CAN_COMPILE_BF16)
+endif ()

 add_library(ch_contrib::usearch ALIAS _usearch)
+
+
+# Cf. https://github.com/llvm/llvm-project/issues/107810 (though it is not 100% the same stack)
+#
+# LLVM ERROR: Cannot select: 0x7996e7a73150: f32,ch = load<(load (s16) from %ir.22, !tbaa !54231), anyext from bf16> 0x79961cb737c0, 0x7996e7a1a500, undef:i64, ./contrib/SimSIMD/include/simsimd/dot.h:215:1
+#   0x7996e7a1a500: i64 = add 0x79961e770d00, Constant:i64<-16>, ./contrib/SimSIMD/include/simsimd/dot.h:215:1
+#     0x79961e770d00: i64,ch = CopyFromReg 0x79961cb737c0, Register:i64 %4, ./contrib/SimSIMD/include/simsimd/dot.h:215:1
+#       0x7996e7a1ae10: i64 = Register %4
+#     0x7996e7a1b5f0: i64 = Constant<-16>
+#   0x7996e7a1a730: i64 = undef
+# In function: _ZL23simsimd_dot_bf16_serialPKu6__bf16S0_yPd
+# PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
+# Stack dump:
+# 0.      Running pass 'Function Pass Manager' on module 'src/libdbms.a(MergeTreeIndexVectorSimilarity.cpp.o at 2312737440)'.
+# 1.      Running pass 'AArch64 Instruction Selection' on function '@_ZL23simsimd_dot_bf16_serialPKu6__bf16S0_yPd'
+#  #0 0x00007999e83a63bf llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda63bf)
+#  #1 0x00007999e83a44f9 llvm::sys::RunSignalHandlers() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda44f9)
+#  #2 0x00007999e83a6b00 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xda6b00)
+#  #3 0x00007999e6e45320 (/lib/x86_64-linux-gnu/libc.so.6+0x45320)
+#  #4 0x00007999e6e9eb1c pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x9eb1c)
+#  #5 0x00007999e6e4526e raise (/lib/x86_64-linux-gnu/libc.so.6+0x4526e)
+#  #6 0x00007999e6e288ff abort (/lib/x86_64-linux-gnu/libc.so.6+0x288ff)
+#  #7 0x00007999e82fe0c2 llvm::report_fatal_error(llvm::Twine const&, bool) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xcfe0c2)
+#  #8 0x00007999e8c2f8e3 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162f8e3)
+#  #9 0x00007999e8c2ed76 llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162ed76)
+# #10 0x00007999ea1adbcb (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x2badbcb)
+# #11 0x00007999e8c2611f llvm::SelectionDAGISel::DoInstructionSelection() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x162611f)
+# #12 0x00007999e8c25790 llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x1625790)
+# #13 0x00007999e8c248de llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x16248de)
+# #14 0x00007999e8c22934 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x1622934)
+# #15 0x00007999e87826b9 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x11826b9)
+# #16 0x00007999e84f7772 llvm::FPPassManager::runOnFunction(llvm::Function&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xef7772)
+# #17 0x00007999e84fd2f4 llvm::FPPassManager::runOnModule(llvm::Module&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xefd2f4)
+# #18 0x00007999e84f7e9f llvm::legacy::PassManagerImpl::run(llvm::Module&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xef7e9f)
+# #19 0x00007999e99f7d61 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f7d61)
+# #20 0x00007999e99f8c91 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f8c91)
+# #21 0x00007999e99f8b10 llvm::lto::thinBackend(llvm::lto::Config const&, unsigned int, std::function<llvm::Expected<std::unique_ptr<llvm::CachedFileStream, std::default_delete<llvm::CachedFileStream>>> (unsigned int, llvm::Twine const&)>, llvm::Module&, llvm::ModuleSummaryIndex const&, llvm::DenseMap<llvm::StringRef, std::unordered_set<unsigned long, std::hash<unsigned long>, std::equal_to<unsigned long>, std::allocator<unsigned long>>, llvm::DenseMapInfo<llvm::StringRef, void
+# >, llvm::detail::DenseMapPair<llvm::StringRef, std::unordered_set<unsigned long, std::hash<unsigned long>, std::equal_to<unsigned long>, std::allocator<unsigned long>>>> const&, llvm::DenseMap<unsigned long, llvm::GlobalValueSummary*, llvm::DenseMapInfo<unsigned long, void>, llvm::detail::DenseMapPair<unsigned long, llvm::GlobalValueSummary*>> const&, llvm::MapVector<llvm::StringRef, llvm::BitcodeModule, llvm::DenseMap<llvm::StringRef, unsigned int, llvm::DenseMapInfo<llvm::S
+# tringRef, void>, llvm::detail::DenseMapPair<llvm::StringRef, unsigned int>>, llvm::SmallVector<std::pair<llvm::StringRef, llvm::BitcodeModule>, 0u>>*, std::vector<unsigned char, std::allocator<unsigned char>> const&) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f8b10)
+# #22 0x00007999e99f248d (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f248d)
+# #23 0x00007999e99f1cd6 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0x23f1cd6)
+# #24 0x00007999e82c9beb (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xcc9beb)
+# #25 0x00007999e834ebe3 llvm::ThreadPool::processTasks(llvm::ThreadPoolTaskGroup*) (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xd4ebe3)
+# #26 0x00007999e834f704 (/usr/lib/llvm-18/bin/../lib/libLLVM.so.18.1+0xd4f704)
+# #27 0x00007999e6e9ca94 (/lib/x86_64-linux-gnu/libc.so.6+0x9ca94)
+# #28 0x00007999e6f29c3c (/lib/x86_64-linux-gnu/libc.so.6+0x129c3c)
+# clang++-18: error: unable to execute command: Aborted (core dumped)
+# clang++-18: error: linker command failed due to signal (use -v to see invocation)
+# ^[[A^Cninja: build stopped: interrupted by user.
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@ -1,7 +1,7 @@
 # The Dockerfile.ubuntu exists for the tests/ci/docker_server.py script
 # If the image is built from Dockerfile.alpine, then the `-alpine` suffix is added automatically,
 # so the only purpose of Dockerfile.ubuntu is to push `latest`, `head` and so on w/o suffixes
-FROM ubuntu:20.04 AS glibc-donor
+FROM ubuntu:22.04 AS glibc-donor
 ARG TARGETARCH

 RUN arch=${TARGETARCH:-amd64} \
@ -9,7 +9,11 @@ RUN arch=${TARGETARCH:-amd64} \
        amd64) rarch=x86_64 ;; \
        arm64) rarch=aarch64 ;; \
    esac \
-    && ln -s "${rarch}-linux-gnu" /lib/linux-gnu
+    && ln -s "${rarch}-linux-gnu" /lib/linux-gnu \
+    && case $arch in \
+        amd64) ln /lib/linux-gnu/ld-linux-x86-64.so.2 /lib/linux-gnu/ld-2.35.so ;; \
+        arm64) ln /lib/linux-gnu/ld-linux-aarch64.so.1 /lib/linux-gnu/ld-2.35.so ;; \
+    esac


 FROM alpine
@ -20,7 +24,7 @@ ENV LANG=en_US.UTF-8 \
    TZ=UTC \
    CLICKHOUSE_CONFIG=/etc/clickhouse-server/config.xml

-COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/
+COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.35.so /lib/
 COPY --from=glibc-donor /etc/nsswitch.conf /etc/
 COPY entrypoint.sh /entrypoint.sh

@ -34,7 +38,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.9.2.42"
+ARG VERSION="24.10.1.2812"
 ARG PACKAGES="clickhouse-keeper"
 ARG DIRECT_DOWNLOAD_URLS=""

--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -35,7 +35,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.9.2.42"
+ARG VERSION="24.10.1.2812"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 ARG DIRECT_DOWNLOAD_URLS=""

--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -1,4 +1,4 @@
-FROM ubuntu:20.04
+FROM ubuntu:22.04

 # see https://github.com/moby/moby/issues/4032#issuecomment-192327844
 # It could be removed after we move on a version 23:04+
@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list

 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="24.9.2.42"
+ARG VERSION="24.10.1.2812"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 #docker-official-library:off
--- a/docker/server/README.md
+++ b/docker/server/README.md
@ -20,6 +20,7 @@ For more information and documentation see https://clickhouse.com/.

 -	The amd64 image requires support for [SSE3 instructions](https://en.wikipedia.org/wiki/SSE3). Virtually all x86 CPUs after 2005 support SSE3.
 -	The arm64 image requires support for the [ARMv8.2-A architecture](https://en.wikipedia.org/wiki/AArch64#ARMv8.2-A) and additionally the Load-Acquire RCpc register. The register is optional in version ARMv8.2-A and mandatory in [ARMv8.3-A](https://en.wikipedia.org/wiki/AArch64#ARMv8.3-A). Supported in Graviton >=2, Azure and GCP instances. Examples for unsupported devices are Raspberry Pi 4 (ARMv8.0-A) and Jetson AGX Xavier/Orin (ARMv8.2-A).
+-	Since the Clickhouse 24.11 Ubuntu images started using `ubuntu:22.04` as its base image. It requires docker version >= `20.10.10` containing [patch](https://github.com/moby/moby/commit/977283509f75303bc6612665a04abf76ff1d2468). As a workaround you could use `docker run [--privileged | --security-opt seccomp=unconfined]` instead, however that has security implications.

 ## How to use this image

--- a/docker/test/libfuzzer/Dockerfile
+++ b/docker/test/libfuzzer/Dockerfile
@ -33,8 +33,6 @@ RUN apt-get update \
 COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r /requirements.txt

-ENV FUZZER_ARGS="-max_total_time=60"
-
 SHELL ["/bin/bash", "-c"]

 # docker run --network=host --volume <workspace>:/workspace -e PR_TO_TEST=<> -e SHA_TO_TEST=<> clickhouse/libfuzzer
--- a/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile
+++ b/docker/test/stateless/clickhouse-statelest-test-runner.Dockerfile
@ -1,16 +0,0 @@
-# Since right now we can't set volumes to the docker during build, we split building container in stages:
-# 1. build base container
-# 2. run base conatiner with mounted volumes
-# 3. commit container as image
-FROM ubuntu:20.04 as clickhouse-test-runner-base
-
-# A volume where directory with clickhouse packages to be mounted,
-# for later installing.
-VOLUME /packages
-
-CMD apt-get update ;\
-    DEBIAN_FRONTEND=noninteractive \
-    apt install -y /packages/clickhouse-common-static_*.deb \
-        /packages/clickhouse-client_*.deb \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/*
--- a/docs/changelogs/v24.10.1.2812-stable.md
+++ b/docs/changelogs/v24.10.1.2812-stable.md
--- a/docs/changelogs/v24.3.13.40-lts.md
+++ b/docs/changelogs/v24.3.13.40-lts.md
@ -0,0 +1,31 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.3.13.40-lts (7acabd77389) FIXME as compared to v24.3.12.75-lts (7cb5dff8019)
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Backported in [#63976](https://github.com/ClickHouse/ClickHouse/issues/63976): Fix intersect parts when restart after drop range. [#63202](https://github.com/ClickHouse/ClickHouse/pull/63202) ([Han Fei](https://github.com/hanfei1991)).
+* Backported in [#71482](https://github.com/ClickHouse/ClickHouse/issues/71482): Fix `Content-Encoding` not sent in some compressed responses. [#64802](https://github.com/ClickHouse/ClickHouse/issues/64802). [#68975](https://github.com/ClickHouse/ClickHouse/pull/68975) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Backported in [#70451](https://github.com/ClickHouse/ClickHouse/issues/70451): Fix vrash during insertion into FixedString column in PostgreSQL engine. [#69584](https://github.com/ClickHouse/ClickHouse/pull/69584) ([Pavel Kruglov](https://github.com/Avogar)).
+* Backported in [#70619](https://github.com/ClickHouse/ClickHouse/issues/70619): Fix server segfault on creating a materialized view with two selects and an `INTERSECT`, e.g. `CREATE MATERIALIZED VIEW v0 AS (SELECT 1) INTERSECT (SELECT 1);`. [#70264](https://github.com/ClickHouse/ClickHouse/pull/70264) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Backported in [#70877](https://github.com/ClickHouse/ClickHouse/issues/70877): Fix table creation with `CREATE ... AS table_function()` with database `Replicated` and unavailable table function source on secondary replica. [#70511](https://github.com/ClickHouse/ClickHouse/pull/70511) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Backported in [#70571](https://github.com/ClickHouse/ClickHouse/issues/70571): Ignore all output on async insert with `wait_for_async_insert=1`. Closes [#62644](https://github.com/ClickHouse/ClickHouse/issues/62644). [#70530](https://github.com/ClickHouse/ClickHouse/pull/70530) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Backported in [#71146](https://github.com/ClickHouse/ClickHouse/issues/71146): Ignore frozen_metadata.txt while traversing shadow directory from system.remote_data_paths. [#70590](https://github.com/ClickHouse/ClickHouse/pull/70590) ([Aleksei Filatov](https://github.com/aalexfvk)).
+* Backported in [#70682](https://github.com/ClickHouse/ClickHouse/issues/70682): Fix creation of stateful window functions on misaligned memory. [#70631](https://github.com/ClickHouse/ClickHouse/pull/70631) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#71113](https://github.com/ClickHouse/ClickHouse/issues/71113): Fix a crash and a leak in AggregateFunctionGroupArraySorted. [#70820](https://github.com/ClickHouse/ClickHouse/pull/70820) ([Michael Kolupaev](https://github.com/al13n321)).
+* Backported in [#70990](https://github.com/ClickHouse/ClickHouse/issues/70990): Fix a logical error due to negative zeros in the two-level hash table. This closes [#70973](https://github.com/ClickHouse/ClickHouse/issues/70973). [#70979](https://github.com/ClickHouse/ClickHouse/pull/70979) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Backported in [#71246](https://github.com/ClickHouse/ClickHouse/issues/71246): Fixed named sessions not being closed and hanging on forever under certain circumstances. [#70998](https://github.com/ClickHouse/ClickHouse/pull/70998) ([Márcio Martins](https://github.com/marcio-absmartly)).
+* Backported in [#71371](https://github.com/ClickHouse/ClickHouse/issues/71371): Add try/catch to data parts destructors to avoid terminate. [#71364](https://github.com/ClickHouse/ClickHouse/pull/71364) ([alesapin](https://github.com/alesapin)).
+* Backported in [#71594](https://github.com/ClickHouse/ClickHouse/issues/71594): Prevent crash in SortCursor with 0 columns (old analyzer). [#71494](https://github.com/ClickHouse/ClickHouse/pull/71494) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Backported in [#71022](https://github.com/ClickHouse/ClickHouse/issues/71022): Fix dropping of file cache in CHECK query in case of enabled transactions. [#69256](https://github.com/ClickHouse/ClickHouse/pull/69256) ([Anton Popov](https://github.com/CurtizJ)).
+* Backported in [#70384](https://github.com/ClickHouse/ClickHouse/issues/70384): CI: Enable Integration Tests for backport PRs. [#70329](https://github.com/ClickHouse/ClickHouse/pull/70329) ([Max Kainov](https://github.com/maxknv)).
+* Backported in [#70538](https://github.com/ClickHouse/ClickHouse/issues/70538): Remove slow poll() logs in keeper. [#70508](https://github.com/ClickHouse/ClickHouse/pull/70508) ([Raúl Marín](https://github.com/Algunenano)).
+* Backported in [#70971](https://github.com/ClickHouse/ClickHouse/issues/70971): Limiting logging some lines about configs. [#70879](https://github.com/ClickHouse/ClickHouse/pull/70879) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+
--- a/docs/changelogs/v24.8.6.70-lts.md
+++ b/docs/changelogs/v24.8.6.70-lts.md
--- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
+++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
@ -4,9 +4,13 @@ sidebar_position: 50
 sidebar_label: EmbeddedRocksDB
 ---

+import CloudNotSupportedBadge from '@theme/badges/CloudNotSupportedBadge';
+
 # EmbeddedRocksDB Engine

-This engine allows integrating ClickHouse with [rocksdb](http://rocksdb.org/).
+<CloudNotSupportedBadge />
+
+This engine allows integrating ClickHouse with [RocksDB](http://rocksdb.org/).

 ## Creating a Table {#creating-a-table}

--- a/docs/en/getting-started/index.md
+++ b/docs/en/getting-started/index.md
@ -23,6 +23,7 @@ functions in ClickHouse. The sample datasets include:
 - The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
 - The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
 - The [Laion dataset](../getting-started/example-datasets/laion.md) has an example of [Approximate nearest neighbor search indexes](../engines/table-engines/mergetree-family/annindexes.md) usage
+- The [TPC-H](../getting-started/example-datasets/tpch.md), [TPC-DS](../getting-started/example-datasets/tpcds.md), and [Star Schema (SSB)](../getting-started/example-datasets/star-schema.md) industry benchmarks for analytics databases
 - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset 
 - [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3
 - [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs.
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -190,6 +190,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 - `--config-file` – The name of the configuration file.
 - `--secure` – If specified, will connect to server over secure connection (TLS). You might need to configure your CA certificates in the [configuration file](#configuration_files). The available configuration settings are the same as for [server-side TLS configuration](../operations/server-configuration-parameters/settings.md#openssl).
 - `--history_file` — Path to a file containing command history.
+- `--history_max_entries` — Maximum number of entries in the history file. Default value: 1 000 000.
 - `--param_<name>` — Value for a [query with parameters](#cli-queries-with-parameters).
 - `--hardware-utilization` — Print hardware utilization information in progress bar.
 - `--print-profile-events` – Print `ProfileEvents` packets.
--- a/docs/en/interfaces/prometheus.md
+++ b/docs/en/interfaces/prometheus.md
@ -9,7 +9,7 @@ sidebar_label: Prometheus protocols
 ## Exposing metrics {#expose}

 :::note
-ClickHouse Cloud does not currently support connecting to Prometheus. To be notified when this feature is supported, please contact support@clickhouse.com.
+If you are using ClickHouse Cloud, you can expose metrics to Prometheus using the [Prometheus Integration](/en/integrations/prometheus).
 :::

 ClickHouse can expose its own metrics for scraping from Prometheus:
--- a/docs/en/operations/_troubleshooting.md
+++ b/docs/en/operations/_troubleshooting.md
@ -65,6 +65,34 @@ sudo rm -f /etc/yum.repos.d/clickhouse.repo

 After that follow the [install guide](../getting-started/install.md#from-rpm-packages)

+### You Can't Run Docker Container
+
+You are running a simple `docker run clickhouse/clickhouse-server` and it crashes with a stack trace similar to following:
+
+```
+$ docker run -it clickhouse/clickhouse-server
+........
+2024.11.06 21:04:48.912036 [ 1 ] {} <Information> SentryWriter: Sending crash reports is disabled
+Poco::Exception. Code: 1000, e.code() = 0, System exception: cannot start thread, Stack trace (when copying this message, always include the lines below):
+
+0. Poco::ThreadImpl::startImpl(Poco::SharedPtr<Poco::Runnable, Poco::ReferenceCounter, Poco::ReleasePolicy<Poco::Runnable>>) @ 0x00000000157c7b34
+1. Poco::Thread::start(Poco::Runnable&) @ 0x00000000157c8a0e
+2. BaseDaemon::initializeTerminationAndSignalProcessing() @ 0x000000000d267a14
+3. BaseDaemon::initialize(Poco::Util::Application&) @ 0x000000000d2652cb
+4. DB::Server::initialize(Poco::Util::Application&) @ 0x000000000d128b38
+5. Poco::Util::Application::run() @ 0x000000001581cfda
+6. DB::Server::run() @ 0x000000000d1288f0
+7. Poco::Util::ServerApplication::run(int, char**) @ 0x0000000015825e27
+8. mainEntryClickHouseServer(int, char**) @ 0x000000000d125b38
+9. main @ 0x0000000007ea4eee
+10. ? @ 0x00007f67ff946d90
+11. ? @ 0x00007f67ff946e40
+12. _start @ 0x00000000062e802e
+ (version 24.10.1.2812 (official build))
+```
+
+The reason is an old docker daemon with version lower than `20.10.10`. A way to fix it either upgrading it, or running `docker run [--privileged | --security-opt seccomp=unconfined]`. The latter has security implications.
+
 ## Connecting to the Server {#troubleshooting-accepts-no-connections}

 Possible issues:
--- a/docs/en/operations/query-cache.md
+++ b/docs/en/operations/query-cache.md
@ -25,9 +25,10 @@ Query caches can generally be viewed as transactionally consistent or inconsiste
  slowly enough that the database only needs to compute the report once (represented by the first `SELECT` query). Further queries can be
  served directly from the query cache. In this example, a reasonable validity period could be 30 min.

-Transactionally inconsistent caching is traditionally provided by client tools or proxy packages interacting with the database. As a result,
-the same caching logic and configuration is often duplicated. With ClickHouse's query cache, the caching logic moves to the server side.
-This reduces maintenance effort and avoids redundancy.
+Transactionally inconsistent caching is traditionally provided by client tools or proxy packages (e.g.
+[chproxy](https://www.chproxy.org/configuration/caching/)) interacting with the database. As a result, the same caching logic and
+configuration is often duplicated. With ClickHouse's query cache, the caching logic moves to the server side. This reduces maintenance
+effort and avoids redundancy.

 ## Configuration Settings and Usage

@ -138,7 +139,10 @@ is only cached if the query runs longer than 5 seconds. It is also possible to s
 cached - for that use setting [query_cache_min_query_runs](settings/settings.md#query-cache-min-query-runs).

 Entries in the query cache become stale after a certain time period (time-to-live). By default, this period is 60 seconds but a different
-value can be specified at session, profile or query level using setting [query_cache_ttl](settings/settings.md#query-cache-ttl).
+value can be specified at session, profile or query level using setting [query_cache_ttl](settings/settings.md#query-cache-ttl). The query
+cache evicts entries "lazily", i.e. when an entry becomes stale, it is not immediately removed from the cache. Instead, when a new entry
+is to be inserted into the query cache, the database checks whether the cache has enough free space for the new entry. If this is not the
+case, the database tries to remove all stale entries. If the cache still has not enough free space, the new entry is not inserted.

 Entries in the query cache are compressed by default. This reduces the overall memory consumption at the cost of slower writes into / reads
 from the query cache. To disable compression, use setting [query_cache_compress_entries](settings/settings.md#query-cache-compress-entries).
@ -188,14 +192,9 @@ Also, results of queries with non-deterministic functions are not cached by defa
 To force caching of results of queries with non-deterministic functions regardless, use setting
 [query_cache_nondeterministic_function_handling](settings/settings.md#query-cache-nondeterministic-function-handling).

-Results of queries that involve system tables, e.g. `system.processes` or `information_schema.tables`, are not cached by default. To force
-caching of results of queries with system tables regardless, use setting
-[query_cache_system_table_handling](settings/settings.md#query-cache-system-table-handling).
-
-:::note
-Prior to ClickHouse v23.11, setting 'query_cache_store_results_of_queries_with_nondeterministic_functions = 0 / 1' controlled whether
-results of queries with non-deterministic results were cached. In newer ClickHouse versions, this setting is obsolete and has no effect.
-:::
+Results of queries that involve system tables (e.g. [system.processes](system-tables/processes.md)` or
+[information_schema.tables](system-tables/information_schema.md)) are not cached by default. To force caching of results of queries with
+system tables regardless, use setting [query_cache_system_table_handling](settings/settings.md#query-cache-system-table-handling).

 Finally, entries in the query cache are not shared between users due to security reasons. For example, user A must not be able to bypass a
 row policy on a table by running the same query as another user B for whom no such policy exists. However, if necessary, cache entries can
--- a/docs/en/operations/system-tables/grants.md
+++ b/docs/en/operations/system-tables/grants.md
@ -19,7 +19,7 @@ Columns:
 - `column` ([Nullable](../../sql-reference/data-types/nullable.md)([String](../../sql-reference/data-types/string.md))) — Name of a column to which access is granted.

 - `is_partial_revoke` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Logical value. It shows whether some privileges have been revoked. Possible values:
- `0` — The row describes a partial revoke.
- `1` — The row describes a grant.
+- `0` — The row describes a grant.
+- `1` — The row describes a partial revoke.

 - `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Permission is granted `WITH GRANT OPTION`, see [GRANT](../../sql-reference/statements/grant.md#granting-privilege-syntax).
--- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md
@ -17,7 +17,7 @@ anyLast(column) [RESPECT NULLS]
 - `column`: The column name. 

 :::note
-Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not.
+Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the last value passed, regardless of whether it is `NULL` or not.
 :::

 **Returned value**
@ -40,4 +40,4 @@ SELECT anyLast(city) FROM any_last_nulls;
 ┌─anyLast(city)─┐
 │ Valencia      │
 └───────────────┘
-```
+```
--- a/docs/en/sql-reference/data-types/dynamic.md
+++ b/docs/en/sql-reference/data-types/dynamic.md
@ -512,6 +512,8 @@ The result of operator `<` for values `d1` with underlying type `T1` and `d2` wi
 - If `T1 = T2 = T`, the result will be `d1.T < d2.T` (underlying values will be compared).
 - If `T1 != T2`, the result will be `T1 < T2` (type names will be compared).

+By default `Dynamic` type is not allowed in `GROUP BY`/`ORDER BY` keys, if you want to use it consider its special comparison rule and enable `allow_suspicious_types_in_group_by`/`allow_suspicious_types_in_order_by` settings.
+
 Examples:
 ```sql
 CREATE TABLE test (d Dynamic) ENGINE=Memory;
@ -535,7 +537,7 @@ SELECT d, dynamicType(d) FROM test;
 ```

 ```sql
-SELECT d, dynamicType(d) FROM test ORDER BY d;
+SELECT d, dynamicType(d) FROM test ORDER BY d SETTINGS allow_suspicious_types_in_order_by=1;
 ```

 ```sql
@ -557,7 +559,7 @@ Example:
 ```sql
 CREATE TABLE test (d Dynamic) ENGINE=Memory;
 INSERT INTO test VALUES (1::UInt32), (1::Int64), (100::UInt32), (100::Int64);
-SELECT d, dynamicType(d) FROM test ORDER by d;
+SELECT d, dynamicType(d) FROM test ORDER BY d SETTINGS allow_suspicious_types_in_order_by=1;
 ```

 ```text
@ -570,7 +572,7 @@ SELECT d, dynamicType(d) FROM test ORDER by d;
 ```

 ```sql
-SELECT d, dynamicType(d) FROM test GROUP by d;
+SELECT d, dynamicType(d) FROM test GROUP by d SETTINGS allow_suspicious_types_in_group_by=1;
 ```

 ```text
@ -582,7 +584,7 @@ SELECT d, dynamicType(d) FROM test GROUP by d;
 └─────┴────────────────┘
 ```

-**Note**: the described comparison rule is not applied during execution of comparison functions like `<`/`>`/`=` and others because of [special work](#using-dynamic-type-in-functions) of functions with `Dynamic` type
+**Note:** the described comparison rule is not applied during execution of comparison functions like `<`/`>`/`=` and others because of [special work](#using-dynamic-type-in-functions) of functions with `Dynamic` type

 ## Reaching the limit in number of different data types stored inside Dynamic

--- a/docs/en/sql-reference/data-types/newjson.md
+++ b/docs/en/sql-reference/data-types/newjson.md
@ -58,10 +58,10 @@ SELECT json FROM test;
 └───────────────────────────────────┘
 ```

-Using CAST from 'String':
+Using CAST from `String`:

 ```sql
-SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json;
+SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON AS json;
 ```

 ```text
@ -70,7 +70,47 @@ SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::JSON as json
 └────────────────────────────────────────────────┘
 ```

-CAST from `JSON`, named `Tuple`, `Map` and `Object('json')` to `JSON` type will be supported later.
+Using CAST from `Tuple`:
+
+```sql
+SELECT (tuple(42 AS b) AS a, [1, 2, 3] AS c, 'Hello, World!' AS d)::JSON AS json;
+```
+
+```text
+┌─json───────────────────────────────────────────┐
+│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
+└────────────────────────────────────────────────┘
+```
+
+Using CAST from `Map`:
+
+```sql
+SELECT map('a', map('b', 42), 'c', [1,2,3], 'd', 'Hello, World!')::JSON AS json;
+```
+
+```text
+┌─json───────────────────────────────────────────┐
+│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
+└────────────────────────────────────────────────┘
+```
+
+Using CAST from deprecated `Object('json')`:
+
+```sql
+ SELECT '{"a" : {"b" : 42},"c" : [1, 2, 3], "d" : "Hello, World!"}'::Object('json')::JSON AS json;
+ ```
+
+```text
+┌─json───────────────────────────────────────────┐
+│ {"a":{"b":42},"c":[1,2,3],"d":"Hello, World!"} │
+└────────────────────────────────────────────────┘
+```
+
+:::note
+CAST from `Tuple`/`Map`/`Object('json')` to `JSON` is implemented via serializing the column into `String` column containing JSON objects and deserializing it back to `JSON` type column. 
+:::
+
+CAST between `JSON` types with different arguments will be supported later.

 ## Reading JSON paths as subcolumns

@ -630,6 +670,28 @@ SELECT arrayJoin(distinctJSONPathsAndTypes(json)) FROM s3('s3://clickhouse-publi
 └─arrayJoin(distinctJSONPathsAndTypes(json))──────────────────┘
 ```

+## ALTER MODIFY COLUMN to JSON type
+
+It's possible to alter an existing table and change the type of the column to the new `JSON` type. Right now only alter from `String` type is supported.
+
+**Example**
+
+```sql
+CREATE TABLE test (json String) ENGINE=MergeTree ORDeR BY tuple();
+INSERT INTO test VALUES ('{"a" : 42}'), ('{"a" : 43, "b" : "Hello"}'), ('{"a" : 44, "b" : [1, 2, 3]}')), ('{"c" : "2020-01-01"}');
+ALTER TABLE test MODIFY COLUMN json JSON;
+SELECT json, json.a, json.b, json.c FROM test;
+```
+
+```text
+┌─json─────────────────────────┬─json.a─┬─json.b──┬─json.c─────┐
+│ {"a":"42"}                   │ 42     │ ᴺᵁᴸᴸ    │ ᴺᵁᴸᴸ       │
+│ {"a":"43","b":"Hello"}       │ 43     │ Hello   │ ᴺᵁᴸᴸ       │
+│ {"a":"44","b":["1","2","3"]} │ 44     │ [1,2,3] │ ᴺᵁᴸᴸ       │
+│ {"c":"2020-01-01"}           │ ᴺᵁᴸᴸ   │ ᴺᵁᴸᴸ    │ 2020-01-01 │
+└──────────────────────────────┴────────┴─────────┴────────────┘
+```
+
 ## Tips for better usage of the JSON type

 Before creating `JSON` column and loading data into it, consider the following tips:
--- a/docs/en/sql-reference/data-types/variant.md
+++ b/docs/en/sql-reference/data-types/variant.md
@ -441,6 +441,8 @@ SELECT v, variantType(v) FROM test ORDER by v;
 └─────┴────────────────┘
 ```

+**Note** by default `Variant` type is not allowed in `GROUP BY`/`ORDER BY` keys, if you want to use it consider its special comparison rule and enable `allow_suspicious_types_in_group_by`/`allow_suspicious_types_in_order_by` settings.
+
 ## JSONExtract functions with Variant

 All `JSONExtract*` functions support `Variant` type:
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -55,7 +55,7 @@ SELECT * FROM view(column1=value1, column2=value2 ...)
 ## Materialized View

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE]
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE]
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }]
 AS SELECT ...
 [COMMENT 'comment']
--- a/docs/en/sql-reference/statements/grant.md
+++ b/docs/en/sql-reference/statements/grant.md
@ -117,6 +117,7 @@ GRANT SELECT ON db*.* TO john -- correct
 GRANT SELECT ON *.my_table TO john -- wrong
 GRANT SELECT ON foo*bar TO john -- wrong
 GRANT SELECT ON *suffix TO john -- wrong
+GRANT SELECT(foo) ON db.table* TO john -- wrong
 ```

 ## Privileges
@ -242,10 +243,13 @@ Hierarchy of privileges:
    - `HDFS`
    - `HIVE`
    - `JDBC`
+    - `KAFKA`
    - `MONGO`
    - `MYSQL`
+    - `NATS`
    - `ODBC`
    - `POSTGRES`
+    - `RABBITMQ`
    - `REDIS`
    - `REMOTE`
    - `S3`
@ -524,10 +528,13 @@ Allows using external data sources. Applies to [table engines](../../engines/tab
    - `HDFS`. Level: `GLOBAL`
    - `HIVE`. Level: `GLOBAL`
    - `JDBC`. Level: `GLOBAL`
+    - `KAFKA`. Level: `GLOBAL`
    - `MONGO`. Level: `GLOBAL`
    - `MYSQL`. Level: `GLOBAL`
+    - `NATS`. Level: `GLOBAL`
    - `ODBC`. Level: `GLOBAL`
    - `POSTGRES`. Level: `GLOBAL`
+    - `RABBITMQ`. Level: `GLOBAL`
    - `REDIS`. Level: `GLOBAL`
    - `REMOTE`. Level: `GLOBAL`
    - `S3`. Level: `GLOBAL`
--- a/docs/en/sql-reference/statements/select/order-by.md
+++ b/docs/en/sql-reference/statements/select/order-by.md
@ -291,7 +291,7 @@ All missed values of `expr` column will be filled sequentially and other columns
 To fill multiple columns, add `WITH FILL` modifier with optional parameters after each field name in `ORDER BY` section.

 ``` sql
-ORDER BY expr [WITH FILL] [FROM const_expr] [TO const_expr] [STEP const_numeric_expr], ... exprN [WITH FILL] [FROM expr] [TO expr] [STEP numeric_expr]
+ORDER BY expr [WITH FILL] [FROM const_expr] [TO const_expr] [STEP const_numeric_expr] [STALENESS const_numeric_expr], ... exprN [WITH FILL] [FROM expr] [TO expr] [STEP numeric_expr] [STALENESS numeric_expr]
 [INTERPOLATE [(col [AS expr], ... colN [AS exprN])]]
 ```

@ -300,6 +300,7 @@ When `FROM const_expr` not defined sequence of filling use minimal `expr` field
 When `TO const_expr` not defined sequence of filling use maximum `expr` field value from `ORDER BY`.
 When `STEP const_numeric_expr` defined then `const_numeric_expr` interprets `as is` for numeric types, as `days` for Date type, as `seconds` for DateTime type. It also supports [INTERVAL](https://clickhouse.com/docs/en/sql-reference/data-types/special-data-types/interval/) data type representing time and date intervals.
 When `STEP const_numeric_expr` omitted then sequence of filling use `1.0` for numeric type, `1 day` for Date type and `1 second` for DateTime type.
+When `STALENESS const_numeric_expr` is defined, the query will generate rows until the difference from the previous row in the original data exceeds `const_numeric_expr`.
 `INTERPOLATE` can be applied to columns not participating in `ORDER BY WITH FILL`. Such columns are filled based on previous fields values by applying `expr`. If `expr` is not present will repeat previous value. Omitted list will result in including all allowed columns.

 Example of a query without `WITH FILL`:
@ -497,6 +498,64 @@ Result:
 └────────────┴────────────┴──────────┘
 ```

+Example of a query without `STALENESS`:
+
+``` sql
+SELECT number as key, 5 * number value, 'original' AS source
+FROM numbers(16) WHERE key % 5 == 0
+ORDER BY key WITH FILL;
+```
+
+Result:
+
+``` text
+    ┌─key─┬─value─┬─source───┐
+ 1. │   0 │     0 │ original │
+ 2. │   1 │     0 │          │
+ 3. │   2 │     0 │          │
+ 4. │   3 │     0 │          │
+ 5. │   4 │     0 │          │
+ 6. │   5 │    25 │ original │
+ 7. │   6 │     0 │          │
+ 8. │   7 │     0 │          │
+ 9. │   8 │     0 │          │
+10. │   9 │     0 │          │
+11. │  10 │    50 │ original │
+12. │  11 │     0 │          │
+13. │  12 │     0 │          │
+14. │  13 │     0 │          │
+15. │  14 │     0 │          │
+16. │  15 │    75 │ original │
+    └─────┴───────┴──────────┘
+```
+
+Same query after applying `STALENESS 3`:
+
+``` sql
+SELECT number as key, 5 * number value, 'original' AS source
+FROM numbers(16) WHERE key % 5 == 0
+ORDER BY key WITH FILL STALENESS 3;
+```
+
+Result:
+
+``` text
+    ┌─key─┬─value─┬─source───┐
+ 1. │   0 │     0 │ original │
+ 2. │   1 │     0 │          │
+ 3. │   2 │     0 │          │
+ 4. │   5 │    25 │ original │
+ 5. │   6 │     0 │          │
+ 6. │   7 │     0 │          │
+ 7. │  10 │    50 │ original │
+ 8. │  11 │     0 │          │
+ 9. │  12 │     0 │          │
+10. │  15 │    75 │ original │
+11. │  16 │     0 │          │
+12. │  17 │     0 │          │
+    └─────┴───────┴──────────┘
+```
+
 Example of a query without `INTERPOLATE`:

 ``` sql
--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@ -95,7 +95,7 @@ sudo yum install -y clickhouse-server clickhouse-client
 sudo systemctl enable clickhouse-server
 sudo systemctl start clickhouse-server
 sudo systemctl status clickhouse-server
-clickhouse-client # илм "clickhouse-client --password" если установлен пароль
+clickhouse-client # или "clickhouse-client --password" если установлен пароль
 ```

 Для использования наиболее свежих версий нужно заменить `stable` на `testing` (рекомендуется для тестовых окружений). Также иногда доступен `prestable`.
--- a/docs/ru/sql-reference/statements/create/view.md
+++ b/docs/ru/sql-reference/statements/create/view.md
@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Материализованные представления {#materialized}

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] 
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] 
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }] 
 AS SELECT ...
 ```
--- a/docs/ru/sql-reference/statements/grant.md
+++ b/docs/ru/sql-reference/statements/grant.md
@ -192,14 +192,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
    - `addressToSymbol`
    - `demangle`
 - [SOURCES](#grant-sources)
+    - `AZURE`
    - `FILE`
-    - `URL`
-    - `REMOTE`
-    - `MYSQL`
-    - `ODBC`
-    - `JDBC`
    - `HDFS`
+    - `HIVE`
+    - `JDBC`
+    - `KAFKA`
+    - `MONGO`
+    - `MYSQL`
+    - `NATS`
+    - `ODBC`
+    - `POSTGRES`
+    - `RABBITMQ`
+    - `REDIS`
+    - `REMOTE`
    - `S3`
+    - `SQLITE`
+    - `URL`
 - [dictGet](#grant-dictget)

 Примеры того, как трактуется данная иерархия:
@ -461,14 +470,23 @@ GRANT INSERT(x,y) ON db.table TO john
 Разрешает использовать внешние источники данных. Применяется к [движкам таблиц](../../engines/table-engines/index.md) и [табличным функциям](../table-functions/index.md#table-functions).

 - `SOURCES`. Уровень: `GROUP`
+    - `AZURE`. Уровень: `GLOBAL`
    - `FILE`. Уровень: `GLOBAL`
-    - `URL`. Уровень: `GLOBAL`
-    - `REMOTE`. Уровень: `GLOBAL`
-    - `MYSQL`. Уровень: `GLOBAL`
-    - `ODBC`. Уровень: `GLOBAL`
-    - `JDBC`. Уровень: `GLOBAL`
    - `HDFS`. Уровень: `GLOBAL`
+    - `HIVE`. Уровень: `GLOBAL`
+    - `JDBC`. Уровень: `GLOBAL`
+    - `KAFKA`. Уровень: `GLOBAL`
+    - `MONGO`. Уровень: `GLOBAL`
+    - `MYSQL`. Уровень: `GLOBAL`
+    - `NATS`. Уровень: `GLOBAL`
+    - `ODBC`. Уровень: `GLOBAL`
+    - `POSTGRES`. Уровень: `GLOBAL`
+    - `RABBITMQ`. Уровень: `GLOBAL`
+    - `REDIS`. Уровень: `GLOBAL`
+    - `REMOTE`. Уровень: `GLOBAL`
    - `S3`. Уровень: `GLOBAL`
+    - `SQLITE`. Уровень: `GLOBAL`
+    - `URL`. Уровень: `GLOBAL`

 Привилегия `SOURCES` разрешает использование всех источников. Также вы можете присвоить привилегию для каждого источника отдельно. Для использования источников необходимы дополнительные привилегии.

--- a/docs/zh/sql-reference/statements/create/view.md
+++ b/docs/zh/sql-reference/statements/create/view.md
@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Materialized {#materialized}

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
 ```

 物化视图存储由相应的[SELECT](../../../sql-reference/statements/select/index.md)管理.
--- a/docs/zh/sql-reference/statements/grant.md
+++ b/docs/zh/sql-reference/statements/grant.md
@ -170,14 +170,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
    -   `addressToSymbol`
    -   `demangle`
 -   [SOURCES](#grant-sources)
+    -   `AZURE`
    -   `FILE`
-    -   `URL`
-    -   `REMOTE`
-    -   `YSQL`
-    -   `ODBC`
-    -   `JDBC`
    -   `HDFS`
+    -   `HIVE`
+    -   `JDBC`
+    -   `KAFKA`
+    -   `MONGO`
+    -   `MYSQL`
+    -   `NATS`
+    -   `ODBC`
+    -   `POSTGRES`
+    -   `RABBITMQ`
+    -   `REDIS`
+    -   `REMOTE`
    -   `S3`
+    -   `SQLITE`
+    -   `URL`
 -   [dictGet](#grant-dictget)

 如何对待该层级的示例：
@ -428,14 +437,23 @@ GRANT INSERT(x,y) ON db.table TO john
 允许在 [table engines](../../engines/table-engines/index.md) 和 [table functions](../../sql-reference/table-functions/index.md#table-functions)中使用外部数据源。

 -   `SOURCES`. 级别: `GROUP`
+    -   `AZURE`. 级别: `GLOBAL`
    -   `FILE`. 级别: `GLOBAL`
-    -   `URL`. 级别: `GLOBAL`
-    -   `REMOTE`. 级别: `GLOBAL`
-    -   `YSQL`. 级别: `GLOBAL`
-    -   `ODBC`. 级别: `GLOBAL`
-    -   `JDBC`. 级别: `GLOBAL`
    -   `HDFS`. 级别: `GLOBAL`
+    -   `HIVE`. 级别: `GLOBAL`
+    -   `JDBC`. 级别: `GLOBAL`
+    -   `KAFKA`. 级别: `GLOBAL`
+    -   `MONGO`. 级别: `GLOBAL`
+    -   `MYSQL`. 级别: `GLOBAL`
+    -   `NATS`. 级别: `GLOBAL`
+    -   `ODBC`. 级别: `GLOBAL`
+    -   `POSTGRES`. 级别: `GLOBAL`
+    -   `RABBITMQ`. 级别: `GLOBAL`
+    -   `REDIS`. 级别: `GLOBAL`
+    -   `REMOTE`. 级别: `GLOBAL`
    -   `S3`. 级别: `GLOBAL`
+    -   `SQLITE`. 级别: `GLOBAL`
+    -   `URL`. 级别: `GLOBAL`

 `SOURCES` 权限允许使用所有数据源。当然也可以单独对每个数据源进行授权。要使用数据源时，还需要额外的权限。

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -192,6 +192,10 @@ void Client::parseConnectionsCredentials(Poco::Util::AbstractConfiguration & con
                history_file = home_path + "/" + history_file.substr(1);
            config.setString("history_file", history_file);
        }
+        if (config.has(prefix + ".history_max_entries"))
+        {
+            config.setUInt("history_max_entries", history_max_entries);
+        }
        if (config.has(prefix + ".accept-invalid-certificate"))
            config.setBool("accept-invalid-certificate", config.getBool(prefix + ".accept-invalid-certificate"));
    }
--- a/programs/disks/DisksApp.cpp
+++ b/programs/disks/DisksApp.cpp
@ -236,6 +236,7 @@ void DisksApp::runInteractiveReplxx()
    ReplxxLineReader lr(
        suggest,
        history_file,
+        history_max_entries,
        /* multiline= */ false,
        /* ignore_shell_suspend= */ false,
        query_extenders,
@ -398,6 +399,8 @@ void DisksApp::initializeHistoryFile()
                throw;
        }
    }
+
+    history_max_entries = config().getUInt("history-max-entries", 1000000);
 }

 void DisksApp::init(const std::vector<String> & common_arguments)
--- a/programs/disks/DisksApp.h
+++ b/programs/disks/DisksApp.h
@ -62,6 +62,8 @@ private:

    // Fields responsible for the REPL work
    String history_file;
+    UInt32 history_max_entries = 0; /// Maximum number of entries in the history file. Needs to be initialized to 0 since we don't have a proper constructor. Worry not, actual value is set within the initializeHistoryFile method.
+
    LineReader::Suggest suggest;
    static LineReader::Patterns query_extenders;
    static LineReader::Patterns query_delimiters;
--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@ -243,6 +243,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */)
        }
    }

+    history_max_entries = config().getUInt("history-max-entries", 1000000);
+
    String default_log_level;
    if (config().has("query"))
        /// We don't want to see any information log in query mode, unless it was set explicitly
@ -319,6 +321,7 @@ void KeeperClient::runInteractiveReplxx()
    ReplxxLineReader lr(
        suggest,
        history_file,
+        history_max_entries,
        /* multiline= */ false,
        /* ignore_shell_suspend= */ false,
        query_extenders,
--- a/programs/keeper-client/KeeperClient.h
+++ b/programs/keeper-client/KeeperClient.h
@ -59,6 +59,8 @@ protected:
    std::vector<String> getCompletions(const String & prefix) const;

    String history_file;
+    UInt32 history_max_entries; /// Maximum number of entries in the history file.
+
    LineReader::Suggest suggest;

    zkutil::ZooKeeperArgs zk_args;
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1353,9 +1353,11 @@ try
    }

    FailPointInjection::enableFromGlobalConfig(config());
+#endif

    memory_worker.start();

+#if defined(OS_LINUX)
    int default_oom_score = 0;

 #if !defined(NDEBUG)
--- a/src/Access/AccessControl.cpp
+++ b/src/Access/AccessControl.cpp
@ -608,7 +608,7 @@ AuthResult AccessControl::authenticate(const Credentials & credentials, const Po
    }
    catch (...)
    {
-        tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName()  + ": Authentication failed");
+        tryLogCurrentException(getLogger(), "from: " + address.toString() + ", user: " + credentials.getUserName()  + ": Authentication failed", LogsLevel::information);

        WriteBufferFromOwnString message;
        message << credentials.getUserName() << ": Authentication failed: password is incorrect, or there is no user with such name.";
@ -622,8 +622,9 @@ AuthResult AccessControl::authenticate(const Credentials & credentials, const Po
                << "and deleting this file will reset the password.\n"
                << "See also /etc/clickhouse-server/users.xml on the server where ClickHouse is installed.\n\n";

-        /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons,
-        /// only the log will show the exact reason.
+        /// We use the same message for all authentication failures because we don't want to give away any unnecessary information for security reasons.
+        /// Only the log ((*), above) will show the exact reason. Note that (*) logs at information level instead of the default error level as
+        /// authentication failures are not an unusual event.
        throw Exception(PreformattedMessage{message.str(),
                                            "{}: Authentication failed: password is incorrect, or there is no user with such name",
                                            std::vector<std::string>{credentials.getUserName()}},
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -243,6 +243,9 @@ enum class AccessType : uint8_t
    M(S3, "", GLOBAL, SOURCES) \
    M(HIVE, "", GLOBAL, SOURCES) \
    M(AZURE, "", GLOBAL, SOURCES) \
+    M(KAFKA, "", GLOBAL, SOURCES) \
+    M(NATS, "", GLOBAL, SOURCES) \
+    M(RABBITMQ, "", GLOBAL, SOURCES) \
    M(SOURCES, "", GROUP, ALL) \
    \
    M(CLUSTER, "", GLOBAL, ALL) /* ON CLUSTER queries */ \
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@ -52,7 +52,10 @@ namespace
        {AccessType::HDFS, "HDFS"},
        {AccessType::S3, "S3"},
        {AccessType::HIVE, "Hive"},
-        {AccessType::AZURE, "AzureBlobStorage"}
+        {AccessType::AZURE, "AzureBlobStorage"},
+        {AccessType::KAFKA, "Kafka"},
+        {AccessType::NATS, "NATS"},
+        {AccessType::RABBITMQ, "RabbitMQ"}
    };


--- a/src/Access/Credentials.h
+++ b/src/Access/Credentials.h
@ -15,6 +15,9 @@ public:
    explicit Credentials() = default;
    explicit Credentials(const String & user_name_);

+    Credentials(const Credentials &) = default;
+    Credentials(Credentials &&) = default;
+
    virtual ~Credentials() = default;

    const String & getUserName() const;
--- a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp
@ -387,7 +387,7 @@ template <typename Value, bool return_float, bool interpolated>
 using FuncQuantileExactWeighted = AggregateFunctionQuantile<
    Value,
    QuantileExactWeighted<Value, interpolated>,
-    NameQuantileExactWeighted,
+    std::conditional_t<interpolated, NameQuantileExactWeightedInterpolated, NameQuantileExactWeighted>,
    true,
    std::conditional_t<return_float, Float64, void>,
    false,
@ -396,7 +396,7 @@ template <typename Value, bool return_float, bool interpolated>
 using FuncQuantilesExactWeighted = AggregateFunctionQuantile<
    Value,
    QuantileExactWeighted<Value, interpolated>,
-    NameQuantilesExactWeighted,
+    std::conditional_t<interpolated, NameQuantilesExactWeightedInterpolated, NameQuantilesExactWeighted>,
    true,
    std::conditional_t<return_float, Float64, void>,
    true,
--- a/src/AggregateFunctions/fuzzers/CMakeLists.txt
+++ b/src/AggregateFunctions/fuzzers/CMakeLists.txt
@ -1,2 +1,2 @@
 clickhouse_add_executable(aggregate_function_state_deserialization_fuzzer aggregate_function_state_deserialization_fuzzer.cpp ${SRCS})
-target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE clickhouse_aggregate_functions)
+target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE clickhouse_aggregate_functions dbms)
--- a/src/Analyzer/QueryNode.h
+++ b/src/Analyzer/QueryNode.h
@ -602,9 +602,21 @@ public:
        return projection_columns;
    }

+    /// Returns true if query node is resolved, false otherwise
+    bool isResolved() const
+    {
+        return !projection_columns.empty();
+    }
+
    /// Resolve query node projection columns
    void resolveProjectionColumns(NamesAndTypes projection_columns_value);

+    /// Clear query node projection columns
+    void clearProjectionColumns()
+    {
+        projection_columns.clear();
+    }
+
    /// Remove unused projection columns
    void removeUnusedProjectionColumns(const std::unordered_set<std::string> & used_projection_columns);

--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -498,6 +498,8 @@ QueryTreeNodePtr QueryTreeBuilder::buildSortList(const ASTPtr & order_by_express
            sort_node->getFillTo() = buildExpression(order_by_element.getFillTo(), context);
        if (order_by_element.getFillStep())
            sort_node->getFillStep() = buildExpression(order_by_element.getFillStep(), context);
+        if (order_by_element.getFillStaleness())
+            sort_node->getFillStaleness() = buildExpression(order_by_element.getFillStaleness(), context);

        list_node->getNodes().push_back(std::move(sort_node));
    }
--- a/src/Analyzer/Resolve/QueryAnalyzer.cpp
+++ b/src/Analyzer/Resolve/QueryAnalyzer.cpp
@ -103,6 +103,8 @@ namespace Setting
    extern const SettingsBool single_join_prefer_left_table;
    extern const SettingsBool transform_null_in;
    extern const SettingsUInt64 use_structure_from_insertion_table_in_table_functions;
+    extern const SettingsBool allow_suspicious_types_in_group_by;
+    extern const SettingsBool allow_suspicious_types_in_order_by;
    extern const SettingsBool use_concurrency_control;
 }

@ -437,8 +439,13 @@ ProjectionName QueryAnalyzer::calculateWindowProjectionName(const QueryTreeNodeP
    return buffer.str();
 }

-ProjectionName QueryAnalyzer::calculateSortColumnProjectionName(const QueryTreeNodePtr & sort_column_node, const ProjectionName & sort_expression_projection_name,
-    const ProjectionName & fill_from_expression_projection_name, const ProjectionName & fill_to_expression_projection_name, const ProjectionName & fill_step_expression_projection_name)
+ProjectionName QueryAnalyzer::calculateSortColumnProjectionName(
+    const QueryTreeNodePtr & sort_column_node,
+    const ProjectionName & sort_expression_projection_name,
+    const ProjectionName & fill_from_expression_projection_name,
+    const ProjectionName & fill_to_expression_projection_name,
+    const ProjectionName & fill_step_expression_projection_name,
+    const ProjectionName & fill_staleness_expression_projection_name)
 {
    auto & sort_node_typed = sort_column_node->as<SortNode &>();

@ -468,6 +475,9 @@ ProjectionName QueryAnalyzer::calculateSortColumnProjectionName(const QueryTreeN

        if (sort_node_typed.hasFillStep())
            sort_column_projection_name_buffer << " STEP " << fill_step_expression_projection_name;
+
+        if (sort_node_typed.hasFillStaleness())
+            sort_column_projection_name_buffer << " STALENESS " << fill_staleness_expression_projection_name;
    }

    return sort_column_projection_name_buffer.str();
@ -2958,27 +2968,29 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
            /// Replace storage with values storage of insertion block
            if (StoragePtr storage = scope.context->getViewSource())
            {
-                QueryTreeNodePtr table_expression;
-                /// Process possibly nested sub-selects
-                for (auto * query_node = in_second_argument->as<QueryNode>(); query_node; query_node = table_expression->as<QueryNode>())
-                    table_expression = extractLeftTableExpression(query_node->getJoinTree());
+                QueryTreeNodePtr table_expression = in_second_argument;

-                if (table_expression)
+                /// Process possibly nested sub-selects
+                while (table_expression)
                {
-                    if (auto * query_table_node = table_expression->as<TableNode>())
-                    {
-                        if (query_table_node->getStorageID().getFullNameNotQuoted() == storage->getStorageID().getFullNameNotQuoted())
-                        {
-                            auto replacement_table_expression = std::make_shared<TableNode>(storage, scope.context);
-                            if (std::optional<TableExpressionModifiers> table_expression_modifiers = query_table_node->getTableExpressionModifiers())
-                                replacement_table_expression->setTableExpressionModifiers(*table_expression_modifiers);
-                            in_second_argument = in_second_argument->cloneAndReplace(table_expression, std::move(replacement_table_expression));
-                        }
-                    }
+                    if (auto * query_node = table_expression->as<QueryNode>())
+                        table_expression = extractLeftTableExpression(query_node->getJoinTree());
+                    else if (auto * union_node = table_expression->as<UnionNode>())
+                        table_expression = union_node->getQueries().getNodes().at(0);
+                    else
+                        break;
+                }
+
+                TableNode * table_expression_table_node = table_expression ? table_expression->as<TableNode>() : nullptr;
+
+                if (table_expression_table_node &&
+                    table_expression_table_node->getStorageID().getFullNameNotQuoted() == storage->getStorageID().getFullNameNotQuoted())
+                {
+                    auto replacement_table_expression_table_node = table_expression_table_node->clone();
+                    replacement_table_expression_table_node->as<TableNode &>().updateStorage(storage, scope.context);
+                    in_second_argument = in_second_argument->cloneAndReplace(table_expression, std::move(replacement_table_expression_table_node));
                }
            }
-
-            resolveExpressionNode(in_second_argument, scope, false /*allow_lambda_expression*/, true /*allow_table_expression*/);
        }

        /// Edge case when the first argument of IN is scalar subquery.
@ -3998,6 +4010,7 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_
    ProjectionNames fill_from_expression_projection_names;
    ProjectionNames fill_to_expression_projection_names;
    ProjectionNames fill_step_expression_projection_names;
+    ProjectionNames fill_staleness_expression_projection_names;

    auto & sort_node_list_typed = sort_node_list->as<ListNode &>();
    for (auto & node : sort_node_list_typed.getNodes())
@ -4019,6 +4032,8 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_
            sort_node.getExpression() = sort_column_list_node->getNodes().front();
        }

+        validateSortingKeyType(sort_node.getExpression()->getResultType(), scope);
+
        size_t sort_expression_projection_names_size = sort_expression_projection_names.size();
        if (sort_expression_projection_names_size != 1)
            throw Exception(ErrorCodes::LOGICAL_ERROR,
@ -4088,11 +4103,38 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_
                    fill_step_expression_projection_names_size);
        }

+        if (sort_node.hasFillStaleness())
+        {
+            fill_staleness_expression_projection_names = resolveExpressionNode(sort_node.getFillStaleness(), scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/);
+
+            const auto * constant_node = sort_node.getFillStaleness()->as<ConstantNode>();
+            if (!constant_node)
+                throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION,
+                    "Sort FILL STALENESS expression must be constant with numeric or interval type. Actual {}. In scope {}",
+                    sort_node.getFillStaleness()->formatASTForErrorMessage(),
+                    scope.scope_node->formatASTForErrorMessage());
+
+            bool is_number = isColumnedAsNumber(constant_node->getResultType());
+            bool is_interval = WhichDataType(constant_node->getResultType()).isInterval();
+            if (!is_number && !is_interval)
+                throw Exception(ErrorCodes::INVALID_WITH_FILL_EXPRESSION,
+                    "Sort FILL STALENESS expression must be constant with numeric or interval type. Actual {}. In scope {}",
+                    sort_node.getFillStaleness()->formatASTForErrorMessage(),
+                    scope.scope_node->formatASTForErrorMessage());
+
+            size_t fill_staleness_expression_projection_names_size = fill_staleness_expression_projection_names.size();
+            if (fill_staleness_expression_projection_names_size != 1)
+                throw Exception(ErrorCodes::LOGICAL_ERROR,
+                    "Sort FILL STALENESS expression expected 1 projection name. Actual {}",
+                    fill_staleness_expression_projection_names_size);
+        }
+
        auto sort_column_projection_name = calculateSortColumnProjectionName(node,
            sort_expression_projection_names[0],
            fill_from_expression_projection_names.empty() ? "" : fill_from_expression_projection_names.front(),
            fill_to_expression_projection_names.empty() ? "" : fill_to_expression_projection_names.front(),
-            fill_step_expression_projection_names.empty() ? "" : fill_step_expression_projection_names.front());
+            fill_step_expression_projection_names.empty() ? "" : fill_step_expression_projection_names.front(),
+            fill_staleness_expression_projection_names.empty() ? "" : fill_staleness_expression_projection_names.front());

        result_projection_names.push_back(std::move(sort_column_projection_name));

@ -4100,11 +4142,32 @@ ProjectionNames QueryAnalyzer::resolveSortNodeList(QueryTreeNodePtr & sort_node_
        fill_from_expression_projection_names.clear();
        fill_to_expression_projection_names.clear();
        fill_step_expression_projection_names.clear();
+        fill_staleness_expression_projection_names.clear();
    }

    return result_projection_names;
 }

+void QueryAnalyzer::validateSortingKeyType(const DataTypePtr & sorting_key_type, const IdentifierResolveScope & scope) const
+{
+    if (scope.context->getSettingsRef()[Setting::allow_suspicious_types_in_order_by])
+        return;
+
+    auto check = [](const IDataType & type)
+    {
+        if (isDynamic(type) || isVariant(type))
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN,
+                "Data types Variant/Dynamic are not allowed in ORDER BY keys, because it can lead to unexpected results. "
+                "Consider using a subcolumn with a specific data type instead (for example 'column.Int64' or 'json.some.path.:Int64' if "
+                "its a JSON path subcolumn) or casting this column to a specific data type. "
+                "Set setting allow_suspicious_types_in_order_by = 1 in order to allow it");
+    };
+
+    check(*sorting_key_type);
+    sorting_key_type->forEachChild(check);
+}
+
 namespace
 {

@ -4144,11 +4207,12 @@ void QueryAnalyzer::resolveGroupByNode(QueryNode & query_node_typed, IdentifierR
            expandTuplesInList(group_by_list);
        }

-        if (scope.group_by_use_nulls)
+        for (const auto & grouping_set : query_node_typed.getGroupBy().getNodes())
        {
-            for (const auto & grouping_set : query_node_typed.getGroupBy().getNodes())
+            for (const auto & group_by_elem : grouping_set->as<ListNode>()->getNodes())
            {
-                for (const auto & group_by_elem : grouping_set->as<ListNode>()->getNodes())
+                validateGroupByKeyType(group_by_elem->getResultType(), scope);
+                if (scope.group_by_use_nulls)
                    scope.nullable_group_by_keys.insert(group_by_elem);
            }
        }
@ -4164,14 +4228,37 @@ void QueryAnalyzer::resolveGroupByNode(QueryNode & query_node_typed, IdentifierR
        auto & group_by_list = query_node_typed.getGroupBy().getNodes();
        expandTuplesInList(group_by_list);

-        if (scope.group_by_use_nulls)
+        for (const auto & group_by_elem : query_node_typed.getGroupBy().getNodes())
        {
-            for (const auto & group_by_elem : query_node_typed.getGroupBy().getNodes())
+            validateGroupByKeyType(group_by_elem->getResultType(), scope);
+            if (scope.group_by_use_nulls)
                scope.nullable_group_by_keys.insert(group_by_elem);
        }
    }
 }

+/** Validate data types of GROUP BY key.
+  */
+void QueryAnalyzer::validateGroupByKeyType(const DataTypePtr & group_by_key_type, const IdentifierResolveScope & scope) const
+{
+    if (scope.context->getSettingsRef()[Setting::allow_suspicious_types_in_group_by])
+        return;
+
+    auto check = [](const IDataType & type)
+    {
+        if (isDynamic(type) || isVariant(type))
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN,
+                "Data types Variant/Dynamic are not allowed in GROUP BY keys, because it can lead to unexpected results. "
+                "Consider using a subcolumn with a specific data type instead (for example 'column.Int64' or 'json.some.path.:Int64' if "
+                "its a JSON path subcolumn) or casting this column to a specific data type. "
+                "Set setting allow_suspicious_types_in_group_by = 1 in order to allow it");
+    };
+
+    check(*group_by_key_type);
+    group_by_key_type->forEachChild(check);
+}
+
 /** Resolve interpolate columns nodes list.
  */
 void QueryAnalyzer::resolveInterpolateColumnsNodeList(QueryTreeNodePtr & interpolate_node_list, IdentifierResolveScope & scope)
@ -5310,6 +5397,16 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier

    auto & query_node_typed = query_node->as<QueryNode &>();

+    /** It is unsafe to call resolveQuery on already resolved query node, because during identifier resolution process
+      * we replace identifiers with expressions without aliases, also at the end of resolveQuery all aliases from all nodes will be removed.
+      * For subsequent resolveQuery executions it is possible to have wrong projection header, because for nodes
+      * with aliases projection name is alias.
+      *
+      * If for client it is necessary to resolve query node after clone, client must clear projection columns from query node before resolve.
+      */
+    if (query_node_typed.isResolved())
+        return;
+
    if (query_node_typed.isCTE())
        ctes_in_resolve_process.insert(query_node_typed.getCTEName());

@ -5448,16 +5545,13 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
      */
    scope.use_identifier_lookup_to_result_cache = false;

-    if (query_node_typed.getJoinTree())
-    {
-        TableExpressionsAliasVisitor table_expressions_visitor(scope);
-        table_expressions_visitor.visit(query_node_typed.getJoinTree());
+    TableExpressionsAliasVisitor table_expressions_visitor(scope);
+    table_expressions_visitor.visit(query_node_typed.getJoinTree());

-        initializeQueryJoinTreeNode(query_node_typed.getJoinTree(), scope);
-        scope.aliases.alias_name_to_table_expression_node.clear();
+    initializeQueryJoinTreeNode(query_node_typed.getJoinTree(), scope);
+    scope.aliases.alias_name_to_table_expression_node.clear();

-        resolveQueryJoinTreeNode(query_node_typed.getJoinTree(), scope, visitor);
-    }
+    resolveQueryJoinTreeNode(query_node_typed.getJoinTree(), scope, visitor);

    if (!scope.group_by_use_nulls)
        scope.use_identifier_lookup_to_result_cache = true;
@ -5675,6 +5769,9 @@ void QueryAnalyzer::resolveUnion(const QueryTreeNodePtr & union_node, Identifier
 {
    auto & union_node_typed = union_node->as<UnionNode &>();

+    if (union_node_typed.isResolved())
+        return;
+
    if (union_node_typed.isCTE())
        ctes_in_resolve_process.insert(union_node_typed.getCTEName());

--- a/src/Analyzer/Resolve/QueryAnalyzer.h
+++ b/src/Analyzer/Resolve/QueryAnalyzer.h
@ -140,7 +140,8 @@ private:
        const ProjectionName & sort_expression_projection_name,
        const ProjectionName & fill_from_expression_projection_name,
        const ProjectionName & fill_to_expression_projection_name,
-        const ProjectionName & fill_step_expression_projection_name);
+        const ProjectionName & fill_step_expression_projection_name,
+        const ProjectionName & fill_staleness_expression_projection_name);

    QueryTreeNodePtr tryGetLambdaFromSQLUserDefinedFunctions(const std::string & function_name, ContextPtr context);

@ -219,8 +220,12 @@ private:

    ProjectionNames resolveSortNodeList(QueryTreeNodePtr & sort_node_list, IdentifierResolveScope & scope);

+    void validateSortingKeyType(const DataTypePtr & sorting_key_type, const IdentifierResolveScope & scope) const;
+
    void resolveGroupByNode(QueryNode & query_node_typed, IdentifierResolveScope & scope);

+    void validateGroupByKeyType(const DataTypePtr & group_by_key_type, const IdentifierResolveScope & scope) const;
+
    void resolveInterpolateColumnsNodeList(QueryTreeNodePtr & interpolate_node_list, IdentifierResolveScope & scope);

    void resolveWindowNodeList(QueryTreeNodePtr & window_node_list, IdentifierResolveScope & scope);
--- a/src/Analyzer/SortNode.cpp
+++ b/src/Analyzer/SortNode.cpp
@ -69,6 +69,12 @@ void SortNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, si
        buffer << '\n' << std::string(indent + 2, ' ') << "FILL STEP\n";
        getFillStep()->dumpTreeImpl(buffer, format_state, indent + 4);
    }
+
+    if (hasFillStaleness())
+    {
+        buffer << '\n' << std::string(indent + 2, ' ') << "FILL STALENESS\n";
+        getFillStaleness()->dumpTreeImpl(buffer, format_state, indent + 4);
+    }
 }

 bool SortNode::isEqualImpl(const IQueryTreeNode & rhs, CompareOptions) const
@ -132,6 +138,8 @@ ASTPtr SortNode::toASTImpl(const ConvertToASTOptions & options) const
        result->setFillTo(getFillTo()->toAST(options));
    if (hasFillStep())
        result->setFillStep(getFillStep()->toAST(options));
+    if (hasFillStaleness())
+        result->setFillStaleness(getFillStaleness()->toAST(options));

    return result;
 }
--- a/src/Analyzer/SortNode.h
+++ b/src/Analyzer/SortNode.h
@ -105,6 +105,24 @@ public:
        return children[fill_step_child_index];
    }

+    /// Returns true if sort node has fill staleness, false otherwise
+    bool hasFillStaleness() const
+    {
+        return children[fill_staleness_child_index] != nullptr;
+    }
+
+    /// Get fill staleness
+    const QueryTreeNodePtr & getFillStaleness() const
+    {
+        return children[fill_staleness_child_index];
+    }
+
+    /// Get fill staleness
+    QueryTreeNodePtr & getFillStaleness()
+    {
+        return children[fill_staleness_child_index];
+    }
+
    /// Get collator
    const std::shared_ptr<Collator> & getCollator() const
    {
@ -144,7 +162,8 @@ private:
    static constexpr size_t fill_from_child_index = 1;
    static constexpr size_t fill_to_child_index = 2;
    static constexpr size_t fill_step_child_index = 3;
-    static constexpr size_t children_size = fill_step_child_index + 1;
+    static constexpr size_t fill_staleness_child_index = 4;
+    static constexpr size_t children_size = fill_staleness_child_index + 1;

    SortDirection sort_direction = SortDirection::ASCENDING;
    std::optional<SortDirection> nulls_sort_direction;
--- a/src/Analyzer/UnionNode.cpp
+++ b/src/Analyzer/UnionNode.cpp
@ -35,6 +35,7 @@ namespace ErrorCodes
 {
    extern const int TYPE_MISMATCH;
    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
 }

 UnionNode::UnionNode(ContextMutablePtr context_, SelectUnionMode union_mode_)
@ -50,6 +51,26 @@ UnionNode::UnionNode(ContextMutablePtr context_, SelectUnionMode union_mode_)
    children[queries_child_index] = std::make_shared<ListNode>();
 }

+bool UnionNode::isResolved() const
+{
+    for (const auto & query_node : getQueries().getNodes())
+    {
+        bool is_resolved = false;
+
+        if (auto * query_node_typed = query_node->as<QueryNode>())
+            is_resolved = query_node_typed->isResolved();
+        else if (auto * union_node_typed = query_node->as<UnionNode>())
+            is_resolved = union_node_typed->isResolved();
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected query tree node type in UNION node");
+
+        if (!is_resolved)
+            return false;
+    }
+
+    return true;
+}
+
 NamesAndTypes UnionNode::computeProjectionColumns() const
 {
    if (recursive_cte_table)
--- a/src/Analyzer/UnionNode.h
+++ b/src/Analyzer/UnionNode.h
@ -163,6 +163,9 @@ public:
        return children[queries_child_index];
    }

+    /// Returns true if union node is resolved, false otherwise
+    bool isResolved() const;
+
    /// Compute union node projection columns
    NamesAndTypes computeProjectionColumns() const;

--- a/src/Backups/BackupConcurrencyCheck.cpp
+++ b/src/Backups/BackupConcurrencyCheck.cpp
@ -0,0 +1,135 @@
+#include <Backups/BackupConcurrencyCheck.h>
+
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
+}
+
+
+BackupConcurrencyCheck::BackupConcurrencyCheck(
+    const UUID & backup_or_restore_uuid_,
+    bool is_restore_,
+    bool on_cluster_,
+    bool allow_concurrency_,
+    BackupConcurrencyCounters & counters_)
+    : is_restore(is_restore_), backup_or_restore_uuid(backup_or_restore_uuid_), on_cluster(on_cluster_), counters(counters_)
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (!allow_concurrency_)
+    {
+        bool found_concurrent_operation = false;
+        if (is_restore)
+        {
+            size_t num_local_restores = counters.local_restores;
+            size_t num_on_cluster_restores = counters.on_cluster_restores.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_restores.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_restores;
+            }
+            else
+            {
+                ++num_local_restores;
+            }
+            found_concurrent_operation = (num_local_restores + num_on_cluster_restores > 1);
+        }
+        else
+        {
+            size_t num_local_backups = counters.local_backups;
+            size_t num_on_cluster_backups = counters.on_cluster_backups.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_backups.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_backups;
+            }
+            else
+            {
+                ++num_local_backups;
+            }
+            found_concurrent_operation = (num_local_backups + num_on_cluster_backups > 1);
+        }
+
+        if (found_concurrent_operation)
+            throwConcurrentOperationNotAllowed(is_restore);
+    }
+
+    if (on_cluster)
+    {
+        if (is_restore)
+            ++counters.on_cluster_restores[backup_or_restore_uuid];
+        else
+            ++counters.on_cluster_backups[backup_or_restore_uuid];
+    }
+    else
+    {
+        if (is_restore)
+            ++counters.local_restores;
+        else
+            ++counters.local_backups;
+    }
+}
+
+
+BackupConcurrencyCheck::~BackupConcurrencyCheck()
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (on_cluster)
+    {
+        if (is_restore)
+        {
+            auto it = counters.on_cluster_restores.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_restores.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_restores.erase(it);
+            }
+        }
+        else
+        {
+            auto it = counters.on_cluster_backups.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_backups.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_backups.erase(it);
+            }
+        }
+    }
+    else
+    {
+        if (is_restore)
+            --counters.local_restores;
+        else
+            --counters.local_backups;
+    }
+}
+
+
+void BackupConcurrencyCheck::throwConcurrentOperationNotAllowed(bool is_restore)
+{
+    throw Exception(
+        ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
+        "Concurrent {} are not allowed, turn on setting '{}'",
+        is_restore ? "restores" : "backups",
+        is_restore ? "allow_concurrent_restores" : "allow_concurrent_backups");
+}
+
+
+BackupConcurrencyCounters::BackupConcurrencyCounters() = default;
+
+
+BackupConcurrencyCounters::~BackupConcurrencyCounters()
+{
+    if (local_backups > 0 || local_restores > 0 || !on_cluster_backups.empty() || !on_cluster_restores.empty())
+        LOG_ERROR(getLogger(__PRETTY_FUNCTION__), "Some backups or restores are processing");
+}
+
+}
--- a/src/Backups/BackupConcurrencyCheck.h
+++ b/src/Backups/BackupConcurrencyCheck.h
@ -0,0 +1,55 @@
+#pragma once
+
+#include <Core/UUID.h>
+#include <base/scope_guard.h>
+#include <mutex>
+#include <unordered_map>
+
+
+namespace DB
+{
+class BackupConcurrencyCounters;
+
+/// Local checker for concurrent BACKUP or RESTORE operations.
+/// This class is used by implementations of IBackupCoordination and IRestoreCoordination
+/// to throw an exception if concurrent backups or restores are not allowed.
+class BackupConcurrencyCheck
+{
+public:
+    /// Checks concurrency of a BACKUP operation or a RESTORE operation.
+    /// Keep a constructed instance of BackupConcurrencyCheck until the operation is done.
+    BackupConcurrencyCheck(
+        const UUID & backup_or_restore_uuid_,
+        bool is_restore_,
+        bool on_cluster_,
+        bool allow_concurrency_,
+        BackupConcurrencyCounters & counters_);
+
+    ~BackupConcurrencyCheck();
+
+    [[noreturn]] static void throwConcurrentOperationNotAllowed(bool is_restore);
+
+private:
+    const bool is_restore;
+    const UUID backup_or_restore_uuid;
+    const bool on_cluster;
+    BackupConcurrencyCounters & counters;
+};
+
+
+class BackupConcurrencyCounters
+{
+public:
+    BackupConcurrencyCounters();
+    ~BackupConcurrencyCounters();
+
+private:
+    friend class BackupConcurrencyCheck;
+    size_t local_backups TSA_GUARDED_BY(mutex) = 0;
+    size_t local_restores TSA_GUARDED_BY(mutex) = 0;
+    std::unordered_map<UUID /* backup_uuid */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
+    std::unordered_map<UUID /* restore_uuid */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationCleaner.cpp
+++ b/src/Backups/BackupCoordinationCleaner.cpp
@ -0,0 +1,64 @@
+#include <Backups/BackupCoordinationCleaner.h>
+
+
+namespace DB
+{
+
+BackupCoordinationCleaner::BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
+    : zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
+{
+}
+
+void BackupCoordinationCleaner::cleanup()
+{
+    tryRemoveAllNodes(/* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryCleanupAfterError() noexcept
+{
+    return tryRemoveAllNodes(/* throw_if_error = */ false, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (cleanup_result.succeeded)
+            return true;
+        if (cleanup_result.exception)
+        {
+            if (throw_if_error)
+                std::rethrow_exception(cleanup_result.exception);
+            return false;
+        }
+    }
+
+    try
+    {
+        LOG_TRACE(log, "Removing nodes from ZooKeeper");
+        auto holder = with_retries.createRetriesControlHolder("removeAllNodes", retries_kind);
+        holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            zookeeper->removeRecursive(zookeeper_path);
+        });
+
+        std::lock_guard lock{mutex};
+        cleanup_result.succeeded = true;
+        return true;
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this restore: {}",
+                  getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));
+
+        std::lock_guard lock{mutex};
+        cleanup_result.exception = std::current_exception();
+
+        if (throw_if_error)
+            throw;
+        return false;
+    }
+}
+
+}
--- a/src/Backups/BackupCoordinationCleaner.h
+++ b/src/Backups/BackupCoordinationCleaner.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <Backups/WithRetries.h>
+
+
+namespace DB
+{
+
+/// Removes all the nodes from ZooKeeper used to coordinate a BACKUP ON CLUSTER operation or
+/// a RESTORE ON CLUSTER operation (successful or not).
+/// This class is used by BackupCoordinationOnCluster and RestoreCoordinationOnCluster to cleanup.
+class BackupCoordinationCleaner
+{
+public:
+    BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);
+
+    void cleanup();
+    bool tryCleanupAfterError() noexcept;
+
+private:
+    bool tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind);
+
+    const String zookeeper_path;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const LoggerPtr log;
+
+    struct CleanupResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+    };
+    CleanupResult cleanup_result TSA_GUARDED_BY(mutex);
+
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -1,5 +1,7 @@
 #include <Backups/BackupCoordinationLocal.h>
+
 #include <Common/Exception.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>
 #include <Common/quoteString.h>
 #include <fmt/format.h>
@ -8,27 +10,20 @@
 namespace DB
 {

-BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
-    : log(getLogger("BackupCoordinationLocal")), file_infos(plain_backup_)
+BackupCoordinationLocal::BackupCoordinationLocal(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("BackupCoordinationLocal"))
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ false, allow_concurrent_backup_, concurrency_counters_)
+    , file_infos(is_plain_backup_)
 {
 }

 BackupCoordinationLocal::~BackupCoordinationLocal() = default;

-void BackupCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void BackupCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo BackupCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -135,15 +130,4 @@ bool BackupCoordinationLocal::startWritingFile(size_t data_file_index)
    return writing_files.emplace(data_file_index).second;
 }

-
-bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
-{
-    if (num_active_backups > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_backups={}", num_active_backups);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -21,13 +22,21 @@ namespace DB
 class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
-    explicit BackupCoordinationLocal(bool plain_backup_);
+    explicit BackupCoordinationLocal(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_);
+
    ~BackupCoordinationLocal() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setBackupQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
@ -54,17 +63,18 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    BackupCoordinationReplicatedTables TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    BackupCoordinationReplicatedAccess TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    BackupCoordinationReplicatedSQLObjects TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    BackupCoordinationFileInfos TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    BackupCoordinationReplicatedTables replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    BackupCoordinationReplicatedAccess replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    BackupCoordinationReplicatedSQLObjects replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    BackupCoordinationFileInfos file_infos TSA_GUARDED_BY(file_infos_mutex);
    BackupCoordinationKeeperMapTables keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
--- a/src/Backups/BackupCoordinationOnCluster.cpp
+++ b/src/Backups/BackupCoordinationOnCluster.cpp
@ -1,7 +1,4 @@
-#include <Backups/BackupCoordinationRemote.h>
-
-#include <base/hex.h>
-#include <boost/algorithm/string/split.hpp>
+#include <Backups/BackupCoordinationOnCluster.h>

 #include <Access/Common/AccessEntityType.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
@ -26,8 +23,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace Stage = BackupCoordinationStage;
-
 namespace
 {
    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
@ -149,144 +144,152 @@ namespace
    };
 }

-size_t BackupCoordinationRemote::findCurrentHostIndex(const Strings & all_hosts, const String & current_host)
+Strings BackupCoordinationOnCluster::excludeInitiator(const Strings & all_hosts)
+{
+    Strings all_hosts_without_initiator = all_hosts;
+    bool has_initiator = (std::erase(all_hosts_without_initiator, kInitiator) > 0);
+    chassert(has_initiator);
+    return all_hosts_without_initiator;
+}
+
+size_t BackupCoordinationOnCluster::findCurrentHostIndex(const String & current_host, const Strings & all_hosts)
 {
    auto it = std::find(all_hosts.begin(), all_hosts.end(), current_host);
    if (it == all_hosts.end())
-        return 0;
+        return all_hosts.size();
    return it - all_hosts.begin();
 }

-BackupCoordinationRemote::BackupCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
+
+BackupCoordinationOnCluster::BackupCoordinationOnCluster(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
    const BackupKeeperSettings & keeper_settings_,
-    const String & backup_uuid_,
-    const Strings & all_hosts_,
    const String & current_host_,
-    bool plain_backup_,
-    bool is_internal_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
    QueryStatusPtr process_list_element_)
    : root_zookeeper_path(root_zookeeper_path_)
-    , zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/backup-" + toString(backup_uuid_))
    , keeper_settings(keeper_settings_)
    , backup_uuid(backup_uuid_)
    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(excludeInitiator(all_hosts))
    , current_host(current_host_)
-    , current_host_index(findCurrentHostIndex(all_hosts, current_host))
-    , plain_backup(plain_backup_)
-    , is_internal(is_internal_)
-    , log(getLogger("BackupCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
+    , current_host_index(findCurrentHostIndex(current_host, all_hosts))
+    , plain_backup(is_plain_backup_)
+    , log(getLogger("BackupCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ true, allow_concurrent_backup_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
 {
    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
 }

-BackupCoordinationRemote::~BackupCoordinationRemote()
+BackupCoordinationOnCluster::~BackupCoordinationOnCluster()
 {
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    tryFinishImpl();
 }

-void BackupCoordinationRemote::createRootNodes()
+void BackupCoordinationOnCluster::createRootNodes()
 {
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
    holder.retries_ctl.retryLoop(
    [&, &zk = holder.faulty_zookeeper]()
    {
        with_retries.renewZooKeeper(zk);

        zk->createAncestors(zookeeper_path);
-
-        Coordination::Requests ops;
-        Coordination::Responses responses;
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_part_names", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_mutations", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_data_paths", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/file_infos", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/writing_files", "", zkutil::CreateMode::Persistent));
-        zk->tryMulti(ops, responses);
+        zk->createIfNotExists(zookeeper_path, "");
+        zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_access", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_sql_objects", "");
+        zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+        zk->createIfNotExists(zookeeper_path + "/file_infos", "");
+        zk->createIfNotExists(zookeeper_path + "/writing_files", "");
    });
 }

-void BackupCoordinationRemote::removeAllNodes()
+Strings BackupCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-    [&, &zk = holder.faulty_zookeeper]()
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void BackupCoordinationOnCluster::setBackupQueryWasSentToOtherHosts()
+{
+    backup_query_was_sent_to_other_hosts = true;
+}
+
+bool BackupCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void BackupCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool BackupCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool BackupCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
    {
-        /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
-        ///
-        /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-        /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
-        /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
-        /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
-        with_retries.renewZooKeeper(zk);
-        zk->removeRecursive(zookeeper_path);
-    });
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
 }

-
-void BackupCoordinationRemote::setStage(const String & new_stage, const String & message)
+void BackupCoordinationOnCluster::waitForOtherHostsToFinish()
 {
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
+    if ((current_host != kInitiator) || !backup_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
 }

-void BackupCoordinationRemote::setError(const Exception & exception)
+bool BackupCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
 {
-    stage_sync->setError(current_host, exception);
+    if (current_host != kInitiator)
+        return false;
+    if (!backup_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait)
+ZooKeeperRetriesInfo BackupCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
 {
-    return stage_sync->wait(all_hosts, stage_to_wait);
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-
-void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
+void BackupCoordinationOnCluster::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
 {
    {
        auto holder = with_retries.createRetriesControlHolder(logging_name + "::create");
@ -301,7 +304,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    if (value.empty())
        return;

-    size_t max_part_size = keeper_settings.keeper_value_max_size;
+    size_t max_part_size = keeper_settings.value_max_size;
    if (!max_part_size)
        max_part_size = value.size();

@ -324,7 +327,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    }
 }

-String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
+String BackupCoordinationOnCluster::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
 {
    Strings part_names;

@ -357,7 +360,7 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
 }


-void BackupCoordinationRemote::addReplicatedPartNames(
+void BackupCoordinationOnCluster::addReplicatedPartNames(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -381,14 +384,14 @@ void BackupCoordinationRemote::addReplicatedPartNames(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
+Strings BackupCoordinationOnCluster::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
    return replicated_tables->getPartNames(table_zk_path, replica_name);
 }

-void BackupCoordinationRemote::addReplicatedMutations(
+void BackupCoordinationOnCluster::addReplicatedMutations(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -412,7 +415,7 @@ void BackupCoordinationRemote::addReplicatedMutations(
        });
 }

-std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationOnCluster::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -420,7 +423,7 @@ std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getRepl
 }


-void BackupCoordinationRemote::addReplicatedDataPath(
+void BackupCoordinationOnCluster::addReplicatedDataPath(
    const String & table_zk_path, const String & data_path)
 {
    {
@ -441,7 +444,7 @@ void BackupCoordinationRemote::addReplicatedDataPath(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk_path) const
+Strings BackupCoordinationOnCluster::getReplicatedDataPaths(const String & table_zk_path) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -449,7 +452,7 @@ Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk
 }


-void BackupCoordinationRemote::prepareReplicatedTables() const
+void BackupCoordinationOnCluster::prepareReplicatedTables() const
 {
    if (replicated_tables)
        return;
@ -536,7 +539,7 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
        replicated_tables->addDataPath(std::move(data_paths));
 }

-void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
+void BackupCoordinationOnCluster::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
 {
    {
        std::lock_guard lock{replicated_access_mutex};
@ -558,14 +561,14 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
    });
 }

-Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
+Strings BackupCoordinationOnCluster::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
 {
    std::lock_guard lock{replicated_access_mutex};
    prepareReplicatedAccess();
    return replicated_access->getFilePaths(access_zk_path, access_entity_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedAccess() const
+void BackupCoordinationOnCluster::prepareReplicatedAccess() const
 {
    if (replicated_access)
        return;
@ -601,7 +604,7 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
        replicated_access->addFilePath(std::move(file_path));
 }

-void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
+void BackupCoordinationOnCluster::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
 {
    {
        std::lock_guard lock{replicated_sql_objects_mutex};
@ -631,14 +634,14 @@ void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_
    });
 }

-Strings BackupCoordinationRemote::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
+Strings BackupCoordinationOnCluster::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
 {
    std::lock_guard lock{replicated_sql_objects_mutex};
    prepareReplicatedSQLObjects();
    return replicated_sql_objects->getDirectories(loader_zk_path, object_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
+void BackupCoordinationOnCluster::prepareReplicatedSQLObjects() const
 {
    if (replicated_sql_objects)
        return;
@ -674,7 +677,7 @@ void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
        replicated_sql_objects->addDirectory(std::move(directory));
 }

-void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
+void BackupCoordinationOnCluster::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
 {
    {
        std::lock_guard lock{keeper_map_tables_mutex};
@ -695,7 +698,7 @@ void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_
    });
 }

-void BackupCoordinationRemote::prepareKeeperMapTables() const
+void BackupCoordinationOnCluster::prepareKeeperMapTables() const
 {
    if (keeper_map_tables)
        return;
@ -740,7 +743,7 @@ void BackupCoordinationRemote::prepareKeeperMapTables() const

 }

-String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
+String BackupCoordinationOnCluster::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
 {
    std::lock_guard lock(keeper_map_tables_mutex);
    prepareKeeperMapTables();
@ -748,7 +751,7 @@ String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zooke
 }


-void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
+void BackupCoordinationOnCluster::addFileInfos(BackupFileInfos && file_infos_)
 {
    {
        std::lock_guard lock{file_infos_mutex};
@ -761,21 +764,21 @@ void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
    serializeToMultipleZooKeeperNodes(zookeeper_path + "/file_infos/" + current_host, file_infos_str, "addFileInfos");
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfos() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfos() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfos(current_host);
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfosForAllHosts() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfosForAllHosts() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfosForAllHosts();
 }

-void BackupCoordinationRemote::prepareFileInfos() const
+void BackupCoordinationOnCluster::prepareFileInfos() const
 {
    if (file_infos)
        return;
@ -801,7 +804,7 @@ void BackupCoordinationRemote::prepareFileInfos() const
    }
 }

-bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
+bool BackupCoordinationOnCluster::startWritingFile(size_t data_file_index)
 {
    {
        /// Check if this host is already writing this file.
@ -842,66 +845,4 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
    }
 }

-bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base backup
-    if (is_internal)
-        return false;
-
-    std::string backup_stage_path = zookeeper_path + "/stage";
-
-    bool result = false;
-
-    auto holder = with_retries.createRetriesControlHolder("getAllArchiveSuffixes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zk);
-
-        if (!zk->exists(root_zookeeper_path))
-            zk->createAncestors(root_zookeeper_path);
-
-        for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-        {
-            Coordination::Stat stat;
-            zk->get(root_zookeeper_path, &stat);
-            Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
-
-            for (const auto & existing_backup_path : existing_backup_paths)
-            {
-                if (startsWith(existing_backup_path, "restore-"))
-                    continue;
-
-                String existing_backup_uuid = existing_backup_path;
-                existing_backup_uuid.erase(0, String("backup-").size());
-
-                if (existing_backup_uuid == toString(backup_uuid))
-                    continue;
-
-                String status;
-                if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
-                {
-                    /// Check if some other backup is in progress
-                    if (status == Stage::SCHEDULED_TO_START)
-                    {
-                        LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
-                        result = true;
-                        return;
-                    }
-                }
-            }
-
-            zk->createIfNotExists(backup_stage_path, "");
-            auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
-            if (code == Coordination::Error::ZOK)
-                break;
-            bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-            if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                throw zkutil::KeeperException::fromPath(code, backup_stage_path);
-        }
-    });
-
-    return result;
-}
-
 }
--- a/src/Backups/BackupCoordinationOnCluster.h
+++ b/src/Backups/BackupCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -13,32 +15,35 @@
 namespace DB
 {

-/// We try to store data to zookeeper several times due to possible version conflicts.
-constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
-
 /// Implementation of the IBackupCoordination interface performing coordination via ZooKeeper. It's necessary for "BACKUP ON CLUSTER".
-class BackupCoordinationRemote : public IBackupCoordination
+class BackupCoordinationOnCluster : public IBackupCoordination
 {
 public:
-    using BackupKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    BackupCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    BackupCoordinationOnCluster(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
        const String & root_zookeeper_path_,
+        zkutil::GetZooKeeper get_zookeeper_,
        const BackupKeeperSettings & keeper_settings_,
-        const String & backup_uuid_,
-        const Strings & all_hosts_,
        const String & current_host_,
-        bool plain_backup_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~BackupCoordinationRemote() override;
+    ~BackupCoordinationOnCluster() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setBackupQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    void addReplicatedPartNames(
        const String & table_zk_path,
@ -73,13 +78,14 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

-    static size_t findCurrentHostIndex(const Strings & all_hosts, const String & current_host);
+    static Strings excludeInitiator(const Strings & all_hosts);
+    static size_t findCurrentHostIndex(const String & current_host, const Strings & all_hosts);

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

    void serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name);
    String deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const;
@ -96,26 +102,27 @@ private:
    const String root_zookeeper_path;
    const String zookeeper_path;
    const BackupKeeperSettings keeper_settings;
-    const String backup_uuid;
+    const UUID backup_uuid;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
    const bool plain_backup;
-    const bool is_internal;
    LoggerPtr const log;

-    /// The order of these two fields matters, because stage_sync holds a reference to with_retries object
-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> backup_query_was_sent_to_other_hosts = false;

-    mutable std::optional<BackupCoordinationReplicatedTables> TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    mutable std::optional<BackupCoordinationReplicatedAccess> TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    mutable std::optional<BackupCoordinationReplicatedSQLObjects> TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    mutable std::optional<BackupCoordinationFileInfos> TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    mutable std::optional<BackupCoordinationReplicatedSQLObjects> replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    mutable std::optional<BackupCoordinationFileInfos> file_infos TSA_GUARDED_BY(file_infos_mutex);
    mutable std::optional<BackupCoordinationKeeperMapTables> keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

-    mutable std::mutex zookeeper_mutex;
    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
    mutable std::mutex replicated_sql_objects_mutex;
--- a/src/Backups/BackupCoordinationStage.h
+++ b/src/Backups/BackupCoordinationStage.h
@ -8,10 +8,6 @@ namespace DB

 namespace BackupCoordinationStage
 {
-    /// This stage is set after concurrency check so ensure we dont start other backup/restores
-    /// when concurrent backup/restores are not allowed
-    constexpr const char * SCHEDULED_TO_START = "scheduled to start";
-
    /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
    constexpr const char * GATHERING_METADATA = "gathering metadata";

@ -46,10 +42,6 @@ namespace BackupCoordinationStage

    /// Coordination stage meaning that a host finished its work.
    constexpr const char * COMPLETED = "completed";
-
-    /// Coordination stage meaning that backup/restore has failed due to an error
-    /// Check '/error' for the error message
-    constexpr const char * ERROR = "error";
 }

 }
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -10,33 +10,193 @@ class BackupCoordinationStageSync
 {
 public:
    BackupCoordinationStageSync(
-        const String & root_zookeeper_path_,
-        WithRetries & with_retries_,
+        bool is_restore_,                    /// true if this is a RESTORE ON CLUSTER command, false if this is a BACKUP ON CLUSTER command
+        const String & zookeeper_path_,      /// path to the "stage" folder in ZooKeeper
+        const String & current_host_,        /// the current host, or an empty string if it's the initiator of the BACKUP/RESTORE ON CLUSTER command
+        const Strings & all_hosts_,          /// all the hosts (including the initiator and the current host) performing the BACKUP/RESTORE ON CLUSTER command
+        bool allow_concurrency_,             /// whether it's allowed to have concurrent backups or restores.
+        const WithRetries & with_retries_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+        QueryStatusPtr process_list_element_,
        LoggerPtr log_);

+    ~BackupCoordinationStageSync();
+
    /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
-    void set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts = false);
-    void setError(const String & current_host, const Exception & exception);
+    void setStage(const String & stage, const String & stage_result = {});

-    /// Sets the stage of the current host and waits until all hosts come to the same stage.
-    /// The function returns the messages all hosts set when they come to the required stage.
-    Strings wait(const Strings & all_hosts, const String & stage_to_wait);
+    /// Waits until all the specified hosts come to the specified stage.
+    /// The function returns the results which specified hosts set when they came to the required stage.
+    /// If it doesn't happen before the timeout then the function will stop waiting and throw an exception.
+    Strings waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout = {}) const;

-    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
-    Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout);
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    void waitForOtherHostsToFinish() const;
+
+    /// Lets other host know that the current host has finished its work.
+    void finish(bool & other_hosts_also_finished);
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(std::exception_ptr exception) noexcept;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    bool tryWaitForOtherHostsToFinishAfterError() const noexcept;
+
+    /// Lets other host know that the current host has finished its work (as a part of error-handling process).
+    bool tryFinishAfterError(bool & other_hosts_also_finished) noexcept;
+
+    /// Returns a printable name of a specific host. For empty host the function returns "initiator".
+    static String getHostDesc(const String & host);
+    static String getHostsDesc(const Strings & hosts);

 private:
+    /// Initializes the original state. It will be updated then with readCurrentState().
+    void initializeState();
+
+    /// Creates the root node in ZooKeeper.
    void createRootNodes();

-    struct State;
-    State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+    /// Atomically creates both 'start' and 'alive' nodes and also checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void createStartAndAliveNodes();
+    void createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);

-    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
+    /// Deserialize the version of a node stored in the 'start' node.
+    int parseStartNode(const String & start_node_contents, const String & host) const;

-    String zookeeper_path;
-    /// A reference to the field of parent object - BackupCoordinationRemote or RestoreCoordinationRemote
-    WithRetries & with_retries;
-    LoggerPtr log;
+    /// Recreates the 'alive' node if it doesn't exist. It's an ephemeral node so it's removed automatically after disconnections.
+    void createAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void checkConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Watching thread periodically reads the current state from ZooKeeper and recreates the 'alive' node.
+    void startWatchingThread();
+    void stopWatchingThread();
+    void watchingThread();
+
+    /// Reads the current state from ZooKeeper without throwing exceptions.
+    void readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    String getStageNodePath(const String & stage) const;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(const Exception & exception);
+    void setError(const Exception & exception);
+
+    /// Deserializes an error stored in the error node.
+    static std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents);
+
+    /// Reset the `connected` flag for each host.
+    void resetConnectedFlag();
+
+    /// Checks if the current query is cancelled, and if so then the function sets the `cancelled` flag in the current state.
+    void checkIfQueryCancelled();
+
+    /// Checks if the current state contains an error, and if so then the function passes this error to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfError();
+
+    /// Checks if some host was disconnected for too long, and if so then the function generates an error and pass it to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfDisconnectedTooLong();
+
+    /// Used by waitForHostsToReachStage() to check if everything is ready to return.
+    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, bool time_is_out, std::optional<std::chrono::milliseconds> timeout, Strings & results) const TSA_REQUIRES(mutex);
+
+    /// Creates the 'finish' node.
+    bool tryFinishImpl();
+    bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
+    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Returns the version used by the initiator.
+    int getInitiatorVersion() const;
+
+    /// Waits until all the other hosts finish their work.
+    bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
+    bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
+
+    const bool is_restore;
+    const String operation_name;
+    const String current_host;
+    const String current_host_desc;
+    const Strings all_hosts;
+    const bool allow_concurrency;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const ThreadPoolCallbackRunnerUnsafe<void> schedule;
+    const QueryStatusPtr process_list_element;
+    const LoggerPtr log;
+
+    const std::chrono::seconds failure_after_host_disconnected_for_seconds;
+    const std::chrono::seconds finish_timeout_after_error;
+    const std::chrono::milliseconds sync_period_ms;
+    const size_t max_attempts_after_bad_version;
+
+    /// Paths in ZooKeeper.
+    const std::filesystem::path zookeeper_path;
+    const String root_zookeeper_path;
+    const String operation_node_path;
+    const String operation_node_name;
+    const String stage_node_path;
+    const String start_node_path;
+    const String finish_node_path;
+    const String num_hosts_node_path;
+    const String alive_node_path;
+    const String alive_tracker_node_path;
+    const String error_node_path;
+
+    std::shared_ptr<Poco::Event> zk_nodes_changed;
+
+    /// We store list of previously found ZooKeeper nodes to show better logging messages.
+    Strings zk_nodes;
+
+    /// Information about one host read from ZooKeeper.
+    struct HostInfo
+    {
+        String host;
+        bool started = false;
+        bool connected = false;
+        bool finished = false;
+        int version = 1;
+        std::map<String /* stage */, String /* result */> stages = {}; /// std::map because we need to compare states
+        std::exception_ptr exception = nullptr;
+
+        std::chrono::time_point<std::chrono::system_clock> last_connection_time = {};
+        std::chrono::time_point<std::chrono::steady_clock> last_connection_time_monotonic = {};
+
+        bool operator ==(const HostInfo & other) const;
+        bool operator !=(const HostInfo & other) const;
+    };
+
+    /// Information about all the host participating in the current BACKUP or RESTORE operation.
+    struct State
+    {
+        std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
+        std::optional<String> host_with_error;
+        bool cancelled = false;
+
+        bool operator ==(const State & other) const;
+        bool operator !=(const State & other) const;
+    };
+
+    State state TSA_GUARDED_BY(mutex);
+    mutable std::condition_variable state_changed;
+
+    std::future<void> watching_thread_future;
+    std::atomic<bool> should_stop_watching_thread = false;
+
+    struct FinishResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+        bool other_hosts_also_finished = false;
+    };
+    FinishResult finish_result TSA_GUARDED_BY(mutex);
+
+    mutable std::mutex mutex;
 };

 }
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -102,7 +102,6 @@ BackupEntriesCollector::BackupEntriesCollector(
    , read_settings(read_settings_)
    , context(context_)
    , process_list_element(context->getProcessListElement())
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , collect_metadata_timeout(context->getConfigRef().getUInt64(
          "backups.collect_metadata_timeout", context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)))
    , attempts_to_collect_metadata_before_sleep(context->getConfigRef().getUInt("backups.attempts_to_collect_metadata_before_sleep", 2))
@ -176,21 +175,7 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String
    checkIsQueryCancelled();

    current_stage = new_stage;
-    backup_coordination->setStage(new_stage, message);
-
-    if (new_stage == Stage::formatGatheringMetadata(0))
-    {
-        return backup_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-    }
-    if (new_stage.starts_with(Stage::GATHERING_METADATA))
-    {
-        auto current_time = std::chrono::steady_clock::now();
-        auto end_of_timeout = std::max(current_time, collect_metadata_end_time);
-        return backup_coordination->waitForStage(
-            new_stage, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time));
-    }
-
-    return backup_coordination->waitForStage(new_stage);
+    return backup_coordination->setStage(new_stage, message, /* sync = */ true);
 }

 void BackupEntriesCollector::checkIsQueryCancelled() const
--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -111,10 +111,6 @@ private:
    ContextPtr context;
    QueryStatusPtr process_list_element;

-    /// The time a BACKUP ON CLUSTER or RESTORE ON CLUSTER command will wait until all the nodes receive the BACKUP (or RESTORE) query and start working.
-    /// This setting is similar to `distributed_ddl_task_timeout`.
-    const std::chrono::milliseconds on_cluster_first_sync_timeout;
-
    /// The time a BACKUP command will try to collect the metadata of tables & databases.
    const std::chrono::milliseconds collect_metadata_timeout;

--- a/src/Backups/BackupIO.h
+++ b/src/Backups/BackupIO.h
@ -5,6 +5,7 @@

 namespace DB
 {
+
 class IDisk;
 using DiskPtr = std::shared_ptr<IDisk>;
 class SeekableReadBuffer;
@ -63,9 +64,13 @@ public:

    virtual void copyFile(const String & destination, const String & source, size_t size) = 0;

+    /// Removes a file written to the backup, if it still exists.
    virtual void removeFile(const String & file_name) = 0;
    virtual void removeFiles(const Strings & file_names) = 0;

+    /// Removes the backup folder if it's empty or contains empty subfolders.
+    virtual void removeEmptyDirectories() = 0;
+
    virtual const ReadSettings & getReadSettings() const = 0;
    virtual const WriteSettings & getWriteSettings() const = 0;
    virtual size_t getWriteBufferSize() const = 0;
--- a/src/Backups/BackupIO_AzureBlobStorage.h
+++ b/src/Backups/BackupIO_AzureBlobStorage.h
@ -81,6 +81,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@ -91,16 +91,36 @@ std::unique_ptr<WriteBuffer> BackupWriterDisk::writeFile(const String & file_nam
 void BackupWriterDisk::removeFile(const String & file_name)
 {
    disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
 }

 void BackupWriterDisk::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!disk->existsDirectory(current_dir))
+        return;
+
+    if (disk->isDirectoryEmpty(current_dir))
+    {
+        disk->removeDirectory(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (auto it = disk->iterateDirectory(current_dir); it->isValid(); it->next())
+        removeEmptyDirectoriesImpl(current_dir / it->name());
+
+    if (disk->isDirectoryEmpty(current_dir))
+        disk->removeDirectory(current_dir);
 }

 void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_Disk.h
+++ b/src/Backups/BackupIO_Disk.h
@ -50,9 +50,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const DiskPtr disk;
    const std::filesystem::path root_path;
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@ -106,16 +106,36 @@ std::unique_ptr<WriteBuffer> BackupWriterFile::writeFile(const String & file_nam
 void BackupWriterFile::removeFile(const String & file_name)
 {
    (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
 }

 void BackupWriterFile::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!fs::is_directory(current_dir))
+        return;
+
+    if (fs::is_empty(current_dir))
+    {
+        (void)fs::remove(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (const auto & it : std::filesystem::directory_iterator{current_dir})
+        removeEmptyDirectoriesImpl(it.path());
+
+    if (fs::is_empty(current_dir))
+        (void)fs::remove(current_dir);
 }

 void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_File.h
+++ b/src/Backups/BackupIO_File.h
@ -42,9 +42,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const std::filesystem::path root_path;
    const DataSourceDescription data_source_description;
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@ -74,6 +74,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -147,11 +147,11 @@ BackupImpl::BackupImpl(

 BackupImpl::~BackupImpl()
 {
-    if ((open_mode == OpenMode::WRITE) && !is_internal_backup && !writing_finalized && !std::uncaught_exceptions() && !std::current_exception())
+    if ((open_mode == OpenMode::WRITE) && !writing_finalized && !corrupted)
    {
        /// It is suspicious to destroy BackupImpl without finalization while writing a backup when there is no exception.
-        LOG_ERROR(log, "BackupImpl is not finalized when destructor is called. Stack trace: {}", StackTrace().toString());
-        chassert(false && "BackupImpl is not finalized when destructor is called.");
+        LOG_ERROR(log, "BackupImpl is not finalized or marked as corrupted when destructor is called. Stack trace: {}", StackTrace().toString());
+        chassert(false, "BackupImpl is not finalized or marked as corrupted when destructor is called.");
    }

    try
@ -196,9 +196,6 @@ void BackupImpl::open()

    if (open_mode == OpenMode::READ)
        readBackupMetadata();
-
-    if ((open_mode == OpenMode::WRITE) && base_backup_info)
-        base_backup_uuid = getBaseBackupUnlocked()->getUUID();
 }

 void BackupImpl::close()
@ -280,6 +277,8 @@ std::shared_ptr<const IBackup> BackupImpl::getBaseBackupUnlocked() const
                toString(base_backup->getUUID()),
                (base_backup_uuid ? toString(*base_backup_uuid) : ""));
        }
+
+        base_backup_uuid = base_backup->getUUID();
    }
    return base_backup;
 }
@ -369,7 +368,7 @@ void BackupImpl::writeBackupMetadata()
        if (base_backup_in_use)
        {
            *out << "<base_backup>" << xml << base_backup_info->toString() << "</base_backup>";
-            *out << "<base_backup_uuid>" << toString(*base_backup_uuid) << "</base_backup_uuid>";
+            *out << "<base_backup_uuid>" << getBaseBackupUnlocked()->getUUID() << "</base_backup_uuid>";
        }
    }

@ -594,9 +593,6 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const

 void BackupImpl::removeLockFile()
 {
-    if (is_internal_backup)
-        return; /// Internal backup must not remove the lock file (it's still used by the initiator).
-
    if (checkLockFile(false))
        writer->removeFile(lock_file_name);
 }
@ -989,8 +985,11 @@ void BackupImpl::finalizeWriting()
    if (open_mode != OpenMode::WRITE)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");

+    if (corrupted)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup can't be finalized after an error happened");
+
    if (writing_finalized)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized");
+        return;

    if (!is_internal_backup)
    {
@ -1015,20 +1014,58 @@ void BackupImpl::setCompressedSize()
 }


-void BackupImpl::tryRemoveAllFiles()
+bool BackupImpl::setIsCorrupted() noexcept
 {
-    if (open_mode != OpenMode::WRITE)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
-
-    if (is_internal_backup)
-        return;
-
    try
    {
-        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+        std::lock_guard lock{mutex};
+        if (open_mode != OpenMode::WRITE)
+        {
+            LOG_ERROR(log, "Backup is not opened for writing. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not opened for writing when setIsCorrupted() is called");
+            return false;
+        }
+
+        if (writing_finalized)
+        {
+            LOG_WARNING(log, "An error happened after the backup was completed successfully, the backup must be correct!");
+            return false;
+        }
+
+        if (corrupted)
+            return true;
+
+        LOG_WARNING(log, "An error happened, the backup won't be completed");
+
        closeArchive(/* finalize= */ false);

+        corrupted = true;
+        return true;
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException(log, "Caught exception while setting that the backup was corrupted");
+        return false;
+    }
+}
+
+
+bool BackupImpl::tryRemoveAllFiles() noexcept
+{
+    try
+    {
+        std::lock_guard lock{mutex};
+        if (!corrupted)
+        {
+            LOG_ERROR(log, "Backup is not set as corrupted. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not set as corrupted when tryRemoveAllFiles() is called");
+            return false;
+        }
+
+        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+
        Strings files_to_remove;
+
        if (use_archive)
        {
            files_to_remove.push_back(archive_params.archive_name);
@ -1041,14 +1078,17 @@ void BackupImpl::tryRemoveAllFiles()
        }

        if (!checkLockFile(false))
-            return;
+            return false;

        writer->removeFiles(files_to_remove);
        removeLockFile();
+        writer->removeEmptyDirectories();
+        return true;
    }
    catch (...)
    {
-        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+        DB::tryLogCurrentException(log, "Caught exception while removing files of a corrupted backup");
+        return false;
    }
 }

--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@ -86,7 +86,8 @@ public:
    void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override;
    bool supportsWritingInMultipleThreads() const override { return !use_archive; }
    void finalizeWriting() override;
-    void tryRemoveAllFiles() override;
+    bool setIsCorrupted() noexcept override;
+    bool tryRemoveAllFiles() noexcept override;

 private:
    void open();
@ -146,13 +147,14 @@ private:
    int version;
    mutable std::optional<BackupInfo> base_backup_info;
    mutable std::shared_ptr<const IBackup> base_backup;
-    std::optional<UUID> base_backup_uuid;
+    mutable std::optional<UUID> base_backup_uuid;
    std::shared_ptr<IArchiveReader> archive_reader;
    std::shared_ptr<IArchiveWriter> archive_writer;
    String lock_file_name;
    std::atomic<bool> lock_file_before_first_file_checked = false;

    bool writing_finalized = false;
+    bool corrupted = false;
    bool deduplicate_files = true;
    bool use_same_s3_credentials_for_base_backup = false;
    bool use_same_password_for_base_backup = false;
--- a/src/Backups/BackupKeeperSettings.cpp
+++ b/src/Backups/BackupKeeperSettings.cpp
@ -0,0 +1,58 @@
+#include <Backups/BackupKeeperSettings.h>
+
+#include <Core/Settings.h>
+#include <Interpreters/Context.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+
+namespace DB
+{
+
+namespace Setting
+{
+    extern const SettingsUInt64 backup_restore_keeper_max_retries;
+    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
+    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
+    extern const SettingsUInt64 backup_restore_failure_after_host_disconnected_for_seconds;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_initializing;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_handling_error;
+    extern const SettingsUInt64 backup_restore_finish_timeout_after_error_sec;
+    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
+    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
+    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
+}
+
+BackupKeeperSettings BackupKeeperSettings::fromContext(const ContextPtr & context)
+{
+    BackupKeeperSettings keeper_settings;
+
+    const auto & settings = context->getSettingsRef();
+    const auto & config = context->getConfigRef();
+
+    keeper_settings.max_retries = settings[Setting::backup_restore_keeper_max_retries];
+    keeper_settings.retry_initial_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_initial_backoff_ms]};
+    keeper_settings.retry_max_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_max_backoff_ms]};
+
+    keeper_settings.failure_after_host_disconnected_for_seconds = std::chrono::seconds{settings[Setting::backup_restore_failure_after_host_disconnected_for_seconds]};
+    keeper_settings.max_retries_while_initializing = settings[Setting::backup_restore_keeper_max_retries_while_initializing];
+    keeper_settings.max_retries_while_handling_error = settings[Setting::backup_restore_keeper_max_retries_while_handling_error];
+    keeper_settings.finish_timeout_after_error = std::chrono::seconds(settings[Setting::backup_restore_finish_timeout_after_error_sec]);
+
+    if (config.has("backups.sync_period_ms"))
+        keeper_settings.sync_period_ms = std::chrono::milliseconds{config.getUInt64("backups.sync_period_ms")};
+
+    if (config.has("backups.max_attempts_after_bad_version"))
+        keeper_settings.max_attempts_after_bad_version = config.getUInt64("backups.max_attempts_after_bad_version");
+
+    keeper_settings.value_max_size = settings[Setting::backup_restore_keeper_value_max_size];
+    keeper_settings.batch_size_for_multi = settings[Setting::backup_restore_batch_size_for_keeper_multi];
+    keeper_settings.batch_size_for_multiread = settings[Setting::backup_restore_batch_size_for_keeper_multiread];
+    keeper_settings.fault_injection_probability = settings[Setting::backup_restore_keeper_fault_injection_probability];
+    keeper_settings.fault_injection_seed = settings[Setting::backup_restore_keeper_fault_injection_seed];
+
+    return keeper_settings;
+}
+
+}
--- a/src/Backups/BackupKeeperSettings.h
+++ b/src/Backups/BackupKeeperSettings.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <Interpreters/Context_fwd.h>
+
+
+namespace DB
+{
+
+/// Settings for [Zoo]Keeper-related works during BACKUP or RESTORE.
+struct BackupKeeperSettings
+{
+    /// Maximum number of retries in the middle of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Should be big enough so the whole operation won't be cancelled in the middle of it because of a temporary ZooKeeper failure.
+    UInt64 max_retries{1000};
+
+    /// Initial backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_initial_backoff_ms{100};
+
+    /// Max backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_max_backoff_ms{5000};
+
+    /// If a host during BACKUP ON CLUSTER or RESTORE ON CLUSTER doesn't recreate its 'alive' node in ZooKeeper
+    /// for this amount of time then the whole backup or restore is considered as failed.
+    /// Should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+    /// Set to zero to disable (if it's zero and some host crashed then BACKUP ON CLUSTER or RESTORE ON CLUSTER will be waiting
+    /// for the crashed host forever until the operation is explicitly cancelled with KILL QUERY).
+    std::chrono::seconds failure_after_host_disconnected_for_seconds{3600};
+
+    /// Maximum number of retries during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because if the operation is going to fail then it's better if it fails faster.
+    UInt64 max_retries_while_initializing{20};
+
+    /// Maximum number of retries while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because those retries are just for cleanup after the operation has failed already.
+    UInt64 max_retries_while_handling_error{20};
+
+    /// How long the initiator should wait for other host to handle the 'error' node and finish their work.
+    std::chrono::seconds finish_timeout_after_error{180};
+
+    /// How often the "stage" folder in ZooKeeper must be scanned in a background thread to track changes done by other hosts.
+    std::chrono::milliseconds sync_period_ms{5000};
+
+    /// Number of attempts after getting error ZBADVERSION from ZooKeeper.
+    size_t max_attempts_after_bad_version{10};
+
+    /// Maximum size of data of a ZooKeeper's node during backup.
+    UInt64 value_max_size{1048576};
+
+    /// Maximum size of a batch for a multi request.
+    UInt64 batch_size_for_multi{1000};
+
+    /// Maximum size of a batch for a multiread request.
+    UInt64 batch_size_for_multiread{10000};
+
+    /// Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f].
+    Float64 fault_injection_probability{0};
+
+    /// Seed for `fault_injection_probability`: 0 - random seed, otherwise the setting value.
+    UInt64 fault_injection_seed{0};
+
+    static BackupKeeperSettings fromContext(const ContextPtr & context);
+};
+
+}
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -74,6 +74,17 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query)
    return res;
 }

+bool BackupSettings::isAsync(const ASTBackupQuery & query)
+{
+    if (query.settings)
+    {
+        const auto * field = query.settings->as<const ASTSetQuery &>().changes.tryGet("async");
+        if (field)
+            return field->safeGet<bool>();
+    }
+    return false; /// `async` is false by default.
+}
+
 void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const
 {
    auto query_settings = std::make_shared<ASTSetQuery>();
--- a/src/Backups/BackupSettings.h
+++ b/src/Backups/BackupSettings.h
@ -101,6 +101,8 @@ struct BackupSettings
    static BackupSettings fromBackupQuery(const ASTBackupQuery & query);
    void copySettingsToQuery(ASTBackupQuery & query) const;

+    static bool isAsync(const ASTBackupQuery & query);
+
    struct Util
    {
        static std::vector<Strings> clusterHostIDsFromAST(const IAST & ast);
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -23,6 +23,7 @@ using BackupMutablePtr = std::shared_ptr<IBackup>;
 using BackupPtr = std::shared_ptr<const IBackup>;
 class IBackupEntry;
 using BackupEntries = std::vector<std::pair<String, std::shared_ptr<const IBackupEntry>>>;
+class BackupConcurrencyCounters;
 using DataRestoreTasks = std::vector<std::function<void()>>;
 struct ReadSettings;
 class BackupLog;
@ -31,6 +32,10 @@ using ThreadGroupPtr = std::shared_ptr<ThreadGroup>;
 class QueryStatus;
 using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 class ProcessList;
+class Cluster;
+using ClusterPtr = std::shared_ptr<Cluster>;
+class AccessRightsElements;
+struct ZooKeeperRetriesInfo;


 /// Manager of backups and restores: executes backups and restores' threads in the background.
@ -47,18 +52,18 @@ public:
    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
    /// For asynchronous operations the function throws no exceptions on failure usually,
    /// call getInfo() on a returned operation id to check for errors.
-    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

    /// Waits until the specified backup or restore operation finishes or stops.
    /// The function returns immediately if the operation is already finished.
-    void wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);
+    BackupStatus wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);

    /// Waits until all running backup and restore operations finish or stop.
    void waitAll();

    /// Cancels the specified backup or restore operation.
    /// The function does nothing if this operation has already finished.
-    void cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
+    BackupStatus cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);

    /// Cancels all running backup and restore operations.
    void cancelAll(bool wait_ = true);
@ -67,26 +72,32 @@ public:
    std::vector<BackupOperationInfo> getAllInfos() const;

 private:
-    BackupOperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    std::pair<BackupOperationID, BackupStatus> startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    struct BackupStarter;
+
+    BackupMutablePtr openBackupForWriting(const BackupInfo & backup_info, const BackupSettings & backup_settings, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context) const;

    void doBackup(
-        BackupMutablePtr & backup,
+        BackupMutablePtr backup,
        const std::shared_ptr<ASTBackupQuery> & backup_query,
        const BackupOperationID & backup_id,
        const String & backup_name_for_logging,
-        const BackupInfo & backup_info,
-        BackupSettings backup_settings,
+        const BackupSettings & backup_settings,
        std::shared_ptr<IBackupCoordination> backup_coordination,
-        const ContextPtr & context,
-        ContextMutablePtr mutable_context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);

    /// Builds file infos for specified backup entries.
    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);

    /// Write backup entries to an opened backup.
-    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal, QueryStatusPtr process_list_element);
+    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool is_internal_backup, QueryStatusPtr process_list_element);

-    BackupOperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    struct RestoreStarter;
+
+    BackupPtr openBackupForReading(const BackupInfo & backup_info, const RestoreSettings & restore_settings, const ContextPtr & context) const;

    void doRestore(
        const std::shared_ptr<ASTBackupQuery> & restore_query,
@ -95,7 +106,17 @@ private:
        const BackupInfo & backup_info,
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
-        ContextMutablePtr context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);
+
+    std::shared_ptr<IBackupCoordination> makeBackupCoordination(bool on_cluster, const BackupSettings & backup_settings, const ContextPtr & context) const;
+    std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(bool on_cluster, const RestoreSettings & restore_settings, const ContextPtr & context) const;
+
+    /// Sends a BACKUP or RESTORE query to other hosts.
+    void sendQueryToOtherHosts(const ASTBackupQuery & backup_or_restore_query, const ClusterPtr & cluster,
+        size_t only_shard_num, size_t only_replica_num, ContextMutablePtr context, const AccessRightsElements & access_to_check,
+        const ZooKeeperRetriesInfo & retries_info) const;

    /// Run data restoring tasks which insert data to tables.
    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);
@ -139,6 +160,8 @@ private:

    std::shared_ptr<BackupLog> backup_log;
    ProcessList & process_list;
+
+    std::unique_ptr<BackupConcurrencyCounters> concurrency_counters;
 };

 }
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@ -121,8 +121,13 @@ public:
    /// Finalizes writing the backup, should be called after all entries have been successfully written.
    virtual void finalizeWriting() = 0;

-    /// Try to remove all files copied to the backup. Used after an exception or it the backup was cancelled.
-    virtual void tryRemoveAllFiles() = 0;
+    /// Sets that a non-retriable error happened while the backup was being written which means that
+    /// the backup is most likely corrupted and it can't be finalized.
+    /// This function is called while handling an exception or if the backup was cancelled.
+    virtual bool setIsCorrupted() noexcept = 0;
+
+    /// Try to remove all files copied to the backup. Could be used after setIsCorrupted().
+    virtual bool tryRemoveAllFiles() noexcept = 0;
 };

 using BackupPtr = std::shared_ptr<const IBackup>;
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -5,26 +5,44 @@

 namespace DB
 {
-class Exception;
 struct BackupFileInfo;
 using BackupFileInfos = std::vector<BackupFileInfo>;
 enum class AccessEntityType : uint8_t;
 enum class UserDefinedSQLObjectType : uint8_t;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're writing to a backup while executing BACKUP ON CLUSTER.
-/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationRemote.
+/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationOnCluster.
 /// BackupCoordinationLocal is used while executing BACKUP without ON CLUSTER and performs coordination in memory.
-/// BackupCoordinationRemote is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
+/// BackupCoordinationOnCluster is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
 class IBackupCoordination
 {
 public:
    virtual ~IBackupCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;
+
+    /// Sets that the backup query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setBackupQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    struct PartNameAndChecksum
    {
@ -87,9 +105,7 @@ public:
    /// Starts writing a specified file, the function returns false if that file is already being written concurrently.
    virtual bool startWritingFile(size_t data_file_index) = 0;

-    /// This function is used to check if concurrent backups are running
-    /// other than the backup passed to the function
-    virtual bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -5,26 +5,42 @@

 namespace DB
 {
-class Exception;
 enum class UserDefinedSQLObjectType : uint8_t;
 class ASTCreateQuery;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
-/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
+/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationOnCluster.
 /// RestoreCoordinationLocal is used while executing RESTORE without ON CLUSTER and performs coordination in memory.
-/// RestoreCoordinationRemote is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
+/// RestoreCoordinationOnCluster is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
 class IRestoreCoordination
 {
 public:
    virtual ~IRestoreCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;

-    static constexpr const char * kErrorStatus = "error";
+    /// Sets that the restore query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setRestoreQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
@ -49,9 +65,7 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    virtual void generateUUIDForTable(ASTCreateQuery & create_query) = 0;

-    /// This function is used to check if concurrent restores are running
-    /// other than the restore passed to the function
-    virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -1,32 +1,24 @@
 #include <Backups/RestoreCoordinationLocal.h>
+
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/formatAST.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>


 namespace DB
 {

-RestoreCoordinationLocal::RestoreCoordinationLocal() : log(getLogger("RestoreCoordinationLocal"))
+RestoreCoordinationLocal::RestoreCoordinationLocal(
+    const UUID & restore_uuid, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("RestoreCoordinationLocal"))
+    , concurrency_check(restore_uuid, /* is_restore = */ true, /* on_cluster = */ false, allow_concurrent_restore_, concurrency_counters_)
 {
 }

 RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;

-void RestoreCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void RestoreCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo RestoreCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -63,7 +55,7 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
 {
    String query_str = serializeAST(create_query);

-    auto find_in_map = [&]
+    auto find_in_map = [&]() TSA_REQUIRES(mutex)
    {
        auto it = create_query_uuids.find(query_str);
        if (it != create_query_uuids.end())
@ -91,14 +83,4 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
    }
 }

-bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
-{
-    if (num_active_restores > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_restores={}", num_active_restores);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Parsers/CreateQueryUUIDs.h>
 #include <Common/Logger.h>
 #include <mutex>
@ -12,19 +13,20 @@ namespace DB
 {
 class ASTCreateQuery;

-
 /// Implementation of the IRestoreCoordination interface performing coordination in memory.
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationLocal();
+    RestoreCoordinationLocal(const UUID & restore_uuid_, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
    ~RestoreCoordinationLocal() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setRestoreQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -49,15 +51,16 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
-    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
-    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids;
-    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables;
+    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables TSA_GUARDED_BY(mutex);
+    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables TSA_GUARDED_BY(mutex);

    mutable std::mutex mutex;
 };
--- a/src/Backups/RestoreCoordinationOnCluster.cpp
+++ b/src/Backups/RestoreCoordinationOnCluster.cpp
@ -0,0 +1,318 @@
+#include <Backups/BackupCoordinationOnCluster.h>
+
+#include <Backups/BackupCoordinationStage.h>
+#include <Backups/BackupCoordinationStageSync.h>
+#include <Backups/RestoreCoordinationOnCluster.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/CreateQueryUUIDs.h>
+#include <Parsers/formatAST.h>
+#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/escapeForFileName.h>
+
+
+namespace DB
+{
+
+RestoreCoordinationOnCluster::RestoreCoordinationOnCluster(
+    const UUID & restore_uuid_,
+    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
+    const BackupKeeperSettings & keeper_settings_,
+    const String & current_host_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_restore_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+    QueryStatusPtr process_list_element_)
+    : root_zookeeper_path(root_zookeeper_path_)
+    , keeper_settings(keeper_settings_)
+    , restore_uuid(restore_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/restore-" + toString(restore_uuid_))
+    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(BackupCoordinationOnCluster::excludeInitiator(all_hosts))
+    , current_host(current_host_)
+    , current_host_index(BackupCoordinationOnCluster::findCurrentHostIndex(current_host, all_hosts))
+    , log(getLogger("RestoreCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(restore_uuid_, /* is_restore = */ true, /* on_cluster = */ true, allow_concurrent_restore_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
+{
+    createRootNodes();
+}
+
+RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster()
+{
+    tryFinishImpl();
+}
+
+void RestoreCoordinationOnCluster::createRootNodes()
+{
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            zk->createAncestors(zookeeper_path);
+            zk->createIfNotExists(zookeeper_path, "");
+            zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_sql_objects_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+            zk->createIfNotExists(zookeeper_path + "/table_uuids", "");
+        });
+}
+
+Strings RestoreCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
+{
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void RestoreCoordinationOnCluster::setRestoreQueryWasSentToOtherHosts()
+{
+    restore_query_was_sent_to_other_hosts = true;
+}
+
+bool RestoreCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void RestoreCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+    {
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
+}
+
+void RestoreCoordinationOnCluster::waitForOtherHostsToFinish()
+{
+    if ((current_host != kInitiator) || !restore_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
+}
+
+bool RestoreCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
+{
+    if (current_host != kInitiator)
+        return false;
+    if (!restore_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+}
+
+ZooKeeperRetriesInfo RestoreCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
+{
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
+}
+
+bool RestoreCoordinationOnCluster::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/" + escapeForFileName(table_name);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/";
+            switch (object_type)
+            {
+                case UserDefinedSQLObjectType::Function:
+                    path += "functions";
+                    break;
+            }
+
+            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
+{
+    bool lock_acquired = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            /// we need to remove leading '/' from root_zk_path
+            auto normalized_root_zk_path = root_zk_path.substr(1);
+            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
+            zk->createAncestors(restore_lock_path);
+            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                lock_acquired = true;
+                return;
+            }
+
+            if (code == Coordination::Error::ZNODEEXISTS)
+                lock_acquired = table_unique_id == zk->get(restore_lock_path);
+            else
+                zkutil::KeeperException::fromPath(code, restore_lock_path);
+        });
+    return lock_acquired;
+}
+
+void RestoreCoordinationOnCluster::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
+    String new_uuids_str = new_uuids.toString();
+
+    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
+            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
+
+            if (res == Coordination::Error::ZOK)
+            {
+                new_uuids.copyToQuery(create_query);
+                return;
+            }
+
+            if (res == Coordination::Error::ZNODEEXISTS)
+            {
+                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
+                return;
+            }
+
+            zkutil::KeeperException::fromPath(res, path);
+        });
+}
+
+}
--- a/src/Backups/RestoreCoordinationOnCluster.h
+++ b/src/Backups/RestoreCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationStageSync.h>
 #include <Backups/WithRetries.h>

@ -9,28 +11,33 @@ namespace DB
 {

 /// Implementation of the IRestoreCoordination interface performing coordination via ZooKeeper. It's necessary for "RESTORE ON CLUSTER".
-class RestoreCoordinationRemote : public IRestoreCoordination
+class RestoreCoordinationOnCluster : public IRestoreCoordination
 {
 public:
-    using RestoreKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a RESTORE ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    RestoreCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    RestoreCoordinationOnCluster(
+        const UUID & restore_uuid_,
        const String & root_zookeeper_path_,
-        const RestoreKeeperSettings & keeper_settings_,
-        const String & restore_uuid_,
-        const Strings & all_hosts_,
+        zkutil::GetZooKeeper get_zookeeper_,
+        const BackupKeeperSettings & keeper_settings_,
        const String & current_host_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_restore_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~RestoreCoordinationRemote() override;
+    ~RestoreCoordinationOnCluster() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setRestoreQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -55,27 +62,27 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

-    /// get_zookeeper will provide a zookeeper client without any fault injection
-    const zkutil::GetZooKeeper get_zookeeper;
    const String root_zookeeper_path;
-    const RestoreKeeperSettings keeper_settings;
-    const String restore_uuid;
+    const BackupKeeperSettings keeper_settings;
+    const UUID restore_uuid;
    const String zookeeper_path;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
-    const bool is_internal;
    LoggerPtr const log;

-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
-    mutable std::mutex mutex;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> restore_query_was_sent_to_other_hosts = false;
 };

 }
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -1,379 +0,0 @@
-#include <Backups/BackupCoordinationRemote.h>
-#include <Backups/BackupCoordinationStage.h>
-#include <Backups/RestoreCoordinationRemote.h>
-#include <Backups/BackupCoordinationStageSync.h>
-#include <Parsers/ASTCreateQuery.h>
-#include <Parsers/CreateQueryUUIDs.h>
-#include <Parsers/formatAST.h>
-#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Common/escapeForFileName.h>
-
-
-namespace DB
-{
-
-namespace Stage = BackupCoordinationStage;
-
-RestoreCoordinationRemote::RestoreCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
-    const String & root_zookeeper_path_,
-    const RestoreKeeperSettings & keeper_settings_,
-    const String & restore_uuid_,
-    const Strings & all_hosts_,
-    const String & current_host_,
-    bool is_internal_,
-    QueryStatusPtr process_list_element_)
-    : get_zookeeper(get_zookeeper_)
-    , root_zookeeper_path(root_zookeeper_path_)
-    , keeper_settings(keeper_settings_)
-    , restore_uuid(restore_uuid_)
-    , zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
-    , all_hosts(all_hosts_)
-    , current_host(current_host_)
-    , current_host_index(BackupCoordinationRemote::findCurrentHostIndex(all_hosts, current_host))
-    , is_internal(is_internal_)
-    , log(getLogger("RestoreCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
-{
-    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
-}
-
-RestoreCoordinationRemote::~RestoreCoordinationRemote()
-{
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void RestoreCoordinationRemote::createRootNodes()
-{
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->createAncestors(zookeeper_path);
-
-            Coordination::Requests ops;
-            Coordination::Responses responses;
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_databases_tables_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/table_uuids", "", zkutil::CreateMode::Persistent));
-            zk->tryMulti(ops, responses);
-        });
-}
-
-void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
-{
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
-}
-
-void RestoreCoordinationRemote::setError(const Exception & exception)
-{
-    stage_sync->setError(current_host, exception);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait)
-{
-    return stage_sync->wait(all_hosts, stage_to_wait);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/" + escapeForFileName(table_name);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/";
-            switch (object_type)
-            {
-                case UserDefinedSQLObjectType::Function:
-                    path += "functions";
-                    break;
-            }
-
-            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result =  zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
-{
-    bool lock_acquired = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            /// we need to remove leading '/' from root_zk_path
-            auto normalized_root_zk_path = root_zk_path.substr(1);
-            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
-            zk->createAncestors(restore_lock_path);
-            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                lock_acquired = true;
-                return;
-            }
-
-            if (code == Coordination::Error::ZNODEEXISTS)
-                lock_acquired = table_unique_id == zk->get(restore_lock_path);
-            else
-                zkutil::KeeperException::fromPath(code, restore_lock_path);
-        });
-    return lock_acquired;
-}
-
-void RestoreCoordinationRemote::generateUUIDForTable(ASTCreateQuery & create_query)
-{
-    String query_str = serializeAST(create_query);
-    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
-    String new_uuids_str = new_uuids.toString();
-
-    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
-            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
-
-            if (res == Coordination::Error::ZOK)
-            {
-                new_uuids.copyToQuery(create_query);
-                return;
-            }
-
-            if (res == Coordination::Error::ZNODEEXISTS)
-            {
-                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
-                return;
-            }
-
-            zkutil::KeeperException::fromPath(res, path);
-        });
-}
-
-void RestoreCoordinationRemote::removeAllNodes()
-{
-    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
-    ///
-    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-    /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination
-    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
-    /// of their restore work before that.
-
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->removeRecursive(zookeeper_path);
-        });
-}
-
-bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base restore
-    if (is_internal)
-        return false;
-
-    bool result = false;
-    std::string path = zookeeper_path + "/stage";
-
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            if (! zk->exists(root_zookeeper_path))
-                zk->createAncestors(root_zookeeper_path);
-
-            for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-            {
-                Coordination::Stat stat;
-                zk->get(root_zookeeper_path, &stat);
-                Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
-                for (const auto & existing_restore_path : existing_restore_paths)
-                {
-                    if (startsWith(existing_restore_path, "backup-"))
-                        continue;
-
-                    String existing_restore_uuid = existing_restore_path;
-                    existing_restore_uuid.erase(0, String("restore-").size());
-
-                    if (existing_restore_uuid == toString(restore_uuid))
-                        continue;
-
-                    String status;
-                    if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
-                    {
-                        /// Check if some other restore is in progress
-                        if (status == Stage::SCHEDULED_TO_START)
-                        {
-                            LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
-                            result = true;
-                            return;
-                        }
-                    }
-                }
-
-                zk->createIfNotExists(path, "");
-                auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
-                if (code == Coordination::Error::ZOK)
-                    break;
-                bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-                if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                    throw zkutil::KeeperException::fromPath(code, path);
-            }
-        });
-
-    return result;
-}
-
-}
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -100,7 +100,6 @@ RestorerFromBackup::RestorerFromBackup(
    , context(context_)
    , process_list_element(context->getProcessListElement())
    , after_task_callback(after_task_callback_)
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
    , log(getLogger("RestorerFromBackup"))
    , tables_dependencies("RestorerFromBackup")
@ -119,12 +118,14 @@ RestorerFromBackup::~RestorerFromBackup()
    }
 }

-void RestorerFromBackup::run(Mode mode)
+void RestorerFromBackup::run(Mode mode_)
 {
    /// run() can be called onle once.
    if (!current_stage.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");

+    mode = mode_;
+
    /// Find other hosts working along with us to execute this ON CLUSTER query.
    all_hosts = BackupSettings::Util::filterHostIDs(
        restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
@ -139,6 +140,7 @@ void RestorerFromBackup::run(Mode mode)
    setStage(Stage::FINDING_TABLES_IN_BACKUP);
    findDatabasesAndTablesInBackup();
    waitFutures();
+    logNumberOfDatabasesAndTablesToRestore();

    /// Check access rights.
    setStage(Stage::CHECKING_ACCESS_RIGHTS);
@ -228,20 +230,8 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa

    if (restore_coordination)
    {
-        restore_coordination->setStage(new_stage, message);
-
-        /// The initiator of a RESTORE ON CLUSTER query waits for other hosts to complete their work (see waitForStage(Stage::COMPLETED) in BackupsWorker::doRestore),
-        /// but other hosts shouldn't wait for each others' completion. (That's simply unnecessary and also
-        /// the initiator may start cleaning up (e.g. removing restore-coordination ZooKeeper nodes) once all other hosts are in Stage::COMPLETED.)
-        bool need_wait = (new_stage != Stage::COMPLETED);
-
-        if (need_wait)
-        {
-            if (new_stage == Stage::FINDING_TABLES_IN_BACKUP)
-                restore_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-            else
-                restore_coordination->waitForStage(new_stage);
-        }
+        /// There is no need to sync Stage::COMPLETED with other hosts because it's the last stage.
+        restore_coordination->setStage(new_stage, message, /* sync = */ (new_stage != Stage::COMPLETED));
    }
 }

@ -384,8 +374,12 @@ void RestorerFromBackup::findDatabasesAndTablesInBackup()
            }
        }
    }
+}

-    LOG_INFO(log, "Will restore {} databases and {} tables", getNumDatabases(), getNumTables());
+void RestorerFromBackup::logNumberOfDatabasesAndTablesToRestore() const
+{
+    std::string_view action = (mode == CHECK_ACCESS_ONLY) ? "check access rights for restoring" : "restore";
+    LOG_INFO(log, "Will {} {} databases and {} tables", action, getNumDatabases(), getNumTables());
 }

 void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name_in_backup, bool skip_if_inner_table, const std::optional<ASTs> & partitions)
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -53,7 +53,7 @@ public:
    using DataRestoreTasks = std::vector<DataRestoreTask>;

    /// Restores the metadata of databases and tables and returns tasks to restore the data of tables.
-    void run(Mode mode);
+    void run(Mode mode_);

    BackupPtr getBackup() const { return backup; }
    const RestoreSettings & getRestoreSettings() const { return restore_settings; }
@ -80,10 +80,10 @@ private:
    ContextMutablePtr context;
    QueryStatusPtr process_list_element;
    std::function<void()> after_task_callback;
-    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds create_table_timeout;
    LoggerPtr log;

+    Mode mode = Mode::RESTORE;
    Strings all_hosts;
    DDLRenamingMap renaming_map;
    std::vector<std::filesystem::path> root_paths_in_backup;
@ -97,6 +97,7 @@ private:
    void findDatabaseInBackupImpl(const String & database_name_in_backup, const std::set<DatabaseAndTableName> & except_table_names);
    void findEverythingInBackup(const std::set<String> & except_database_names, const std::set<DatabaseAndTableName> & except_table_names);

+    void logNumberOfDatabasesAndTablesToRestore() const;
    size_t getNumDatabases() const;
    size_t getNumTables() const;

--- a/Show More
+++ b/Show More