Merge branch 'master' into disable-fs-cache-background-download-for-metadata-files

2024-11-15 12:14:18 +00:00 · 2024-11-04 13:21:58 +01:00 · 2024-11-04 13:21:58 +01:00 · 9b20146d4b
commit 9b20146d4b
parent ca389d0d71 b618fe03bf
191 changed files with 6020 additions and 2332 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -332,7 +332,7 @@
 	url = https://github.com/ClickHouse/usearch.git
 [submodule "contrib/SimSIMD"]
 	path = contrib/SimSIMD
-	url = https://github.com/ashvardanian/SimSIMD.git
+	url = https://github.com/ClickHouse/SimSIMD.git
 [submodule "contrib/FP16"]
 	path = contrib/FP16
 	url = https://github.com/Maratyszcza/FP16.git
--- a/base/base/chrono_io.h
+++ b/base/base/chrono_io.h
@ -4,6 +4,7 @@
 #include <string>
 #include <sstream>
 #include <cctz/time_zone.h>
+#include <fmt/core.h>


 inline std::string to_string(const std::time_t & time)
@ -11,18 +12,6 @@ inline std::string to_string(const std::time_t & time)
    return cctz::format("%Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(time), cctz::local_time_zone());
 }

-template <typename Clock, typename Duration = typename Clock::duration>
-std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
-{
-    // Don't use DateLUT because it shows weird characters for
-    // TimePoint::max(). I wish we could use C++20 format, but it's not
-    // there yet.
-    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
-
-    auto in_time_t = std::chrono::system_clock::to_time_t(tp);
-    return to_string(in_time_t);
-}
-
 template <typename Rep, typename Period = std::ratio<1>>
 std::string to_string(const std::chrono::duration<Rep, Period> & duration)
 {
@ -33,6 +22,20 @@ std::string to_string(const std::chrono::duration<Rep, Period> & duration)
    return std::to_string(seconds_as_double.count()) + "s";
 }

+template <typename Clock, typename Duration = typename Clock::duration>
+std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
+{
+    // Don't use DateLUT because it shows weird characters for
+    // TimePoint::max(). I wish we could use C++20 format, but it's not
+    // there yet.
+    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
+
+    if constexpr (std::is_same_v<Clock, std::chrono::system_clock>)
+        return to_string(std::chrono::system_clock::to_time_t(tp));
+    else
+        return to_string(tp.time_since_epoch());
+}
+
 template <typename Clock, typename Duration = typename Clock::duration>
 std::ostream & operator<<(std::ostream & o, const std::chrono::time_point<Clock, Duration> & tp)
 {
@ -44,3 +47,23 @@ std::ostream & operator<<(std::ostream & o, const std::chrono::duration<Rep, Per
 {
    return o << to_string(duration);
 }
+
+template <typename Clock, typename Duration>
+struct fmt::formatter<std::chrono::time_point<Clock, Duration>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::time_point<Clock, Duration> & tp, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(tp), ctx);
+    }
+};
+
+template <typename Rep, typename Period>
+struct fmt::formatter<std::chrono::duration<Rep, Period>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::duration<Rep, Period> & duration, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(duration), ctx);
+    }
+};
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@ -1 +1 @@
-Subproject commit ff51434d90c66f916e94ff05b24530b127aa4cff
+Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3
--- a/contrib/arrow
+++ b/contrib/arrow
@ -1 +1 @@
-Subproject commit 5cfccd8ea65f33d4517e7409815d761c7650b45d
+Subproject commit 6e2574f5013a005c050c9a7787d341aef09d0063
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@ -213,13 +213,19 @@ target_include_directories(_orc SYSTEM PRIVATE
 set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow")

 # arrow/cpp/src/arrow/CMakeLists.txt (ARROW_SRCS + ARROW_COMPUTE + ARROW_IPC)
+# find . \( -iname \*.cc -o -iname \*.cpp -o -iname \*.c \) | sort | awk '{print "\"${LIBRARY_DIR}" substr($1,2) "\"" }' | grep -v 'test.cc' | grep -v 'json' | grep -v 'flight' \|
+# grep -v 'csv' | grep -v 'acero' | grep -v 'dataset' | grep -v 'testing' | grep -v 'gpu' | grep -v 'engine' | grep -v 'filesystem' | grep -v 'benchmark.cc'
 set(ARROW_SRCS
+        "${LIBRARY_DIR}/adapters/orc/adapter.cc"
+        "${LIBRARY_DIR}/adapters/orc/options.cc"
+        "${LIBRARY_DIR}/adapters/orc/util.cc"
        "${LIBRARY_DIR}/array/array_base.cc"
        "${LIBRARY_DIR}/array/array_binary.cc"
        "${LIBRARY_DIR}/array/array_decimal.cc"
        "${LIBRARY_DIR}/array/array_dict.cc"
        "${LIBRARY_DIR}/array/array_nested.cc"
        "${LIBRARY_DIR}/array/array_primitive.cc"
+        "${LIBRARY_DIR}/array/array_run_end.cc"
        "${LIBRARY_DIR}/array/builder_adaptive.cc"
        "${LIBRARY_DIR}/array/builder_base.cc"
        "${LIBRARY_DIR}/array/builder_binary.cc"
@ -227,124 +233,26 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/array/builder_dict.cc"
        "${LIBRARY_DIR}/array/builder_nested.cc"
        "${LIBRARY_DIR}/array/builder_primitive.cc"
-        "${LIBRARY_DIR}/array/builder_union.cc"
        "${LIBRARY_DIR}/array/builder_run_end.cc"
-        "${LIBRARY_DIR}/array/array_run_end.cc"
+        "${LIBRARY_DIR}/array/builder_union.cc"
        "${LIBRARY_DIR}/array/concatenate.cc"
        "${LIBRARY_DIR}/array/data.cc"
        "${LIBRARY_DIR}/array/diff.cc"
        "${LIBRARY_DIR}/array/util.cc"
        "${LIBRARY_DIR}/array/validate.cc"
-        "${LIBRARY_DIR}/builder.cc"
        "${LIBRARY_DIR}/buffer.cc"
-        "${LIBRARY_DIR}/chunked_array.cc"
-        "${LIBRARY_DIR}/chunk_resolver.cc"
-        "${LIBRARY_DIR}/compare.cc"
-        "${LIBRARY_DIR}/config.cc"
-        "${LIBRARY_DIR}/datum.cc"
-        "${LIBRARY_DIR}/device.cc"
-        "${LIBRARY_DIR}/extension_type.cc"
-        "${LIBRARY_DIR}/memory_pool.cc"
-        "${LIBRARY_DIR}/pretty_print.cc"
-        "${LIBRARY_DIR}/record_batch.cc"
-        "${LIBRARY_DIR}/result.cc"
-        "${LIBRARY_DIR}/scalar.cc"
-        "${LIBRARY_DIR}/sparse_tensor.cc"
-        "${LIBRARY_DIR}/status.cc"
-        "${LIBRARY_DIR}/table.cc"
-        "${LIBRARY_DIR}/table_builder.cc"
-        "${LIBRARY_DIR}/tensor.cc"
-        "${LIBRARY_DIR}/tensor/coo_converter.cc"
-        "${LIBRARY_DIR}/tensor/csf_converter.cc"
-        "${LIBRARY_DIR}/tensor/csx_converter.cc"
-        "${LIBRARY_DIR}/type.cc"
-        "${LIBRARY_DIR}/visitor.cc"
+        "${LIBRARY_DIR}/builder.cc"
        "${LIBRARY_DIR}/c/bridge.cc"
-        "${LIBRARY_DIR}/io/buffered.cc"
-        "${LIBRARY_DIR}/io/caching.cc"
-        "${LIBRARY_DIR}/io/compressed.cc"
-        "${LIBRARY_DIR}/io/file.cc"
-        "${LIBRARY_DIR}/io/hdfs.cc"
-        "${LIBRARY_DIR}/io/hdfs_internal.cc"
-        "${LIBRARY_DIR}/io/interfaces.cc"
-        "${LIBRARY_DIR}/io/memory.cc"
-        "${LIBRARY_DIR}/io/slow.cc"
-        "${LIBRARY_DIR}/io/stdio.cc"
-        "${LIBRARY_DIR}/io/transform.cc"
-        "${LIBRARY_DIR}/util/async_util.cc"
-        "${LIBRARY_DIR}/util/basic_decimal.cc"
-        "${LIBRARY_DIR}/util/bit_block_counter.cc"
-        "${LIBRARY_DIR}/util/bit_run_reader.cc"
-        "${LIBRARY_DIR}/util/bit_util.cc"
-        "${LIBRARY_DIR}/util/bitmap.cc"
-        "${LIBRARY_DIR}/util/bitmap_builders.cc"
-        "${LIBRARY_DIR}/util/bitmap_ops.cc"
-        "${LIBRARY_DIR}/util/bpacking.cc"
-        "${LIBRARY_DIR}/util/cancel.cc"
-        "${LIBRARY_DIR}/util/compression.cc"
-        "${LIBRARY_DIR}/util/counting_semaphore.cc"
-        "${LIBRARY_DIR}/util/cpu_info.cc"
-        "${LIBRARY_DIR}/util/decimal.cc"
-        "${LIBRARY_DIR}/util/delimiting.cc"
-        "${LIBRARY_DIR}/util/formatting.cc"
-        "${LIBRARY_DIR}/util/future.cc"
-        "${LIBRARY_DIR}/util/int_util.cc"
-        "${LIBRARY_DIR}/util/io_util.cc"
-        "${LIBRARY_DIR}/util/logging.cc"
-        "${LIBRARY_DIR}/util/key_value_metadata.cc"
-        "${LIBRARY_DIR}/util/memory.cc"
-        "${LIBRARY_DIR}/util/mutex.cc"
-        "${LIBRARY_DIR}/util/string.cc"
-        "${LIBRARY_DIR}/util/string_builder.cc"
-        "${LIBRARY_DIR}/util/task_group.cc"
-        "${LIBRARY_DIR}/util/tdigest.cc"
-        "${LIBRARY_DIR}/util/thread_pool.cc"
-        "${LIBRARY_DIR}/util/time.cc"
-        "${LIBRARY_DIR}/util/trie.cc"
-        "${LIBRARY_DIR}/util/unreachable.cc"
-        "${LIBRARY_DIR}/util/uri.cc"
-        "${LIBRARY_DIR}/util/utf8.cc"
-        "${LIBRARY_DIR}/util/value_parsing.cc"
-        "${LIBRARY_DIR}/util/byte_size.cc"
-        "${LIBRARY_DIR}/util/debug.cc"
-        "${LIBRARY_DIR}/util/tracing.cc"
-        "${LIBRARY_DIR}/util/atfork_internal.cc"
-        "${LIBRARY_DIR}/util/crc32.cc"
-        "${LIBRARY_DIR}/util/hashing.cc"
-        "${LIBRARY_DIR}/util/ree_util.cc"
-        "${LIBRARY_DIR}/util/union_util.cc"
-        "${LIBRARY_DIR}/vendored/base64.cpp"
-        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
-        "${LIBRARY_DIR}/vendored/musl/strptime.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
-
+        "${LIBRARY_DIR}/c/dlpack.cc"
+        "${LIBRARY_DIR}/chunk_resolver.cc"
+        "${LIBRARY_DIR}/chunked_array.cc"
+        "${LIBRARY_DIR}/compare.cc"
        "${LIBRARY_DIR}/compute/api_aggregate.cc"
        "${LIBRARY_DIR}/compute/api_scalar.cc"
        "${LIBRARY_DIR}/compute/api_vector.cc"
        "${LIBRARY_DIR}/compute/cast.cc"
        "${LIBRARY_DIR}/compute/exec.cc"
+        "${LIBRARY_DIR}/compute/expression.cc"
        "${LIBRARY_DIR}/compute/function.cc"
        "${LIBRARY_DIR}/compute/function_internal.cc"
        "${LIBRARY_DIR}/compute/kernel.cc"
@ -355,6 +263,7 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc"
        "${LIBRARY_DIR}/compute/kernels/codegen_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc"
+        "${LIBRARY_DIR}/compute/kernels/ree_util_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/row_encoder.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc"
        "${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc"
@ -382,30 +291,139 @@ set(ARROW_SRCS
        "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_pairwise.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_run_end_encode.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
        "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc"
-        "${LIBRARY_DIR}/compute/light_array.cc"
-        "${LIBRARY_DIR}/compute/registry.cc"
-        "${LIBRARY_DIR}/compute/expression.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
+        "${LIBRARY_DIR}/compute/key_hash_internal.cc"
+        "${LIBRARY_DIR}/compute/key_map_internal.cc"
+        "${LIBRARY_DIR}/compute/light_array_internal.cc"
        "${LIBRARY_DIR}/compute/ordering.cc"
+        "${LIBRARY_DIR}/compute/registry.cc"
        "${LIBRARY_DIR}/compute/row/compare_internal.cc"
        "${LIBRARY_DIR}/compute/row/encode_internal.cc"
        "${LIBRARY_DIR}/compute/row/grouper.cc"
        "${LIBRARY_DIR}/compute/row/row_internal.cc"
-
+        "${LIBRARY_DIR}/compute/util.cc"
+        "${LIBRARY_DIR}/config.cc"
+        "${LIBRARY_DIR}/datum.cc"
+        "${LIBRARY_DIR}/device.cc"
+        "${LIBRARY_DIR}/extension_type.cc"
+        "${LIBRARY_DIR}/integration/c_data_integration_internal.cc"
+        "${LIBRARY_DIR}/io/buffered.cc"
+        "${LIBRARY_DIR}/io/caching.cc"
+        "${LIBRARY_DIR}/io/compressed.cc"
+        "${LIBRARY_DIR}/io/file.cc"
+        "${LIBRARY_DIR}/io/hdfs.cc"
+        "${LIBRARY_DIR}/io/hdfs_internal.cc"
+        "${LIBRARY_DIR}/io/interfaces.cc"
+        "${LIBRARY_DIR}/io/memory.cc"
+        "${LIBRARY_DIR}/io/slow.cc"
+        "${LIBRARY_DIR}/io/stdio.cc"
+        "${LIBRARY_DIR}/io/transform.cc"
        "${LIBRARY_DIR}/ipc/dictionary.cc"
        "${LIBRARY_DIR}/ipc/feather.cc"
+        "${LIBRARY_DIR}/ipc/file_to_stream.cc"
        "${LIBRARY_DIR}/ipc/message.cc"
        "${LIBRARY_DIR}/ipc/metadata_internal.cc"
        "${LIBRARY_DIR}/ipc/options.cc"
        "${LIBRARY_DIR}/ipc/reader.cc"
+        "${LIBRARY_DIR}/ipc/stream_to_file.cc"
        "${LIBRARY_DIR}/ipc/writer.cc"
+        "${LIBRARY_DIR}/memory_pool.cc"
+        "${LIBRARY_DIR}/pretty_print.cc"
+        "${LIBRARY_DIR}/record_batch.cc"
+        "${LIBRARY_DIR}/result.cc"
+        "${LIBRARY_DIR}/scalar.cc"
+        "${LIBRARY_DIR}/sparse_tensor.cc"
+        "${LIBRARY_DIR}/status.cc"
+        "${LIBRARY_DIR}/table.cc"
+        "${LIBRARY_DIR}/table_builder.cc"
+        "${LIBRARY_DIR}/tensor.cc"
+        "${LIBRARY_DIR}/tensor/coo_converter.cc"
+        "${LIBRARY_DIR}/tensor/csf_converter.cc"
+        "${LIBRARY_DIR}/tensor/csx_converter.cc"
+        "${LIBRARY_DIR}/type.cc"
+        "${LIBRARY_DIR}/type_traits.cc"
+        "${LIBRARY_DIR}/util/align_util.cc"
+        "${LIBRARY_DIR}/util/async_util.cc"
+        "${LIBRARY_DIR}/util/atfork_internal.cc"
+        "${LIBRARY_DIR}/util/basic_decimal.cc"
+        "${LIBRARY_DIR}/util/bit_block_counter.cc"
+        "${LIBRARY_DIR}/util/bit_run_reader.cc"
+        "${LIBRARY_DIR}/util/bit_util.cc"
+        "${LIBRARY_DIR}/util/bitmap.cc"
+        "${LIBRARY_DIR}/util/bitmap_builders.cc"
+        "${LIBRARY_DIR}/util/bitmap_ops.cc"
+        "${LIBRARY_DIR}/util/bpacking.cc"
+        "${LIBRARY_DIR}/util/byte_size.cc"
+        "${LIBRARY_DIR}/util/cancel.cc"
+        "${LIBRARY_DIR}/util/compression.cc"
+        "${LIBRARY_DIR}/util/counting_semaphore.cc"
+        "${LIBRARY_DIR}/util/cpu_info.cc"
+        "${LIBRARY_DIR}/util/crc32.cc"
+        "${LIBRARY_DIR}/util/debug.cc"
+        "${LIBRARY_DIR}/util/decimal.cc"
+        "${LIBRARY_DIR}/util/delimiting.cc"
+        "${LIBRARY_DIR}/util/dict_util.cc"
+        "${LIBRARY_DIR}/util/float16.cc"
+        "${LIBRARY_DIR}/util/formatting.cc"
+        "${LIBRARY_DIR}/util/future.cc"
+        "${LIBRARY_DIR}/util/hashing.cc"
+        "${LIBRARY_DIR}/util/int_util.cc"
+        "${LIBRARY_DIR}/util/io_util.cc"
+        "${LIBRARY_DIR}/util/key_value_metadata.cc"
+        "${LIBRARY_DIR}/util/list_util.cc"
+        "${LIBRARY_DIR}/util/logging.cc"
+        "${LIBRARY_DIR}/util/memory.cc"
+        "${LIBRARY_DIR}/util/mutex.cc"
+        "${LIBRARY_DIR}/util/ree_util.cc"
+        "${LIBRARY_DIR}/util/string.cc"
+        "${LIBRARY_DIR}/util/string_builder.cc"
+        "${LIBRARY_DIR}/util/task_group.cc"
+        "${LIBRARY_DIR}/util/tdigest.cc"
+        "${LIBRARY_DIR}/util/thread_pool.cc"
+        "${LIBRARY_DIR}/util/time.cc"
+        "${LIBRARY_DIR}/util/tracing.cc"
+        "${LIBRARY_DIR}/util/trie.cc"
+        "${LIBRARY_DIR}/util/union_util.cc"
+        "${LIBRARY_DIR}/util/unreachable.cc"
+        "${LIBRARY_DIR}/util/uri.cc"
+        "${LIBRARY_DIR}/util/utf8.cc"
+        "${LIBRARY_DIR}/util/value_parsing.cc"
+        "${LIBRARY_DIR}/vendored/base64.cpp"
+        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
+        "${LIBRARY_DIR}/vendored/musl/strptime.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
+        "${LIBRARY_DIR}/visitor.cc"

        "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
        "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"
@ -465,22 +483,38 @@ set(PARQUET_SRCS
        "${LIBRARY_DIR}/arrow/schema.cc"
        "${LIBRARY_DIR}/arrow/schema_internal.cc"
        "${LIBRARY_DIR}/arrow/writer.cc"
+        "${LIBRARY_DIR}/benchmark_util.cc"
        "${LIBRARY_DIR}/bloom_filter.cc"
+        "${LIBRARY_DIR}/bloom_filter_reader.cc"
        "${LIBRARY_DIR}/column_reader.cc"
        "${LIBRARY_DIR}/column_scanner.cc"
        "${LIBRARY_DIR}/column_writer.cc"
        "${LIBRARY_DIR}/encoding.cc"
+        "${LIBRARY_DIR}/encryption/crypto_factory.cc"
        "${LIBRARY_DIR}/encryption/encryption.cc"
        "${LIBRARY_DIR}/encryption/encryption_internal.cc"
+        "${LIBRARY_DIR}/encryption/encryption_internal_nossl.cc"
+        "${LIBRARY_DIR}/encryption/file_key_unwrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_key_wrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_system_key_material_store.cc"
        "${LIBRARY_DIR}/encryption/internal_file_decryptor.cc"
        "${LIBRARY_DIR}/encryption/internal_file_encryptor.cc"
+        "${LIBRARY_DIR}/encryption/key_material.cc"
+        "${LIBRARY_DIR}/encryption/key_metadata.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit_internal.cc"
+        "${LIBRARY_DIR}/encryption/kms_client.cc"
+        "${LIBRARY_DIR}/encryption/local_wrap_kms_client.cc"
+        "${LIBRARY_DIR}/encryption/openssl_internal.cc"
        "${LIBRARY_DIR}/exception.cc"
        "${LIBRARY_DIR}/file_reader.cc"
        "${LIBRARY_DIR}/file_writer.cc"
-        "${LIBRARY_DIR}/page_index.cc"
-        "${LIBRARY_DIR}/level_conversion.cc"
        "${LIBRARY_DIR}/level_comparison.cc"
+        "${LIBRARY_DIR}/level_comparison_avx2.cc"
+        "${LIBRARY_DIR}/level_conversion.cc"
+        "${LIBRARY_DIR}/level_conversion_bmi2.cc"
        "${LIBRARY_DIR}/metadata.cc"
+        "${LIBRARY_DIR}/page_index.cc"
        "${LIBRARY_DIR}/platform.cc"
        "${LIBRARY_DIR}/printer.cc"
        "${LIBRARY_DIR}/properties.cc"
@ -489,7 +523,6 @@ set(PARQUET_SRCS
        "${LIBRARY_DIR}/stream_reader.cc"
        "${LIBRARY_DIR}/stream_writer.cc"
        "${LIBRARY_DIR}/types.cc"
-        "${LIBRARY_DIR}/bloom_filter_reader.cc"
        "${LIBRARY_DIR}/xxhasher.cc"

        "${GEN_LIBRARY_DIR}/parquet_constants.cpp"
@ -520,6 +553,9 @@ endif ()
 add_definitions(-DPARQUET_THRIFT_VERSION_MAJOR=0)
 add_definitions(-DPARQUET_THRIFT_VERSION_MINOR=16)

+# As per https://github.com/apache/arrow/pull/35672 you need to enable it explicitly.
+add_definitions(-DARROW_ENABLE_THREADING)
+
 # === tools

 set(TOOLS_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/tools/parquet")
--- a/contrib/flatbuffers
+++ b/contrib/flatbuffers
@ -1 +1 @@
-Subproject commit eb3f827948241ce0e701516f16cd67324802bce9
+Subproject commit 0100f6a5779831fa7a651e4b67ef389a8752bd9b
--- a/contrib/usearch
+++ b/contrib/usearch
@ -1 +1 @@
-Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48
+Subproject commit 53799b84ca9ad708b060d0b1cfa5f039371721cd
--- a/docker/test/style/Dockerfile
+++ b/docker/test/style/Dockerfile
@ -28,7 +28,7 @@ COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt

 RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8
-ENV LC_ALL en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8

 # Architecture of the image when BuildKit/buildx is used
 ARG TARGETARCH
--- a/docker/test/style/requirements.txt
+++ b/docker/test/style/requirements.txt
@ -12,6 +12,7 @@ charset-normalizer==3.3.2
 click==8.1.7
 codespell==2.2.1
 cryptography==43.0.1
+datacompy==0.7.3
 Deprecated==1.2.14
 dill==0.3.8
 flake8==4.0.1
@ -23,6 +24,7 @@ mccabe==0.6.1
 multidict==6.0.5
 mypy==1.8.0
 mypy-extensions==1.0.0
+pandas==2.2.3
 packaging==24.1
 pathspec==0.9.0
 pip==24.1.1
--- a/docs/en/getting-started/index.md
+++ b/docs/en/getting-started/index.md
@ -23,6 +23,7 @@ functions in ClickHouse. The sample datasets include:
 - The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
 - The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
 - The [Laion dataset](../getting-started/example-datasets/laion.md) has an example of [Approximate nearest neighbor search indexes](../engines/table-engines/mergetree-family/annindexes.md) usage
+- The [TPC-H](../getting-started/example-datasets/tpch.md), [TPC-DS](../getting-started/example-datasets/tpcds.md), and [Star Schema (SSB)](../getting-started/example-datasets/star-schema.md) industry benchmarks for analytics databases
 - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset 
 - [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3
 - [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs.
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@ -190,6 +190,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 - `--config-file` – The name of the configuration file.
 - `--secure` – If specified, will connect to server over secure connection (TLS). You might need to configure your CA certificates in the [configuration file](#configuration_files). The available configuration settings are the same as for [server-side TLS configuration](../operations/server-configuration-parameters/settings.md#openssl).
 - `--history_file` — Path to a file containing command history.
+- `--history_max_entries` — Maximum number of entries in the history file. Default value: 1 000 000.
 - `--param_<name>` — Value for a [query with parameters](#cli-queries-with-parameters).
 - `--hardware-utilization` — Print hardware utilization information in progress bar.
 - `--print-profile-events` – Print `ProfileEvents` packets.
--- a/docs/en/operations/system-tables/merge_tree_settings.md
+++ b/docs/en/operations/system-tables/merge_tree_settings.md
@ -18,6 +18,11 @@ Columns:
    - `1` — Current user can’t change the setting.
 - `type` ([String](../../sql-reference/data-types/string.md)) — Setting type (implementation specific string value).
 - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete.
+- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values:
+    - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. .
+    - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+    - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+    - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases.

 **Example**
 ```sql
--- a/docs/en/operations/system-tables/settings.md
+++ b/docs/en/operations/system-tables/settings.md
@ -18,6 +18,11 @@ Columns:
    - `1` — Current user can’t change the setting.
 - `default` ([String](../../sql-reference/data-types/string.md)) — Setting default value.
 - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete.
+- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values:
+    - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. .
+    - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+    - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+    - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases.

 **Example**

@ -26,19 +31,99 @@ The following example shows how to get information about settings which name con
 ``` sql
 SELECT *
 FROM system.settings
-WHERE name LIKE '%min_i%'
+WHERE name LIKE '%min_insert_block_size_%'
+FORMAT Vertical
 ```

 ``` text
-┌─name───────────────────────────────────────────────_─value─────_─changed─_─description───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────_─min──_─max──_─readonly─_─type─────────_─default───_─alias_for─_─is_obsolete─┐
-│ min_insert_block_size_rows                         │ 1048449   │       0 │ Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.                                                                         │ ____ │ ____ │        0 │ UInt64       │ 1048449   │           │           0 │
-│ min_insert_block_size_bytes                        │ 268402944 │       0 │ Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough.                                                                        │ ____ │ ____ │        0 │ UInt64       │ 268402944 │           │           0 │
-│ min_insert_block_size_rows_for_materialized_views  │ 0         │       0 │ Like min_insert_block_size_rows, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_rows)                                           │ ____ │ ____ │        0 │ UInt64       │ 0         │           │           0 │
-│ min_insert_block_size_bytes_for_materialized_views │ 0         │       0 │ Like min_insert_block_size_bytes, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_bytes)                                         │ ____ │ ____ │        0 │ UInt64       │ 0         │           │           0 │
-│ read_backoff_min_interval_between_events_ms        │ 1000      │       0 │ Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. │ ____ │ ____ │        0 │ Milliseconds │ 1000      │           │           0 │
-└────────────────────────────────────────────────────┴───────────┴─────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-──────────────────────────────────────────────────────┴──────┴──────┴──────────┴──────────────┴───────────┴───────────┴─────────────┘
-```
+Row 1:
+──────
+name:        min_insert_block_size_rows
+value:       1048449
+changed:     0
+description: Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+
+Possible values:
+
+- Positive integer.
+- 0 — Squashing disabled.
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     1048449
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 2:
+──────
+name:        min_insert_block_size_bytes
+value:       268402944
+changed:     0
+description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+
+Possible values:
+
+- Positive integer.
+- 0 — Squashing disabled.
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     268402944
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 3:
+──────
+name:        min_insert_block_size_rows_for_materialized_views
+value:       0
+changed:     0
+description: Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage.
+
+Possible values:
+
+- Any positive integer.
+- 0 — Squashing disabled.
+
+**See Also**
+
+- [min_insert_block_size_rows](#min-insert-block-size-rows)
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     0
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 4:
+──────
+name:        min_insert_block_size_bytes_for_materialized_views
+value:       0
+changed:     0
+description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage.
+
+Possible values:
+
+- Any positive integer.
+- 0 — Squashing disabled.
+
+**See also**
+
+- [min_insert_block_size_bytes](#min-insert-block-size-bytes)
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     0
+alias_for:   
+is_obsolete: 0
+tier:        Production
+ ```

 Using of `WHERE changed` can be useful, for example, when you want to check:

--- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md
@ -17,7 +17,7 @@ anyLast(column) [RESPECT NULLS]
 - `column`: The column name. 

 :::note
-Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not.
+Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the last value passed, regardless of whether it is `NULL` or not.
 :::

 **Returned value**
@ -40,4 +40,4 @@ SELECT anyLast(city) FROM any_last_nulls;
 ┌─anyLast(city)─┐
 │ Valencia      │
 └───────────────┘
-```
+```
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -55,7 +55,7 @@ SELECT * FROM view(column1=value1, column2=value2 ...)
 ## Materialized View

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE]
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE]
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }]
 AS SELECT ...
 [COMMENT 'comment']
--- a/docs/en/sql-reference/statements/grant.md
+++ b/docs/en/sql-reference/statements/grant.md
@ -117,6 +117,7 @@ GRANT SELECT ON db*.* TO john -- correct
 GRANT SELECT ON *.my_table TO john -- wrong
 GRANT SELECT ON foo*bar TO john -- wrong
 GRANT SELECT ON *suffix TO john -- wrong
+GRANT SELECT(foo) ON db.table* TO john -- wrong
 ```

 ## Privileges
@ -242,10 +243,13 @@ Hierarchy of privileges:
    - `HDFS`
    - `HIVE`
    - `JDBC`
+    - `KAFKA`
    - `MONGO`
    - `MYSQL`
+    - `NATS`
    - `ODBC`
    - `POSTGRES`
+    - `RABBITMQ`
    - `REDIS`
    - `REMOTE`
    - `S3`
@ -524,10 +528,13 @@ Allows using external data sources. Applies to [table engines](../../engines/tab
    - `HDFS`. Level: `GLOBAL`
    - `HIVE`. Level: `GLOBAL`
    - `JDBC`. Level: `GLOBAL`
+    - `KAFKA`. Level: `GLOBAL`
    - `MONGO`. Level: `GLOBAL`
    - `MYSQL`. Level: `GLOBAL`
+    - `NATS`. Level: `GLOBAL`
    - `ODBC`. Level: `GLOBAL`
    - `POSTGRES`. Level: `GLOBAL`
+    - `RABBITMQ`. Level: `GLOBAL`
    - `REDIS`. Level: `GLOBAL`
    - `REMOTE`. Level: `GLOBAL`
    - `S3`. Level: `GLOBAL`
--- a/docs/en/sql-reference/statements/kill.md
+++ b/docs/en/sql-reference/statements/kill.md
@ -83,7 +83,7 @@ The presence of long-running or incomplete mutations often indicates that a Clic
 - Or manually kill some of these mutations by sending a `KILL` command.

 ``` sql
-KILL MUTATION [ON CLUSTER cluster]
+KILL MUTATION
  WHERE <where expression to SELECT FROM system.mutations query>
  [TEST]
  [FORMAT format]
@ -135,7 +135,6 @@ KILL MUTATION WHERE database = 'default' AND table = 'table'
 -- Cancel the specific mutation:
 KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt'
 ```
-:::tip If you are killing a mutation in ClickHouse Cloud or in a self-managed cluster, then be sure to use the ```ON CLUSTER [cluster-name]``` option, in order to ensure the mutation is killed on all replicas:::

 The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table).

--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@ -95,7 +95,7 @@ sudo yum install -y clickhouse-server clickhouse-client
 sudo systemctl enable clickhouse-server
 sudo systemctl start clickhouse-server
 sudo systemctl status clickhouse-server
-clickhouse-client # илм "clickhouse-client --password" если установлен пароль
+clickhouse-client # или "clickhouse-client --password" если установлен пароль
 ```

 Для использования наиболее свежих версий нужно заменить `stable` на `testing` (рекомендуется для тестовых окружений). Также иногда доступен `prestable`.
--- a/docs/ru/sql-reference/statements/create/view.md
+++ b/docs/ru/sql-reference/statements/create/view.md
@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Материализованные представления {#materialized}

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] 
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] 
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }] 
 AS SELECT ...
 ```
--- a/docs/ru/sql-reference/statements/grant.md
+++ b/docs/ru/sql-reference/statements/grant.md
@ -192,14 +192,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
    - `addressToSymbol`
    - `demangle`
 - [SOURCES](#grant-sources)
+    - `AZURE`
    - `FILE`
-    - `URL`
-    - `REMOTE`
-    - `MYSQL`
-    - `ODBC`
-    - `JDBC`
    - `HDFS`
+    - `HIVE`
+    - `JDBC`
+    - `KAFKA`
+    - `MONGO`
+    - `MYSQL`
+    - `NATS`
+    - `ODBC`
+    - `POSTGRES`
+    - `RABBITMQ`
+    - `REDIS`
+    - `REMOTE`
    - `S3`
+    - `SQLITE`
+    - `URL`
 - [dictGet](#grant-dictget)

 Примеры того, как трактуется данная иерархия:
@ -461,14 +470,23 @@ GRANT INSERT(x,y) ON db.table TO john
 Разрешает использовать внешние источники данных. Применяется к [движкам таблиц](../../engines/table-engines/index.md) и [табличным функциям](../table-functions/index.md#table-functions).

 - `SOURCES`. Уровень: `GROUP`
+    - `AZURE`. Уровень: `GLOBAL`
    - `FILE`. Уровень: `GLOBAL`
-    - `URL`. Уровень: `GLOBAL`
-    - `REMOTE`. Уровень: `GLOBAL`
-    - `MYSQL`. Уровень: `GLOBAL`
-    - `ODBC`. Уровень: `GLOBAL`
-    - `JDBC`. Уровень: `GLOBAL`
    - `HDFS`. Уровень: `GLOBAL`
+    - `HIVE`. Уровень: `GLOBAL`
+    - `JDBC`. Уровень: `GLOBAL`
+    - `KAFKA`. Уровень: `GLOBAL`
+    - `MONGO`. Уровень: `GLOBAL`
+    - `MYSQL`. Уровень: `GLOBAL`
+    - `NATS`. Уровень: `GLOBAL`
+    - `ODBC`. Уровень: `GLOBAL`
+    - `POSTGRES`. Уровень: `GLOBAL`
+    - `RABBITMQ`. Уровень: `GLOBAL`
+    - `REDIS`. Уровень: `GLOBAL`
+    - `REMOTE`. Уровень: `GLOBAL`
    - `S3`. Уровень: `GLOBAL`
+    - `SQLITE`. Уровень: `GLOBAL`
+    - `URL`. Уровень: `GLOBAL`

 Привилегия `SOURCES` разрешает использование всех источников. Также вы можете присвоить привилегию для каждого источника отдельно. Для использования источников необходимы дополнительные привилегии.

--- a/docs/zh/sql-reference/statements/create/view.md
+++ b/docs/zh/sql-reference/statements/create/view.md
@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Materialized {#materialized}

 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
 ```

 物化视图存储由相应的[SELECT](../../../sql-reference/statements/select/index.md)管理.
--- a/docs/zh/sql-reference/statements/grant.md
+++ b/docs/zh/sql-reference/statements/grant.md
@ -170,14 +170,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
    -   `addressToSymbol`
    -   `demangle`
 -   [SOURCES](#grant-sources)
+    -   `AZURE`
    -   `FILE`
-    -   `URL`
-    -   `REMOTE`
-    -   `YSQL`
-    -   `ODBC`
-    -   `JDBC`
    -   `HDFS`
+    -   `HIVE`
+    -   `JDBC`
+    -   `KAFKA`
+    -   `MONGO`
+    -   `MYSQL`
+    -   `NATS`
+    -   `ODBC`
+    -   `POSTGRES`
+    -   `RABBITMQ`
+    -   `REDIS`
+    -   `REMOTE`
    -   `S3`
+    -   `SQLITE`
+    -   `URL`
 -   [dictGet](#grant-dictget)

 如何对待该层级的示例：
@ -428,14 +437,23 @@ GRANT INSERT(x,y) ON db.table TO john
 允许在 [table engines](../../engines/table-engines/index.md) 和 [table functions](../../sql-reference/table-functions/index.md#table-functions)中使用外部数据源。

 -   `SOURCES`. 级别: `GROUP`
+    -   `AZURE`. 级别: `GLOBAL`
    -   `FILE`. 级别: `GLOBAL`
-    -   `URL`. 级别: `GLOBAL`
-    -   `REMOTE`. 级别: `GLOBAL`
-    -   `YSQL`. 级别: `GLOBAL`
-    -   `ODBC`. 级别: `GLOBAL`
-    -   `JDBC`. 级别: `GLOBAL`
    -   `HDFS`. 级别: `GLOBAL`
+    -   `HIVE`. 级别: `GLOBAL`
+    -   `JDBC`. 级别: `GLOBAL`
+    -   `KAFKA`. 级别: `GLOBAL`
+    -   `MONGO`. 级别: `GLOBAL`
+    -   `MYSQL`. 级别: `GLOBAL`
+    -   `NATS`. 级别: `GLOBAL`
+    -   `ODBC`. 级别: `GLOBAL`
+    -   `POSTGRES`. 级别: `GLOBAL`
+    -   `RABBITMQ`. 级别: `GLOBAL`
+    -   `REDIS`. 级别: `GLOBAL`
+    -   `REMOTE`. 级别: `GLOBAL`
    -   `S3`. 级别: `GLOBAL`
+    -   `SQLITE`. 级别: `GLOBAL`
+    -   `URL`. 级别: `GLOBAL`

 `SOURCES` 权限允许使用所有数据源。当然也可以单独对每个数据源进行授权。要使用数据源时，还需要额外的权限。

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -192,6 +192,10 @@ void Client::parseConnectionsCredentials(Poco::Util::AbstractConfiguration & con
                history_file = home_path + "/" + history_file.substr(1);
            config.setString("history_file", history_file);
        }
+        if (config.has(prefix + ".history_max_entries"))
+        {
+            config.setUInt("history_max_entries", history_max_entries);
+        }
        if (config.has(prefix + ".accept-invalid-certificate"))
            config.setBool("accept-invalid-certificate", config.getBool(prefix + ".accept-invalid-certificate"));
    }
--- a/programs/disks/DisksApp.cpp
+++ b/programs/disks/DisksApp.cpp
@ -236,6 +236,7 @@ void DisksApp::runInteractiveReplxx()
    ReplxxLineReader lr(
        suggest,
        history_file,
+        history_max_entries,
        /* multiline= */ false,
        /* ignore_shell_suspend= */ false,
        query_extenders,
@ -398,6 +399,8 @@ void DisksApp::initializeHistoryFile()
                throw;
        }
    }
+
+    history_max_entries = config().getUInt("history-max-entries", 1000000);
 }

 void DisksApp::init(const std::vector<String> & common_arguments)
--- a/programs/disks/DisksApp.h
+++ b/programs/disks/DisksApp.h
@ -62,6 +62,8 @@ private:

    // Fields responsible for the REPL work
    String history_file;
+    UInt32 history_max_entries = 0; /// Maximum number of entries in the history file. Needs to be initialized to 0 since we don't have a proper constructor. Worry not, actual value is set within the initializeHistoryFile method.
+
    LineReader::Suggest suggest;
    static LineReader::Patterns query_extenders;
    static LineReader::Patterns query_delimiters;
--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@ -243,6 +243,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */)
        }
    }

+    history_max_entries = config().getUInt("history-max-entries", 1000000);
+
    String default_log_level;
    if (config().has("query"))
        /// We don't want to see any information log in query mode, unless it was set explicitly
@ -319,6 +321,7 @@ void KeeperClient::runInteractiveReplxx()
    ReplxxLineReader lr(
        suggest,
        history_file,
+        history_max_entries,
        /* multiline= */ false,
        /* ignore_shell_suspend= */ false,
        query_extenders,
--- a/programs/keeper-client/KeeperClient.h
+++ b/programs/keeper-client/KeeperClient.h
@ -59,6 +59,8 @@ protected:
    std::vector<String> getCompletions(const String & prefix) const;

    String history_file;
+    UInt32 history_max_entries; /// Maximum number of entries in the history file.
+
    LineReader::Suggest suggest;

    zkutil::ZooKeeperArgs zk_args;
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -243,6 +243,9 @@ enum class AccessType : uint8_t
    M(S3, "", GLOBAL, SOURCES) \
    M(HIVE, "", GLOBAL, SOURCES) \
    M(AZURE, "", GLOBAL, SOURCES) \
+    M(KAFKA, "", GLOBAL, SOURCES) \
+    M(NATS, "", GLOBAL, SOURCES) \
+    M(RABBITMQ, "", GLOBAL, SOURCES) \
    M(SOURCES, "", GROUP, ALL) \
    \
    M(CLUSTER, "", GLOBAL, ALL) /* ON CLUSTER queries */ \
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@ -52,7 +52,10 @@ namespace
        {AccessType::HDFS, "HDFS"},
        {AccessType::S3, "S3"},
        {AccessType::HIVE, "Hive"},
-        {AccessType::AZURE, "AzureBlobStorage"}
+        {AccessType::AZURE, "AzureBlobStorage"},
+        {AccessType::KAFKA, "Kafka"},
+        {AccessType::NATS, "NATS"},
+        {AccessType::RABBITMQ, "RabbitMQ"}
    };


--- a/src/Access/Credentials.h
+++ b/src/Access/Credentials.h
@ -15,6 +15,9 @@ public:
    explicit Credentials() = default;
    explicit Credentials(const String & user_name_);

+    Credentials(const Credentials &) = default;
+    Credentials(Credentials &&) = default;
+
    virtual ~Credentials() = default;

    const String & getUserName() const;
--- a/src/Backups/BackupConcurrencyCheck.cpp
+++ b/src/Backups/BackupConcurrencyCheck.cpp
@ -0,0 +1,135 @@
+#include <Backups/BackupConcurrencyCheck.h>
+
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
+}
+
+
+BackupConcurrencyCheck::BackupConcurrencyCheck(
+    const UUID & backup_or_restore_uuid_,
+    bool is_restore_,
+    bool on_cluster_,
+    bool allow_concurrency_,
+    BackupConcurrencyCounters & counters_)
+    : is_restore(is_restore_), backup_or_restore_uuid(backup_or_restore_uuid_), on_cluster(on_cluster_), counters(counters_)
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (!allow_concurrency_)
+    {
+        bool found_concurrent_operation = false;
+        if (is_restore)
+        {
+            size_t num_local_restores = counters.local_restores;
+            size_t num_on_cluster_restores = counters.on_cluster_restores.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_restores.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_restores;
+            }
+            else
+            {
+                ++num_local_restores;
+            }
+            found_concurrent_operation = (num_local_restores + num_on_cluster_restores > 1);
+        }
+        else
+        {
+            size_t num_local_backups = counters.local_backups;
+            size_t num_on_cluster_backups = counters.on_cluster_backups.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_backups.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_backups;
+            }
+            else
+            {
+                ++num_local_backups;
+            }
+            found_concurrent_operation = (num_local_backups + num_on_cluster_backups > 1);
+        }
+
+        if (found_concurrent_operation)
+            throwConcurrentOperationNotAllowed(is_restore);
+    }
+
+    if (on_cluster)
+    {
+        if (is_restore)
+            ++counters.on_cluster_restores[backup_or_restore_uuid];
+        else
+            ++counters.on_cluster_backups[backup_or_restore_uuid];
+    }
+    else
+    {
+        if (is_restore)
+            ++counters.local_restores;
+        else
+            ++counters.local_backups;
+    }
+}
+
+
+BackupConcurrencyCheck::~BackupConcurrencyCheck()
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (on_cluster)
+    {
+        if (is_restore)
+        {
+            auto it = counters.on_cluster_restores.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_restores.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_restores.erase(it);
+            }
+        }
+        else
+        {
+            auto it = counters.on_cluster_backups.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_backups.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_backups.erase(it);
+            }
+        }
+    }
+    else
+    {
+        if (is_restore)
+            --counters.local_restores;
+        else
+            --counters.local_backups;
+    }
+}
+
+
+void BackupConcurrencyCheck::throwConcurrentOperationNotAllowed(bool is_restore)
+{
+    throw Exception(
+        ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
+        "Concurrent {} are not allowed, turn on setting '{}'",
+        is_restore ? "restores" : "backups",
+        is_restore ? "allow_concurrent_restores" : "allow_concurrent_backups");
+}
+
+
+BackupConcurrencyCounters::BackupConcurrencyCounters() = default;
+
+
+BackupConcurrencyCounters::~BackupConcurrencyCounters()
+{
+    if (local_backups > 0 || local_restores > 0 || !on_cluster_backups.empty() || !on_cluster_restores.empty())
+        LOG_ERROR(getLogger(__PRETTY_FUNCTION__), "Some backups or restores are processing");
+}
+
+}
--- a/src/Backups/BackupConcurrencyCheck.h
+++ b/src/Backups/BackupConcurrencyCheck.h
@ -0,0 +1,55 @@
+#pragma once
+
+#include <Core/UUID.h>
+#include <base/scope_guard.h>
+#include <mutex>
+#include <unordered_map>
+
+
+namespace DB
+{
+class BackupConcurrencyCounters;
+
+/// Local checker for concurrent BACKUP or RESTORE operations.
+/// This class is used by implementations of IBackupCoordination and IRestoreCoordination
+/// to throw an exception if concurrent backups or restores are not allowed.
+class BackupConcurrencyCheck
+{
+public:
+    /// Checks concurrency of a BACKUP operation or a RESTORE operation.
+    /// Keep a constructed instance of BackupConcurrencyCheck until the operation is done.
+    BackupConcurrencyCheck(
+        const UUID & backup_or_restore_uuid_,
+        bool is_restore_,
+        bool on_cluster_,
+        bool allow_concurrency_,
+        BackupConcurrencyCounters & counters_);
+
+    ~BackupConcurrencyCheck();
+
+    [[noreturn]] static void throwConcurrentOperationNotAllowed(bool is_restore);
+
+private:
+    const bool is_restore;
+    const UUID backup_or_restore_uuid;
+    const bool on_cluster;
+    BackupConcurrencyCounters & counters;
+};
+
+
+class BackupConcurrencyCounters
+{
+public:
+    BackupConcurrencyCounters();
+    ~BackupConcurrencyCounters();
+
+private:
+    friend class BackupConcurrencyCheck;
+    size_t local_backups TSA_GUARDED_BY(mutex) = 0;
+    size_t local_restores TSA_GUARDED_BY(mutex) = 0;
+    std::unordered_map<UUID /* backup_uuid */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
+    std::unordered_map<UUID /* restore_uuid */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationCleaner.cpp
+++ b/src/Backups/BackupCoordinationCleaner.cpp
@ -0,0 +1,64 @@
+#include <Backups/BackupCoordinationCleaner.h>
+
+
+namespace DB
+{
+
+BackupCoordinationCleaner::BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
+    : zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
+{
+}
+
+void BackupCoordinationCleaner::cleanup()
+{
+    tryRemoveAllNodes(/* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryCleanupAfterError() noexcept
+{
+    return tryRemoveAllNodes(/* throw_if_error = */ false, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (cleanup_result.succeeded)
+            return true;
+        if (cleanup_result.exception)
+        {
+            if (throw_if_error)
+                std::rethrow_exception(cleanup_result.exception);
+            return false;
+        }
+    }
+
+    try
+    {
+        LOG_TRACE(log, "Removing nodes from ZooKeeper");
+        auto holder = with_retries.createRetriesControlHolder("removeAllNodes", retries_kind);
+        holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            zookeeper->removeRecursive(zookeeper_path);
+        });
+
+        std::lock_guard lock{mutex};
+        cleanup_result.succeeded = true;
+        return true;
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this restore: {}",
+                  getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));
+
+        std::lock_guard lock{mutex};
+        cleanup_result.exception = std::current_exception();
+
+        if (throw_if_error)
+            throw;
+        return false;
+    }
+}
+
+}
--- a/src/Backups/BackupCoordinationCleaner.h
+++ b/src/Backups/BackupCoordinationCleaner.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <Backups/WithRetries.h>
+
+
+namespace DB
+{
+
+/// Removes all the nodes from ZooKeeper used to coordinate a BACKUP ON CLUSTER operation or
+/// a RESTORE ON CLUSTER operation (successful or not).
+/// This class is used by BackupCoordinationOnCluster and RestoreCoordinationOnCluster to cleanup.
+class BackupCoordinationCleaner
+{
+public:
+    BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);
+
+    void cleanup();
+    bool tryCleanupAfterError() noexcept;
+
+private:
+    bool tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind);
+
+    const String zookeeper_path;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const LoggerPtr log;
+
+    struct CleanupResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+    };
+    CleanupResult cleanup_result TSA_GUARDED_BY(mutex);
+
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -1,5 +1,7 @@
 #include <Backups/BackupCoordinationLocal.h>
+
 #include <Common/Exception.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>
 #include <Common/quoteString.h>
 #include <fmt/format.h>
@ -8,27 +10,20 @@
 namespace DB
 {

-BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
-    : log(getLogger("BackupCoordinationLocal")), file_infos(plain_backup_)
+BackupCoordinationLocal::BackupCoordinationLocal(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("BackupCoordinationLocal"))
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ false, allow_concurrent_backup_, concurrency_counters_)
+    , file_infos(is_plain_backup_)
 {
 }

 BackupCoordinationLocal::~BackupCoordinationLocal() = default;

-void BackupCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void BackupCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo BackupCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -135,15 +130,4 @@ bool BackupCoordinationLocal::startWritingFile(size_t data_file_index)
    return writing_files.emplace(data_file_index).second;
 }

-
-bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
-{
-    if (num_active_backups > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_backups={}", num_active_backups);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -21,13 +22,21 @@ namespace DB
 class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
-    explicit BackupCoordinationLocal(bool plain_backup_);
+    explicit BackupCoordinationLocal(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_);
+
    ~BackupCoordinationLocal() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setBackupQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
@ -54,17 +63,18 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    BackupCoordinationReplicatedTables TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    BackupCoordinationReplicatedAccess TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    BackupCoordinationReplicatedSQLObjects TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    BackupCoordinationFileInfos TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    BackupCoordinationReplicatedTables replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    BackupCoordinationReplicatedAccess replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    BackupCoordinationReplicatedSQLObjects replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    BackupCoordinationFileInfos file_infos TSA_GUARDED_BY(file_infos_mutex);
    BackupCoordinationKeeperMapTables keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
--- a/src/Backups/BackupCoordinationOnCluster.cpp
+++ b/src/Backups/BackupCoordinationOnCluster.cpp
@ -1,7 +1,4 @@
-#include <Backups/BackupCoordinationRemote.h>
-
-#include <base/hex.h>
-#include <boost/algorithm/string/split.hpp>
+#include <Backups/BackupCoordinationOnCluster.h>

 #include <Access/Common/AccessEntityType.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
@ -26,8 +23,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace Stage = BackupCoordinationStage;
-
 namespace
 {
    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
@ -149,144 +144,152 @@ namespace
    };
 }

-size_t BackupCoordinationRemote::findCurrentHostIndex(const Strings & all_hosts, const String & current_host)
+Strings BackupCoordinationOnCluster::excludeInitiator(const Strings & all_hosts)
+{
+    Strings all_hosts_without_initiator = all_hosts;
+    bool has_initiator = (std::erase(all_hosts_without_initiator, kInitiator) > 0);
+    chassert(has_initiator);
+    return all_hosts_without_initiator;
+}
+
+size_t BackupCoordinationOnCluster::findCurrentHostIndex(const String & current_host, const Strings & all_hosts)
 {
    auto it = std::find(all_hosts.begin(), all_hosts.end(), current_host);
    if (it == all_hosts.end())
-        return 0;
+        return all_hosts.size();
    return it - all_hosts.begin();
 }

-BackupCoordinationRemote::BackupCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
+
+BackupCoordinationOnCluster::BackupCoordinationOnCluster(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
    const BackupKeeperSettings & keeper_settings_,
-    const String & backup_uuid_,
-    const Strings & all_hosts_,
    const String & current_host_,
-    bool plain_backup_,
-    bool is_internal_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
    QueryStatusPtr process_list_element_)
    : root_zookeeper_path(root_zookeeper_path_)
-    , zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/backup-" + toString(backup_uuid_))
    , keeper_settings(keeper_settings_)
    , backup_uuid(backup_uuid_)
    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(excludeInitiator(all_hosts))
    , current_host(current_host_)
-    , current_host_index(findCurrentHostIndex(all_hosts, current_host))
-    , plain_backup(plain_backup_)
-    , is_internal(is_internal_)
-    , log(getLogger("BackupCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
+    , current_host_index(findCurrentHostIndex(current_host, all_hosts))
+    , plain_backup(is_plain_backup_)
+    , log(getLogger("BackupCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ true, allow_concurrent_backup_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
 {
    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
 }

-BackupCoordinationRemote::~BackupCoordinationRemote()
+BackupCoordinationOnCluster::~BackupCoordinationOnCluster()
 {
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    tryFinishImpl();
 }

-void BackupCoordinationRemote::createRootNodes()
+void BackupCoordinationOnCluster::createRootNodes()
 {
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
    holder.retries_ctl.retryLoop(
    [&, &zk = holder.faulty_zookeeper]()
    {
        with_retries.renewZooKeeper(zk);

        zk->createAncestors(zookeeper_path);
-
-        Coordination::Requests ops;
-        Coordination::Responses responses;
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_part_names", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_mutations", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_data_paths", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/file_infos", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/writing_files", "", zkutil::CreateMode::Persistent));
-        zk->tryMulti(ops, responses);
+        zk->createIfNotExists(zookeeper_path, "");
+        zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_access", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_sql_objects", "");
+        zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+        zk->createIfNotExists(zookeeper_path + "/file_infos", "");
+        zk->createIfNotExists(zookeeper_path + "/writing_files", "");
    });
 }

-void BackupCoordinationRemote::removeAllNodes()
+Strings BackupCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-    [&, &zk = holder.faulty_zookeeper]()
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void BackupCoordinationOnCluster::setBackupQueryWasSentToOtherHosts()
+{
+    backup_query_was_sent_to_other_hosts = true;
+}
+
+bool BackupCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void BackupCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool BackupCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool BackupCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
    {
-        /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
-        ///
-        /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-        /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
-        /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
-        /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
-        with_retries.renewZooKeeper(zk);
-        zk->removeRecursive(zookeeper_path);
-    });
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
 }

-
-void BackupCoordinationRemote::setStage(const String & new_stage, const String & message)
+void BackupCoordinationOnCluster::waitForOtherHostsToFinish()
 {
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
+    if ((current_host != kInitiator) || !backup_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
 }

-void BackupCoordinationRemote::setError(const Exception & exception)
+bool BackupCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
 {
-    stage_sync->setError(current_host, exception);
+    if (current_host != kInitiator)
+        return false;
+    if (!backup_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait)
+ZooKeeperRetriesInfo BackupCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
 {
-    return stage_sync->wait(all_hosts, stage_to_wait);
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-
-void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
+void BackupCoordinationOnCluster::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
 {
    {
        auto holder = with_retries.createRetriesControlHolder(logging_name + "::create");
@ -301,7 +304,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    if (value.empty())
        return;

-    size_t max_part_size = keeper_settings.keeper_value_max_size;
+    size_t max_part_size = keeper_settings.value_max_size;
    if (!max_part_size)
        max_part_size = value.size();

@ -324,7 +327,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    }
 }

-String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
+String BackupCoordinationOnCluster::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
 {
    Strings part_names;

@ -357,7 +360,7 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
 }


-void BackupCoordinationRemote::addReplicatedPartNames(
+void BackupCoordinationOnCluster::addReplicatedPartNames(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -381,14 +384,14 @@ void BackupCoordinationRemote::addReplicatedPartNames(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
+Strings BackupCoordinationOnCluster::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
    return replicated_tables->getPartNames(table_zk_path, replica_name);
 }

-void BackupCoordinationRemote::addReplicatedMutations(
+void BackupCoordinationOnCluster::addReplicatedMutations(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -412,7 +415,7 @@ void BackupCoordinationRemote::addReplicatedMutations(
        });
 }

-std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationOnCluster::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -420,7 +423,7 @@ std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getRepl
 }


-void BackupCoordinationRemote::addReplicatedDataPath(
+void BackupCoordinationOnCluster::addReplicatedDataPath(
    const String & table_zk_path, const String & data_path)
 {
    {
@ -441,7 +444,7 @@ void BackupCoordinationRemote::addReplicatedDataPath(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk_path) const
+Strings BackupCoordinationOnCluster::getReplicatedDataPaths(const String & table_zk_path) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -449,7 +452,7 @@ Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk
 }


-void BackupCoordinationRemote::prepareReplicatedTables() const
+void BackupCoordinationOnCluster::prepareReplicatedTables() const
 {
    if (replicated_tables)
        return;
@ -536,7 +539,7 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
        replicated_tables->addDataPath(std::move(data_paths));
 }

-void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
+void BackupCoordinationOnCluster::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
 {
    {
        std::lock_guard lock{replicated_access_mutex};
@ -558,14 +561,14 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
    });
 }

-Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
+Strings BackupCoordinationOnCluster::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
 {
    std::lock_guard lock{replicated_access_mutex};
    prepareReplicatedAccess();
    return replicated_access->getFilePaths(access_zk_path, access_entity_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedAccess() const
+void BackupCoordinationOnCluster::prepareReplicatedAccess() const
 {
    if (replicated_access)
        return;
@ -601,7 +604,7 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
        replicated_access->addFilePath(std::move(file_path));
 }

-void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
+void BackupCoordinationOnCluster::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
 {
    {
        std::lock_guard lock{replicated_sql_objects_mutex};
@ -631,14 +634,14 @@ void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_
    });
 }

-Strings BackupCoordinationRemote::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
+Strings BackupCoordinationOnCluster::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
 {
    std::lock_guard lock{replicated_sql_objects_mutex};
    prepareReplicatedSQLObjects();
    return replicated_sql_objects->getDirectories(loader_zk_path, object_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
+void BackupCoordinationOnCluster::prepareReplicatedSQLObjects() const
 {
    if (replicated_sql_objects)
        return;
@ -674,7 +677,7 @@ void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
        replicated_sql_objects->addDirectory(std::move(directory));
 }

-void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
+void BackupCoordinationOnCluster::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
 {
    {
        std::lock_guard lock{keeper_map_tables_mutex};
@ -695,7 +698,7 @@ void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_
    });
 }

-void BackupCoordinationRemote::prepareKeeperMapTables() const
+void BackupCoordinationOnCluster::prepareKeeperMapTables() const
 {
    if (keeper_map_tables)
        return;
@ -740,7 +743,7 @@ void BackupCoordinationRemote::prepareKeeperMapTables() const

 }

-String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
+String BackupCoordinationOnCluster::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
 {
    std::lock_guard lock(keeper_map_tables_mutex);
    prepareKeeperMapTables();
@ -748,7 +751,7 @@ String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zooke
 }


-void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
+void BackupCoordinationOnCluster::addFileInfos(BackupFileInfos && file_infos_)
 {
    {
        std::lock_guard lock{file_infos_mutex};
@ -761,21 +764,21 @@ void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
    serializeToMultipleZooKeeperNodes(zookeeper_path + "/file_infos/" + current_host, file_infos_str, "addFileInfos");
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfos() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfos() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfos(current_host);
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfosForAllHosts() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfosForAllHosts() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfosForAllHosts();
 }

-void BackupCoordinationRemote::prepareFileInfos() const
+void BackupCoordinationOnCluster::prepareFileInfos() const
 {
    if (file_infos)
        return;
@ -801,7 +804,7 @@ void BackupCoordinationRemote::prepareFileInfos() const
    }
 }

-bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
+bool BackupCoordinationOnCluster::startWritingFile(size_t data_file_index)
 {
    {
        /// Check if this host is already writing this file.
@ -842,66 +845,4 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
    }
 }

-bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base backup
-    if (is_internal)
-        return false;
-
-    std::string backup_stage_path = zookeeper_path + "/stage";
-
-    bool result = false;
-
-    auto holder = with_retries.createRetriesControlHolder("getAllArchiveSuffixes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zk);
-
-        if (!zk->exists(root_zookeeper_path))
-            zk->createAncestors(root_zookeeper_path);
-
-        for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-        {
-            Coordination::Stat stat;
-            zk->get(root_zookeeper_path, &stat);
-            Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
-
-            for (const auto & existing_backup_path : existing_backup_paths)
-            {
-                if (startsWith(existing_backup_path, "restore-"))
-                    continue;
-
-                String existing_backup_uuid = existing_backup_path;
-                existing_backup_uuid.erase(0, String("backup-").size());
-
-                if (existing_backup_uuid == toString(backup_uuid))
-                    continue;
-
-                String status;
-                if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
-                {
-                    /// Check if some other backup is in progress
-                    if (status == Stage::SCHEDULED_TO_START)
-                    {
-                        LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
-                        result = true;
-                        return;
-                    }
-                }
-            }
-
-            zk->createIfNotExists(backup_stage_path, "");
-            auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
-            if (code == Coordination::Error::ZOK)
-                break;
-            bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-            if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                throw zkutil::KeeperException::fromPath(code, backup_stage_path);
-        }
-    });
-
-    return result;
-}
-
 }
--- a/src/Backups/BackupCoordinationOnCluster.h
+++ b/src/Backups/BackupCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -13,32 +15,35 @@
 namespace DB
 {

-/// We try to store data to zookeeper several times due to possible version conflicts.
-constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
-
 /// Implementation of the IBackupCoordination interface performing coordination via ZooKeeper. It's necessary for "BACKUP ON CLUSTER".
-class BackupCoordinationRemote : public IBackupCoordination
+class BackupCoordinationOnCluster : public IBackupCoordination
 {
 public:
-    using BackupKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    BackupCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    BackupCoordinationOnCluster(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
        const String & root_zookeeper_path_,
+        zkutil::GetZooKeeper get_zookeeper_,
        const BackupKeeperSettings & keeper_settings_,
-        const String & backup_uuid_,
-        const Strings & all_hosts_,
        const String & current_host_,
-        bool plain_backup_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~BackupCoordinationRemote() override;
+    ~BackupCoordinationOnCluster() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setBackupQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    void addReplicatedPartNames(
        const String & table_zk_path,
@ -73,13 +78,14 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

-    static size_t findCurrentHostIndex(const Strings & all_hosts, const String & current_host);
+    static Strings excludeInitiator(const Strings & all_hosts);
+    static size_t findCurrentHostIndex(const String & current_host, const Strings & all_hosts);

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

    void serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name);
    String deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const;
@ -96,26 +102,27 @@ private:
    const String root_zookeeper_path;
    const String zookeeper_path;
    const BackupKeeperSettings keeper_settings;
-    const String backup_uuid;
+    const UUID backup_uuid;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
    const bool plain_backup;
-    const bool is_internal;
    LoggerPtr const log;

-    /// The order of these two fields matters, because stage_sync holds a reference to with_retries object
-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> backup_query_was_sent_to_other_hosts = false;

-    mutable std::optional<BackupCoordinationReplicatedTables> TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    mutable std::optional<BackupCoordinationReplicatedAccess> TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    mutable std::optional<BackupCoordinationReplicatedSQLObjects> TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    mutable std::optional<BackupCoordinationFileInfos> TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    mutable std::optional<BackupCoordinationReplicatedSQLObjects> replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    mutable std::optional<BackupCoordinationFileInfos> file_infos TSA_GUARDED_BY(file_infos_mutex);
    mutable std::optional<BackupCoordinationKeeperMapTables> keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

-    mutable std::mutex zookeeper_mutex;
    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
    mutable std::mutex replicated_sql_objects_mutex;
--- a/src/Backups/BackupCoordinationStage.h
+++ b/src/Backups/BackupCoordinationStage.h
@ -8,10 +8,6 @@ namespace DB

 namespace BackupCoordinationStage
 {
-    /// This stage is set after concurrency check so ensure we dont start other backup/restores
-    /// when concurrent backup/restores are not allowed
-    constexpr const char * SCHEDULED_TO_START = "scheduled to start";
-
    /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
    constexpr const char * GATHERING_METADATA = "gathering metadata";

@ -46,10 +42,6 @@ namespace BackupCoordinationStage

    /// Coordination stage meaning that a host finished its work.
    constexpr const char * COMPLETED = "completed";
-
-    /// Coordination stage meaning that backup/restore has failed due to an error
-    /// Check '/error' for the error message
-    constexpr const char * ERROR = "error";
 }

 }
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -10,33 +10,193 @@ class BackupCoordinationStageSync
 {
 public:
    BackupCoordinationStageSync(
-        const String & root_zookeeper_path_,
-        WithRetries & with_retries_,
+        bool is_restore_,                    /// true if this is a RESTORE ON CLUSTER command, false if this is a BACKUP ON CLUSTER command
+        const String & zookeeper_path_,      /// path to the "stage" folder in ZooKeeper
+        const String & current_host_,        /// the current host, or an empty string if it's the initiator of the BACKUP/RESTORE ON CLUSTER command
+        const Strings & all_hosts_,          /// all the hosts (including the initiator and the current host) performing the BACKUP/RESTORE ON CLUSTER command
+        bool allow_concurrency_,             /// whether it's allowed to have concurrent backups or restores.
+        const WithRetries & with_retries_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+        QueryStatusPtr process_list_element_,
        LoggerPtr log_);

+    ~BackupCoordinationStageSync();
+
    /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
-    void set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts = false);
-    void setError(const String & current_host, const Exception & exception);
+    void setStage(const String & stage, const String & stage_result = {});

-    /// Sets the stage of the current host and waits until all hosts come to the same stage.
-    /// The function returns the messages all hosts set when they come to the required stage.
-    Strings wait(const Strings & all_hosts, const String & stage_to_wait);
+    /// Waits until all the specified hosts come to the specified stage.
+    /// The function returns the results which specified hosts set when they came to the required stage.
+    /// If it doesn't happen before the timeout then the function will stop waiting and throw an exception.
+    Strings waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout = {}) const;

-    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
-    Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout);
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    void waitForOtherHostsToFinish() const;
+
+    /// Lets other host know that the current host has finished its work.
+    void finish(bool & other_hosts_also_finished);
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(std::exception_ptr exception) noexcept;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    bool tryWaitForOtherHostsToFinishAfterError() const noexcept;
+
+    /// Lets other host know that the current host has finished its work (as a part of error-handling process).
+    bool tryFinishAfterError(bool & other_hosts_also_finished) noexcept;
+
+    /// Returns a printable name of a specific host. For empty host the function returns "initiator".
+    static String getHostDesc(const String & host);
+    static String getHostsDesc(const Strings & hosts);

 private:
+    /// Initializes the original state. It will be updated then with readCurrentState().
+    void initializeState();
+
+    /// Creates the root node in ZooKeeper.
    void createRootNodes();

-    struct State;
-    State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+    /// Atomically creates both 'start' and 'alive' nodes and also checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void createStartAndAliveNodes();
+    void createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);

-    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
+    /// Deserialize the version of a node stored in the 'start' node.
+    int parseStartNode(const String & start_node_contents, const String & host) const;

-    String zookeeper_path;
-    /// A reference to the field of parent object - BackupCoordinationRemote or RestoreCoordinationRemote
-    WithRetries & with_retries;
-    LoggerPtr log;
+    /// Recreates the 'alive' node if it doesn't exist. It's an ephemeral node so it's removed automatically after disconnections.
+    void createAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void checkConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Watching thread periodically reads the current state from ZooKeeper and recreates the 'alive' node.
+    void startWatchingThread();
+    void stopWatchingThread();
+    void watchingThread();
+
+    /// Reads the current state from ZooKeeper without throwing exceptions.
+    void readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    String getStageNodePath(const String & stage) const;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(const Exception & exception);
+    void setError(const Exception & exception);
+
+    /// Deserializes an error stored in the error node.
+    static std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents);
+
+    /// Reset the `connected` flag for each host.
+    void resetConnectedFlag();
+
+    /// Checks if the current query is cancelled, and if so then the function sets the `cancelled` flag in the current state.
+    void checkIfQueryCancelled();
+
+    /// Checks if the current state contains an error, and if so then the function passes this error to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfError();
+
+    /// Checks if some host was disconnected for too long, and if so then the function generates an error and pass it to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfDisconnectedTooLong();
+
+    /// Used by waitForHostsToReachStage() to check if everything is ready to return.
+    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, bool time_is_out, std::optional<std::chrono::milliseconds> timeout, Strings & results) const TSA_REQUIRES(mutex);
+
+    /// Creates the 'finish' node.
+    bool tryFinishImpl();
+    bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
+    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Returns the version used by the initiator.
+    int getInitiatorVersion() const;
+
+    /// Waits until all the other hosts finish their work.
+    bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
+    bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
+
+    const bool is_restore;
+    const String operation_name;
+    const String current_host;
+    const String current_host_desc;
+    const Strings all_hosts;
+    const bool allow_concurrency;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const ThreadPoolCallbackRunnerUnsafe<void> schedule;
+    const QueryStatusPtr process_list_element;
+    const LoggerPtr log;
+
+    const std::chrono::seconds failure_after_host_disconnected_for_seconds;
+    const std::chrono::seconds finish_timeout_after_error;
+    const std::chrono::milliseconds sync_period_ms;
+    const size_t max_attempts_after_bad_version;
+
+    /// Paths in ZooKeeper.
+    const std::filesystem::path zookeeper_path;
+    const String root_zookeeper_path;
+    const String operation_node_path;
+    const String operation_node_name;
+    const String stage_node_path;
+    const String start_node_path;
+    const String finish_node_path;
+    const String num_hosts_node_path;
+    const String alive_node_path;
+    const String alive_tracker_node_path;
+    const String error_node_path;
+
+    std::shared_ptr<Poco::Event> zk_nodes_changed;
+
+    /// We store list of previously found ZooKeeper nodes to show better logging messages.
+    Strings zk_nodes;
+
+    /// Information about one host read from ZooKeeper.
+    struct HostInfo
+    {
+        String host;
+        bool started = false;
+        bool connected = false;
+        bool finished = false;
+        int version = 1;
+        std::map<String /* stage */, String /* result */> stages = {}; /// std::map because we need to compare states
+        std::exception_ptr exception = nullptr;
+
+        std::chrono::time_point<std::chrono::system_clock> last_connection_time = {};
+        std::chrono::time_point<std::chrono::steady_clock> last_connection_time_monotonic = {};
+
+        bool operator ==(const HostInfo & other) const;
+        bool operator !=(const HostInfo & other) const;
+    };
+
+    /// Information about all the host participating in the current BACKUP or RESTORE operation.
+    struct State
+    {
+        std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
+        std::optional<String> host_with_error;
+        bool cancelled = false;
+
+        bool operator ==(const State & other) const;
+        bool operator !=(const State & other) const;
+    };
+
+    State state TSA_GUARDED_BY(mutex);
+    mutable std::condition_variable state_changed;
+
+    std::future<void> watching_thread_future;
+    std::atomic<bool> should_stop_watching_thread = false;
+
+    struct FinishResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+        bool other_hosts_also_finished = false;
+    };
+    FinishResult finish_result TSA_GUARDED_BY(mutex);
+
+    mutable std::mutex mutex;
 };

 }
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -102,7 +102,6 @@ BackupEntriesCollector::BackupEntriesCollector(
    , read_settings(read_settings_)
    , context(context_)
    , process_list_element(context->getProcessListElement())
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , collect_metadata_timeout(context->getConfigRef().getUInt64(
          "backups.collect_metadata_timeout", context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)))
    , attempts_to_collect_metadata_before_sleep(context->getConfigRef().getUInt("backups.attempts_to_collect_metadata_before_sleep", 2))
@ -176,21 +175,7 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String
    checkIsQueryCancelled();

    current_stage = new_stage;
-    backup_coordination->setStage(new_stage, message);
-
-    if (new_stage == Stage::formatGatheringMetadata(0))
-    {
-        return backup_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-    }
-    if (new_stage.starts_with(Stage::GATHERING_METADATA))
-    {
-        auto current_time = std::chrono::steady_clock::now();
-        auto end_of_timeout = std::max(current_time, collect_metadata_end_time);
-        return backup_coordination->waitForStage(
-            new_stage, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time));
-    }
-
-    return backup_coordination->waitForStage(new_stage);
+    return backup_coordination->setStage(new_stage, message, /* sync = */ true);
 }

 void BackupEntriesCollector::checkIsQueryCancelled() const
--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -111,10 +111,6 @@ private:
    ContextPtr context;
    QueryStatusPtr process_list_element;

-    /// The time a BACKUP ON CLUSTER or RESTORE ON CLUSTER command will wait until all the nodes receive the BACKUP (or RESTORE) query and start working.
-    /// This setting is similar to `distributed_ddl_task_timeout`.
-    const std::chrono::milliseconds on_cluster_first_sync_timeout;
-
    /// The time a BACKUP command will try to collect the metadata of tables & databases.
    const std::chrono::milliseconds collect_metadata_timeout;

--- a/src/Backups/BackupIO.h
+++ b/src/Backups/BackupIO.h
@ -5,6 +5,7 @@

 namespace DB
 {
+
 class IDisk;
 using DiskPtr = std::shared_ptr<IDisk>;
 class SeekableReadBuffer;
@ -63,9 +64,13 @@ public:

    virtual void copyFile(const String & destination, const String & source, size_t size) = 0;

+    /// Removes a file written to the backup, if it still exists.
    virtual void removeFile(const String & file_name) = 0;
    virtual void removeFiles(const Strings & file_names) = 0;

+    /// Removes the backup folder if it's empty or contains empty subfolders.
+    virtual void removeEmptyDirectories() = 0;
+
    virtual const ReadSettings & getReadSettings() const = 0;
    virtual const WriteSettings & getWriteSettings() const = 0;
    virtual size_t getWriteBufferSize() const = 0;
--- a/src/Backups/BackupIO_AzureBlobStorage.h
+++ b/src/Backups/BackupIO_AzureBlobStorage.h
@ -81,6 +81,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@ -91,16 +91,36 @@ std::unique_ptr<WriteBuffer> BackupWriterDisk::writeFile(const String & file_nam
 void BackupWriterDisk::removeFile(const String & file_name)
 {
    disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
 }

 void BackupWriterDisk::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!disk->existsDirectory(current_dir))
+        return;
+
+    if (disk->isDirectoryEmpty(current_dir))
+    {
+        disk->removeDirectory(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (auto it = disk->iterateDirectory(current_dir); it->isValid(); it->next())
+        removeEmptyDirectoriesImpl(current_dir / it->name());
+
+    if (disk->isDirectoryEmpty(current_dir))
+        disk->removeDirectory(current_dir);
 }

 void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_Disk.h
+++ b/src/Backups/BackupIO_Disk.h
@ -50,9 +50,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const DiskPtr disk;
    const std::filesystem::path root_path;
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@ -106,16 +106,36 @@ std::unique_ptr<WriteBuffer> BackupWriterFile::writeFile(const String & file_nam
 void BackupWriterFile::removeFile(const String & file_name)
 {
    (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
 }

 void BackupWriterFile::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!fs::is_directory(current_dir))
+        return;
+
+    if (fs::is_empty(current_dir))
+    {
+        (void)fs::remove(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (const auto & it : std::filesystem::directory_iterator{current_dir})
+        removeEmptyDirectoriesImpl(it.path());
+
+    if (fs::is_empty(current_dir))
+        (void)fs::remove(current_dir);
 }

 void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_File.h
+++ b/src/Backups/BackupIO_File.h
@ -42,9 +42,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const std::filesystem::path root_path;
    const DataSourceDescription data_source_description;
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@ -74,6 +74,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -147,11 +147,11 @@ BackupImpl::BackupImpl(

 BackupImpl::~BackupImpl()
 {
-    if ((open_mode == OpenMode::WRITE) && !is_internal_backup && !writing_finalized && !std::uncaught_exceptions() && !std::current_exception())
+    if ((open_mode == OpenMode::WRITE) && !writing_finalized && !corrupted)
    {
        /// It is suspicious to destroy BackupImpl without finalization while writing a backup when there is no exception.
-        LOG_ERROR(log, "BackupImpl is not finalized when destructor is called. Stack trace: {}", StackTrace().toString());
-        chassert(false && "BackupImpl is not finalized when destructor is called.");
+        LOG_ERROR(log, "BackupImpl is not finalized or marked as corrupted when destructor is called. Stack trace: {}", StackTrace().toString());
+        chassert(false, "BackupImpl is not finalized or marked as corrupted when destructor is called.");
    }

    try
@ -196,9 +196,6 @@ void BackupImpl::open()

    if (open_mode == OpenMode::READ)
        readBackupMetadata();
-
-    if ((open_mode == OpenMode::WRITE) && base_backup_info)
-        base_backup_uuid = getBaseBackupUnlocked()->getUUID();
 }

 void BackupImpl::close()
@ -280,6 +277,8 @@ std::shared_ptr<const IBackup> BackupImpl::getBaseBackupUnlocked() const
                toString(base_backup->getUUID()),
                (base_backup_uuid ? toString(*base_backup_uuid) : ""));
        }
+
+        base_backup_uuid = base_backup->getUUID();
    }
    return base_backup;
 }
@ -369,7 +368,7 @@ void BackupImpl::writeBackupMetadata()
        if (base_backup_in_use)
        {
            *out << "<base_backup>" << xml << base_backup_info->toString() << "</base_backup>";
-            *out << "<base_backup_uuid>" << toString(*base_backup_uuid) << "</base_backup_uuid>";
+            *out << "<base_backup_uuid>" << getBaseBackupUnlocked()->getUUID() << "</base_backup_uuid>";
        }
    }

@ -594,9 +593,6 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const

 void BackupImpl::removeLockFile()
 {
-    if (is_internal_backup)
-        return; /// Internal backup must not remove the lock file (it's still used by the initiator).
-
    if (checkLockFile(false))
        writer->removeFile(lock_file_name);
 }
@ -989,8 +985,11 @@ void BackupImpl::finalizeWriting()
    if (open_mode != OpenMode::WRITE)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");

+    if (corrupted)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup can't be finalized after an error happened");
+
    if (writing_finalized)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized");
+        return;

    if (!is_internal_backup)
    {
@ -1015,20 +1014,58 @@ void BackupImpl::setCompressedSize()
 }


-void BackupImpl::tryRemoveAllFiles()
+bool BackupImpl::setIsCorrupted() noexcept
 {
-    if (open_mode != OpenMode::WRITE)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
-
-    if (is_internal_backup)
-        return;
-
    try
    {
-        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+        std::lock_guard lock{mutex};
+        if (open_mode != OpenMode::WRITE)
+        {
+            LOG_ERROR(log, "Backup is not opened for writing. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not opened for writing when setIsCorrupted() is called");
+            return false;
+        }
+
+        if (writing_finalized)
+        {
+            LOG_WARNING(log, "An error happened after the backup was completed successfully, the backup must be correct!");
+            return false;
+        }
+
+        if (corrupted)
+            return true;
+
+        LOG_WARNING(log, "An error happened, the backup won't be completed");
+
        closeArchive(/* finalize= */ false);

+        corrupted = true;
+        return true;
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException(log, "Caught exception while setting that the backup was corrupted");
+        return false;
+    }
+}
+
+
+bool BackupImpl::tryRemoveAllFiles() noexcept
+{
+    try
+    {
+        std::lock_guard lock{mutex};
+        if (!corrupted)
+        {
+            LOG_ERROR(log, "Backup is not set as corrupted. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not set as corrupted when tryRemoveAllFiles() is called");
+            return false;
+        }
+
+        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+
        Strings files_to_remove;
+
        if (use_archive)
        {
            files_to_remove.push_back(archive_params.archive_name);
@ -1041,14 +1078,17 @@ void BackupImpl::tryRemoveAllFiles()
        }

        if (!checkLockFile(false))
-            return;
+            return false;

        writer->removeFiles(files_to_remove);
        removeLockFile();
+        writer->removeEmptyDirectories();
+        return true;
    }
    catch (...)
    {
-        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+        DB::tryLogCurrentException(log, "Caught exception while removing files of a corrupted backup");
+        return false;
    }
 }

--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@ -86,7 +86,8 @@ public:
    void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override;
    bool supportsWritingInMultipleThreads() const override { return !use_archive; }
    void finalizeWriting() override;
-    void tryRemoveAllFiles() override;
+    bool setIsCorrupted() noexcept override;
+    bool tryRemoveAllFiles() noexcept override;

 private:
    void open();
@ -146,13 +147,14 @@ private:
    int version;
    mutable std::optional<BackupInfo> base_backup_info;
    mutable std::shared_ptr<const IBackup> base_backup;
-    std::optional<UUID> base_backup_uuid;
+    mutable std::optional<UUID> base_backup_uuid;
    std::shared_ptr<IArchiveReader> archive_reader;
    std::shared_ptr<IArchiveWriter> archive_writer;
    String lock_file_name;
    std::atomic<bool> lock_file_before_first_file_checked = false;

    bool writing_finalized = false;
+    bool corrupted = false;
    bool deduplicate_files = true;
    bool use_same_s3_credentials_for_base_backup = false;
    bool use_same_password_for_base_backup = false;
--- a/src/Backups/BackupKeeperSettings.cpp
+++ b/src/Backups/BackupKeeperSettings.cpp
@ -0,0 +1,58 @@
+#include <Backups/BackupKeeperSettings.h>
+
+#include <Core/Settings.h>
+#include <Interpreters/Context.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+
+namespace DB
+{
+
+namespace Setting
+{
+    extern const SettingsUInt64 backup_restore_keeper_max_retries;
+    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
+    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
+    extern const SettingsUInt64 backup_restore_failure_after_host_disconnected_for_seconds;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_initializing;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_handling_error;
+    extern const SettingsUInt64 backup_restore_finish_timeout_after_error_sec;
+    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
+    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
+    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
+}
+
+BackupKeeperSettings BackupKeeperSettings::fromContext(const ContextPtr & context)
+{
+    BackupKeeperSettings keeper_settings;
+
+    const auto & settings = context->getSettingsRef();
+    const auto & config = context->getConfigRef();
+
+    keeper_settings.max_retries = settings[Setting::backup_restore_keeper_max_retries];
+    keeper_settings.retry_initial_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_initial_backoff_ms]};
+    keeper_settings.retry_max_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_max_backoff_ms]};
+
+    keeper_settings.failure_after_host_disconnected_for_seconds = std::chrono::seconds{settings[Setting::backup_restore_failure_after_host_disconnected_for_seconds]};
+    keeper_settings.max_retries_while_initializing = settings[Setting::backup_restore_keeper_max_retries_while_initializing];
+    keeper_settings.max_retries_while_handling_error = settings[Setting::backup_restore_keeper_max_retries_while_handling_error];
+    keeper_settings.finish_timeout_after_error = std::chrono::seconds(settings[Setting::backup_restore_finish_timeout_after_error_sec]);
+
+    if (config.has("backups.sync_period_ms"))
+        keeper_settings.sync_period_ms = std::chrono::milliseconds{config.getUInt64("backups.sync_period_ms")};
+
+    if (config.has("backups.max_attempts_after_bad_version"))
+        keeper_settings.max_attempts_after_bad_version = config.getUInt64("backups.max_attempts_after_bad_version");
+
+    keeper_settings.value_max_size = settings[Setting::backup_restore_keeper_value_max_size];
+    keeper_settings.batch_size_for_multi = settings[Setting::backup_restore_batch_size_for_keeper_multi];
+    keeper_settings.batch_size_for_multiread = settings[Setting::backup_restore_batch_size_for_keeper_multiread];
+    keeper_settings.fault_injection_probability = settings[Setting::backup_restore_keeper_fault_injection_probability];
+    keeper_settings.fault_injection_seed = settings[Setting::backup_restore_keeper_fault_injection_seed];
+
+    return keeper_settings;
+}
+
+}
--- a/src/Backups/BackupKeeperSettings.h
+++ b/src/Backups/BackupKeeperSettings.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <Interpreters/Context_fwd.h>
+
+
+namespace DB
+{
+
+/// Settings for [Zoo]Keeper-related works during BACKUP or RESTORE.
+struct BackupKeeperSettings
+{
+    /// Maximum number of retries in the middle of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Should be big enough so the whole operation won't be cancelled in the middle of it because of a temporary ZooKeeper failure.
+    UInt64 max_retries{1000};
+
+    /// Initial backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_initial_backoff_ms{100};
+
+    /// Max backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_max_backoff_ms{5000};
+
+    /// If a host during BACKUP ON CLUSTER or RESTORE ON CLUSTER doesn't recreate its 'alive' node in ZooKeeper
+    /// for this amount of time then the whole backup or restore is considered as failed.
+    /// Should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+    /// Set to zero to disable (if it's zero and some host crashed then BACKUP ON CLUSTER or RESTORE ON CLUSTER will be waiting
+    /// for the crashed host forever until the operation is explicitly cancelled with KILL QUERY).
+    std::chrono::seconds failure_after_host_disconnected_for_seconds{3600};
+
+    /// Maximum number of retries during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because if the operation is going to fail then it's better if it fails faster.
+    UInt64 max_retries_while_initializing{20};
+
+    /// Maximum number of retries while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because those retries are just for cleanup after the operation has failed already.
+    UInt64 max_retries_while_handling_error{20};
+
+    /// How long the initiator should wait for other host to handle the 'error' node and finish their work.
+    std::chrono::seconds finish_timeout_after_error{180};
+
+    /// How often the "stage" folder in ZooKeeper must be scanned in a background thread to track changes done by other hosts.
+    std::chrono::milliseconds sync_period_ms{5000};
+
+    /// Number of attempts after getting error ZBADVERSION from ZooKeeper.
+    size_t max_attempts_after_bad_version{10};
+
+    /// Maximum size of data of a ZooKeeper's node during backup.
+    UInt64 value_max_size{1048576};
+
+    /// Maximum size of a batch for a multi request.
+    UInt64 batch_size_for_multi{1000};
+
+    /// Maximum size of a batch for a multiread request.
+    UInt64 batch_size_for_multiread{10000};
+
+    /// Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f].
+    Float64 fault_injection_probability{0};
+
+    /// Seed for `fault_injection_probability`: 0 - random seed, otherwise the setting value.
+    UInt64 fault_injection_seed{0};
+
+    static BackupKeeperSettings fromContext(const ContextPtr & context);
+};
+
+}
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -74,6 +74,17 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query)
    return res;
 }

+bool BackupSettings::isAsync(const ASTBackupQuery & query)
+{
+    if (query.settings)
+    {
+        const auto * field = query.settings->as<const ASTSetQuery &>().changes.tryGet("async");
+        if (field)
+            return field->safeGet<bool>();
+    }
+    return false; /// `async` is false by default.
+}
+
 void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const
 {
    auto query_settings = std::make_shared<ASTSetQuery>();
--- a/src/Backups/BackupSettings.h
+++ b/src/Backups/BackupSettings.h
@ -101,6 +101,8 @@ struct BackupSettings
    static BackupSettings fromBackupQuery(const ASTBackupQuery & query);
    void copySettingsToQuery(ASTBackupQuery & query) const;

+    static bool isAsync(const ASTBackupQuery & query);
+
    struct Util
    {
        static std::vector<Strings> clusterHostIDsFromAST(const IAST & ast);
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -23,6 +23,7 @@ using BackupMutablePtr = std::shared_ptr<IBackup>;
 using BackupPtr = std::shared_ptr<const IBackup>;
 class IBackupEntry;
 using BackupEntries = std::vector<std::pair<String, std::shared_ptr<const IBackupEntry>>>;
+class BackupConcurrencyCounters;
 using DataRestoreTasks = std::vector<std::function<void()>>;
 struct ReadSettings;
 class BackupLog;
@ -31,6 +32,10 @@ using ThreadGroupPtr = std::shared_ptr<ThreadGroup>;
 class QueryStatus;
 using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 class ProcessList;
+class Cluster;
+using ClusterPtr = std::shared_ptr<Cluster>;
+class AccessRightsElements;
+struct ZooKeeperRetriesInfo;


 /// Manager of backups and restores: executes backups and restores' threads in the background.
@ -47,18 +52,18 @@ public:
    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
    /// For asynchronous operations the function throws no exceptions on failure usually,
    /// call getInfo() on a returned operation id to check for errors.
-    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

    /// Waits until the specified backup or restore operation finishes or stops.
    /// The function returns immediately if the operation is already finished.
-    void wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);
+    BackupStatus wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);

    /// Waits until all running backup and restore operations finish or stop.
    void waitAll();

    /// Cancels the specified backup or restore operation.
    /// The function does nothing if this operation has already finished.
-    void cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
+    BackupStatus cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);

    /// Cancels all running backup and restore operations.
    void cancelAll(bool wait_ = true);
@ -67,26 +72,32 @@ public:
    std::vector<BackupOperationInfo> getAllInfos() const;

 private:
-    BackupOperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    std::pair<BackupOperationID, BackupStatus> startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    struct BackupStarter;
+
+    BackupMutablePtr openBackupForWriting(const BackupInfo & backup_info, const BackupSettings & backup_settings, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context) const;

    void doBackup(
-        BackupMutablePtr & backup,
+        BackupMutablePtr backup,
        const std::shared_ptr<ASTBackupQuery> & backup_query,
        const BackupOperationID & backup_id,
        const String & backup_name_for_logging,
-        const BackupInfo & backup_info,
-        BackupSettings backup_settings,
+        const BackupSettings & backup_settings,
        std::shared_ptr<IBackupCoordination> backup_coordination,
-        const ContextPtr & context,
-        ContextMutablePtr mutable_context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);

    /// Builds file infos for specified backup entries.
    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);

    /// Write backup entries to an opened backup.
-    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal, QueryStatusPtr process_list_element);
+    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool is_internal_backup, QueryStatusPtr process_list_element);

-    BackupOperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    struct RestoreStarter;
+
+    BackupPtr openBackupForReading(const BackupInfo & backup_info, const RestoreSettings & restore_settings, const ContextPtr & context) const;

    void doRestore(
        const std::shared_ptr<ASTBackupQuery> & restore_query,
@ -95,7 +106,17 @@ private:
        const BackupInfo & backup_info,
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
-        ContextMutablePtr context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);
+
+    std::shared_ptr<IBackupCoordination> makeBackupCoordination(bool on_cluster, const BackupSettings & backup_settings, const ContextPtr & context) const;
+    std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(bool on_cluster, const RestoreSettings & restore_settings, const ContextPtr & context) const;
+
+    /// Sends a BACKUP or RESTORE query to other hosts.
+    void sendQueryToOtherHosts(const ASTBackupQuery & backup_or_restore_query, const ClusterPtr & cluster,
+        size_t only_shard_num, size_t only_replica_num, ContextMutablePtr context, const AccessRightsElements & access_to_check,
+        const ZooKeeperRetriesInfo & retries_info) const;

    /// Run data restoring tasks which insert data to tables.
    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);
@ -139,6 +160,8 @@ private:

    std::shared_ptr<BackupLog> backup_log;
    ProcessList & process_list;
+
+    std::unique_ptr<BackupConcurrencyCounters> concurrency_counters;
 };

 }
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@ -121,8 +121,13 @@ public:
    /// Finalizes writing the backup, should be called after all entries have been successfully written.
    virtual void finalizeWriting() = 0;

-    /// Try to remove all files copied to the backup. Used after an exception or it the backup was cancelled.
-    virtual void tryRemoveAllFiles() = 0;
+    /// Sets that a non-retriable error happened while the backup was being written which means that
+    /// the backup is most likely corrupted and it can't be finalized.
+    /// This function is called while handling an exception or if the backup was cancelled.
+    virtual bool setIsCorrupted() noexcept = 0;
+
+    /// Try to remove all files copied to the backup. Could be used after setIsCorrupted().
+    virtual bool tryRemoveAllFiles() noexcept = 0;
 };

 using BackupPtr = std::shared_ptr<const IBackup>;
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -5,26 +5,44 @@

 namespace DB
 {
-class Exception;
 struct BackupFileInfo;
 using BackupFileInfos = std::vector<BackupFileInfo>;
 enum class AccessEntityType : uint8_t;
 enum class UserDefinedSQLObjectType : uint8_t;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're writing to a backup while executing BACKUP ON CLUSTER.
-/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationRemote.
+/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationOnCluster.
 /// BackupCoordinationLocal is used while executing BACKUP without ON CLUSTER and performs coordination in memory.
-/// BackupCoordinationRemote is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
+/// BackupCoordinationOnCluster is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
 class IBackupCoordination
 {
 public:
    virtual ~IBackupCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;
+
+    /// Sets that the backup query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setBackupQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    struct PartNameAndChecksum
    {
@ -87,9 +105,7 @@ public:
    /// Starts writing a specified file, the function returns false if that file is already being written concurrently.
    virtual bool startWritingFile(size_t data_file_index) = 0;

-    /// This function is used to check if concurrent backups are running
-    /// other than the backup passed to the function
-    virtual bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -5,26 +5,42 @@

 namespace DB
 {
-class Exception;
 enum class UserDefinedSQLObjectType : uint8_t;
 class ASTCreateQuery;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
-/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
+/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationOnCluster.
 /// RestoreCoordinationLocal is used while executing RESTORE without ON CLUSTER and performs coordination in memory.
-/// RestoreCoordinationRemote is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
+/// RestoreCoordinationOnCluster is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
 class IRestoreCoordination
 {
 public:
    virtual ~IRestoreCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;

-    static constexpr const char * kErrorStatus = "error";
+    /// Sets that the restore query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setRestoreQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
@ -49,9 +65,7 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    virtual void generateUUIDForTable(ASTCreateQuery & create_query) = 0;

-    /// This function is used to check if concurrent restores are running
-    /// other than the restore passed to the function
-    virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -1,32 +1,24 @@
 #include <Backups/RestoreCoordinationLocal.h>
+
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/formatAST.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>


 namespace DB
 {

-RestoreCoordinationLocal::RestoreCoordinationLocal() : log(getLogger("RestoreCoordinationLocal"))
+RestoreCoordinationLocal::RestoreCoordinationLocal(
+    const UUID & restore_uuid, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("RestoreCoordinationLocal"))
+    , concurrency_check(restore_uuid, /* is_restore = */ true, /* on_cluster = */ false, allow_concurrent_restore_, concurrency_counters_)
 {
 }

 RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;

-void RestoreCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void RestoreCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo RestoreCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -63,7 +55,7 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
 {
    String query_str = serializeAST(create_query);

-    auto find_in_map = [&]
+    auto find_in_map = [&]() TSA_REQUIRES(mutex)
    {
        auto it = create_query_uuids.find(query_str);
        if (it != create_query_uuids.end())
@ -91,14 +83,4 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
    }
 }

-bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
-{
-    if (num_active_restores > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_restores={}", num_active_restores);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Parsers/CreateQueryUUIDs.h>
 #include <Common/Logger.h>
 #include <mutex>
@ -12,19 +13,20 @@ namespace DB
 {
 class ASTCreateQuery;

-
 /// Implementation of the IRestoreCoordination interface performing coordination in memory.
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationLocal();
+    RestoreCoordinationLocal(const UUID & restore_uuid_, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
    ~RestoreCoordinationLocal() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setRestoreQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -49,15 +51,16 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
-    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
-    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids;
-    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables;
+    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables TSA_GUARDED_BY(mutex);
+    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables TSA_GUARDED_BY(mutex);

    mutable std::mutex mutex;
 };
--- a/src/Backups/RestoreCoordinationOnCluster.cpp
+++ b/src/Backups/RestoreCoordinationOnCluster.cpp
@ -0,0 +1,318 @@
+#include <Backups/BackupCoordinationOnCluster.h>
+
+#include <Backups/BackupCoordinationStage.h>
+#include <Backups/BackupCoordinationStageSync.h>
+#include <Backups/RestoreCoordinationOnCluster.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/CreateQueryUUIDs.h>
+#include <Parsers/formatAST.h>
+#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/escapeForFileName.h>
+
+
+namespace DB
+{
+
+RestoreCoordinationOnCluster::RestoreCoordinationOnCluster(
+    const UUID & restore_uuid_,
+    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
+    const BackupKeeperSettings & keeper_settings_,
+    const String & current_host_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_restore_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+    QueryStatusPtr process_list_element_)
+    : root_zookeeper_path(root_zookeeper_path_)
+    , keeper_settings(keeper_settings_)
+    , restore_uuid(restore_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/restore-" + toString(restore_uuid_))
+    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(BackupCoordinationOnCluster::excludeInitiator(all_hosts))
+    , current_host(current_host_)
+    , current_host_index(BackupCoordinationOnCluster::findCurrentHostIndex(current_host, all_hosts))
+    , log(getLogger("RestoreCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(restore_uuid_, /* is_restore = */ true, /* on_cluster = */ true, allow_concurrent_restore_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
+{
+    createRootNodes();
+}
+
+RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster()
+{
+    tryFinishImpl();
+}
+
+void RestoreCoordinationOnCluster::createRootNodes()
+{
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            zk->createAncestors(zookeeper_path);
+            zk->createIfNotExists(zookeeper_path, "");
+            zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_sql_objects_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+            zk->createIfNotExists(zookeeper_path + "/table_uuids", "");
+        });
+}
+
+Strings RestoreCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
+{
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void RestoreCoordinationOnCluster::setRestoreQueryWasSentToOtherHosts()
+{
+    restore_query_was_sent_to_other_hosts = true;
+}
+
+bool RestoreCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void RestoreCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+    {
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
+}
+
+void RestoreCoordinationOnCluster::waitForOtherHostsToFinish()
+{
+    if ((current_host != kInitiator) || !restore_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
+}
+
+bool RestoreCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
+{
+    if (current_host != kInitiator)
+        return false;
+    if (!restore_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+}
+
+ZooKeeperRetriesInfo RestoreCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
+{
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
+}
+
+bool RestoreCoordinationOnCluster::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/" + escapeForFileName(table_name);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/";
+            switch (object_type)
+            {
+                case UserDefinedSQLObjectType::Function:
+                    path += "functions";
+                    break;
+            }
+
+            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
+{
+    bool lock_acquired = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            /// we need to remove leading '/' from root_zk_path
+            auto normalized_root_zk_path = root_zk_path.substr(1);
+            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
+            zk->createAncestors(restore_lock_path);
+            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                lock_acquired = true;
+                return;
+            }
+
+            if (code == Coordination::Error::ZNODEEXISTS)
+                lock_acquired = table_unique_id == zk->get(restore_lock_path);
+            else
+                zkutil::KeeperException::fromPath(code, restore_lock_path);
+        });
+    return lock_acquired;
+}
+
+void RestoreCoordinationOnCluster::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
+    String new_uuids_str = new_uuids.toString();
+
+    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
+            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
+
+            if (res == Coordination::Error::ZOK)
+            {
+                new_uuids.copyToQuery(create_query);
+                return;
+            }
+
+            if (res == Coordination::Error::ZNODEEXISTS)
+            {
+                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
+                return;
+            }
+
+            zkutil::KeeperException::fromPath(res, path);
+        });
+}
+
+}
--- a/src/Backups/RestoreCoordinationOnCluster.h
+++ b/src/Backups/RestoreCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationStageSync.h>
 #include <Backups/WithRetries.h>

@ -9,28 +11,33 @@ namespace DB
 {

 /// Implementation of the IRestoreCoordination interface performing coordination via ZooKeeper. It's necessary for "RESTORE ON CLUSTER".
-class RestoreCoordinationRemote : public IRestoreCoordination
+class RestoreCoordinationOnCluster : public IRestoreCoordination
 {
 public:
-    using RestoreKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a RESTORE ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    RestoreCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    RestoreCoordinationOnCluster(
+        const UUID & restore_uuid_,
        const String & root_zookeeper_path_,
-        const RestoreKeeperSettings & keeper_settings_,
-        const String & restore_uuid_,
-        const Strings & all_hosts_,
+        zkutil::GetZooKeeper get_zookeeper_,
+        const BackupKeeperSettings & keeper_settings_,
        const String & current_host_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_restore_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~RestoreCoordinationRemote() override;
+    ~RestoreCoordinationOnCluster() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setRestoreQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -55,27 +62,27 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

-    /// get_zookeeper will provide a zookeeper client without any fault injection
-    const zkutil::GetZooKeeper get_zookeeper;
    const String root_zookeeper_path;
-    const RestoreKeeperSettings keeper_settings;
-    const String restore_uuid;
+    const BackupKeeperSettings keeper_settings;
+    const UUID restore_uuid;
    const String zookeeper_path;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
-    const bool is_internal;
    LoggerPtr const log;

-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
-    mutable std::mutex mutex;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> restore_query_was_sent_to_other_hosts = false;
 };

 }
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -1,379 +0,0 @@
-#include <Backups/BackupCoordinationRemote.h>
-#include <Backups/BackupCoordinationStage.h>
-#include <Backups/RestoreCoordinationRemote.h>
-#include <Backups/BackupCoordinationStageSync.h>
-#include <Parsers/ASTCreateQuery.h>
-#include <Parsers/CreateQueryUUIDs.h>
-#include <Parsers/formatAST.h>
-#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Common/escapeForFileName.h>
-
-
-namespace DB
-{
-
-namespace Stage = BackupCoordinationStage;
-
-RestoreCoordinationRemote::RestoreCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
-    const String & root_zookeeper_path_,
-    const RestoreKeeperSettings & keeper_settings_,
-    const String & restore_uuid_,
-    const Strings & all_hosts_,
-    const String & current_host_,
-    bool is_internal_,
-    QueryStatusPtr process_list_element_)
-    : get_zookeeper(get_zookeeper_)
-    , root_zookeeper_path(root_zookeeper_path_)
-    , keeper_settings(keeper_settings_)
-    , restore_uuid(restore_uuid_)
-    , zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
-    , all_hosts(all_hosts_)
-    , current_host(current_host_)
-    , current_host_index(BackupCoordinationRemote::findCurrentHostIndex(all_hosts, current_host))
-    , is_internal(is_internal_)
-    , log(getLogger("RestoreCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
-{
-    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
-}
-
-RestoreCoordinationRemote::~RestoreCoordinationRemote()
-{
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void RestoreCoordinationRemote::createRootNodes()
-{
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->createAncestors(zookeeper_path);
-
-            Coordination::Requests ops;
-            Coordination::Responses responses;
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_databases_tables_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/table_uuids", "", zkutil::CreateMode::Persistent));
-            zk->tryMulti(ops, responses);
-        });
-}
-
-void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
-{
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
-}
-
-void RestoreCoordinationRemote::setError(const Exception & exception)
-{
-    stage_sync->setError(current_host, exception);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait)
-{
-    return stage_sync->wait(all_hosts, stage_to_wait);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/" + escapeForFileName(table_name);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/";
-            switch (object_type)
-            {
-                case UserDefinedSQLObjectType::Function:
-                    path += "functions";
-                    break;
-            }
-
-            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result =  zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
-{
-    bool lock_acquired = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            /// we need to remove leading '/' from root_zk_path
-            auto normalized_root_zk_path = root_zk_path.substr(1);
-            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
-            zk->createAncestors(restore_lock_path);
-            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                lock_acquired = true;
-                return;
-            }
-
-            if (code == Coordination::Error::ZNODEEXISTS)
-                lock_acquired = table_unique_id == zk->get(restore_lock_path);
-            else
-                zkutil::KeeperException::fromPath(code, restore_lock_path);
-        });
-    return lock_acquired;
-}
-
-void RestoreCoordinationRemote::generateUUIDForTable(ASTCreateQuery & create_query)
-{
-    String query_str = serializeAST(create_query);
-    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
-    String new_uuids_str = new_uuids.toString();
-
-    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
-            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
-
-            if (res == Coordination::Error::ZOK)
-            {
-                new_uuids.copyToQuery(create_query);
-                return;
-            }
-
-            if (res == Coordination::Error::ZNODEEXISTS)
-            {
-                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
-                return;
-            }
-
-            zkutil::KeeperException::fromPath(res, path);
-        });
-}
-
-void RestoreCoordinationRemote::removeAllNodes()
-{
-    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
-    ///
-    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-    /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination
-    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
-    /// of their restore work before that.
-
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->removeRecursive(zookeeper_path);
-        });
-}
-
-bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base restore
-    if (is_internal)
-        return false;
-
-    bool result = false;
-    std::string path = zookeeper_path + "/stage";
-
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            if (! zk->exists(root_zookeeper_path))
-                zk->createAncestors(root_zookeeper_path);
-
-            for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-            {
-                Coordination::Stat stat;
-                zk->get(root_zookeeper_path, &stat);
-                Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
-                for (const auto & existing_restore_path : existing_restore_paths)
-                {
-                    if (startsWith(existing_restore_path, "backup-"))
-                        continue;
-
-                    String existing_restore_uuid = existing_restore_path;
-                    existing_restore_uuid.erase(0, String("restore-").size());
-
-                    if (existing_restore_uuid == toString(restore_uuid))
-                        continue;
-
-                    String status;
-                    if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
-                    {
-                        /// Check if some other restore is in progress
-                        if (status == Stage::SCHEDULED_TO_START)
-                        {
-                            LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
-                            result = true;
-                            return;
-                        }
-                    }
-                }
-
-                zk->createIfNotExists(path, "");
-                auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
-                if (code == Coordination::Error::ZOK)
-                    break;
-                bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-                if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                    throw zkutil::KeeperException::fromPath(code, path);
-            }
-        });
-
-    return result;
-}
-
-}
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -100,7 +100,6 @@ RestorerFromBackup::RestorerFromBackup(
    , context(context_)
    , process_list_element(context->getProcessListElement())
    , after_task_callback(after_task_callback_)
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
    , log(getLogger("RestorerFromBackup"))
    , tables_dependencies("RestorerFromBackup")
@ -119,12 +118,14 @@ RestorerFromBackup::~RestorerFromBackup()
    }
 }

-void RestorerFromBackup::run(Mode mode)
+void RestorerFromBackup::run(Mode mode_)
 {
    /// run() can be called onle once.
    if (!current_stage.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");

+    mode = mode_;
+
    /// Find other hosts working along with us to execute this ON CLUSTER query.
    all_hosts = BackupSettings::Util::filterHostIDs(
        restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
@ -139,6 +140,7 @@ void RestorerFromBackup::run(Mode mode)
    setStage(Stage::FINDING_TABLES_IN_BACKUP);
    findDatabasesAndTablesInBackup();
    waitFutures();
+    logNumberOfDatabasesAndTablesToRestore();

    /// Check access rights.
    setStage(Stage::CHECKING_ACCESS_RIGHTS);
@ -228,20 +230,8 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa

    if (restore_coordination)
    {
-        restore_coordination->setStage(new_stage, message);
-
-        /// The initiator of a RESTORE ON CLUSTER query waits for other hosts to complete their work (see waitForStage(Stage::COMPLETED) in BackupsWorker::doRestore),
-        /// but other hosts shouldn't wait for each others' completion. (That's simply unnecessary and also
-        /// the initiator may start cleaning up (e.g. removing restore-coordination ZooKeeper nodes) once all other hosts are in Stage::COMPLETED.)
-        bool need_wait = (new_stage != Stage::COMPLETED);
-
-        if (need_wait)
-        {
-            if (new_stage == Stage::FINDING_TABLES_IN_BACKUP)
-                restore_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-            else
-                restore_coordination->waitForStage(new_stage);
-        }
+        /// There is no need to sync Stage::COMPLETED with other hosts because it's the last stage.
+        restore_coordination->setStage(new_stage, message, /* sync = */ (new_stage != Stage::COMPLETED));
    }
 }

@ -384,8 +374,12 @@ void RestorerFromBackup::findDatabasesAndTablesInBackup()
            }
        }
    }
+}

-    LOG_INFO(log, "Will restore {} databases and {} tables", getNumDatabases(), getNumTables());
+void RestorerFromBackup::logNumberOfDatabasesAndTablesToRestore() const
+{
+    std::string_view action = (mode == CHECK_ACCESS_ONLY) ? "check access rights for restoring" : "restore";
+    LOG_INFO(log, "Will {} {} databases and {} tables", action, getNumDatabases(), getNumTables());
 }

 void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name_in_backup, bool skip_if_inner_table, const std::optional<ASTs> & partitions)
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -53,7 +53,7 @@ public:
    using DataRestoreTasks = std::vector<DataRestoreTask>;

    /// Restores the metadata of databases and tables and returns tasks to restore the data of tables.
-    void run(Mode mode);
+    void run(Mode mode_);

    BackupPtr getBackup() const { return backup; }
    const RestoreSettings & getRestoreSettings() const { return restore_settings; }
@ -80,10 +80,10 @@ private:
    ContextMutablePtr context;
    QueryStatusPtr process_list_element;
    std::function<void()> after_task_callback;
-    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds create_table_timeout;
    LoggerPtr log;

+    Mode mode = Mode::RESTORE;
    Strings all_hosts;
    DDLRenamingMap renaming_map;
    std::vector<std::filesystem::path> root_paths_in_backup;
@ -97,6 +97,7 @@ private:
    void findDatabaseInBackupImpl(const String & database_name_in_backup, const std::set<DatabaseAndTableName> & except_table_names);
    void findEverythingInBackup(const std::set<String> & except_database_names, const std::set<DatabaseAndTableName> & except_table_names);

+    void logNumberOfDatabasesAndTablesToRestore() const;
    size_t getNumDatabases() const;
    size_t getNumTables() const;

--- a/src/Backups/WithRetries.cpp
+++ b/src/Backups/WithRetries.cpp
@ -1,57 +1,34 @@
 #include <Backups/WithRetries.h>
-#include <Core/Settings.h>

 #include <mutex>

+
 namespace DB
 {
-namespace Setting
-{
-    extern const SettingsUInt64 backup_restore_keeper_max_retries;
-    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
-    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
-    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
-    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
-    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
-}
-
-WithRetries::KeeperSettings WithRetries::KeeperSettings::fromContext(ContextPtr context)
-{
-    return
-    {
-        .keeper_max_retries = context->getSettingsRef()[Setting::backup_restore_keeper_max_retries],
-        .keeper_retry_initial_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_initial_backoff_ms],
-        .keeper_retry_max_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_max_backoff_ms],
-        .batch_size_for_keeper_multiread = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multiread],
-        .keeper_fault_injection_probability = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_probability],
-        .keeper_fault_injection_seed = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_seed],
-        .keeper_value_max_size = context->getSettingsRef()[Setting::backup_restore_keeper_value_max_size],
-        .batch_size_for_keeper_multi = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multi],
-    };
-}

 WithRetries::WithRetries(
-    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
+    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
    : log(log_)
    , get_zookeeper(get_zookeeper_)
    , settings(settings_)
    , process_list_element(process_list_element_)
    , callback(callback_)
-    , global_zookeeper_retries_info(
-          settings.keeper_max_retries, settings.keeper_retry_initial_backoff_ms, settings.keeper_retry_max_backoff_ms)
 {}

-WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name)
-    : info(parent->global_zookeeper_retries_info)
-    , retries_ctl(name, parent->log, info, parent->process_list_element)
+WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind)
+    : info(  (kind == kInitialization) ? parent->settings.max_retries_while_initializing
+           : (kind == kErrorHandling)  ? parent->settings.max_retries_while_handling_error
+                                       : parent->settings.max_retries,
+           parent->settings.retry_initial_backoff_ms.count(),
+           parent->settings.retry_max_backoff_ms.count())
+    /// We don't use process_list_element while handling an error because the error handling can't be cancellable.
+    , retries_ctl(name, parent->log, info, (kind == kErrorHandling) ? nullptr : parent->process_list_element)
    , faulty_zookeeper(parent->getFaultyZooKeeper())
 {}

-WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name)
+WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name, Kind kind) const
 {
-    return RetriesControlHolder(this, name);
+    return RetriesControlHolder(this, name, kind);
 }

 void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
@ -62,8 +39,8 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
    {
        zookeeper = get_zookeeper();
        my_faulty_zookeeper->setKeeper(zookeeper);
-
-        callback(my_faulty_zookeeper);
+        if (callback)
+            callback(my_faulty_zookeeper);
    }
    else
    {
@ -71,7 +48,7 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
    }
 }

-const WithRetries::KeeperSettings & WithRetries::getKeeperSettings() const
+const BackupKeeperSettings & WithRetries::getKeeperSettings() const
 {
    return settings;
 }
@ -88,8 +65,8 @@ WithRetries::FaultyKeeper WithRetries::getFaultyZooKeeper() const
    /// The reason is that ZooKeeperWithFaultInjection may reset the underlying pointer and there could be a race condition
    /// when the same object is used from multiple threads.
    auto faulty_zookeeper = ZooKeeperWithFaultInjection::createInstance(
-        settings.keeper_fault_injection_probability,
-        settings.keeper_fault_injection_seed,
+        settings.fault_injection_probability,
+        settings.fault_injection_seed,
        current_zookeeper,
        log->name(),
        log);
--- a/src/Backups/WithRetries.h
+++ b/src/Backups/WithRetries.h
@ -1,9 +1,11 @@
 #pragma once

-#include <Common/ZooKeeper/ZooKeeperRetries.h>
+#include <Backups/BackupKeeperSettings.h>
 #include <Common/ZooKeeper/Common.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/ZooKeeper/ZooKeeperWithFaultInjection.h>

+
 namespace DB
 {

@ -15,20 +17,13 @@ class WithRetries
 {
 public:
    using FaultyKeeper = Coordination::ZooKeeperWithFaultInjection::Ptr;
-    using RenewerCallback = std::function<void(FaultyKeeper &)>;
+    using RenewerCallback = std::function<void(FaultyKeeper)>;

-    struct KeeperSettings
+    enum Kind
    {
-        UInt64 keeper_max_retries{0};
-        UInt64 keeper_retry_initial_backoff_ms{0};
-        UInt64 keeper_retry_max_backoff_ms{0};
-        UInt64 batch_size_for_keeper_multiread{10000};
-        Float64 keeper_fault_injection_probability{0};
-        UInt64 keeper_fault_injection_seed{42};
-        UInt64 keeper_value_max_size{1048576};
-        UInt64 batch_size_for_keeper_multi{1000};
-
-        static KeeperSettings fromContext(ContextPtr context);
+        kNormal,
+        kInitialization,
+        kErrorHandling,
    };

    /// For simplicity a separate ZooKeeperRetriesInfo and a faulty [Zoo]Keeper client
@ -48,23 +43,23 @@ public:

    private:
        friend class WithRetries;
-        RetriesControlHolder(const WithRetries * parent, const String & name);
+        RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind);
    };

-    RetriesControlHolder createRetriesControlHolder(const String & name);
-    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback);
+    RetriesControlHolder createRetriesControlHolder(const String & name, Kind kind = Kind::kNormal) const;
+    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback = {});

    /// Used to re-establish new connection inside a retry loop.
    void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;

-    const KeeperSettings & getKeeperSettings() const;
+    const BackupKeeperSettings & getKeeperSettings() const;
 private:
    /// This will provide a special wrapper which is useful for testing
    FaultyKeeper getFaultyZooKeeper() const;

    LoggerPtr log;
    zkutil::GetZooKeeper get_zookeeper;
-    KeeperSettings settings;
+    BackupKeeperSettings settings;
    QueryStatusPtr process_list_element;

    /// This callback is called each time when a new [Zoo]Keeper session is created.
@ -76,7 +71,6 @@ private:
    /// it could lead just to a failed backup which could possibly be successful
    /// if there were a little bit more retries.
    RenewerCallback callback;
-    ZooKeeperRetriesInfo global_zookeeper_retries_info;

    /// This is needed only to protect zookeeper object
    mutable std::mutex zookeeper_mutex;
--- a/src/Client/ClientApplicationBase.cpp
+++ b/src/Client/ClientApplicationBase.cpp
@ -167,7 +167,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
        ("query_kind", po::value<std::string>()->default_value("initial_query"), "One of initial_query/secondary_query/no_query")
        ("query_id", po::value<std::string>(), "query_id")

-        ("history_file", po::value<std::string>(), "path to history file")
+        ("history_file", po::value<std::string>(), "Path to a file containing command history.")
+        ("history_max_entries", po::value<UInt32>()->default_value(1000000), "Maximum number of entries in the history file.")

        ("stage", po::value<std::string>()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit")
        ("progress", po::value<ProgressOption>()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print progress of queries execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off")
@ -350,6 +351,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
        getClientConfiguration().setBool("highlight", options["highlight"].as<bool>());
    if (options.count("history_file"))
        getClientConfiguration().setString("history_file", options["history_file"].as<std::string>());
+    if (options.count("history_max_entries"))
+        getClientConfiguration().setUInt("history_max_entries", options["history_max_entries"].as<UInt32>());
    if (options.count("interactive"))
        getClientConfiguration().setBool("interactive", true);
    if (options.count("pager"))
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -1454,8 +1454,22 @@ void ClientBase::resetOutput()

    /// Order is important: format, compression, file

-    if (output_format)
-        output_format->finalize();
+    try
+    {
+        if (output_format)
+            output_format->finalize();
+    }
+    catch (...)
+    {
+        /// We need to make sure we continue resetting output_format (will stop threads on parallel output)
+        /// as well as cleaning other output related setup
+        if (!have_error)
+        {
+            client_exception
+                = std::make_unique<Exception>(getCurrentExceptionMessageAndPattern(print_stack_trace), getCurrentExceptionCode());
+            have_error = true;
+        }
+    }
    output_format.reset();

    logs_out_stream.reset();
@ -2651,6 +2665,8 @@ void ClientBase::runInteractive()
        }
    }

+    history_max_entries = getClientConfiguration().getUInt("history_max_entries");
+
    LineReader::Patterns query_extenders = {"\\"};
    LineReader::Patterns query_delimiters = {";", "\\G", "\\G;"};
    char word_break_characters[] = " \t\v\f\a\b\r\n`~!@#$%^&*()-=+[{]}\\|;:'\",<.>/?";
@ -2663,6 +2679,7 @@ void ClientBase::runInteractive()
    ReplxxLineReader lr(
        *suggest,
        history_file,
+        history_max_entries,
        getClientConfiguration().has("multiline"),
        getClientConfiguration().getBool("ignore_shell_suspend", true),
        query_extenders,
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -328,6 +328,7 @@ protected:

    String home_path;
    String history_file; /// Path to a file containing command history.
+    UInt32 history_max_entries; /// Maximum number of entries in the history file.

    String current_profile;

--- a/src/Client/ReplxxLineReader.cpp
+++ b/src/Client/ReplxxLineReader.cpp
@ -293,6 +293,7 @@ void ReplxxLineReader::setLastIsDelimiter(bool flag)
 ReplxxLineReader::ReplxxLineReader(
    Suggest & suggest,
    const String & history_file_path_,
+    UInt32 history_max_entries_,
    bool multiline_,
    bool ignore_shell_suspend,
    Patterns extenders_,
@ -313,6 +314,8 @@ ReplxxLineReader::ReplxxLineReader(
 {
    using Replxx = replxx::Replxx;

+    rx.set_max_history_size(static_cast<int>(history_max_entries_));
+
    if (!history_file_path.empty())
    {
        history_file_fd = open(history_file_path.c_str(), O_RDWR);
--- a/src/Client/ReplxxLineReader.h
+++ b/src/Client/ReplxxLineReader.h
@ -14,6 +14,7 @@ public:
    (
        Suggest & suggest,
        const String & history_file_path,
+        UInt32 history_max_entries,
        bool multiline,
        bool ignore_shell_suspend,
        Patterns extenders_,
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -627,7 +627,7 @@ PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with
    return PreformattedMessage{stream.str(), e.tryGetMessageFormatString(), e.getMessageFormatStringArgs()};
 }

-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace)
 {
    try
    {
@ -635,7 +635,7 @@ std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
    }
    catch (...)
    {
-        return getCurrentExceptionMessage(with_stacktrace);
+        return getCurrentExceptionMessage(with_stacktrace, check_embedded_stacktrace);
    }
 }

--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -329,7 +329,7 @@ void tryLogException(std::exception_ptr e, const AtomicLogger & logger, const st

 std::string getExceptionMessage(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
 PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace);
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace = false);


 template <typename T>
--- a/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
+++ b/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
@ -492,9 +492,9 @@ public:
            nodes.push_back(impl.semaphore);
        if (impl.branch.queue)
            nodes.push_back(impl.branch.queue);
-        for (auto & [_, branch] : impl.branch.branch.branches)
+        for (auto & [_0, branch] : impl.branch.branch.branches)
        {
-            for (auto & [_, child] : branch.children)
+            for (auto & [_1, child] : branch.children)
                child->addRawPointerNodes(nodes);
        }
    }
--- a/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
+++ b/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
@ -48,9 +48,9 @@ ASTPtr normalizeCreateWorkloadEntityQuery(const IAST & create_query)
 /// Returns a type of a workload entity `ptr`
 WorkloadEntityType getEntityType(const ASTPtr & ptr)
 {
-    if (auto * res = typeid_cast<ASTCreateWorkloadQuery *>(ptr.get()))
+    if (auto * res = typeid_cast<ASTCreateWorkloadQuery *>(ptr.get()); res)
        return WorkloadEntityType::Workload;
-    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(ptr.get()))
+    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(ptr.get()); res)
        return WorkloadEntityType::Resource;
    chassert(false);
    return WorkloadEntityType::MAX;
@ -106,7 +106,7 @@ void forEachReference(
        for (const String & resource : resources)
            func(resource, res->getWorkloadName(), ReferenceType::ForResource);
    }
-    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(source_entity.get()))
+    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(source_entity.get()); res)
    {
        // RESOURCE has no references to be validated, we allow mentioned disks to be created later
    }
@ -578,15 +578,15 @@ void WorkloadEntityStorageBase::setAllEntities(const std::vector<std::pair<Strin
            if (!entityEquals(entity, it->second))
            {
                changes.emplace_back(entity_name, entity, it->second); // Update entities that are present in both `new_entities` and `entities`
-                LOG_TRACE(log, "Entity {} was updated", entity_name);
+                LOG_TRACE(log, "Workload entity {} was updated", entity_name);
            }
            else
-                LOG_TRACE(log, "Entity {} is the same", entity_name);
+                LOG_TRACE(log, "Workload entity {} is the same", entity_name);
        }
        else
        {
            changes.emplace_back(entity_name, entity, ASTPtr{}); // Remove entities that are not present in `new_entities`
-            LOG_TRACE(log, "Entity {} was dropped", entity_name);
+            LOG_TRACE(log, "Workload entity {} was dropped", entity_name);
        }
    }
    for (const auto & [entity_name, entity] : new_entities)
@ -594,7 +594,7 @@ void WorkloadEntityStorageBase::setAllEntities(const std::vector<std::pair<Strin
        if (!entities.contains(entity_name))
        {
            changes.emplace_back(entity_name, ASTPtr{}, entity); // Create entities that are only present in `new_entities`
-            LOG_TRACE(log, "Entity {} was created", entity_name);
+            LOG_TRACE(log, "Workload entity {} was created", entity_name);
        }
    }

--- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@ -176,6 +176,10 @@ void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguratio
        {
            connection_timeout_ms = config.getInt(config_name + "." + key);
        }
+        else if (key == "num_connection_retries")
+        {
+            num_connection_retries = config.getInt(config_name + "." + key);
+        }
        else if (key == "enable_fault_injections_during_startup")
        {
            enable_fault_injections_during_startup = config.getBool(config_name + "." + key);
--- a/src/Common/ZooKeeper/ZooKeeperArgs.h
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@ -39,6 +39,7 @@ struct ZooKeeperArgs
    String sessions_path = "/clickhouse/sessions";
    String client_availability_zone;
    int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+    UInt64 num_connection_retries = 2;
    int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
    int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
    bool enable_fault_injections_during_startup = false;
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -440,7 +440,9 @@ void ZooKeeper::connect(
    if (nodes.empty())
        throw Exception::fromMessage(Error::ZBADARGUMENTS, "No nodes passed to ZooKeeper constructor");

-    static constexpr size_t num_tries = 3;
+    /// We always have at least one attempt to connect.
+    size_t num_tries = args.num_connection_retries + 1;
+
    bool connected = false;
    bool dns_error = false;

--- a/src/Common/ZooKeeper/ZooKeeperRetries.h
+++ b/src/Common/ZooKeeper/ZooKeeperRetries.h
@ -15,14 +15,15 @@ namespace ErrorCodes

 struct ZooKeeperRetriesInfo
 {
+    ZooKeeperRetriesInfo() = default;
    ZooKeeperRetriesInfo(UInt64 max_retries_, UInt64 initial_backoff_ms_, UInt64 max_backoff_ms_)
        : max_retries(max_retries_), initial_backoff_ms(std::min(initial_backoff_ms_, max_backoff_ms_)), max_backoff_ms(max_backoff_ms_)
    {
    }

-    UInt64 max_retries;
-    UInt64 initial_backoff_ms;
-    UInt64 max_backoff_ms;
+    UInt64 max_retries = 0; /// "max_retries = 0" means only one attempt.
+    UInt64 initial_backoff_ms = 100;
+    UInt64 max_backoff_ms = 5000;
 };

 class ZooKeeperRetriesControl
@ -220,6 +221,7 @@ private:
            return false;
        }

+        /// Check if the query was cancelled.
        if (process_list_element)
            process_list_element->checkTimeLimit();

@ -228,6 +230,10 @@ private:
        sleepForMilliseconds(current_backoff_ms);
        current_backoff_ms = std::min(current_backoff_ms * 2, retries_info.max_backoff_ms);

+        /// Check if the query was cancelled again after sleeping.
+        if (process_list_element)
+            process_list_element->checkTimeLimit();
+
        return true;
    }

--- a/src/Core/BaseSettings.cpp
+++ b/src/Core/BaseSettings.cpp
@ -8,6 +8,7 @@ namespace DB
 {
 namespace ErrorCodes
 {
+    extern const int INCORRECT_DATA;
    extern const int UNKNOWN_SETTING;
 }

@ -31,11 +32,19 @@ void BaseSettingsHelpers::writeFlags(Flags flags, WriteBuffer & out)
 }


-BaseSettingsHelpers::Flags BaseSettingsHelpers::readFlags(ReadBuffer & in)
+UInt64 BaseSettingsHelpers::readFlags(ReadBuffer & in)
 {
    UInt64 res;
    readVarUInt(res, in);
-    return static_cast<Flags>(res);
+    return res;
+}
+
+SettingsTierType BaseSettingsHelpers::getTier(UInt64 flags)
+{
+    int8_t tier = static_cast<int8_t>(flags & Flags::TIER);
+    if (tier > SettingsTierType::BETA)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown tier value: '{}'", tier);
+    return SettingsTierType{tier};
 }


--- a/src/Core/BaseSettings.h
+++ b/src/Core/BaseSettings.h
@ -2,6 +2,7 @@

 #include <unordered_map>
 #include <Core/SettingsFields.h>
+#include <Core/SettingsTierType.h>
 #include <Core/SettingsWriteFormat.h>
 #include <IO/Operators.h>
 #include <base/range.h>
@ -21,6 +22,27 @@ namespace DB
 class ReadBuffer;
 class WriteBuffer;

+struct BaseSettingsHelpers
+{
+    [[noreturn]] static void throwSettingNotFound(std::string_view name);
+    static void warningSettingNotFound(std::string_view name);
+
+    static void writeString(std::string_view str, WriteBuffer & out);
+    static String readString(ReadBuffer & in);
+
+    enum Flags : UInt64
+    {
+        IMPORTANT = 0x01,
+        CUSTOM = 0x02,
+        TIER = 0x0c, /// 0b1100 == 2 bits
+        /// If adding new flags, consider first if Tier might need more bits
+    };
+
+    static SettingsTierType getTier(UInt64 flags);
+    static void writeFlags(Flags flags, WriteBuffer & out);
+    static UInt64 readFlags(ReadBuffer & in);
+};
+
 /** Template class to define collections of settings.
  * If you create a new setting, please also add it to ./utils/check-style/check-settings-style
  * for validation
@ -138,7 +160,7 @@ public:
        const char * getTypeName() const;
        const char * getDescription() const;
        bool isCustom() const;
-        bool isObsolete() const;
+        SettingsTierType getTier() const;

        bool operator==(const SettingFieldRef & other) const { return (getName() == other.getName()) && (getValue() == other.getValue()); }
        bool operator!=(const SettingFieldRef & other) const { return !(*this == other); }
@ -225,24 +247,6 @@ private:
    std::conditional_t<Traits::allow_custom_settings, CustomSettingMap, boost::blank> custom_settings_map;
 };

-struct BaseSettingsHelpers
-{
-    [[noreturn]] static void throwSettingNotFound(std::string_view name);
-    static void warningSettingNotFound(std::string_view name);
-
-    static void writeString(std::string_view str, WriteBuffer & out);
-    static String readString(ReadBuffer & in);
-
-    enum Flags : UInt64
-    {
-        IMPORTANT = 0x01,
-        CUSTOM = 0x02,
-        OBSOLETE = 0x04,
-    };
-    static void writeFlags(Flags flags, WriteBuffer & out);
-    static Flags readFlags(ReadBuffer & in);
-};
-
 template <typename TTraits>
 void BaseSettings<TTraits>::set(std::string_view name, const Field & value)
 {
@ -477,7 +481,7 @@ void BaseSettings<TTraits>::read(ReadBuffer & in, SettingsWriteFormat format)
        size_t index = accessor.find(name);

        using Flags = BaseSettingsHelpers::Flags;
-        Flags flags{0};
+        UInt64 flags{0};
        if (format >= SettingsWriteFormat::STRINGS_WITH_FLAGS)
            flags = BaseSettingsHelpers::readFlags(in);
        bool is_important = (flags & Flags::IMPORTANT);
@ -797,14 +801,14 @@ bool BaseSettings<TTraits>::SettingFieldRef::isCustom() const
 }

 template <typename TTraits>
-bool BaseSettings<TTraits>::SettingFieldRef::isObsolete() const
+SettingsTierType BaseSettings<TTraits>::SettingFieldRef::getTier() const
 {
    if constexpr (Traits::allow_custom_settings)
    {
        if (custom_setting)
-            return false;
+            return SettingsTierType::PRODUCTION;
    }
-    return accessor->isObsolete(index);
+    return accessor->getTier(index);
 }

 using AliasMap = std::unordered_map<std::string_view, std::string_view>;
@ -835,8 +839,8 @@ using AliasMap = std::unordered_map<std::string_view, std::string_view>;
            const String & getName(size_t index) const { return field_infos[index].name; } \
            const char * getTypeName(size_t index) const { return field_infos[index].type; } \
            const char * getDescription(size_t index) const { return field_infos[index].description; } \
-            bool isImportant(size_t index) const { return field_infos[index].is_important; } \
-            bool isObsolete(size_t index) const { return field_infos[index].is_obsolete; } \
+            bool isImportant(size_t index) const { return field_infos[index].flags & BaseSettingsHelpers::Flags::IMPORTANT; } \
+            SettingsTierType getTier(size_t index) const { return BaseSettingsHelpers::getTier(field_infos[index].flags); } \
            Field castValueUtil(size_t index, const Field & value) const { return field_infos[index].cast_value_util_function(value); } \
            String valueToStringUtil(size_t index, const Field & value) const { return field_infos[index].value_to_string_util_function(value); } \
            Field stringToValueUtil(size_t index, const String & str) const { return field_infos[index].string_to_value_util_function(str); } \
@ -856,8 +860,7 @@ using AliasMap = std::unordered_map<std::string_view, std::string_view>;
                String name; \
                const char * type; \
                const char * description; \
-                bool is_important; \
-                bool is_obsolete; \
+                UInt64 flags; \
                Field (*cast_value_util_function)(const Field &); \
                String (*value_to_string_util_function)(const Field &); \
                Field (*string_to_value_util_function)(const String &); \
@ -968,8 +971,8 @@ struct DefineAliases
 /// NOLINTNEXTLINE
 #define IMPLEMENT_SETTINGS_TRAITS_(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \
    res.field_infos.emplace_back( \
-        FieldInfo{#NAME, #TYPE, DESCRIPTION, (FLAGS) & IMPORTANT, \
-            static_cast<bool>((FLAGS) & BaseSettingsHelpers::Flags::OBSOLETE), \
+        FieldInfo{#NAME, #TYPE, DESCRIPTION, \
+            static_cast<UInt64>(FLAGS), \
            [](const Field & value) -> Field { return static_cast<Field>(SettingField##TYPE{value}); }, \
            [](const Field & value) -> String { return SettingField##TYPE{value}.toString(); }, \
            [](const String & str) -> Field { SettingField##TYPE temp; temp.parseFromString(str); return static_cast<Field>(temp); }, \
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@ -192,6 +192,13 @@ namespace DB
    DECLARE(UInt64, parts_killer_pool_size, 128, "Threads for cleanup of shared merge tree outdated threads. Only available in ClickHouse Cloud", 0) \
    DECLARE(UInt64, keeper_multiread_batch_size, 10'000, "Maximum size of batch for MultiRead request to [Zoo]Keeper that support batching. If set to 0, batching is disabled. Available only in ClickHouse Cloud.", 0) \
    DECLARE(Bool, use_legacy_mongodb_integration, true, "Use the legacy MongoDB integration implementation. Note: it's highly recommended to set this option to false, since legacy implementation will be removed in the future. Please submit any issues you encounter with the new implementation.", 0) \
+    \
+    DECLARE(UInt64, prefetch_threadpool_pool_size, 100, "Size of background pool for prefetches for remote object storages", 0) \
+    DECLARE(UInt64, prefetch_threadpool_queue_size, 1000000, "Number of tasks which is possible to push into prefetches pool", 0) \
+    DECLARE(UInt64, load_marks_threadpool_pool_size, 50, "Size of background pool for marks loading", 0) \
+    DECLARE(UInt64, load_marks_threadpool_queue_size, 1000000, "Number of tasks which is possible to push into prefetches pool", 0) \
+    DECLARE(UInt64, threadpool_writer_pool_size, 100, "Size of background pool for write requests to object storages", 0) \
+    DECLARE(UInt64, threadpool_writer_queue_size, 1000000, "Number of tasks which is possible to push into background pool for write requests to object storages", 0)

 /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in dumpToSystemServerSettingsColumns below

@ -339,7 +346,7 @@ void ServerSettings::dumpToSystemServerSettingsColumns(ServerSettingColumnsParam
        res_columns[4]->insert(setting.getDescription());
        res_columns[5]->insert(setting.getTypeName());
        res_columns[6]->insert(is_changeable ? changeable_settings_it->second.second : ChangeableWithoutRestart::No);
-        res_columns[7]->insert(setting.isObsolete());
+        res_columns[7]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
    }
 }
 }
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@ -1,7 +1,5 @@
-#include <Columns/ColumnArray.h>
 #include <Columns/ColumnMap.h>
 #include <Core/BaseSettings.h>
-#include <Core/BaseSettingsFwdMacros.h>
 #include <Core/BaseSettingsFwdMacrosImpl.h>
 #include <Core/BaseSettingsProgramOptions.h>
 #include <Core/DistributedCacheProtocol.h>
@ -40,10 +38,17 @@ namespace ErrorCodes
  * Note: as an alternative, we could implement settings to be completely dynamic in the form of the map: String -> Field,
  *  but we are not going to do it, because settings are used everywhere as static struct fields.
  *
-  * `flags` can be either 0 or IMPORTANT.
-  * A setting is "IMPORTANT" if it affects the results of queries and can't be ignored by older versions.
+  * `flags` can include a Tier (BETA | EXPERIMENTAL) and an optional bitwise AND with IMPORTANT.
+  * The default (0) means a PRODUCTION ready setting
  *
-  * When adding new or changing existing settings add them to the settings changes history in SettingsChangesHistory.h
+  * A setting is "IMPORTANT" if it affects the results of queries and can't be ignored by older versions.
+  * Tiers:
+  * EXPERIMENTAL: The feature is in active development stage. Mostly for developers or for ClickHouse enthusiasts.
+  * BETA: There are no known bugs problems in the functionality, but the outcome of using it together with other
+  * features/components is unknown and correctness is not guaranteed.
+  * PRODUCTION (Default): The feature is safe to use along with other features from the PRODUCTION tier.
+  *
+  * When adding new or changing existing settings add them to the settings changes history in SettingsChangesHistory.cpp
  * for tracking settings changes in different versions and for special `compatibility` settings to work correctly.
  */

@ -2660,29 +2665,44 @@ The maximum amount of data consumed by temporary files on disk in bytes for all
 The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited.
 )", 0)\
    \
-    DECLARE(UInt64, backup_restore_keeper_max_retries, 20, R"(
-Max retries for keeper operations during backup or restore
+    DECLARE(UInt64, backup_restore_keeper_max_retries, 1000, R"(
+Max retries for [Zoo]Keeper operations in the middle of a BACKUP or RESTORE operation.
+Should be big enough so the whole operation won't fail because of a temporary [Zoo]Keeper failure.
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_retry_initial_backoff_ms, 100, R"(
 Initial backoff timeout for [Zoo]Keeper operations during backup or restore
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_retry_max_backoff_ms, 5000, R"(
 Max backoff timeout for [Zoo]Keeper operations during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_failure_after_host_disconnected_for_seconds, 3600, R"(
+If a host during a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation doesn't recreate its ephemeral 'alive' node in ZooKeeper for this amount of time then the whole backup or restore is considered as failed.
+This value should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+Zero means unlimited.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_initializing, 20, R"(
+Max retries for [Zoo]Keeper operations during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_handling_error, 20, R"(
+Max retries for [Zoo]Keeper operations while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_finish_timeout_after_error_sec, 180, R"(
+How long the initiator should wait for other host to react to the 'error' node and stop their work on the current BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
+Maximum size of data of a [Zoo]Keeper's node during backup
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
+Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
+Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
 )", 0) \
    DECLARE(Float, backup_restore_keeper_fault_injection_probability, 0.0f, R"(
 Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f]
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_fault_injection_seed, 0, R"(
 0 - random seed, otherwise the setting value
-)", 0) \
-    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
-Maximum size of data of a [Zoo]Keeper's node during backup
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
-Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
-Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
 )", 0) \
    DECLARE(UInt64, backup_restore_s3_retry_attempts, 1000, R"(
 Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.
@ -5106,6 +5126,9 @@ Only in ClickHouse Cloud. A maximum number of unacknowledged in-flight packets i
 )", 0) \
    DECLARE(UInt64, distributed_cache_data_packet_ack_window, DistributedCache::ACK_DATA_PACKET_WINDOW, R"(
 Only in ClickHouse Cloud. A window for sending ACK for DataPacket sequence in a single distributed cache read request
+)", 0) \
+    DECLARE(Bool, distributed_cache_discard_connection_if_unread_data, true, R"(
+Only in ClickHouse Cloud. Discard connection if some data is unread.
 )", 0) \
    DECLARE(Bool, filesystem_cache_enable_background_download_for_metadata_files_in_packed_storage, true, R"(
 Only in ClickHouse Cloud. Wait time to lock cache for space reservation in filesystem cache
@ -5512,90 +5535,102 @@ For testing purposes. Replaces all external table functions to Null to not initi
    DECLARE(Bool, restore_replace_external_dictionary_source_to_null, false, R"(
 Replace external dictionary sources to Null on restore. Useful for testing purposes
 )", 0) \
-    DECLARE(Bool, create_if_not_exists, false, R"(
-Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
-)", 0) \
-    DECLARE(Bool, enforce_strict_identifier_format, false, R"(
-If enabled, only allow identifiers containing alphanumeric characters and underscores.
-)", 0) \
-    DECLARE(Bool, mongodb_throw_on_unsupported_query, true, R"(
-If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.
-)", 0) \
-    \
-    /* ###################################### */ \
-    /* ######## EXPERIMENTAL FEATURES ####### */ \
-    /* ###################################### */ \
-    DECLARE(Bool, allow_experimental_materialized_postgresql_table, false, R"(
-Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental
-)", 0) \
-    DECLARE(Bool, allow_experimental_funnel_functions, false, R"(
-Enable experimental functions for funnel analysis.
-)", 0) \
-    DECLARE(Bool, allow_experimental_nlp_functions, false, R"(
-Enable experimental functions for natural language processing.
-)", 0) \
-    DECLARE(Bool, allow_experimental_hash_functions, false, R"(
-Enable experimental hash functions
-)", 0) \
-    DECLARE(Bool, allow_experimental_object_type, false, R"(
-Allow Object and JSON data types
-)", 0) \
-    DECLARE(Bool, allow_experimental_time_series_table, false, R"(
-Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.
+        /* Parallel replicas */ \
+    DECLARE(UInt64, allow_experimental_parallel_reading_from_replicas, 0, R"(
+Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure
+)", BETA) ALIAS(enable_parallel_replicas) \
+    DECLARE(NonZeroUInt64, max_parallel_replicas, 1, R"(
+The maximum number of replicas for each shard when executing a query.

 Possible values:

- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled.
- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
-)", 0) \
-    DECLARE(Bool, allow_experimental_vector_similarity_index, false, R"(
-Allow experimental vector similarity index
-)", 0) \
-    DECLARE(Bool, allow_experimental_variant_type, false, R"(
-Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md).
-)", 0) \
-    DECLARE(Bool, allow_experimental_dynamic_type, false, R"(
-Allow Dynamic data type
-)", 0) \
-    DECLARE(Bool, allow_experimental_json_type, false, R"(
-Allow JSON data type
-)", 0) \
-    DECLARE(Bool, allow_experimental_codecs, false, R"(
-If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).
-)", 0) \
-    DECLARE(Bool, allow_experimental_shared_set_join, true, R"(
-Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
-)", 0) \
-    DECLARE(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
-SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
-)", 0) \
-    DECLARE(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
-The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
-)", 0) \
-    DECLARE(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
-Throw exception if unsupported query is used inside transaction
-)", 0) \
-    DECLARE(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, R"(
-Wait for committed changes to become actually visible in the latest snapshot
-)", 0) \
-    DECLARE(Bool, implicit_transaction, false, R"(
-If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)
-)", 0) \
-    DECLARE(UInt64, grace_hash_join_initial_buckets, 1, R"(
-Initial number of grace hash join buckets
-)", 0) \
-    DECLARE(UInt64, grace_hash_join_max_buckets, 1024, R"(
-Limit on the number of grace hash join buckets
-)", 0) \
-    DECLARE(UInt64, join_to_sort_minimum_perkey_rows, 40, R"(
-The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys
-)", 0) \
-    DECLARE(UInt64, join_to_sort_maximum_table_rows, 10000, R"(
-The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.
-)", 0) \
-    DECLARE(Bool, allow_experimental_join_right_table_sorting, false, R"(
-If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.
+- Positive integer.
+
+**Additional Info**
+
+This options will produce different results depending on the settings used.
+
+:::note
+This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details.
+:::
+
+### Parallel processing using `SAMPLE` key
+
+A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases:
+
+- The position of the sampling key in the partitioning key does not allow efficient range scans.
+- Adding a sampling key to the table makes filtering by other columns less efficient.
+- The sampling key is an expression that is expensive to calculate.
+- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency.
+
+### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key)
+
+This setting is useful for any replicated table.
 )", 0) \
+    DECLARE(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, R"(
+Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_count, 0, R"(
+This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing.
+)", BETA) \
+    DECLARE(UInt64, parallel_replica_offset, 0, R"(
+This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas.
+)", BETA) \
+    DECLARE(String, parallel_replicas_custom_key, "", R"(
+An arbitrary integer expression that can be used to split work between replicas for a specific table.
+The value can be any integer expression.
+
+Simple expressions using primary keys are preferred.
+
+If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards.
+Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_custom_key_range_lower, 0, R"(
+Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`.
+
+When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
+
+Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_custom_key_range_upper, 0, R"(
+Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression.
+
+When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
+
+Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing
+)", BETA) \
+    DECLARE(String, cluster_for_parallel_replicas, "", R"(
+Cluster for a shard in which current server is located
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_allow_in_with_subquery, true, R"(
+If true, subquery for IN will be executed on every follower replica.
+)", BETA) \
+    DECLARE(Float, parallel_replicas_single_task_marks_count_multiplier, 2, R"(
+A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_for_non_replicated_merge_tree, false, R"(
+If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, R"(
+Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_prefer_local_join, true, R"(
+If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_mark_segment_size, 0, R"(
+Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384]
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_local_plan, false, R"(
+Build local plan for local replica
+)", BETA) \
+    \
+    DECLARE(Bool, allow_experimental_analyzer, true, R"(
+Allow new query analyzer.
+)", IMPORTANT | BETA) ALIAS(enable_analyzer) \
+    DECLARE(Bool, analyzer_compatibility_join_using_top_level_identifier, false, R"(
+Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).
+)", BETA) \
+    \
    DECLARE(Timezone, session_timezone, "", R"(
 Sets the implicit time zone of the current session or query.
 The implicit time zone is the time zone applied to values of type DateTime/DateTime64 which have no explicitly specified time zone.
@ -5655,126 +5690,121 @@ This happens due to different parsing pipelines:
 **See also**

 - [timezone](../server-configuration-parameters/settings.md#timezone)
+)", BETA) \
+DECLARE(Bool, create_if_not_exists, false, R"(
+Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
+)", 0) \
+    DECLARE(Bool, enforce_strict_identifier_format, false, R"(
+If enabled, only allow identifiers containing alphanumeric characters and underscores.
+)", 0) \
+    DECLARE(Bool, mongodb_throw_on_unsupported_query, true, R"(
+If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.
+)", 0) \
+    DECLARE(Bool, implicit_select, false, R"(
+Allow writing simple SELECT queries without the leading SELECT keyword, which makes it simple for calculator-style usage, e.g. `1 + 2` becomes a valid query.
 )", 0) \
-    DECLARE(Bool, use_hive_partitioning, false, R"(
-When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`.
-)", 0)\
    \
-    DECLARE(Bool, allow_statistics_optimize, false, R"(
-Allows using statistics to optimize queries
-)", 0) ALIAS(allow_statistic_optimize) \
-    DECLARE(Bool, allow_experimental_statistics, false, R"(
-Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics).
-)", 0) ALIAS(allow_experimental_statistic) \
    \
-    /* Parallel replicas */ \
-    DECLARE(UInt64, allow_experimental_parallel_reading_from_replicas, 0, R"(
-Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure
-)", 0) ALIAS(enable_parallel_replicas) \
-    DECLARE(NonZeroUInt64, max_parallel_replicas, 1, R"(
-The maximum number of replicas for each shard when executing a query.
+    /* ####################################################### */ \
+    /* ########### START OF EXPERIMENTAL FEATURES ############ */ \
+    /* ## ADD PRODUCTION / BETA FEATURES BEFORE THIS BLOCK  ## */ \
+    /* ####################################################### */ \
+    \
+    DECLARE(Bool, allow_experimental_materialized_postgresql_table, false, R"(
+Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_funnel_functions, false, R"(
+Enable experimental functions for funnel analysis.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_nlp_functions, false, R"(
+Enable experimental functions for natural language processing.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_hash_functions, false, R"(
+Enable experimental hash functions
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_object_type, false, R"(
+Allow Object and JSON data types
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_time_series_table, false, R"(
+Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.

 Possible values:

- Positive integer.
-
-**Additional Info**
-
-This options will produce different results depending on the settings used.
-
-:::note
-This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details.
-:::
-
-### Parallel processing using `SAMPLE` key
-
-A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases:
-
- The position of the sampling key in the partitioning key does not allow efficient range scans.
- Adding a sampling key to the table makes filtering by other columns less efficient.
- The sampling key is an expression that is expensive to calculate.
- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency.
-
-### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key)
-
-This setting is useful for any replicated table.
-)", 0) \
-    DECLARE(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, R"(
-Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_count, 0, R"(
-This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing.
-)", 0) \
-    DECLARE(UInt64, parallel_replica_offset, 0, R"(
-This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas.
-)", 0) \
-    DECLARE(String, parallel_replicas_custom_key, "", R"(
-An arbitrary integer expression that can be used to split work between replicas for a specific table.
-The value can be any integer expression.
-
-Simple expressions using primary keys are preferred.
-
-If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards.
-Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_custom_key_range_lower, 0, R"(
-Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`.
-
-When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
-
-Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_custom_key_range_upper, 0, R"(
-Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression.
-
-When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
-
-Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing
-)", 0) \
-    DECLARE(String, cluster_for_parallel_replicas, "", R"(
-Cluster for a shard in which current server is located
-)", 0) \
-    DECLARE(Bool, parallel_replicas_allow_in_with_subquery, true, R"(
-If true, subquery for IN will be executed on every follower replica.
-)", 0) \
-    DECLARE(Float, parallel_replicas_single_task_marks_count_multiplier, 2, R"(
-A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.
-)", 0) \
-    DECLARE(Bool, parallel_replicas_for_non_replicated_merge_tree, false, R"(
-If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, R"(
-Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'
-)", 0) \
-    DECLARE(Bool, parallel_replicas_prefer_local_join, true, R"(
-If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_mark_segment_size, 0, R"(
-Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384]
+- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled.
+- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
 )", 0) \
+    DECLARE(Bool, allow_experimental_vector_similarity_index, false, R"(
+Allow experimental vector similarity index
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_variant_type, false, R"(
+Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md).
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_dynamic_type, false, R"(
+Allow Dynamic data type
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_json_type, false, R"(
+Allow JSON data type
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_codecs, false, R"(
+If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_shared_set_join, true, R"(
+Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
+SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
+The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
+Throw exception if unsupported query is used inside transaction
+)", EXPERIMENTAL) \
+    DECLARE(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, R"(
+Wait for committed changes to become actually visible in the latest snapshot
+)", EXPERIMENTAL) \
+    DECLARE(Bool, implicit_transaction, false, R"(
+If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, grace_hash_join_initial_buckets, 1, R"(
+Initial number of grace hash join buckets
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, grace_hash_join_max_buckets, 1024, R"(
+Limit on the number of grace hash join buckets
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, join_to_sort_minimum_perkey_rows, 40, R"(
+The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, join_to_sort_maximum_table_rows, 10000, R"(
+The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_join_right_table_sorting, false, R"(
+If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, use_hive_partitioning, false, R"(
+When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`.
+)", EXPERIMENTAL)\
+    \
+    DECLARE(Bool, allow_statistics_optimize, false, R"(
+Allows using statistics to optimize queries
+)", EXPERIMENTAL) ALIAS(allow_statistic_optimize) \
+    DECLARE(Bool, allow_experimental_statistics, false, R"(
+Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics).
+)", EXPERIMENTAL) ALIAS(allow_experimental_statistic) \
+    \
    DECLARE(Bool, allow_archive_path_syntax, true, R"(
 File/S3 engines/table function will parse paths with '::' as '\\<archive\\> :: \\<file\\>' if archive has correct extension
-)", 0) \
-    DECLARE(Bool, parallel_replicas_local_plan, false, R"(
-Build local plan for local replica
-)", 0) \
+)", EXPERIMENTAL) \
    \
    DECLARE(Bool, allow_experimental_inverted_index, false, R"(
 If it is set to true, allow to use experimental inverted index.
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(Bool, allow_experimental_full_text_index, false, R"(
 If it is set to true, allow to use experimental full-text index.
-)", 0) \
+)", EXPERIMENTAL) \
    \
    DECLARE(Bool, allow_experimental_join_condition, false, R"(
 Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.
-)", 0) \
-    \
-    DECLARE(Bool, allow_experimental_analyzer, true, R"(
-Allow new query analyzer.
-)", IMPORTANT) ALIAS(enable_analyzer) \
-    DECLARE(Bool, analyzer_compatibility_join_using_top_level_identifier, false, R"(
-Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).
 )", 0) \
    \
    DECLARE(Bool, allow_experimental_live_view, false, R"(
@ -5787,43 +5817,43 @@ Possible values:
 )", 0) \
    DECLARE(Seconds, live_view_heartbeat_interval, 15, R"(
 The heartbeat interval in seconds to indicate live query is alive.
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(UInt64, max_live_view_insert_blocks_before_refresh, 64, R"(
 Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed.
-)", 0) \
+)", EXPERIMENTAL) \
    \
    DECLARE(Bool, allow_experimental_window_view, false, R"(
 Enable WINDOW VIEW. Not mature enough.
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(Seconds, window_view_clean_interval, 60, R"(
 The clean interval of window view in seconds to free outdated data.
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(Seconds, window_view_heartbeat_interval, 15, R"(
 The heartbeat interval in seconds to indicate watch query is alive.
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(Seconds, wait_for_window_view_fire_signal_timeout, 10, R"(
 Timeout for waiting for window view fire signal in event time processing
-)", 0) \
+)", EXPERIMENTAL) \
    \
    DECLARE(Bool, stop_refreshable_materialized_views_on_startup, false, R"(
 On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW \\<name\\> afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views.
-)", 0) \
+)", EXPERIMENTAL) \
    \
    DECLARE(Bool, allow_experimental_database_materialized_mysql, false, R"(
 Allow to create database with Engine=MaterializedMySQL(...).
-)", 0) \
+)", EXPERIMENTAL) \
    DECLARE(Bool, allow_experimental_database_materialized_postgresql, false, R"(
 Allow to create database with Engine=MaterializedPostgreSQL(...).
-)", 0) \
+)", EXPERIMENTAL) \
    \
    /** Experimental feature for moving data between shards. */ \
    DECLARE(Bool, allow_experimental_query_deduplication, false, R"(
 Experimental data deduplication for SELECT queries based on part UUIDs
-)", 0) \
-    DECLARE(Bool, implicit_select, false, R"(
-Allow writing simple SELECT queries without the leading SELECT keyword, which makes it simple for calculator-style usage, e.g. `1 + 2` becomes a valid query.
-)", 0)
-
+)", EXPERIMENTAL) \
+    \
+    /* ####################################################### */ \
+    /* ############ END OF EXPERIMENTAL FEATURES ############# */ \
+    /* ####################################################### */ \

 // End of COMMON_SETTINGS
 // Please add settings related to formats in Core/FormatFactorySettings.h, move obsolete settings to OBSOLETE_SETTINGS and obsolete format settings to OBSOLETE_FORMAT_SETTINGS.
@ -5902,13 +5932,14 @@ Allow writing simple SELECT queries without the leading SELECT keyword, which ma
    /** The section above is for obsolete settings. Do not add anything there. */
 #endif /// __CLION_IDE__

-
 #define LIST_OF_SETTINGS(M, ALIAS)     \
    COMMON_SETTINGS(M, ALIAS)          \
    OBSOLETE_SETTINGS(M, ALIAS)        \
    FORMAT_FACTORY_SETTINGS(M, ALIAS)  \
    OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \

+// clang-format on
+
 DECLARE_SETTINGS_TRAITS_ALLOW_CUSTOM_SETTINGS(SettingsTraits, LIST_OF_SETTINGS)
 IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS)

@ -6016,7 +6047,7 @@ void SettingsImpl::checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfi
    {
        const auto & name = setting.getName();
        bool should_skip_check = name == "max_table_size_to_drop" || name == "max_partition_size_to_drop";
-        if (config.has(name) && !setting.isObsolete() && !should_skip_check)
+        if (config.has(name) && (setting.getTier() != SettingsTierType::OBSOLETE) && !should_skip_check)
        {
            throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "A setting '{}' appeared at top level in config {}."
                " But it is user-level setting that should be located in users.xml inside <profiles> section for specific profile."
@ -6192,7 +6223,7 @@ std::vector<std::string_view> Settings::getChangedAndObsoleteNames() const
    std::vector<std::string_view> setting_names;
    for (const auto & setting : impl->allChanged())
    {
-        if (setting.isObsolete())
+        if (setting.getTier() == SettingsTierType::OBSOLETE)
            setting_names.emplace_back(setting.getName());
    }
    return setting_names;
@ -6241,7 +6272,8 @@ void Settings::dumpToSystemSettingsColumns(MutableColumnsAndConstraints & params
        res_columns[6]->insert(writability == SettingConstraintWritability::CONST);
        res_columns[7]->insert(setting.getTypeName());
        res_columns[8]->insert(setting.getDefaultValueString());
-        res_columns[10]->insert(setting.isObsolete());
+        res_columns[10]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
+        res_columns[11]->insert(setting.getTier());
    };

    const auto & settings_to_aliases = SettingsImpl::Traits::settingsToAliases();
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -64,6 +64,15 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
    },
    {"24.11",
        {
+            {"distributed_cache_discard_connection_if_unread_data", true, true, "New setting"},
+            {"filesystem_cache_enable_background_download_for_metadata_files_in_packed_storage", true, true, "New setting"},
+            {"filesystem_cache_enable_background_download_during_fetch", true, true, "New setting"},
+            {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
+            {"backup_restore_keeper_max_retries", 20, 1000, "Should be big enough so the whole operation BACKUP or RESTORE operation won't fail because of a temporary [Zoo]Keeper failure in the middle of it."},
+            {"backup_restore_failure_after_host_disconnected_for_seconds", 0, 3600, "New setting."},
+            {"backup_restore_keeper_max_retries_while_initializing", 0, 20, "New setting."},
+            {"backup_restore_keeper_max_retries_while_handling_error", 0, 20, "New setting."},
+            {"backup_restore_finish_timeout_after_error_sec", 0, 180, "New setting."},
        }
    },
    {"24.10",
@ -112,9 +121,6 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"allow_reorder_prewhere_conditions", false, true, "New setting"},
            {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
            {"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
-            {"filesystem_cache_enable_background_download_for_metadata_files_in_packed_storage", true, true, "New setting"},
-            {"filesystem_cache_enable_background_download_during_fetch", true, true, "New setting"},
-            {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
        }
    },
    {"24.9",
--- a/src/Core/SettingsObsoleteMacros.h
+++ b/src/Core/SettingsObsoleteMacros.h
@ -2,8 +2,8 @@

 // clang-format off
 #define MAKE_OBSOLETE(M, TYPE, NAME, DEFAULT) \
-    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE)
+    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", SettingsTierType::OBSOLETE)

 /// NOTE: ServerSettings::loadSettingsFromConfig() should be updated to include this settings
 #define MAKE_DEPRECATED_BY_SERVER_CONFIG(M, TYPE, NAME, DEFAULT) \
-    M(TYPE, NAME, DEFAULT, "User-level setting is deprecated, and it must be defined in the server configuration instead.", BaseSettingsHelpers::Flags::OBSOLETE)
+    M(TYPE, NAME, DEFAULT, "User-level setting is deprecated, and it must be defined in the server configuration instead.", SettingsTierType::OBSOLETE)
--- a/src/Core/SettingsTierType.cpp
+++ b/src/Core/SettingsTierType.cpp
@ -0,0 +1,19 @@
+#include <Core/SettingsTierType.h>
+#include <DataTypes/DataTypeEnum.h>
+
+namespace DB
+{
+
+std::shared_ptr<DataTypeEnum8> getSettingsTierEnum()
+{
+    return std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            {"Production",      static_cast<Int8>(SettingsTierType::PRODUCTION)},
+            {"Obsolete",        static_cast<Int8>(SettingsTierType::OBSOLETE)},
+            {"Experimental",    static_cast<Int8>(SettingsTierType::EXPERIMENTAL)},
+            {"Beta",            static_cast<Int8>(SettingsTierType::BETA)}
+        });
+}
+
+}
--- a/src/Core/SettingsTierType.h
+++ b/src/Core/SettingsTierType.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include <Core/Types.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace DB
+{
+
+template <typename Type>
+class DataTypeEnum;
+using DataTypeEnum8 = DataTypeEnum<Int8>;
+
+// Make it signed for compatibility with DataTypeEnum8
+enum SettingsTierType : int8_t
+{
+    PRODUCTION = 0b0000,
+    OBSOLETE = 0b0100,
+    EXPERIMENTAL = 0b1000,
+    BETA = 0b1100
+};
+
+std::shared_ptr<DataTypeEnum8> getSettingsTierEnum();
+
+}
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -195,6 +195,15 @@ struct SortCursorHelper
        /// The last row of this cursor is no larger than the first row of the another cursor.
        return !derived().greaterAt(rhs.derived(), impl->rows - 1, 0);
    }
+
+    bool ALWAYS_INLINE totallyLess(const SortCursorHelper & rhs) const
+    {
+        if (impl->rows == 0 || rhs.impl->rows == 0)
+            return false;
+
+        /// The last row of this cursor is less than the first row of the another cursor.
+        return rhs.derived().template greaterAt<false>(derived(), 0, impl->rows - 1);
+    }
 };


@ -203,6 +212,7 @@ struct SortCursor : SortCursorHelper<SortCursor>
    using SortCursorHelper<SortCursor>::SortCursorHelper;

    /// The specified row of this cursor is greater than the specified row of another cursor.
+    template <bool consider_order = true>
    bool ALWAYS_INLINE greaterAt(const SortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
    {
 #if USE_EMBEDDED_COMPILER
@ -218,7 +228,10 @@ struct SortCursor : SortCursorHelper<SortCursor>
            if (res < 0)
                return false;

-            return impl->order > rhs.impl->order;
+            if constexpr (consider_order)
+                return impl->order > rhs.impl->order;
+            else
+                return false;
        }
 #endif

@ -235,7 +248,10 @@ struct SortCursor : SortCursorHelper<SortCursor>
                return false;
        }

-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
    }
 };

@ -245,6 +261,7 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
 {
    using SortCursorHelper<SimpleSortCursor>::SortCursorHelper;

+    template <bool consider_order = true>
    bool ALWAYS_INLINE greaterAt(const SimpleSortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
    {
        int res = 0;
@ -271,7 +288,10 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
        if (res < 0)
            return false;

-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
    }
 };

@ -280,6 +300,7 @@ struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleCol
 {
    using SortCursorHelper<SpecializedSingleColumnSortCursor>::SortCursorHelper;

+    template <bool consider_order = true>
    bool ALWAYS_INLINE greaterAt(const SortCursorHelper<SpecializedSingleColumnSortCursor> & rhs, size_t lhs_pos, size_t rhs_pos) const
    {
        auto & this_impl = this->impl;
@ -302,7 +323,10 @@ struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleCol
        if (res < 0)
            return false;

-        return this_impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return this_impl->order > rhs.impl->order;
+        else
+            return false;
    }
 };

@ -311,6 +335,7 @@ struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
 {
    using SortCursorHelper<SortCursorWithCollation>::SortCursorHelper;

+    template <bool consider_order = true>
    bool ALWAYS_INLINE greaterAt(const SortCursorWithCollation & rhs, size_t lhs_pos, size_t rhs_pos) const
    {
        for (size_t i = 0; i < impl->sort_columns_size; ++i)
@ -330,7 +355,10 @@ struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
            if (res < 0)
                return false;
        }
-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
    }
 };

--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@ -161,7 +161,7 @@ String getNameForSubstreamPath(
    String stream_name,
    SubstreamIterator begin,
    SubstreamIterator end,
-    bool escape_tuple_delimiter)
+    bool escape_for_file_name)
 {
    using Substream = ISerialization::Substream;

@ -186,7 +186,7 @@ String getNameForSubstreamPath(
            /// Because nested data may be represented not by Array of Tuple,
            /// but by separate Array columns with names in a form of a.b,
            /// and name is encoded as a whole.
-            if (it->type == Substream::TupleElement && escape_tuple_delimiter)
+            if (it->type == Substream::TupleElement && escape_for_file_name)
                stream_name += escapeForFileName(substream_name);
            else
                stream_name += substream_name;
@ -206,7 +206,7 @@ String getNameForSubstreamPath(
        else if (it->type == SubstreamType::ObjectSharedData)
            stream_name += ".object_shared_data";
        else if (it->type == SubstreamType::ObjectTypedPath || it->type == SubstreamType::ObjectDynamicPath)
-            stream_name += "." + it->object_path_name;
+            stream_name += "." + (escape_for_file_name ? escapeForFileName(it->object_path_name) : it->object_path_name);
    }

    return stream_name;
@ -434,6 +434,14 @@ bool ISerialization::isDynamicSubcolumn(const DB::ISerialization::SubstreamPath
    return false;
 }

+bool ISerialization::isLowCardinalityDictionarySubcolumn(const DB::ISerialization::SubstreamPath & path)
+{
+    if (path.empty())
+        return false;
+
+    return path[path.size() - 1].type == SubstreamType::DictionaryKeys;
+}
+
 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
 {
    assert(prefix_len <= path.size());
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@ -463,6 +463,8 @@ public:
    /// Returns true if stream with specified path corresponds to dynamic subcolumn.
    static bool isDynamicSubcolumn(const SubstreamPath & path, size_t prefix_len);

+    static bool isLowCardinalityDictionarySubcolumn(const SubstreamPath & path);
+
 protected:
    template <typename State, typename StatePtr>
    State * checkAndGetState(const StatePtr & state) const;
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@ -54,7 +54,7 @@ void SerializationLowCardinality::enumerateStreams(
        .withSerializationInfo(data.serialization_info);

    settings.path.back().data = dict_data;
-    dict_inner_serialization->enumerateStreams(settings, callback, dict_data);
+    callback(settings.path);

    settings.path.back() = Substream::DictionaryIndexes;
    settings.path.back().data = data;
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -199,13 +199,12 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
    active_node_holder = zkutil::EphemeralNodeHolder::existing(active_path, *active_node_holder_zookeeper);
 }

-String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
+String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr)
 {
    auto zookeeper = getAndSetZooKeeper();
    return enqueueQueryImpl(zookeeper, entry, database);
 }

-
 bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeout_ms)
 {
    auto zookeeper = getAndSetZooKeeper();
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@ -24,7 +24,7 @@ class DatabaseReplicatedDDLWorker : public DDLWorker
 public:
    DatabaseReplicatedDDLWorker(DatabaseReplicated * db, ContextPtr context_);

-    String enqueueQuery(DDLLogEntry & entry) override;
+    String enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr) override;

    String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, ContextPtr query_context);

--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@ -103,15 +103,15 @@ std::unique_ptr<WriteBufferFromFileBase> HDFSObjectStorage::writeObject( /// NOL
            ErrorCodes::UNSUPPORTED_METHOD,
            "HDFS API doesn't support custom attributes/metadata for stored objects");

-    std::string path = object.remote_path;
-    if (path.starts_with("/"))
-        path = path.substr(1);
-    if (!path.starts_with(url))
-        path = fs::path(url) / path;
-
+    auto path = extractObjectKeyFromURL(object);
    /// Single O_WRONLY in libhdfs adds O_TRUNC
    return std::make_unique<WriteBufferFromHDFS>(
-        path, config, settings->replication, patchSettings(write_settings), buf_size,
+        url_without_path,
+        fs::path(data_directory) / path,
+        config,
+        settings->replication,
+        patchSettings(write_settings),
+        buf_size,
        mode == WriteMode::Rewrite ? O_WRONLY : O_WRONLY | O_APPEND);
 }

--- a/src/Functions/FunctionsComparison.h
+++ b/src/Functions/FunctionsComparison.h
@ -1171,7 +1171,7 @@ public:

        if (left_tuple && right_tuple)
        {
-            auto func = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionComparison<Op, Name>>(check_decimal_overflow));
+            auto func = std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionComparison<Op, Name>>(check_decimal_overflow));

            bool has_nullable = false;
            bool has_null = false;
@ -1181,7 +1181,7 @@ public:
            {
                ColumnsWithTypeAndName args = {{nullptr, left_tuple->getElements()[i], ""},
                                               {nullptr, right_tuple->getElements()[i], ""}};
-                auto element_type = func.build(args)->getResultType();
+                auto element_type = func->build(args)->getResultType();
                has_nullable = has_nullable || element_type->isNullable();
                has_null = has_null || element_type->onlyNull();
            }
--- a/src/Functions/transform.cpp
+++ b/src/Functions/transform.cpp
@ -211,7 +211,7 @@ namespace
            ColumnsWithTypeAndName args = arguments;
            args[0].column = args[0].column->cloneResized(input_rows_count)->convertToFullColumnIfConst();

-            auto impl = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionTransform>()).build(args);
+            auto impl = std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionTransform>())->build(args);

            return impl->execute(args, result_type, input_rows_count);
        }
--- a/Show More
+++ b/Show More