Merge pull request #70027 from vitlibar/fix-restore-on-cluster-sync

Better error-handling and cancellation of ON CLUSTER backups and restores
2024-11-15 12:14:18 +00:00 · 2024-11-01 17:01:07 +00:00 · 2024-11-01 17:01:07 +00:00 · ae2eeb489d
commit ae2eeb489d
parent 66db317107 b16a18ed66
71 changed files with 4144 additions and 1618 deletions
--- a/base/base/chrono_io.h
+++ b/base/base/chrono_io.h
@ -4,6 +4,7 @@
 #include <string>
 #include <sstream>
 #include <cctz/time_zone.h>
+#include <fmt/core.h>


 inline std::string to_string(const std::time_t & time)
@ -11,18 +12,6 @@ inline std::string to_string(const std::time_t & time)
    return cctz::format("%Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(time), cctz::local_time_zone());
 }

-template <typename Clock, typename Duration = typename Clock::duration>
-std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
-{
-    // Don't use DateLUT because it shows weird characters for
-    // TimePoint::max(). I wish we could use C++20 format, but it's not
-    // there yet.
-    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
-
-    auto in_time_t = std::chrono::system_clock::to_time_t(tp);
-    return to_string(in_time_t);
-}
-
 template <typename Rep, typename Period = std::ratio<1>>
 std::string to_string(const std::chrono::duration<Rep, Period> & duration)
 {
@ -33,6 +22,20 @@ std::string to_string(const std::chrono::duration<Rep, Period> & duration)
    return std::to_string(seconds_as_double.count()) + "s";
 }

+template <typename Clock, typename Duration = typename Clock::duration>
+std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
+{
+    // Don't use DateLUT because it shows weird characters for
+    // TimePoint::max(). I wish we could use C++20 format, but it's not
+    // there yet.
+    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
+
+    if constexpr (std::is_same_v<Clock, std::chrono::system_clock>)
+        return to_string(std::chrono::system_clock::to_time_t(tp));
+    else
+        return to_string(tp.time_since_epoch());
+}
+
 template <typename Clock, typename Duration = typename Clock::duration>
 std::ostream & operator<<(std::ostream & o, const std::chrono::time_point<Clock, Duration> & tp)
 {
@ -44,3 +47,23 @@ std::ostream & operator<<(std::ostream & o, const std::chrono::duration<Rep, Per
 {
    return o << to_string(duration);
 }
+
+template <typename Clock, typename Duration>
+struct fmt::formatter<std::chrono::time_point<Clock, Duration>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::time_point<Clock, Duration> & tp, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(tp), ctx);
+    }
+};
+
+template <typename Rep, typename Period>
+struct fmt::formatter<std::chrono::duration<Rep, Period>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::duration<Rep, Period> & duration, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(duration), ctx);
+    }
+};
--- a/src/Backups/BackupConcurrencyCheck.cpp
+++ b/src/Backups/BackupConcurrencyCheck.cpp
@ -0,0 +1,135 @@
+#include <Backups/BackupConcurrencyCheck.h>
+
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
+}
+
+
+BackupConcurrencyCheck::BackupConcurrencyCheck(
+    const UUID & backup_or_restore_uuid_,
+    bool is_restore_,
+    bool on_cluster_,
+    bool allow_concurrency_,
+    BackupConcurrencyCounters & counters_)
+    : is_restore(is_restore_), backup_or_restore_uuid(backup_or_restore_uuid_), on_cluster(on_cluster_), counters(counters_)
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (!allow_concurrency_)
+    {
+        bool found_concurrent_operation = false;
+        if (is_restore)
+        {
+            size_t num_local_restores = counters.local_restores;
+            size_t num_on_cluster_restores = counters.on_cluster_restores.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_restores.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_restores;
+            }
+            else
+            {
+                ++num_local_restores;
+            }
+            found_concurrent_operation = (num_local_restores + num_on_cluster_restores > 1);
+        }
+        else
+        {
+            size_t num_local_backups = counters.local_backups;
+            size_t num_on_cluster_backups = counters.on_cluster_backups.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_backups.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_backups;
+            }
+            else
+            {
+                ++num_local_backups;
+            }
+            found_concurrent_operation = (num_local_backups + num_on_cluster_backups > 1);
+        }
+
+        if (found_concurrent_operation)
+            throwConcurrentOperationNotAllowed(is_restore);
+    }
+
+    if (on_cluster)
+    {
+        if (is_restore)
+            ++counters.on_cluster_restores[backup_or_restore_uuid];
+        else
+            ++counters.on_cluster_backups[backup_or_restore_uuid];
+    }
+    else
+    {
+        if (is_restore)
+            ++counters.local_restores;
+        else
+            ++counters.local_backups;
+    }
+}
+
+
+BackupConcurrencyCheck::~BackupConcurrencyCheck()
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (on_cluster)
+    {
+        if (is_restore)
+        {
+            auto it = counters.on_cluster_restores.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_restores.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_restores.erase(it);
+            }
+        }
+        else
+        {
+            auto it = counters.on_cluster_backups.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_backups.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_backups.erase(it);
+            }
+        }
+    }
+    else
+    {
+        if (is_restore)
+            --counters.local_restores;
+        else
+            --counters.local_backups;
+    }
+}
+
+
+void BackupConcurrencyCheck::throwConcurrentOperationNotAllowed(bool is_restore)
+{
+    throw Exception(
+        ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
+        "Concurrent {} are not allowed, turn on setting '{}'",
+        is_restore ? "restores" : "backups",
+        is_restore ? "allow_concurrent_restores" : "allow_concurrent_backups");
+}
+
+
+BackupConcurrencyCounters::BackupConcurrencyCounters() = default;
+
+
+BackupConcurrencyCounters::~BackupConcurrencyCounters()
+{
+    if (local_backups > 0 || local_restores > 0 || !on_cluster_backups.empty() || !on_cluster_restores.empty())
+        LOG_ERROR(getLogger(__PRETTY_FUNCTION__), "Some backups or restores are processing");
+}
+
+}
--- a/src/Backups/BackupConcurrencyCheck.h
+++ b/src/Backups/BackupConcurrencyCheck.h
@ -0,0 +1,55 @@
+#pragma once
+
+#include <Core/UUID.h>
+#include <base/scope_guard.h>
+#include <mutex>
+#include <unordered_map>
+
+
+namespace DB
+{
+class BackupConcurrencyCounters;
+
+/// Local checker for concurrent BACKUP or RESTORE operations.
+/// This class is used by implementations of IBackupCoordination and IRestoreCoordination
+/// to throw an exception if concurrent backups or restores are not allowed.
+class BackupConcurrencyCheck
+{
+public:
+    /// Checks concurrency of a BACKUP operation or a RESTORE operation.
+    /// Keep a constructed instance of BackupConcurrencyCheck until the operation is done.
+    BackupConcurrencyCheck(
+        const UUID & backup_or_restore_uuid_,
+        bool is_restore_,
+        bool on_cluster_,
+        bool allow_concurrency_,
+        BackupConcurrencyCounters & counters_);
+
+    ~BackupConcurrencyCheck();
+
+    [[noreturn]] static void throwConcurrentOperationNotAllowed(bool is_restore);
+
+private:
+    const bool is_restore;
+    const UUID backup_or_restore_uuid;
+    const bool on_cluster;
+    BackupConcurrencyCounters & counters;
+};
+
+
+class BackupConcurrencyCounters
+{
+public:
+    BackupConcurrencyCounters();
+    ~BackupConcurrencyCounters();
+
+private:
+    friend class BackupConcurrencyCheck;
+    size_t local_backups TSA_GUARDED_BY(mutex) = 0;
+    size_t local_restores TSA_GUARDED_BY(mutex) = 0;
+    std::unordered_map<UUID /* backup_uuid */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
+    std::unordered_map<UUID /* restore_uuid */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationCleaner.cpp
+++ b/src/Backups/BackupCoordinationCleaner.cpp
@ -0,0 +1,64 @@
+#include <Backups/BackupCoordinationCleaner.h>
+
+
+namespace DB
+{
+
+BackupCoordinationCleaner::BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
+    : zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
+{
+}
+
+void BackupCoordinationCleaner::cleanup()
+{
+    tryRemoveAllNodes(/* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryCleanupAfterError() noexcept
+{
+    return tryRemoveAllNodes(/* throw_if_error = */ false, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (cleanup_result.succeeded)
+            return true;
+        if (cleanup_result.exception)
+        {
+            if (throw_if_error)
+                std::rethrow_exception(cleanup_result.exception);
+            return false;
+        }
+    }
+
+    try
+    {
+        LOG_TRACE(log, "Removing nodes from ZooKeeper");
+        auto holder = with_retries.createRetriesControlHolder("removeAllNodes", retries_kind);
+        holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            zookeeper->removeRecursive(zookeeper_path);
+        });
+
+        std::lock_guard lock{mutex};
+        cleanup_result.succeeded = true;
+        return true;
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this restore: {}",
+                  getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));
+
+        std::lock_guard lock{mutex};
+        cleanup_result.exception = std::current_exception();
+
+        if (throw_if_error)
+            throw;
+        return false;
+    }
+}
+
+}
--- a/src/Backups/BackupCoordinationCleaner.h
+++ b/src/Backups/BackupCoordinationCleaner.h
@ -0,0 +1,40 @@
+#pragma once
+
+#include <Backups/WithRetries.h>
+
+
+namespace DB
+{
+
+/// Removes all the nodes from ZooKeeper used to coordinate a BACKUP ON CLUSTER operation or
+/// a RESTORE ON CLUSTER operation (successful or not).
+/// This class is used by BackupCoordinationOnCluster and RestoreCoordinationOnCluster to cleanup.
+class BackupCoordinationCleaner
+{
+public:
+    BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);
+
+    void cleanup();
+    bool tryCleanupAfterError() noexcept;
+
+private:
+    bool tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind);
+
+    const String zookeeper_path;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const LoggerPtr log;
+
+    struct CleanupResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+    };
+    CleanupResult cleanup_result TSA_GUARDED_BY(mutex);
+
+    std::mutex mutex;
+};
+
+}
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -1,5 +1,7 @@
 #include <Backups/BackupCoordinationLocal.h>
+
 #include <Common/Exception.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>
 #include <Common/quoteString.h>
 #include <fmt/format.h>
@ -8,27 +10,20 @@
 namespace DB
 {

-BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
-    : log(getLogger("BackupCoordinationLocal")), file_infos(plain_backup_)
+BackupCoordinationLocal::BackupCoordinationLocal(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("BackupCoordinationLocal"))
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ false, allow_concurrent_backup_, concurrency_counters_)
+    , file_infos(is_plain_backup_)
 {
 }

 BackupCoordinationLocal::~BackupCoordinationLocal() = default;

-void BackupCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void BackupCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo BackupCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -135,15 +130,4 @@ bool BackupCoordinationLocal::startWritingFile(size_t data_file_index)
    return writing_files.emplace(data_file_index).second;
 }

-
-bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
-{
-    if (num_active_backups > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_backups={}", num_active_backups);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -21,13 +22,21 @@ namespace DB
 class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
-    explicit BackupCoordinationLocal(bool plain_backup_);
+    explicit BackupCoordinationLocal(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_);
+
    ~BackupCoordinationLocal() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setBackupQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
@ -54,17 +63,18 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    BackupCoordinationReplicatedTables TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    BackupCoordinationReplicatedAccess TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    BackupCoordinationReplicatedSQLObjects TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    BackupCoordinationFileInfos TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    BackupCoordinationReplicatedTables replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    BackupCoordinationReplicatedAccess replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    BackupCoordinationReplicatedSQLObjects replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    BackupCoordinationFileInfos file_infos TSA_GUARDED_BY(file_infos_mutex);
    BackupCoordinationKeeperMapTables keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
--- a/src/Backups/BackupCoordinationOnCluster.cpp
+++ b/src/Backups/BackupCoordinationOnCluster.cpp
@ -1,7 +1,4 @@
-#include <Backups/BackupCoordinationRemote.h>
-
-#include <base/hex.h>
-#include <boost/algorithm/string/split.hpp>
+#include <Backups/BackupCoordinationOnCluster.h>

 #include <Access/Common/AccessEntityType.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
@ -26,8 +23,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace Stage = BackupCoordinationStage;
-
 namespace
 {
    using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
@ -149,144 +144,152 @@ namespace
    };
 }

-size_t BackupCoordinationRemote::findCurrentHostIndex(const Strings & all_hosts, const String & current_host)
+Strings BackupCoordinationOnCluster::excludeInitiator(const Strings & all_hosts)
+{
+    Strings all_hosts_without_initiator = all_hosts;
+    bool has_initiator = (std::erase(all_hosts_without_initiator, kInitiator) > 0);
+    chassert(has_initiator);
+    return all_hosts_without_initiator;
+}
+
+size_t BackupCoordinationOnCluster::findCurrentHostIndex(const String & current_host, const Strings & all_hosts)
 {
    auto it = std::find(all_hosts.begin(), all_hosts.end(), current_host);
    if (it == all_hosts.end())
-        return 0;
+        return all_hosts.size();
    return it - all_hosts.begin();
 }

-BackupCoordinationRemote::BackupCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
+
+BackupCoordinationOnCluster::BackupCoordinationOnCluster(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
    const BackupKeeperSettings & keeper_settings_,
-    const String & backup_uuid_,
-    const Strings & all_hosts_,
    const String & current_host_,
-    bool plain_backup_,
-    bool is_internal_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
    QueryStatusPtr process_list_element_)
    : root_zookeeper_path(root_zookeeper_path_)
-    , zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/backup-" + toString(backup_uuid_))
    , keeper_settings(keeper_settings_)
    , backup_uuid(backup_uuid_)
    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(excludeInitiator(all_hosts))
    , current_host(current_host_)
-    , current_host_index(findCurrentHostIndex(all_hosts, current_host))
-    , plain_backup(plain_backup_)
-    , is_internal(is_internal_)
-    , log(getLogger("BackupCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
+    , current_host_index(findCurrentHostIndex(current_host, all_hosts))
+    , plain_backup(is_plain_backup_)
+    , log(getLogger("BackupCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ true, allow_concurrent_backup_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
 {
    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
 }

-BackupCoordinationRemote::~BackupCoordinationRemote()
+BackupCoordinationOnCluster::~BackupCoordinationOnCluster()
 {
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    tryFinishImpl();
 }

-void BackupCoordinationRemote::createRootNodes()
+void BackupCoordinationOnCluster::createRootNodes()
 {
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
    holder.retries_ctl.retryLoop(
    [&, &zk = holder.faulty_zookeeper]()
    {
        with_retries.renewZooKeeper(zk);

        zk->createAncestors(zookeeper_path);
-
-        Coordination::Requests ops;
-        Coordination::Responses responses;
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_part_names", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_mutations", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_data_paths", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/file_infos", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/writing_files", "", zkutil::CreateMode::Persistent));
-        zk->tryMulti(ops, responses);
+        zk->createIfNotExists(zookeeper_path, "");
+        zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_access", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_sql_objects", "");
+        zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+        zk->createIfNotExists(zookeeper_path + "/file_infos", "");
+        zk->createIfNotExists(zookeeper_path + "/writing_files", "");
    });
 }

-void BackupCoordinationRemote::removeAllNodes()
+Strings BackupCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-    [&, &zk = holder.faulty_zookeeper]()
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void BackupCoordinationOnCluster::setBackupQueryWasSentToOtherHosts()
+{
+    backup_query_was_sent_to_other_hosts = true;
+}
+
+bool BackupCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void BackupCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool BackupCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool BackupCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
    {
-        /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
-        ///
-        /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-        /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
-        /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
-        /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
-        with_retries.renewZooKeeper(zk);
-        zk->removeRecursive(zookeeper_path);
-    });
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
 }

-
-void BackupCoordinationRemote::setStage(const String & new_stage, const String & message)
+void BackupCoordinationOnCluster::waitForOtherHostsToFinish()
 {
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
+    if ((current_host != kInitiator) || !backup_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
 }

-void BackupCoordinationRemote::setError(const Exception & exception)
+bool BackupCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
 {
-    stage_sync->setError(current_host, exception);
+    if (current_host != kInitiator)
+        return false;
+    if (!backup_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait)
+ZooKeeperRetriesInfo BackupCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
 {
-    return stage_sync->wait(all_hosts, stage_to_wait);
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
 }

-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-
-void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
+void BackupCoordinationOnCluster::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
 {
    {
        auto holder = with_retries.createRetriesControlHolder(logging_name + "::create");
@ -301,7 +304,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    if (value.empty())
        return;

-    size_t max_part_size = keeper_settings.keeper_value_max_size;
+    size_t max_part_size = keeper_settings.value_max_size;
    if (!max_part_size)
        max_part_size = value.size();

@ -324,7 +327,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
    }
 }

-String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
+String BackupCoordinationOnCluster::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
 {
    Strings part_names;

@ -357,7 +360,7 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
 }


-void BackupCoordinationRemote::addReplicatedPartNames(
+void BackupCoordinationOnCluster::addReplicatedPartNames(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -381,14 +384,14 @@ void BackupCoordinationRemote::addReplicatedPartNames(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
+Strings BackupCoordinationOnCluster::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
    return replicated_tables->getPartNames(table_zk_path, replica_name);
 }

-void BackupCoordinationRemote::addReplicatedMutations(
+void BackupCoordinationOnCluster::addReplicatedMutations(
    const String & table_zk_path,
    const String & table_name_for_logs,
    const String & replica_name,
@ -412,7 +415,7 @@ void BackupCoordinationRemote::addReplicatedMutations(
        });
 }

-std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationOnCluster::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -420,7 +423,7 @@ std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getRepl
 }


-void BackupCoordinationRemote::addReplicatedDataPath(
+void BackupCoordinationOnCluster::addReplicatedDataPath(
    const String & table_zk_path, const String & data_path)
 {
    {
@ -441,7 +444,7 @@ void BackupCoordinationRemote::addReplicatedDataPath(
    });
 }

-Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk_path) const
+Strings BackupCoordinationOnCluster::getReplicatedDataPaths(const String & table_zk_path) const
 {
    std::lock_guard lock{replicated_tables_mutex};
    prepareReplicatedTables();
@ -449,7 +452,7 @@ Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk
 }


-void BackupCoordinationRemote::prepareReplicatedTables() const
+void BackupCoordinationOnCluster::prepareReplicatedTables() const
 {
    if (replicated_tables)
        return;
@ -536,7 +539,7 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
        replicated_tables->addDataPath(std::move(data_paths));
 }

-void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
+void BackupCoordinationOnCluster::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
 {
    {
        std::lock_guard lock{replicated_access_mutex};
@ -558,14 +561,14 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
    });
 }

-Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
+Strings BackupCoordinationOnCluster::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
 {
    std::lock_guard lock{replicated_access_mutex};
    prepareReplicatedAccess();
    return replicated_access->getFilePaths(access_zk_path, access_entity_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedAccess() const
+void BackupCoordinationOnCluster::prepareReplicatedAccess() const
 {
    if (replicated_access)
        return;
@ -601,7 +604,7 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
        replicated_access->addFilePath(std::move(file_path));
 }

-void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
+void BackupCoordinationOnCluster::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
 {
    {
        std::lock_guard lock{replicated_sql_objects_mutex};
@ -631,14 +634,14 @@ void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_
    });
 }

-Strings BackupCoordinationRemote::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
+Strings BackupCoordinationOnCluster::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
 {
    std::lock_guard lock{replicated_sql_objects_mutex};
    prepareReplicatedSQLObjects();
    return replicated_sql_objects->getDirectories(loader_zk_path, object_type, current_host);
 }

-void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
+void BackupCoordinationOnCluster::prepareReplicatedSQLObjects() const
 {
    if (replicated_sql_objects)
        return;
@ -674,7 +677,7 @@ void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
        replicated_sql_objects->addDirectory(std::move(directory));
 }

-void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
+void BackupCoordinationOnCluster::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
 {
    {
        std::lock_guard lock{keeper_map_tables_mutex};
@ -695,7 +698,7 @@ void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_
    });
 }

-void BackupCoordinationRemote::prepareKeeperMapTables() const
+void BackupCoordinationOnCluster::prepareKeeperMapTables() const
 {
    if (keeper_map_tables)
        return;
@ -740,7 +743,7 @@ void BackupCoordinationRemote::prepareKeeperMapTables() const

 }

-String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
+String BackupCoordinationOnCluster::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
 {
    std::lock_guard lock(keeper_map_tables_mutex);
    prepareKeeperMapTables();
@ -748,7 +751,7 @@ String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zooke
 }


-void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
+void BackupCoordinationOnCluster::addFileInfos(BackupFileInfos && file_infos_)
 {
    {
        std::lock_guard lock{file_infos_mutex};
@ -761,21 +764,21 @@ void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
    serializeToMultipleZooKeeperNodes(zookeeper_path + "/file_infos/" + current_host, file_infos_str, "addFileInfos");
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfos() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfos() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfos(current_host);
 }

-BackupFileInfos BackupCoordinationRemote::getFileInfosForAllHosts() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfosForAllHosts() const
 {
    std::lock_guard lock{file_infos_mutex};
    prepareFileInfos();
    return file_infos->getFileInfosForAllHosts();
 }

-void BackupCoordinationRemote::prepareFileInfos() const
+void BackupCoordinationOnCluster::prepareFileInfos() const
 {
    if (file_infos)
        return;
@ -801,7 +804,7 @@ void BackupCoordinationRemote::prepareFileInfos() const
    }
 }

-bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
+bool BackupCoordinationOnCluster::startWritingFile(size_t data_file_index)
 {
    {
        /// Check if this host is already writing this file.
@ -842,66 +845,4 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
    }
 }

-bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base backup
-    if (is_internal)
-        return false;
-
-    std::string backup_stage_path = zookeeper_path + "/stage";
-
-    bool result = false;
-
-    auto holder = with_retries.createRetriesControlHolder("getAllArchiveSuffixes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zk);
-
-        if (!zk->exists(root_zookeeper_path))
-            zk->createAncestors(root_zookeeper_path);
-
-        for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-        {
-            Coordination::Stat stat;
-            zk->get(root_zookeeper_path, &stat);
-            Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
-
-            for (const auto & existing_backup_path : existing_backup_paths)
-            {
-                if (startsWith(existing_backup_path, "restore-"))
-                    continue;
-
-                String existing_backup_uuid = existing_backup_path;
-                existing_backup_uuid.erase(0, String("backup-").size());
-
-                if (existing_backup_uuid == toString(backup_uuid))
-                    continue;
-
-                String status;
-                if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
-                {
-                    /// Check if some other backup is in progress
-                    if (status == Stage::SCHEDULED_TO_START)
-                    {
-                        LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
-                        result = true;
-                        return;
-                    }
-                }
-            }
-
-            zk->createIfNotExists(backup_stage_path, "");
-            auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
-            if (code == Coordination::Error::ZOK)
-                break;
-            bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-            if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                throw zkutil::KeeperException::fromPath(code, backup_stage_path);
-        }
-    });
-
-    return result;
-}
-
 }
--- a/src/Backups/BackupCoordinationOnCluster.h
+++ b/src/Backups/BackupCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@ -13,32 +15,35 @@
 namespace DB
 {

-/// We try to store data to zookeeper several times due to possible version conflicts.
-constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
-
 /// Implementation of the IBackupCoordination interface performing coordination via ZooKeeper. It's necessary for "BACKUP ON CLUSTER".
-class BackupCoordinationRemote : public IBackupCoordination
+class BackupCoordinationOnCluster : public IBackupCoordination
 {
 public:
-    using BackupKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    BackupCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    BackupCoordinationOnCluster(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
        const String & root_zookeeper_path_,
+        zkutil::GetZooKeeper get_zookeeper_,
        const BackupKeeperSettings & keeper_settings_,
-        const String & backup_uuid_,
-        const Strings & all_hosts_,
        const String & current_host_,
-        bool plain_backup_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~BackupCoordinationRemote() override;
+    ~BackupCoordinationOnCluster() override;

-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setBackupQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    void addReplicatedPartNames(
        const String & table_zk_path,
@ -73,13 +78,14 @@ public:
    BackupFileInfos getFileInfosForAllHosts() const override;
    bool startWritingFile(size_t data_file_index) override;

-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

-    static size_t findCurrentHostIndex(const Strings & all_hosts, const String & current_host);
+    static Strings excludeInitiator(const Strings & all_hosts);
+    static size_t findCurrentHostIndex(const String & current_host, const Strings & all_hosts);

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

    void serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name);
    String deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const;
@ -96,26 +102,27 @@ private:
    const String root_zookeeper_path;
    const String zookeeper_path;
    const BackupKeeperSettings keeper_settings;
-    const String backup_uuid;
+    const UUID backup_uuid;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
    const bool plain_backup;
-    const bool is_internal;
    LoggerPtr const log;

-    /// The order of these two fields matters, because stage_sync holds a reference to with_retries object
-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> backup_query_was_sent_to_other_hosts = false;

-    mutable std::optional<BackupCoordinationReplicatedTables> TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    mutable std::optional<BackupCoordinationReplicatedAccess> TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    mutable std::optional<BackupCoordinationReplicatedSQLObjects> TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    mutable std::optional<BackupCoordinationFileInfos> TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    mutable std::optional<BackupCoordinationReplicatedSQLObjects> replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    mutable std::optional<BackupCoordinationFileInfos> file_infos TSA_GUARDED_BY(file_infos_mutex);
    mutable std::optional<BackupCoordinationKeeperMapTables> keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);

-    mutable std::mutex zookeeper_mutex;
    mutable std::mutex replicated_tables_mutex;
    mutable std::mutex replicated_access_mutex;
    mutable std::mutex replicated_sql_objects_mutex;
--- a/src/Backups/BackupCoordinationStage.h
+++ b/src/Backups/BackupCoordinationStage.h
@ -8,10 +8,6 @@ namespace DB

 namespace BackupCoordinationStage
 {
-    /// This stage is set after concurrency check so ensure we dont start other backup/restores
-    /// when concurrent backup/restores are not allowed
-    constexpr const char * SCHEDULED_TO_START = "scheduled to start";
-
    /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
    constexpr const char * GATHERING_METADATA = "gathering metadata";

@ -46,10 +42,6 @@ namespace BackupCoordinationStage

    /// Coordination stage meaning that a host finished its work.
    constexpr const char * COMPLETED = "completed";
-
-    /// Coordination stage meaning that backup/restore has failed due to an error
-    /// Check '/error' for the error message
-    constexpr const char * ERROR = "error";
 }

 }
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -10,33 +10,193 @@ class BackupCoordinationStageSync
 {
 public:
    BackupCoordinationStageSync(
-        const String & root_zookeeper_path_,
-        WithRetries & with_retries_,
+        bool is_restore_,                    /// true if this is a RESTORE ON CLUSTER command, false if this is a BACKUP ON CLUSTER command
+        const String & zookeeper_path_,      /// path to the "stage" folder in ZooKeeper
+        const String & current_host_,        /// the current host, or an empty string if it's the initiator of the BACKUP/RESTORE ON CLUSTER command
+        const Strings & all_hosts_,          /// all the hosts (including the initiator and the current host) performing the BACKUP/RESTORE ON CLUSTER command
+        bool allow_concurrency_,             /// whether it's allowed to have concurrent backups or restores.
+        const WithRetries & with_retries_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+        QueryStatusPtr process_list_element_,
        LoggerPtr log_);

+    ~BackupCoordinationStageSync();
+
    /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
-    void set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts = false);
-    void setError(const String & current_host, const Exception & exception);
+    void setStage(const String & stage, const String & stage_result = {});

-    /// Sets the stage of the current host and waits until all hosts come to the same stage.
-    /// The function returns the messages all hosts set when they come to the required stage.
-    Strings wait(const Strings & all_hosts, const String & stage_to_wait);
+    /// Waits until all the specified hosts come to the specified stage.
+    /// The function returns the results which specified hosts set when they came to the required stage.
+    /// If it doesn't happen before the timeout then the function will stop waiting and throw an exception.
+    Strings waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout = {}) const;

-    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
-    Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout);
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    void waitForOtherHostsToFinish() const;
+
+    /// Lets other host know that the current host has finished its work.
+    void finish(bool & other_hosts_also_finished);
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(std::exception_ptr exception) noexcept;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    bool tryWaitForOtherHostsToFinishAfterError() const noexcept;
+
+    /// Lets other host know that the current host has finished its work (as a part of error-handling process).
+    bool tryFinishAfterError(bool & other_hosts_also_finished) noexcept;
+
+    /// Returns a printable name of a specific host. For empty host the function returns "initiator".
+    static String getHostDesc(const String & host);
+    static String getHostsDesc(const Strings & hosts);

 private:
+    /// Initializes the original state. It will be updated then with readCurrentState().
+    void initializeState();
+
+    /// Creates the root node in ZooKeeper.
    void createRootNodes();

-    struct State;
-    State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+    /// Atomically creates both 'start' and 'alive' nodes and also checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void createStartAndAliveNodes();
+    void createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);

-    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
+    /// Deserialize the version of a node stored in the 'start' node.
+    int parseStartNode(const String & start_node_contents, const String & host) const;

-    String zookeeper_path;
-    /// A reference to the field of parent object - BackupCoordinationRemote or RestoreCoordinationRemote
-    WithRetries & with_retries;
-    LoggerPtr log;
+    /// Recreates the 'alive' node if it doesn't exist. It's an ephemeral node so it's removed automatically after disconnections.
+    void createAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void checkConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Watching thread periodically reads the current state from ZooKeeper and recreates the 'alive' node.
+    void startWatchingThread();
+    void stopWatchingThread();
+    void watchingThread();
+
+    /// Reads the current state from ZooKeeper without throwing exceptions.
+    void readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    String getStageNodePath(const String & stage) const;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(const Exception & exception);
+    void setError(const Exception & exception);
+
+    /// Deserializes an error stored in the error node.
+    static std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents);
+
+    /// Reset the `connected` flag for each host.
+    void resetConnectedFlag();
+
+    /// Checks if the current query is cancelled, and if so then the function sets the `cancelled` flag in the current state.
+    void checkIfQueryCancelled();
+
+    /// Checks if the current state contains an error, and if so then the function passes this error to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfError();
+
+    /// Checks if some host was disconnected for too long, and if so then the function generates an error and pass it to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfDisconnectedTooLong();
+
+    /// Used by waitForHostsToReachStage() to check if everything is ready to return.
+    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, bool time_is_out, std::optional<std::chrono::milliseconds> timeout, Strings & results) const TSA_REQUIRES(mutex);
+
+    /// Creates the 'finish' node.
+    bool tryFinishImpl();
+    bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
+    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Returns the version used by the initiator.
+    int getInitiatorVersion() const;
+
+    /// Waits until all the other hosts finish their work.
+    bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
+    bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
+
+    const bool is_restore;
+    const String operation_name;
+    const String current_host;
+    const String current_host_desc;
+    const Strings all_hosts;
+    const bool allow_concurrency;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const ThreadPoolCallbackRunnerUnsafe<void> schedule;
+    const QueryStatusPtr process_list_element;
+    const LoggerPtr log;
+
+    const std::chrono::seconds failure_after_host_disconnected_for_seconds;
+    const std::chrono::seconds finish_timeout_after_error;
+    const std::chrono::milliseconds sync_period_ms;
+    const size_t max_attempts_after_bad_version;
+
+    /// Paths in ZooKeeper.
+    const std::filesystem::path zookeeper_path;
+    const String root_zookeeper_path;
+    const String operation_node_path;
+    const String operation_node_name;
+    const String stage_node_path;
+    const String start_node_path;
+    const String finish_node_path;
+    const String num_hosts_node_path;
+    const String alive_node_path;
+    const String alive_tracker_node_path;
+    const String error_node_path;
+
+    std::shared_ptr<Poco::Event> zk_nodes_changed;
+
+    /// We store list of previously found ZooKeeper nodes to show better logging messages.
+    Strings zk_nodes;
+
+    /// Information about one host read from ZooKeeper.
+    struct HostInfo
+    {
+        String host;
+        bool started = false;
+        bool connected = false;
+        bool finished = false;
+        int version = 1;
+        std::map<String /* stage */, String /* result */> stages = {}; /// std::map because we need to compare states
+        std::exception_ptr exception = nullptr;
+
+        std::chrono::time_point<std::chrono::system_clock> last_connection_time = {};
+        std::chrono::time_point<std::chrono::steady_clock> last_connection_time_monotonic = {};
+
+        bool operator ==(const HostInfo & other) const;
+        bool operator !=(const HostInfo & other) const;
+    };
+
+    /// Information about all the host participating in the current BACKUP or RESTORE operation.
+    struct State
+    {
+        std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
+        std::optional<String> host_with_error;
+        bool cancelled = false;
+
+        bool operator ==(const State & other) const;
+        bool operator !=(const State & other) const;
+    };
+
+    State state TSA_GUARDED_BY(mutex);
+    mutable std::condition_variable state_changed;
+
+    std::future<void> watching_thread_future;
+    std::atomic<bool> should_stop_watching_thread = false;
+
+    struct FinishResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+        bool other_hosts_also_finished = false;
+    };
+    FinishResult finish_result TSA_GUARDED_BY(mutex);
+
+    mutable std::mutex mutex;
 };

 }
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@ -102,7 +102,6 @@ BackupEntriesCollector::BackupEntriesCollector(
    , read_settings(read_settings_)
    , context(context_)
    , process_list_element(context->getProcessListElement())
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , collect_metadata_timeout(context->getConfigRef().getUInt64(
          "backups.collect_metadata_timeout", context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)))
    , attempts_to_collect_metadata_before_sleep(context->getConfigRef().getUInt("backups.attempts_to_collect_metadata_before_sleep", 2))
@ -176,21 +175,7 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String
    checkIsQueryCancelled();

    current_stage = new_stage;
-    backup_coordination->setStage(new_stage, message);
-
-    if (new_stage == Stage::formatGatheringMetadata(0))
-    {
-        return backup_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-    }
-    if (new_stage.starts_with(Stage::GATHERING_METADATA))
-    {
-        auto current_time = std::chrono::steady_clock::now();
-        auto end_of_timeout = std::max(current_time, collect_metadata_end_time);
-        return backup_coordination->waitForStage(
-            new_stage, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time));
-    }
-
-    return backup_coordination->waitForStage(new_stage);
+    return backup_coordination->setStage(new_stage, message, /* sync = */ true);
 }

 void BackupEntriesCollector::checkIsQueryCancelled() const
--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@ -111,10 +111,6 @@ private:
    ContextPtr context;
    QueryStatusPtr process_list_element;

-    /// The time a BACKUP ON CLUSTER or RESTORE ON CLUSTER command will wait until all the nodes receive the BACKUP (or RESTORE) query and start working.
-    /// This setting is similar to `distributed_ddl_task_timeout`.
-    const std::chrono::milliseconds on_cluster_first_sync_timeout;
-
    /// The time a BACKUP command will try to collect the metadata of tables & databases.
    const std::chrono::milliseconds collect_metadata_timeout;

--- a/src/Backups/BackupIO.h
+++ b/src/Backups/BackupIO.h
@ -5,6 +5,7 @@

 namespace DB
 {
+
 class IDisk;
 using DiskPtr = std::shared_ptr<IDisk>;
 class SeekableReadBuffer;
@ -63,9 +64,13 @@ public:

    virtual void copyFile(const String & destination, const String & source, size_t size) = 0;

+    /// Removes a file written to the backup, if it still exists.
    virtual void removeFile(const String & file_name) = 0;
    virtual void removeFiles(const Strings & file_names) = 0;

+    /// Removes the backup folder if it's empty or contains empty subfolders.
+    virtual void removeEmptyDirectories() = 0;
+
    virtual const ReadSettings & getReadSettings() const = 0;
    virtual const WriteSettings & getWriteSettings() const = 0;
    virtual size_t getWriteBufferSize() const = 0;
--- a/src/Backups/BackupIO_AzureBlobStorage.h
+++ b/src/Backups/BackupIO_AzureBlobStorage.h
@ -81,6 +81,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@ -91,16 +91,36 @@ std::unique_ptr<WriteBuffer> BackupWriterDisk::writeFile(const String & file_nam
 void BackupWriterDisk::removeFile(const String & file_name)
 {
    disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
 }

 void BackupWriterDisk::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!disk->existsDirectory(current_dir))
+        return;
+
+    if (disk->isDirectoryEmpty(current_dir))
+    {
+        disk->removeDirectory(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (auto it = disk->iterateDirectory(current_dir); it->isValid(); it->next())
+        removeEmptyDirectoriesImpl(current_dir / it->name());
+
+    if (disk->isDirectoryEmpty(current_dir))
+        disk->removeDirectory(current_dir);
 }

 void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_Disk.h
+++ b/src/Backups/BackupIO_Disk.h
@ -50,9 +50,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const DiskPtr disk;
    const std::filesystem::path root_path;
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@ -106,16 +106,36 @@ std::unique_ptr<WriteBuffer> BackupWriterFile::writeFile(const String & file_nam
 void BackupWriterFile::removeFile(const String & file_name)
 {
    (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
 }

 void BackupWriterFile::removeFiles(const Strings & file_names)
 {
    for (const auto & file_name : file_names)
        (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!fs::is_directory(current_dir))
+        return;
+
+    if (fs::is_empty(current_dir))
+    {
+        (void)fs::remove(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (const auto & it : std::filesystem::directory_iterator{current_dir})
+        removeEmptyDirectoriesImpl(it.path());
+
+    if (fs::is_empty(current_dir))
+        (void)fs::remove(current_dir);
 }

 void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
--- a/src/Backups/BackupIO_File.h
+++ b/src/Backups/BackupIO_File.h
@ -42,9 +42,11 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);

    const std::filesystem::path root_path;
    const DataSourceDescription data_source_description;
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@ -74,6 +74,7 @@ public:

    void removeFile(const String & file_name) override;
    void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}

 private:
    std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@ -147,11 +147,11 @@ BackupImpl::BackupImpl(

 BackupImpl::~BackupImpl()
 {
-    if ((open_mode == OpenMode::WRITE) && !is_internal_backup && !writing_finalized && !std::uncaught_exceptions() && !std::current_exception())
+    if ((open_mode == OpenMode::WRITE) && !writing_finalized && !corrupted)
    {
        /// It is suspicious to destroy BackupImpl without finalization while writing a backup when there is no exception.
-        LOG_ERROR(log, "BackupImpl is not finalized when destructor is called. Stack trace: {}", StackTrace().toString());
-        chassert(false && "BackupImpl is not finalized when destructor is called.");
+        LOG_ERROR(log, "BackupImpl is not finalized or marked as corrupted when destructor is called. Stack trace: {}", StackTrace().toString());
+        chassert(false, "BackupImpl is not finalized or marked as corrupted when destructor is called.");
    }

    try
@ -196,9 +196,6 @@ void BackupImpl::open()

    if (open_mode == OpenMode::READ)
        readBackupMetadata();
-
-    if ((open_mode == OpenMode::WRITE) && base_backup_info)
-        base_backup_uuid = getBaseBackupUnlocked()->getUUID();
 }

 void BackupImpl::close()
@ -280,6 +277,8 @@ std::shared_ptr<const IBackup> BackupImpl::getBaseBackupUnlocked() const
                toString(base_backup->getUUID()),
                (base_backup_uuid ? toString(*base_backup_uuid) : ""));
        }
+
+        base_backup_uuid = base_backup->getUUID();
    }
    return base_backup;
 }
@ -369,7 +368,7 @@ void BackupImpl::writeBackupMetadata()
        if (base_backup_in_use)
        {
            *out << "<base_backup>" << xml << base_backup_info->toString() << "</base_backup>";
-            *out << "<base_backup_uuid>" << toString(*base_backup_uuid) << "</base_backup_uuid>";
+            *out << "<base_backup_uuid>" << getBaseBackupUnlocked()->getUUID() << "</base_backup_uuid>";
        }
    }

@ -594,9 +593,6 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const

 void BackupImpl::removeLockFile()
 {
-    if (is_internal_backup)
-        return; /// Internal backup must not remove the lock file (it's still used by the initiator).
-
    if (checkLockFile(false))
        writer->removeFile(lock_file_name);
 }
@ -989,8 +985,11 @@ void BackupImpl::finalizeWriting()
    if (open_mode != OpenMode::WRITE)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");

+    if (corrupted)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup can't be finalized after an error happened");
+
    if (writing_finalized)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized");
+        return;

    if (!is_internal_backup)
    {
@ -1015,20 +1014,58 @@ void BackupImpl::setCompressedSize()
 }


-void BackupImpl::tryRemoveAllFiles()
+bool BackupImpl::setIsCorrupted() noexcept
 {
-    if (open_mode != OpenMode::WRITE)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
-
-    if (is_internal_backup)
-        return;
-
    try
    {
-        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+        std::lock_guard lock{mutex};
+        if (open_mode != OpenMode::WRITE)
+        {
+            LOG_ERROR(log, "Backup is not opened for writing. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not opened for writing when setIsCorrupted() is called");
+            return false;
+        }
+
+        if (writing_finalized)
+        {
+            LOG_WARNING(log, "An error happened after the backup was completed successfully, the backup must be correct!");
+            return false;
+        }
+
+        if (corrupted)
+            return true;
+
+        LOG_WARNING(log, "An error happened, the backup won't be completed");
+
        closeArchive(/* finalize= */ false);

+        corrupted = true;
+        return true;
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException(log, "Caught exception while setting that the backup was corrupted");
+        return false;
+    }
+}
+
+
+bool BackupImpl::tryRemoveAllFiles() noexcept
+{
+    try
+    {
+        std::lock_guard lock{mutex};
+        if (!corrupted)
+        {
+            LOG_ERROR(log, "Backup is not set as corrupted. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not set as corrupted when tryRemoveAllFiles() is called");
+            return false;
+        }
+
+        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+
        Strings files_to_remove;
+
        if (use_archive)
        {
            files_to_remove.push_back(archive_params.archive_name);
@ -1041,14 +1078,17 @@ void BackupImpl::tryRemoveAllFiles()
        }

        if (!checkLockFile(false))
-            return;
+            return false;

        writer->removeFiles(files_to_remove);
        removeLockFile();
+        writer->removeEmptyDirectories();
+        return true;
    }
    catch (...)
    {
-        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+        DB::tryLogCurrentException(log, "Caught exception while removing files of a corrupted backup");
+        return false;
    }
 }

--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@ -86,7 +86,8 @@ public:
    void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override;
    bool supportsWritingInMultipleThreads() const override { return !use_archive; }
    void finalizeWriting() override;
-    void tryRemoveAllFiles() override;
+    bool setIsCorrupted() noexcept override;
+    bool tryRemoveAllFiles() noexcept override;

 private:
    void open();
@ -146,13 +147,14 @@ private:
    int version;
    mutable std::optional<BackupInfo> base_backup_info;
    mutable std::shared_ptr<const IBackup> base_backup;
-    std::optional<UUID> base_backup_uuid;
+    mutable std::optional<UUID> base_backup_uuid;
    std::shared_ptr<IArchiveReader> archive_reader;
    std::shared_ptr<IArchiveWriter> archive_writer;
    String lock_file_name;
    std::atomic<bool> lock_file_before_first_file_checked = false;

    bool writing_finalized = false;
+    bool corrupted = false;
    bool deduplicate_files = true;
    bool use_same_s3_credentials_for_base_backup = false;
    bool use_same_password_for_base_backup = false;
--- a/src/Backups/BackupKeeperSettings.cpp
+++ b/src/Backups/BackupKeeperSettings.cpp
@ -0,0 +1,58 @@
+#include <Backups/BackupKeeperSettings.h>
+
+#include <Core/Settings.h>
+#include <Interpreters/Context.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+
+namespace DB
+{
+
+namespace Setting
+{
+    extern const SettingsUInt64 backup_restore_keeper_max_retries;
+    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
+    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
+    extern const SettingsUInt64 backup_restore_failure_after_host_disconnected_for_seconds;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_initializing;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_handling_error;
+    extern const SettingsUInt64 backup_restore_finish_timeout_after_error_sec;
+    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
+    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
+    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
+}
+
+BackupKeeperSettings BackupKeeperSettings::fromContext(const ContextPtr & context)
+{
+    BackupKeeperSettings keeper_settings;
+
+    const auto & settings = context->getSettingsRef();
+    const auto & config = context->getConfigRef();
+
+    keeper_settings.max_retries = settings[Setting::backup_restore_keeper_max_retries];
+    keeper_settings.retry_initial_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_initial_backoff_ms]};
+    keeper_settings.retry_max_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_max_backoff_ms]};
+
+    keeper_settings.failure_after_host_disconnected_for_seconds = std::chrono::seconds{settings[Setting::backup_restore_failure_after_host_disconnected_for_seconds]};
+    keeper_settings.max_retries_while_initializing = settings[Setting::backup_restore_keeper_max_retries_while_initializing];
+    keeper_settings.max_retries_while_handling_error = settings[Setting::backup_restore_keeper_max_retries_while_handling_error];
+    keeper_settings.finish_timeout_after_error = std::chrono::seconds(settings[Setting::backup_restore_finish_timeout_after_error_sec]);
+
+    if (config.has("backups.sync_period_ms"))
+        keeper_settings.sync_period_ms = std::chrono::milliseconds{config.getUInt64("backups.sync_period_ms")};
+
+    if (config.has("backups.max_attempts_after_bad_version"))
+        keeper_settings.max_attempts_after_bad_version = config.getUInt64("backups.max_attempts_after_bad_version");
+
+    keeper_settings.value_max_size = settings[Setting::backup_restore_keeper_value_max_size];
+    keeper_settings.batch_size_for_multi = settings[Setting::backup_restore_batch_size_for_keeper_multi];
+    keeper_settings.batch_size_for_multiread = settings[Setting::backup_restore_batch_size_for_keeper_multiread];
+    keeper_settings.fault_injection_probability = settings[Setting::backup_restore_keeper_fault_injection_probability];
+    keeper_settings.fault_injection_seed = settings[Setting::backup_restore_keeper_fault_injection_seed];
+
+    return keeper_settings;
+}
+
+}
--- a/src/Backups/BackupKeeperSettings.h
+++ b/src/Backups/BackupKeeperSettings.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <Interpreters/Context_fwd.h>
+
+
+namespace DB
+{
+
+/// Settings for [Zoo]Keeper-related works during BACKUP or RESTORE.
+struct BackupKeeperSettings
+{
+    /// Maximum number of retries in the middle of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Should be big enough so the whole operation won't be cancelled in the middle of it because of a temporary ZooKeeper failure.
+    UInt64 max_retries{1000};
+
+    /// Initial backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_initial_backoff_ms{100};
+
+    /// Max backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_max_backoff_ms{5000};
+
+    /// If a host during BACKUP ON CLUSTER or RESTORE ON CLUSTER doesn't recreate its 'alive' node in ZooKeeper
+    /// for this amount of time then the whole backup or restore is considered as failed.
+    /// Should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+    /// Set to zero to disable (if it's zero and some host crashed then BACKUP ON CLUSTER or RESTORE ON CLUSTER will be waiting
+    /// for the crashed host forever until the operation is explicitly cancelled with KILL QUERY).
+    std::chrono::seconds failure_after_host_disconnected_for_seconds{3600};
+
+    /// Maximum number of retries during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because if the operation is going to fail then it's better if it fails faster.
+    UInt64 max_retries_while_initializing{20};
+
+    /// Maximum number of retries while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because those retries are just for cleanup after the operation has failed already.
+    UInt64 max_retries_while_handling_error{20};
+
+    /// How long the initiator should wait for other host to handle the 'error' node and finish their work.
+    std::chrono::seconds finish_timeout_after_error{180};
+
+    /// How often the "stage" folder in ZooKeeper must be scanned in a background thread to track changes done by other hosts.
+    std::chrono::milliseconds sync_period_ms{5000};
+
+    /// Number of attempts after getting error ZBADVERSION from ZooKeeper.
+    size_t max_attempts_after_bad_version{10};
+
+    /// Maximum size of data of a ZooKeeper's node during backup.
+    UInt64 value_max_size{1048576};
+
+    /// Maximum size of a batch for a multi request.
+    UInt64 batch_size_for_multi{1000};
+
+    /// Maximum size of a batch for a multiread request.
+    UInt64 batch_size_for_multiread{10000};
+
+    /// Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f].
+    Float64 fault_injection_probability{0};
+
+    /// Seed for `fault_injection_probability`: 0 - random seed, otherwise the setting value.
+    UInt64 fault_injection_seed{0};
+
+    static BackupKeeperSettings fromContext(const ContextPtr & context);
+};
+
+}
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -74,6 +74,17 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query)
    return res;
 }

+bool BackupSettings::isAsync(const ASTBackupQuery & query)
+{
+    if (query.settings)
+    {
+        const auto * field = query.settings->as<const ASTSetQuery &>().changes.tryGet("async");
+        if (field)
+            return field->safeGet<bool>();
+    }
+    return false; /// `async` is false by default.
+}
+
 void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const
 {
    auto query_settings = std::make_shared<ASTSetQuery>();
--- a/src/Backups/BackupSettings.h
+++ b/src/Backups/BackupSettings.h
@ -101,6 +101,8 @@ struct BackupSettings
    static BackupSettings fromBackupQuery(const ASTBackupQuery & query);
    void copySettingsToQuery(ASTBackupQuery & query) const;

+    static bool isAsync(const ASTBackupQuery & query);
+
    struct Util
    {
        static std::vector<Strings> clusterHostIDsFromAST(const IAST & ast);
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -23,6 +23,7 @@ using BackupMutablePtr = std::shared_ptr<IBackup>;
 using BackupPtr = std::shared_ptr<const IBackup>;
 class IBackupEntry;
 using BackupEntries = std::vector<std::pair<String, std::shared_ptr<const IBackupEntry>>>;
+class BackupConcurrencyCounters;
 using DataRestoreTasks = std::vector<std::function<void()>>;
 struct ReadSettings;
 class BackupLog;
@ -31,6 +32,10 @@ using ThreadGroupPtr = std::shared_ptr<ThreadGroup>;
 class QueryStatus;
 using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 class ProcessList;
+class Cluster;
+using ClusterPtr = std::shared_ptr<Cluster>;
+class AccessRightsElements;
+struct ZooKeeperRetriesInfo;


 /// Manager of backups and restores: executes backups and restores' threads in the background.
@ -47,18 +52,18 @@ public:
    /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
    /// For asynchronous operations the function throws no exceptions on failure usually,
    /// call getInfo() on a returned operation id to check for errors.
-    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);

    /// Waits until the specified backup or restore operation finishes or stops.
    /// The function returns immediately if the operation is already finished.
-    void wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);
+    BackupStatus wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);

    /// Waits until all running backup and restore operations finish or stop.
    void waitAll();

    /// Cancels the specified backup or restore operation.
    /// The function does nothing if this operation has already finished.
-    void cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
+    BackupStatus cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);

    /// Cancels all running backup and restore operations.
    void cancelAll(bool wait_ = true);
@ -67,26 +72,32 @@ public:
    std::vector<BackupOperationInfo> getAllInfos() const;

 private:
-    BackupOperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    std::pair<BackupOperationID, BackupStatus> startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    struct BackupStarter;
+
+    BackupMutablePtr openBackupForWriting(const BackupInfo & backup_info, const BackupSettings & backup_settings, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context) const;

    void doBackup(
-        BackupMutablePtr & backup,
+        BackupMutablePtr backup,
        const std::shared_ptr<ASTBackupQuery> & backup_query,
        const BackupOperationID & backup_id,
        const String & backup_name_for_logging,
-        const BackupInfo & backup_info,
-        BackupSettings backup_settings,
+        const BackupSettings & backup_settings,
        std::shared_ptr<IBackupCoordination> backup_coordination,
-        const ContextPtr & context,
-        ContextMutablePtr mutable_context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);

    /// Builds file infos for specified backup entries.
    void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);

    /// Write backup entries to an opened backup.
-    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal, QueryStatusPtr process_list_element);
+    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool is_internal_backup, QueryStatusPtr process_list_element);

-    BackupOperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    struct RestoreStarter;
+
+    BackupPtr openBackupForReading(const BackupInfo & backup_info, const RestoreSettings & restore_settings, const ContextPtr & context) const;

    void doRestore(
        const std::shared_ptr<ASTBackupQuery> & restore_query,
@ -95,7 +106,17 @@ private:
        const BackupInfo & backup_info,
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
-        ContextMutablePtr context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);
+
+    std::shared_ptr<IBackupCoordination> makeBackupCoordination(bool on_cluster, const BackupSettings & backup_settings, const ContextPtr & context) const;
+    std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(bool on_cluster, const RestoreSettings & restore_settings, const ContextPtr & context) const;
+
+    /// Sends a BACKUP or RESTORE query to other hosts.
+    void sendQueryToOtherHosts(const ASTBackupQuery & backup_or_restore_query, const ClusterPtr & cluster,
+        size_t only_shard_num, size_t only_replica_num, ContextMutablePtr context, const AccessRightsElements & access_to_check,
+        const ZooKeeperRetriesInfo & retries_info) const;

    /// Run data restoring tasks which insert data to tables.
    void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);
@ -139,6 +160,8 @@ private:

    std::shared_ptr<BackupLog> backup_log;
    ProcessList & process_list;
+
+    std::unique_ptr<BackupConcurrencyCounters> concurrency_counters;
 };

 }
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@ -121,8 +121,13 @@ public:
    /// Finalizes writing the backup, should be called after all entries have been successfully written.
    virtual void finalizeWriting() = 0;

-    /// Try to remove all files copied to the backup. Used after an exception or it the backup was cancelled.
-    virtual void tryRemoveAllFiles() = 0;
+    /// Sets that a non-retriable error happened while the backup was being written which means that
+    /// the backup is most likely corrupted and it can't be finalized.
+    /// This function is called while handling an exception or if the backup was cancelled.
+    virtual bool setIsCorrupted() noexcept = 0;
+
+    /// Try to remove all files copied to the backup. Could be used after setIsCorrupted().
+    virtual bool tryRemoveAllFiles() noexcept = 0;
 };

 using BackupPtr = std::shared_ptr<const IBackup>;
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -5,26 +5,44 @@

 namespace DB
 {
-class Exception;
 struct BackupFileInfo;
 using BackupFileInfos = std::vector<BackupFileInfo>;
 enum class AccessEntityType : uint8_t;
 enum class UserDefinedSQLObjectType : uint8_t;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're writing to a backup while executing BACKUP ON CLUSTER.
-/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationRemote.
+/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationOnCluster.
 /// BackupCoordinationLocal is used while executing BACKUP without ON CLUSTER and performs coordination in memory.
-/// BackupCoordinationRemote is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
+/// BackupCoordinationOnCluster is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
 class IBackupCoordination
 {
 public:
    virtual ~IBackupCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;
+
+    /// Sets that the backup query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setBackupQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    struct PartNameAndChecksum
    {
@ -87,9 +105,7 @@ public:
    /// Starts writing a specified file, the function returns false if that file is already being written concurrently.
    virtual bool startWritingFile(size_t data_file_index) = 0;

-    /// This function is used to check if concurrent backups are running
-    /// other than the backup passed to the function
-    virtual bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -5,26 +5,42 @@

 namespace DB
 {
-class Exception;
 enum class UserDefinedSQLObjectType : uint8_t;
 class ASTCreateQuery;
+struct ZooKeeperRetriesInfo;

 /// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
-/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
+/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationOnCluster.
 /// RestoreCoordinationLocal is used while executing RESTORE without ON CLUSTER and performs coordination in memory.
-/// RestoreCoordinationRemote is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
+/// RestoreCoordinationOnCluster is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
 class IRestoreCoordination
 {
 public:
    virtual ~IRestoreCoordination() = default;

    /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;

-    static constexpr const char * kErrorStatus = "error";
+    /// Sets that the restore query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setRestoreQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
@ -49,9 +65,7 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    virtual void generateUUIDForTable(ASTCreateQuery & create_query) = 0;

-    /// This function is used to check if concurrent restores are running
-    /// other than the restore passed to the function
-    virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };

 }
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -1,32 +1,24 @@
 #include <Backups/RestoreCoordinationLocal.h>
+
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/formatAST.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>


 namespace DB
 {

-RestoreCoordinationLocal::RestoreCoordinationLocal() : log(getLogger("RestoreCoordinationLocal"))
+RestoreCoordinationLocal::RestoreCoordinationLocal(
+    const UUID & restore_uuid, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("RestoreCoordinationLocal"))
+    , concurrency_check(restore_uuid, /* is_restore = */ true, /* on_cluster = */ false, allow_concurrent_restore_, concurrency_counters_)
 {
 }

 RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;

-void RestoreCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void RestoreCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo RestoreCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
    return {};
 }
@ -63,7 +55,7 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
 {
    String query_str = serializeAST(create_query);

-    auto find_in_map = [&]
+    auto find_in_map = [&]() TSA_REQUIRES(mutex)
    {
        auto it = create_query_uuids.find(query_str);
        if (it != create_query_uuids.end())
@ -91,14 +83,4 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
    }
 }

-bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
-{
-    if (num_active_restores > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_restores={}", num_active_restores);
-        return true;
-    }
-    return false;
-}
-
 }
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Parsers/CreateQueryUUIDs.h>
 #include <Common/Logger.h>
 #include <mutex>
@ -12,19 +13,20 @@ namespace DB
 {
 class ASTCreateQuery;

-
 /// Implementation of the IRestoreCoordination interface performing coordination in memory.
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationLocal();
+    RestoreCoordinationLocal(const UUID & restore_uuid_, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
    ~RestoreCoordinationLocal() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setRestoreQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -49,15 +51,16 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;

-    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
-    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
-    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids;
-    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables;
+    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables TSA_GUARDED_BY(mutex);
+    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables TSA_GUARDED_BY(mutex);

    mutable std::mutex mutex;
 };
--- a/src/Backups/RestoreCoordinationOnCluster.cpp
+++ b/src/Backups/RestoreCoordinationOnCluster.cpp
@ -0,0 +1,318 @@
+#include <Backups/BackupCoordinationOnCluster.h>
+
+#include <Backups/BackupCoordinationStage.h>
+#include <Backups/BackupCoordinationStageSync.h>
+#include <Backups/RestoreCoordinationOnCluster.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/CreateQueryUUIDs.h>
+#include <Parsers/formatAST.h>
+#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/escapeForFileName.h>
+
+
+namespace DB
+{
+
+RestoreCoordinationOnCluster::RestoreCoordinationOnCluster(
+    const UUID & restore_uuid_,
+    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
+    const BackupKeeperSettings & keeper_settings_,
+    const String & current_host_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_restore_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+    QueryStatusPtr process_list_element_)
+    : root_zookeeper_path(root_zookeeper_path_)
+    , keeper_settings(keeper_settings_)
+    , restore_uuid(restore_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/restore-" + toString(restore_uuid_))
+    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(BackupCoordinationOnCluster::excludeInitiator(all_hosts))
+    , current_host(current_host_)
+    , current_host_index(BackupCoordinationOnCluster::findCurrentHostIndex(current_host, all_hosts))
+    , log(getLogger("RestoreCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(restore_uuid_, /* is_restore = */ true, /* on_cluster = */ true, allow_concurrent_restore_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
+{
+    createRootNodes();
+}
+
+RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster()
+{
+    tryFinishImpl();
+}
+
+void RestoreCoordinationOnCluster::createRootNodes()
+{
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            zk->createAncestors(zookeeper_path);
+            zk->createIfNotExists(zookeeper_path, "");
+            zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_sql_objects_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+            zk->createIfNotExists(zookeeper_path + "/table_uuids", "");
+        });
+}
+
+Strings RestoreCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
+{
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void RestoreCoordinationOnCluster::setRestoreQueryWasSentToOtherHosts()
+{
+    restore_query_was_sent_to_other_hosts = true;
+}
+
+bool RestoreCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void RestoreCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+    {
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
+}
+
+void RestoreCoordinationOnCluster::waitForOtherHostsToFinish()
+{
+    if ((current_host != kInitiator) || !restore_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
+}
+
+bool RestoreCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
+{
+    if (current_host != kInitiator)
+        return false;
+    if (!restore_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+}
+
+ZooKeeperRetriesInfo RestoreCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
+{
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
+}
+
+bool RestoreCoordinationOnCluster::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/" + escapeForFileName(table_name);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/";
+            switch (object_type)
+            {
+                case UserDefinedSQLObjectType::Function:
+                    path += "functions";
+                    break;
+            }
+
+            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
+{
+    bool lock_acquired = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            /// we need to remove leading '/' from root_zk_path
+            auto normalized_root_zk_path = root_zk_path.substr(1);
+            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
+            zk->createAncestors(restore_lock_path);
+            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                lock_acquired = true;
+                return;
+            }
+
+            if (code == Coordination::Error::ZNODEEXISTS)
+                lock_acquired = table_unique_id == zk->get(restore_lock_path);
+            else
+                zkutil::KeeperException::fromPath(code, restore_lock_path);
+        });
+    return lock_acquired;
+}
+
+void RestoreCoordinationOnCluster::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
+    String new_uuids_str = new_uuids.toString();
+
+    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
+            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
+
+            if (res == Coordination::Error::ZOK)
+            {
+                new_uuids.copyToQuery(create_query);
+                return;
+            }
+
+            if (res == Coordination::Error::ZNODEEXISTS)
+            {
+                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
+                return;
+            }
+
+            zkutil::KeeperException::fromPath(res, path);
+        });
+}
+
+}
--- a/src/Backups/RestoreCoordinationOnCluster.h
+++ b/src/Backups/RestoreCoordinationOnCluster.h
@ -1,6 +1,8 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationStageSync.h>
 #include <Backups/WithRetries.h>

@ -9,28 +11,33 @@ namespace DB
 {

 /// Implementation of the IRestoreCoordination interface performing coordination via ZooKeeper. It's necessary for "RESTORE ON CLUSTER".
-class RestoreCoordinationRemote : public IRestoreCoordination
+class RestoreCoordinationOnCluster : public IRestoreCoordination
 {
 public:
-    using RestoreKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a RESTORE ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;

-    RestoreCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    RestoreCoordinationOnCluster(
+        const UUID & restore_uuid_,
        const String & root_zookeeper_path_,
-        const RestoreKeeperSettings & keeper_settings_,
-        const String & restore_uuid_,
-        const Strings & all_hosts_,
+        zkutil::GetZooKeeper get_zookeeper_,
+        const BackupKeeperSettings & keeper_settings_,
        const String & current_host_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_restore_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_);

-    ~RestoreCoordinationRemote() override;
+    ~RestoreCoordinationOnCluster() override;

-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setRestoreQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -55,27 +62,27 @@ public:
    /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
    void generateUUIDForTable(ASTCreateQuery & create_query) override;

-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;

 private:
    void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;

-    /// get_zookeeper will provide a zookeeper client without any fault injection
-    const zkutil::GetZooKeeper get_zookeeper;
    const String root_zookeeper_path;
-    const RestoreKeeperSettings keeper_settings;
-    const String restore_uuid;
+    const BackupKeeperSettings keeper_settings;
+    const UUID restore_uuid;
    const String zookeeper_path;
    const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
    const String current_host;
    const size_t current_host_index;
-    const bool is_internal;
    LoggerPtr const log;

-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
-    mutable std::mutex mutex;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> restore_query_was_sent_to_other_hosts = false;
 };

 }
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ b/src/Backups/RestoreCoordinationRemote.cpp
@ -1,379 +0,0 @@
-#include <Backups/BackupCoordinationRemote.h>
-#include <Backups/BackupCoordinationStage.h>
-#include <Backups/RestoreCoordinationRemote.h>
-#include <Backups/BackupCoordinationStageSync.h>
-#include <Parsers/ASTCreateQuery.h>
-#include <Parsers/CreateQueryUUIDs.h>
-#include <Parsers/formatAST.h>
-#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Common/escapeForFileName.h>
-
-
-namespace DB
-{
-
-namespace Stage = BackupCoordinationStage;
-
-RestoreCoordinationRemote::RestoreCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
-    const String & root_zookeeper_path_,
-    const RestoreKeeperSettings & keeper_settings_,
-    const String & restore_uuid_,
-    const Strings & all_hosts_,
-    const String & current_host_,
-    bool is_internal_,
-    QueryStatusPtr process_list_element_)
-    : get_zookeeper(get_zookeeper_)
-    , root_zookeeper_path(root_zookeeper_path_)
-    , keeper_settings(keeper_settings_)
-    , restore_uuid(restore_uuid_)
-    , zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
-    , all_hosts(all_hosts_)
-    , current_host(current_host_)
-    , current_host_index(BackupCoordinationRemote::findCurrentHostIndex(all_hosts, current_host))
-    , is_internal(is_internal_)
-    , log(getLogger("RestoreCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
-{
-    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
-}
-
-RestoreCoordinationRemote::~RestoreCoordinationRemote()
-{
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void RestoreCoordinationRemote::createRootNodes()
-{
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->createAncestors(zookeeper_path);
-
-            Coordination::Requests ops;
-            Coordination::Responses responses;
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_databases_tables_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/table_uuids", "", zkutil::CreateMode::Persistent));
-            zk->tryMulti(ops, responses);
-        });
-}
-
-void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
-{
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
-}
-
-void RestoreCoordinationRemote::setError(const Exception & exception)
-{
-    stage_sync->setError(current_host, exception);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait)
-{
-    return stage_sync->wait(all_hosts, stage_to_wait);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/" + escapeForFileName(table_name);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/";
-            switch (object_type)
-            {
-                case UserDefinedSQLObjectType::Function:
-                    path += "functions";
-                    break;
-            }
-
-            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result =  zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
-{
-    bool lock_acquired = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            /// we need to remove leading '/' from root_zk_path
-            auto normalized_root_zk_path = root_zk_path.substr(1);
-            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
-            zk->createAncestors(restore_lock_path);
-            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                lock_acquired = true;
-                return;
-            }
-
-            if (code == Coordination::Error::ZNODEEXISTS)
-                lock_acquired = table_unique_id == zk->get(restore_lock_path);
-            else
-                zkutil::KeeperException::fromPath(code, restore_lock_path);
-        });
-    return lock_acquired;
-}
-
-void RestoreCoordinationRemote::generateUUIDForTable(ASTCreateQuery & create_query)
-{
-    String query_str = serializeAST(create_query);
-    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
-    String new_uuids_str = new_uuids.toString();
-
-    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
-            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
-
-            if (res == Coordination::Error::ZOK)
-            {
-                new_uuids.copyToQuery(create_query);
-                return;
-            }
-
-            if (res == Coordination::Error::ZNODEEXISTS)
-            {
-                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
-                return;
-            }
-
-            zkutil::KeeperException::fromPath(res, path);
-        });
-}
-
-void RestoreCoordinationRemote::removeAllNodes()
-{
-    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
-    ///
-    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-    /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination
-    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
-    /// of their restore work before that.
-
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->removeRecursive(zookeeper_path);
-        });
-}
-
-bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base restore
-    if (is_internal)
-        return false;
-
-    bool result = false;
-    std::string path = zookeeper_path + "/stage";
-
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            if (! zk->exists(root_zookeeper_path))
-                zk->createAncestors(root_zookeeper_path);
-
-            for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-            {
-                Coordination::Stat stat;
-                zk->get(root_zookeeper_path, &stat);
-                Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
-                for (const auto & existing_restore_path : existing_restore_paths)
-                {
-                    if (startsWith(existing_restore_path, "backup-"))
-                        continue;
-
-                    String existing_restore_uuid = existing_restore_path;
-                    existing_restore_uuid.erase(0, String("restore-").size());
-
-                    if (existing_restore_uuid == toString(restore_uuid))
-                        continue;
-
-                    String status;
-                    if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
-                    {
-                        /// Check if some other restore is in progress
-                        if (status == Stage::SCHEDULED_TO_START)
-                        {
-                            LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
-                            result = true;
-                            return;
-                        }
-                    }
-                }
-
-                zk->createIfNotExists(path, "");
-                auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
-                if (code == Coordination::Error::ZOK)
-                    break;
-                bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-                if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                    throw zkutil::KeeperException::fromPath(code, path);
-            }
-        });
-
-    return result;
-}
-
-}
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@ -100,7 +100,6 @@ RestorerFromBackup::RestorerFromBackup(
    , context(context_)
    , process_list_element(context->getProcessListElement())
    , after_task_callback(after_task_callback_)
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
    , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
    , log(getLogger("RestorerFromBackup"))
    , tables_dependencies("RestorerFromBackup")
@ -119,12 +118,14 @@ RestorerFromBackup::~RestorerFromBackup()
    }
 }

-void RestorerFromBackup::run(Mode mode)
+void RestorerFromBackup::run(Mode mode_)
 {
    /// run() can be called onle once.
    if (!current_stage.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");

+    mode = mode_;
+
    /// Find other hosts working along with us to execute this ON CLUSTER query.
    all_hosts = BackupSettings::Util::filterHostIDs(
        restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
@ -139,6 +140,7 @@ void RestorerFromBackup::run(Mode mode)
    setStage(Stage::FINDING_TABLES_IN_BACKUP);
    findDatabasesAndTablesInBackup();
    waitFutures();
+    logNumberOfDatabasesAndTablesToRestore();

    /// Check access rights.
    setStage(Stage::CHECKING_ACCESS_RIGHTS);
@ -228,20 +230,8 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa

    if (restore_coordination)
    {
-        restore_coordination->setStage(new_stage, message);
-
-        /// The initiator of a RESTORE ON CLUSTER query waits for other hosts to complete their work (see waitForStage(Stage::COMPLETED) in BackupsWorker::doRestore),
-        /// but other hosts shouldn't wait for each others' completion. (That's simply unnecessary and also
-        /// the initiator may start cleaning up (e.g. removing restore-coordination ZooKeeper nodes) once all other hosts are in Stage::COMPLETED.)
-        bool need_wait = (new_stage != Stage::COMPLETED);
-
-        if (need_wait)
-        {
-            if (new_stage == Stage::FINDING_TABLES_IN_BACKUP)
-                restore_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-            else
-                restore_coordination->waitForStage(new_stage);
-        }
+        /// There is no need to sync Stage::COMPLETED with other hosts because it's the last stage.
+        restore_coordination->setStage(new_stage, message, /* sync = */ (new_stage != Stage::COMPLETED));
    }
 }

@ -384,8 +374,12 @@ void RestorerFromBackup::findDatabasesAndTablesInBackup()
            }
        }
    }
+}

-    LOG_INFO(log, "Will restore {} databases and {} tables", getNumDatabases(), getNumTables());
+void RestorerFromBackup::logNumberOfDatabasesAndTablesToRestore() const
+{
+    std::string_view action = (mode == CHECK_ACCESS_ONLY) ? "check access rights for restoring" : "restore";
+    LOG_INFO(log, "Will {} {} databases and {} tables", action, getNumDatabases(), getNumTables());
 }

 void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name_in_backup, bool skip_if_inner_table, const std::optional<ASTs> & partitions)
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@ -53,7 +53,7 @@ public:
    using DataRestoreTasks = std::vector<DataRestoreTask>;

    /// Restores the metadata of databases and tables and returns tasks to restore the data of tables.
-    void run(Mode mode);
+    void run(Mode mode_);

    BackupPtr getBackup() const { return backup; }
    const RestoreSettings & getRestoreSettings() const { return restore_settings; }
@ -80,10 +80,10 @@ private:
    ContextMutablePtr context;
    QueryStatusPtr process_list_element;
    std::function<void()> after_task_callback;
-    std::chrono::milliseconds on_cluster_first_sync_timeout;
    std::chrono::milliseconds create_table_timeout;
    LoggerPtr log;

+    Mode mode = Mode::RESTORE;
    Strings all_hosts;
    DDLRenamingMap renaming_map;
    std::vector<std::filesystem::path> root_paths_in_backup;
@ -97,6 +97,7 @@ private:
    void findDatabaseInBackupImpl(const String & database_name_in_backup, const std::set<DatabaseAndTableName> & except_table_names);
    void findEverythingInBackup(const std::set<String> & except_database_names, const std::set<DatabaseAndTableName> & except_table_names);

+    void logNumberOfDatabasesAndTablesToRestore() const;
    size_t getNumDatabases() const;
    size_t getNumTables() const;

--- a/src/Backups/WithRetries.cpp
+++ b/src/Backups/WithRetries.cpp
@ -1,57 +1,34 @@
 #include <Backups/WithRetries.h>
-#include <Core/Settings.h>

 #include <mutex>

+
 namespace DB
 {
-namespace Setting
-{
-    extern const SettingsUInt64 backup_restore_keeper_max_retries;
-    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
-    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
-    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
-    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
-    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
-}
-
-WithRetries::KeeperSettings WithRetries::KeeperSettings::fromContext(ContextPtr context)
-{
-    return
-    {
-        .keeper_max_retries = context->getSettingsRef()[Setting::backup_restore_keeper_max_retries],
-        .keeper_retry_initial_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_initial_backoff_ms],
-        .keeper_retry_max_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_max_backoff_ms],
-        .batch_size_for_keeper_multiread = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multiread],
-        .keeper_fault_injection_probability = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_probability],
-        .keeper_fault_injection_seed = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_seed],
-        .keeper_value_max_size = context->getSettingsRef()[Setting::backup_restore_keeper_value_max_size],
-        .batch_size_for_keeper_multi = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multi],
-    };
-}

 WithRetries::WithRetries(
-    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
+    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
    : log(log_)
    , get_zookeeper(get_zookeeper_)
    , settings(settings_)
    , process_list_element(process_list_element_)
    , callback(callback_)
-    , global_zookeeper_retries_info(
-          settings.keeper_max_retries, settings.keeper_retry_initial_backoff_ms, settings.keeper_retry_max_backoff_ms)
 {}

-WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name)
-    : info(parent->global_zookeeper_retries_info)
-    , retries_ctl(name, parent->log, info, parent->process_list_element)
+WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind)
+    : info(  (kind == kInitialization) ? parent->settings.max_retries_while_initializing
+           : (kind == kErrorHandling)  ? parent->settings.max_retries_while_handling_error
+                                       : parent->settings.max_retries,
+           parent->settings.retry_initial_backoff_ms.count(),
+           parent->settings.retry_max_backoff_ms.count())
+    /// We don't use process_list_element while handling an error because the error handling can't be cancellable.
+    , retries_ctl(name, parent->log, info, (kind == kErrorHandling) ? nullptr : parent->process_list_element)
    , faulty_zookeeper(parent->getFaultyZooKeeper())
 {}

-WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name)
+WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name, Kind kind) const
 {
-    return RetriesControlHolder(this, name);
+    return RetriesControlHolder(this, name, kind);
 }

 void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
@ -62,8 +39,8 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
    {
        zookeeper = get_zookeeper();
        my_faulty_zookeeper->setKeeper(zookeeper);
-
-        callback(my_faulty_zookeeper);
+        if (callback)
+            callback(my_faulty_zookeeper);
    }
    else
    {
@ -71,7 +48,7 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
    }
 }

-const WithRetries::KeeperSettings & WithRetries::getKeeperSettings() const
+const BackupKeeperSettings & WithRetries::getKeeperSettings() const
 {
    return settings;
 }
@ -88,8 +65,8 @@ WithRetries::FaultyKeeper WithRetries::getFaultyZooKeeper() const
    /// The reason is that ZooKeeperWithFaultInjection may reset the underlying pointer and there could be a race condition
    /// when the same object is used from multiple threads.
    auto faulty_zookeeper = ZooKeeperWithFaultInjection::createInstance(
-        settings.keeper_fault_injection_probability,
-        settings.keeper_fault_injection_seed,
+        settings.fault_injection_probability,
+        settings.fault_injection_seed,
        current_zookeeper,
        log->name(),
        log);
--- a/src/Backups/WithRetries.h
+++ b/src/Backups/WithRetries.h
@ -1,9 +1,11 @@
 #pragma once

-#include <Common/ZooKeeper/ZooKeeperRetries.h>
+#include <Backups/BackupKeeperSettings.h>
 #include <Common/ZooKeeper/Common.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/ZooKeeper/ZooKeeperWithFaultInjection.h>

+
 namespace DB
 {

@ -15,20 +17,13 @@ class WithRetries
 {
 public:
    using FaultyKeeper = Coordination::ZooKeeperWithFaultInjection::Ptr;
-    using RenewerCallback = std::function<void(FaultyKeeper &)>;
+    using RenewerCallback = std::function<void(FaultyKeeper)>;

-    struct KeeperSettings
+    enum Kind
    {
-        UInt64 keeper_max_retries{0};
-        UInt64 keeper_retry_initial_backoff_ms{0};
-        UInt64 keeper_retry_max_backoff_ms{0};
-        UInt64 batch_size_for_keeper_multiread{10000};
-        Float64 keeper_fault_injection_probability{0};
-        UInt64 keeper_fault_injection_seed{42};
-        UInt64 keeper_value_max_size{1048576};
-        UInt64 batch_size_for_keeper_multi{1000};
-
-        static KeeperSettings fromContext(ContextPtr context);
+        kNormal,
+        kInitialization,
+        kErrorHandling,
    };

    /// For simplicity a separate ZooKeeperRetriesInfo and a faulty [Zoo]Keeper client
@ -48,23 +43,23 @@ public:

    private:
        friend class WithRetries;
-        RetriesControlHolder(const WithRetries * parent, const String & name);
+        RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind);
    };

-    RetriesControlHolder createRetriesControlHolder(const String & name);
-    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback);
+    RetriesControlHolder createRetriesControlHolder(const String & name, Kind kind = Kind::kNormal) const;
+    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback = {});

    /// Used to re-establish new connection inside a retry loop.
    void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;

-    const KeeperSettings & getKeeperSettings() const;
+    const BackupKeeperSettings & getKeeperSettings() const;
 private:
    /// This will provide a special wrapper which is useful for testing
    FaultyKeeper getFaultyZooKeeper() const;

    LoggerPtr log;
    zkutil::GetZooKeeper get_zookeeper;
-    KeeperSettings settings;
+    BackupKeeperSettings settings;
    QueryStatusPtr process_list_element;

    /// This callback is called each time when a new [Zoo]Keeper session is created.
@ -76,7 +71,6 @@ private:
    /// it could lead just to a failed backup which could possibly be successful
    /// if there were a little bit more retries.
    RenewerCallback callback;
-    ZooKeeperRetriesInfo global_zookeeper_retries_info;

    /// This is needed only to protect zookeeper object
    mutable std::mutex zookeeper_mutex;
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -627,7 +627,7 @@ PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with
    return PreformattedMessage{stream.str(), e.tryGetMessageFormatString(), e.getMessageFormatStringArgs()};
 }

-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace)
 {
    try
    {
@ -635,7 +635,7 @@ std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
    }
    catch (...)
    {
-        return getCurrentExceptionMessage(with_stacktrace);
+        return getCurrentExceptionMessage(with_stacktrace, check_embedded_stacktrace);
    }
 }

--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -329,7 +329,7 @@ void tryLogException(std::exception_ptr e, const AtomicLogger & logger, const st

 std::string getExceptionMessage(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
 PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace);
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace = false);


 template <typename T>
--- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@ -176,6 +176,10 @@ void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguratio
        {
            connection_timeout_ms = config.getInt(config_name + "." + key);
        }
+        else if (key == "num_connection_retries")
+        {
+            num_connection_retries = config.getInt(config_name + "." + key);
+        }
        else if (key == "enable_fault_injections_during_startup")
        {
            enable_fault_injections_during_startup = config.getBool(config_name + "." + key);
--- a/src/Common/ZooKeeper/ZooKeeperArgs.h
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@ -39,6 +39,7 @@ struct ZooKeeperArgs
    String sessions_path = "/clickhouse/sessions";
    String client_availability_zone;
    int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+    UInt64 num_connection_retries = 2;
    int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
    int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
    bool enable_fault_injections_during_startup = false;
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -440,7 +440,9 @@ void ZooKeeper::connect(
    if (nodes.empty())
        throw Exception::fromMessage(Error::ZBADARGUMENTS, "No nodes passed to ZooKeeper constructor");

-    static constexpr size_t num_tries = 3;
+    /// We always have at least one attempt to connect.
+    size_t num_tries = args.num_connection_retries + 1;
+
    bool connected = false;
    bool dns_error = false;

--- a/src/Common/ZooKeeper/ZooKeeperRetries.h
+++ b/src/Common/ZooKeeper/ZooKeeperRetries.h
@ -15,14 +15,15 @@ namespace ErrorCodes

 struct ZooKeeperRetriesInfo
 {
+    ZooKeeperRetriesInfo() = default;
    ZooKeeperRetriesInfo(UInt64 max_retries_, UInt64 initial_backoff_ms_, UInt64 max_backoff_ms_)
        : max_retries(max_retries_), initial_backoff_ms(std::min(initial_backoff_ms_, max_backoff_ms_)), max_backoff_ms(max_backoff_ms_)
    {
    }

-    UInt64 max_retries;
-    UInt64 initial_backoff_ms;
-    UInt64 max_backoff_ms;
+    UInt64 max_retries = 0; /// "max_retries = 0" means only one attempt.
+    UInt64 initial_backoff_ms = 100;
+    UInt64 max_backoff_ms = 5000;
 };

 class ZooKeeperRetriesControl
@ -220,6 +221,7 @@ private:
            return false;
        }

+        /// Check if the query was cancelled.
        if (process_list_element)
            process_list_element->checkTimeLimit();

@ -228,6 +230,10 @@ private:
        sleepForMilliseconds(current_backoff_ms);
        current_backoff_ms = std::min(current_backoff_ms * 2, retries_info.max_backoff_ms);

+        /// Check if the query was cancelled again after sleeping.
+        if (process_list_element)
+            process_list_element->checkTimeLimit();
+
        return true;
    }

--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@ -2665,29 +2665,44 @@ The maximum amount of data consumed by temporary files on disk in bytes for all
 The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited.
 )", 0)\
    \
-    DECLARE(UInt64, backup_restore_keeper_max_retries, 20, R"(
-Max retries for keeper operations during backup or restore
+    DECLARE(UInt64, backup_restore_keeper_max_retries, 1000, R"(
+Max retries for [Zoo]Keeper operations in the middle of a BACKUP or RESTORE operation.
+Should be big enough so the whole operation won't fail because of a temporary [Zoo]Keeper failure.
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_retry_initial_backoff_ms, 100, R"(
 Initial backoff timeout for [Zoo]Keeper operations during backup or restore
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_retry_max_backoff_ms, 5000, R"(
 Max backoff timeout for [Zoo]Keeper operations during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_failure_after_host_disconnected_for_seconds, 3600, R"(
+If a host during a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation doesn't recreate its ephemeral 'alive' node in ZooKeeper for this amount of time then the whole backup or restore is considered as failed.
+This value should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+Zero means unlimited.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_initializing, 20, R"(
+Max retries for [Zoo]Keeper operations during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_handling_error, 20, R"(
+Max retries for [Zoo]Keeper operations while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_finish_timeout_after_error_sec, 180, R"(
+How long the initiator should wait for other host to react to the 'error' node and stop their work on the current BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
+Maximum size of data of a [Zoo]Keeper's node during backup
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
+Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
+Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
 )", 0) \
    DECLARE(Float, backup_restore_keeper_fault_injection_probability, 0.0f, R"(
 Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f]
 )", 0) \
    DECLARE(UInt64, backup_restore_keeper_fault_injection_seed, 0, R"(
 0 - random seed, otherwise the setting value
-)", 0) \
-    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
-Maximum size of data of a [Zoo]Keeper's node during backup
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
-Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
-Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
 )", 0) \
    DECLARE(UInt64, backup_restore_s3_retry_attempts, 1000, R"(
 Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -114,6 +114,11 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
            {"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
            {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
+            {"backup_restore_keeper_max_retries", 20, 1000, "Should be big enough so the whole operation BACKUP or RESTORE operation won't fail because of a temporary [Zoo]Keeper failure in the middle of it."},
+            {"backup_restore_failure_after_host_disconnected_for_seconds", 0, 3600, "New setting."},
+            {"backup_restore_keeper_max_retries_while_initializing", 0, 20, "New setting."},
+            {"backup_restore_keeper_max_retries_while_handling_error", 0, 20, "New setting."},
+            {"backup_restore_finish_timeout_after_error_sec", 0, 180, "New setting."},
        }
    },
    {"24.9",
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -199,13 +199,12 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
    active_node_holder = zkutil::EphemeralNodeHolder::existing(active_path, *active_node_holder_zookeeper);
 }

-String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
+String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr)
 {
    auto zookeeper = getAndSetZooKeeper();
    return enqueueQueryImpl(zookeeper, entry, database);
 }

-
 bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeout_ms)
 {
    auto zookeeper = getAndSetZooKeeper();
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@ -24,7 +24,7 @@ class DatabaseReplicatedDDLWorker : public DDLWorker
 public:
    DatabaseReplicatedDDLWorker(DatabaseReplicated * db, ContextPtr context_);

-    String enqueueQuery(DDLLogEntry & entry) override;
+    String enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr) override;

    String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, ContextPtr query_context);

--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
@ -26,6 +26,7 @@
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperLock.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/isLocalAddress.h>
 #include <Common/logger_useful.h>
 #include <Common/randomSeed.h>
@ -1053,7 +1054,25 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP
 }


-String DDLWorker::enqueueQuery(DDLLogEntry & entry)
+String DDLWorker::enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo & retries_info, QueryStatusPtr process_list_element)
+{
+    String node_path;
+    if (retries_info.max_retries > 0)
+    {
+        ZooKeeperRetriesControl retries_ctl{"DDLWorker::enqueueQuery", log, retries_info, process_list_element};
+        retries_ctl.retryLoop([&]{
+            node_path = enqueueQueryAttempt(entry);
+        });
+    }
+    else
+    {
+        node_path = enqueueQueryAttempt(entry);
+    }
+    return node_path;
+}
+
+
+String DDLWorker::enqueueQueryAttempt(DDLLogEntry & entry)
 {
    if (entry.hosts.empty())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty host list in a distributed DDL task");
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@ -48,6 +48,9 @@ struct DDLTaskBase;
 using DDLTaskPtr = std::unique_ptr<DDLTaskBase>;
 using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
 class AccessRightsElements;
+struct ZooKeeperRetriesInfo;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;

 class DDLWorker
 {
@ -65,7 +68,7 @@ public:
    virtual ~DDLWorker();

    /// Pushes query into DDL queue, returns path to created node
-    virtual String enqueueQuery(DDLLogEntry & entry);
+    virtual String enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo & retries_info, QueryStatusPtr process_list_element);

    /// Host ID (name:port) for logging purposes
    /// Note that in each task hosts are identified individually by name:port from initiator server cluster config
@ -120,6 +123,9 @@ protected:
        mutable std::shared_mutex mtx;
    };

+    /// Pushes query into DDL queue, returns path to created node
+    String enqueueQueryAttempt(DDLLogEntry & entry);
+
    /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
    void scheduleTasks(bool reinitialized);

--- a/src/Interpreters/InterpreterBackupQuery.cpp
+++ b/src/Interpreters/InterpreterBackupQuery.cpp
@ -2,6 +2,8 @@
 #include <Interpreters/InterpreterBackupQuery.h>

 #include <Backups/BackupsWorker.h>
+#include <Backups/BackupSettings.h>
+#include <Parsers/ASTBackupQuery.h>
 #include <Columns/ColumnString.h>
 #include <Columns/ColumnsNumber.h>
 #include <DataTypes/DataTypeEnum.h>
@ -18,13 +20,13 @@ namespace DB

 namespace
 {
-    Block getResultRow(const BackupOperationInfo & info)
+    Block getResultRow(const String & id, BackupStatus status)
    {
        auto column_id = ColumnString::create();
        auto column_status = ColumnInt8::create();

-        column_id->insert(info.id);
-        column_status->insert(static_cast<Int8>(info.status));
+        column_id->insert(id);
+        column_status->insert(static_cast<Int8>(status));

        Block res_columns;
        res_columns.insert(0, {std::move(column_id), std::make_shared<DataTypeString>(), "id"});
@ -36,15 +38,18 @@ namespace

 BlockIO InterpreterBackupQuery::execute()
 {
+    const ASTBackupQuery & backup_query = query_ptr->as<const ASTBackupQuery &>();
    auto & backups_worker = context->getBackupsWorker();
-    auto id = backups_worker.start(query_ptr, context);

-    auto info = backups_worker.getInfo(id);
-    if (info.exception)
-        std::rethrow_exception(info.exception);
+    auto [id, status] = backups_worker.start(query_ptr, context);
+
+    /// Wait if it's a synchronous operation.
+    bool async = BackupSettings::isAsync(backup_query);
+    if (!async)
+        status = backups_worker.wait(id);

    BlockIO res_io;
-    res_io.pipeline = QueryPipeline(std::make_shared<SourceFromSingleChunk>(getResultRow(info)));
+    res_io.pipeline = QueryPipeline(std::make_shared<SourceFromSingleChunk>(getResultRow(id, status)));
    return res_io;
 }

--- a/src/Interpreters/ProcessList.cpp
+++ b/src/Interpreters/ProcessList.cpp
@ -447,12 +447,16 @@ void QueryStatus::ExecutorHolder::remove()
    executor = nullptr;
 }

-CancellationCode QueryStatus::cancelQuery(bool)
+CancellationCode QueryStatus::cancelQuery(bool /* kill */, std::exception_ptr exception)
 {
-    if (is_killed.load())
+    if (is_killed.exchange(true))
        return CancellationCode::CancelSent;

-    is_killed.store(true);
+    {
+        std::lock_guard lock{cancellation_exception_mutex};
+        if (!cancellation_exception)
+            cancellation_exception = exception;
+    }

    std::vector<ExecutorHolderPtr> executors_snapshot;

@ -486,7 +490,7 @@ void QueryStatus::addPipelineExecutor(PipelineExecutor * e)
    /// addPipelineExecutor() from the cancelQuery() context, and this will
    /// lead to deadlock.
    if (is_killed.load())
-        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        throwQueryWasCancelled();

    std::lock_guard lock(executors_mutex);
    assert(!executors.contains(e));
@ -512,11 +516,20 @@ void QueryStatus::removePipelineExecutor(PipelineExecutor * e)
 bool QueryStatus::checkTimeLimit()
 {
    if (is_killed.load())
-        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        throwQueryWasCancelled();

    return limits.checkTimeLimit(watch, overflow_mode);
 }

+void QueryStatus::throwQueryWasCancelled() const
+{
+    std::lock_guard lock{cancellation_exception_mutex};
+    if (cancellation_exception)
+        std::rethrow_exception(cancellation_exception);
+    else
+        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+}
+
 bool QueryStatus::checkTimeLimitSoft()
 {
    if (is_killed.load())
--- a/src/Interpreters/ProcessList.h
+++ b/src/Interpreters/ProcessList.h
@ -109,6 +109,9 @@ protected:
    /// KILL was send to the query
    std::atomic<bool> is_killed { false };

+    std::exception_ptr cancellation_exception TSA_GUARDED_BY(cancellation_exception_mutex);
+    mutable std::mutex cancellation_exception_mutex;
+
    /// All data to the client already had been sent.
    /// Including EndOfStream or Exception.
    std::atomic<bool> is_all_data_sent { false };
@ -127,6 +130,8 @@ protected:
    /// A weak pointer is used here because it's a ProcessListEntry which owns this QueryStatus, and not vice versa.
    void setProcessListEntry(std::weak_ptr<ProcessListEntry> process_list_entry_);

+    [[noreturn]] void throwQueryWasCancelled() const;
+
    mutable std::mutex executors_mutex;

    struct ExecutorHolder
@ -225,7 +230,9 @@ public:

    QueryStatusInfo getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const;

-    CancellationCode cancelQuery(bool kill);
+    /// Cancels the current query.
+    /// Optional argument `exception` allows to set an exception which checkTimeLimit() will throw instead of "QUERY_WAS_CANCELLED".
+    CancellationCode cancelQuery(bool kill, std::exception_ptr exception = nullptr);

    bool isKilled() const { return is_killed; }

--- a/src/Interpreters/executeDDLQueryOnCluster.cpp
+++ b/src/Interpreters/executeDDLQueryOnCluster.cpp
@ -189,7 +189,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context,
    entry.setSettingsIfRequired(context);
    entry.tracing_context = OpenTelemetry::CurrentContext();
    entry.initial_query_id = context->getClientInfo().initial_query_id;
-    String node_path = ddl_worker.enqueueQuery(entry);
+    String node_path = ddl_worker.enqueueQuery(entry, params.retries_info, context->getProcessListElement());

    return getDDLOnClusterStatus(node_path, ddl_worker.getReplicasDir(), entry, context);
 }
--- a/src/Interpreters/executeDDLQueryOnCluster.h
+++ b/src/Interpreters/executeDDLQueryOnCluster.h
@ -37,6 +37,9 @@ struct DDLQueryOnClusterParams

    /// Privileges which the current user should have to execute a query.
    AccessRightsElements access_to_check;
+
+    /// Use retries when creating nodes "query-0000000000", "query-0000000001", "query-0000000002" in ZooKeeper.
+    ZooKeeperRetriesInfo retries_info;
 };

 /// Pushes distributed DDL query to the queue.
--- a/src/Storages/StorageKeeperMap.cpp
+++ b/src/Storages/StorageKeeperMap.cpp
@ -889,7 +889,7 @@ private:
            }
        };

-        auto max_multiread_size = with_retries->getKeeperSettings().batch_size_for_keeper_multiread;
+        auto max_multiread_size = with_retries->getKeeperSettings().batch_size_for_multiread;

        auto keys_it = data_children.begin();
        while (keys_it != data_children.end())
@ -941,9 +941,8 @@ void StorageKeeperMap::backupData(BackupEntriesCollector & backup_entries_collec
        (
            getLogger(fmt::format("StorageKeeperMapBackup ({})", getStorageID().getNameForLogs())),
            [&] { return getClient(); },
-            WithRetries::KeeperSettings::fromContext(backup_entries_collector.getContext()),
-            backup_entries_collector.getContext()->getProcessListElement(),
-            [](WithRetries::FaultyKeeper &) {}
+            BackupKeeperSettings::fromContext(backup_entries_collector.getContext()),
+            backup_entries_collector.getContext()->getProcessListElement()
        );

        backup_entries_collector.addBackupEntries(
@ -972,9 +971,8 @@ void StorageKeeperMap::restoreDataFromBackup(RestorerFromBackup & restorer, cons
    (
        getLogger(fmt::format("StorageKeeperMapRestore ({})", getStorageID().getNameForLogs())),
        [&] { return getClient(); },
-        WithRetries::KeeperSettings::fromContext(restorer.getContext()),
-        restorer.getContext()->getProcessListElement(),
-        [](WithRetries::FaultyKeeper &) {}
+        BackupKeeperSettings::fromContext(restorer.getContext()),
+        restorer.getContext()->getProcessListElement()
    );

    bool allow_non_empty_tables = restorer.isNonEmptyTableAllowed();
@ -1037,7 +1035,7 @@ void StorageKeeperMap::restoreDataImpl(
    CompressedReadBufferFromFile compressed_in{std::move(in_from_file)};
    fs::path data_path_fs(zk_data_path);

-    auto max_multi_size = with_retries->getKeeperSettings().batch_size_for_keeper_multi;
+    auto max_multi_size = with_retries->getKeeperSettings().batch_size_for_multi;

    Coordination::Requests create_requests;
    const auto flush_create_requests = [&]
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@ -2132,6 +2132,16 @@ class ClickHouseCluster:
                ],
            )

+    def remove_file_from_container(self, container_id, path):
+        self.exec_in_container(
+            container_id,
+            [
+                "bash",
+                "-c",
+                "rm {}".format(path),
+            ],
+        )
+
    def wait_for_url(
        self, url="http://localhost:8123/ping", conn_timeout=2, interval=2, timeout=60
    ):
@ -4139,6 +4149,9 @@ class ClickHouseInstance:
            self.docker_id, local_path, dest_path
        )

+    def remove_file_from_container(self, path):
+        return self.cluster.remove_file_from_container(self.docker_id, path)
+
    def get_process_pid(self, process_name):
        output = self.exec_in_container(
            [
--- a/tests/integration/helpers/config_manager.py
+++ b/tests/integration/helpers/config_manager.py
@ -0,0 +1,65 @@
+import os
+
+
+class ConfigManager:
+    """Allows to temporarily add configuration files to the "config.d" or "users.d" directories.
+
+    Can act as a context manager:
+
+    with ConfigManager() as cm:
+        cm.add_main_config("configs/test_specific_config.xml") # copy "configs/test_specific_config.xml" to "/etc/clickhouse-server/config.d"
+        ...
+        # "/etc/clickhouse-server/config.d/test_specific_config.xml" is removed automatically
+
+    """
+
+    def __init__(self):
+        self.__added_configs = []
+
+    def add_main_config(self, node_or_nodes, local_path, reload_config=True):
+        """Temporarily adds a configuration file to the "config.d" directory."""
+        self.__add_config(
+            node_or_nodes, local_path, dest_dir="config.d", reload_config=reload_config
+        )
+
+    def add_user_config(self, node_or_nodes, local_path, reload_config=True):
+        """Temporarily adds a configuration file to the "users.d" directory."""
+        self.__add_config(
+            node_or_nodes, local_path, dest_dir="users.d", reload_config=reload_config
+        )
+
+    def reset(self, reload_config=True):
+        """Removes all configuration files added by this ConfigManager."""
+        if not self.__added_configs:
+            return
+        for node, dest_path in self.__added_configs:
+            node.remove_file_from_container(dest_path)
+        if reload_config:
+            for node, _ in self.__added_configs:
+                node.query("SYSTEM RELOAD CONFIG")
+        self.__added_configs = []
+
+    def __add_config(self, node_or_nodes, local_path, dest_dir, reload_config):
+        nodes_to_add_config = (
+            node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+        )
+        for node in nodes_to_add_config:
+            src_path = os.path.join(node.cluster.base_dir, local_path)
+            dest_path = os.path.join(
+                "/etc/clickhouse-server", dest_dir, os.path.basename(local_path)
+            )
+            node.copy_file_to_container(src_path, dest_path)
+        if reload_config:
+            for node in nodes_to_add_config:
+                node.query("SYSTEM RELOAD CONFIG")
+        for node in nodes_to_add_config:
+            dest_path = os.path.join(
+                "/etc/clickhouse-server", dest_dir, os.path.basename(local_path)
+            )
+            self.__added_configs.append((node, dest_path))
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.reset()
--- a/tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml
@ -0,0 +1,16 @@
+<clickhouse>
+    <remote_servers>
+        <cluster_ver>
+            <shard>
+                <replica>
+                    <host>new_node</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>old_node</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_ver>
+    </remote_servers>
+</clickhouse>
--- a/tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml
@ -0,0 +1,12 @@
+<clickhouse>
+    <zookeeper replace="replace">
+        <node index="1">
+            <host>zoo1</host>
+            <port>2181</port>
+        </node>
+        <connection_timeout_ms>500</connection_timeout_ms>
+        <num_connection_retries>0</num_connection_retries>
+        <operation_timeout_ms>1000</operation_timeout_ms>
+        <session_timeout_ms>5000</session_timeout_ms>
+    </zookeeper>
+</clickhouse>
--- a/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml
@ -1,6 +1,6 @@
 <clickhouse>
    <backups>
-        <on_cluster_first_sync_timeout>1000</on_cluster_first_sync_timeout>
+        <sync_period_ms>1000</sync_period_ms>
        <consistent_metadata_snapshot_timeout>10000</consistent_metadata_snapshot_timeout>
        <create_table_timeout>3000</create_table_timeout>
    </backups>
--- a/tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml
@ -0,0 +1,3 @@
+<clickhouse>
+    <shutdown_wait_backups_and_restores>false</shutdown_wait_backups_and_restores>
+</clickhouse>
--- a/tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml
@ -0,0 +1,7 @@
+<clickhouse>
+    <backups>
+        <test_inject_sleep>true</test_inject_sleep>
+    </backups>
+    <backup_threads>12</backup_threads>
+    <restore_threads>2</restore_threads>
+</clickhouse>
--- a/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml
@ -1,9 +1,12 @@
 <clickhouse>
    <profiles>
        <default>
-            <backup_restore_keeper_max_retries>1000</backup_restore_keeper_max_retries>
-            <backup_restore_keeper_retry_initial_backoff_ms>1</backup_restore_keeper_retry_initial_backoff_ms>
-            <backup_restore_keeper_retry_max_backoff_ms>1</backup_restore_keeper_retry_max_backoff_ms>
+            <backup_restore_keeper_max_retries>50</backup_restore_keeper_max_retries>
+            <backup_restore_keeper_retry_initial_backoff_ms>100</backup_restore_keeper_retry_initial_backoff_ms>
+            <backup_restore_keeper_retry_max_backoff_ms>1000</backup_restore_keeper_retry_max_backoff_ms>
+            <backup_restore_keeper_max_retries_while_initializing>10</backup_restore_keeper_max_retries_while_initializing>
+            <backup_restore_keeper_max_retries_while_handling_error>2</backup_restore_keeper_max_retries_while_handling_error>
+            <backup_restore_finish_timeout_after_error_sec>3</backup_restore_finish_timeout_after_error_sec>
            <backup_restore_keeper_fault_injection_seed>42</backup_restore_keeper_fault_injection_seed>
            <backup_restore_keeper_fault_injection_probability>0.002</backup_restore_keeper_fault_injection_probability>
        </default>
--- a/tests/integration/test_backup_restore_on_cluster/test.py
+++ b/tests/integration/test_backup_restore_on_cluster/test.py
@ -1153,7 +1153,7 @@ def test_get_error_from_other_host():
    node1.query("INSERT INTO tbl VALUES (3)")

    backup_name = new_backup_name()
-    expected_error = "Got error from node2.*Table default.tbl was not found"
+    expected_error = "Got error from host node2.*Table default.tbl was not found"
    assert re.search(
        expected_error,
        node1.query_and_get_error(
@ -1162,8 +1162,7 @@ def test_get_error_from_other_host():
    )


-@pytest.mark.parametrize("kill", [False, True])
-def test_stop_other_host_during_backup(kill):
+def test_shutdown_waits_for_backup():
    node1.query(
        "CREATE TABLE tbl ON CLUSTER 'cluster' ("
        "x UInt8"
@ -1182,7 +1181,7 @@ def test_stop_other_host_during_backup(kill):

    # If kill=False the pending backup must be completed
    # If kill=True the pending backup might be completed or failed
-    node2.stop_clickhouse(kill=kill)
+    node2.stop_clickhouse(kill=False)

    assert_eq_with_retry(
        node1,
@ -1192,22 +1191,11 @@ def test_stop_other_host_during_backup(kill):
    )

    status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip()
-
-    if kill:
-        expected_statuses = ["BACKUP_CREATED", "BACKUP_FAILED"]
-    else:
-        expected_statuses = ["BACKUP_CREATED", "BACKUP_CANCELLED"]
-
-    assert status in expected_statuses
+    assert status == "BACKUP_CREATED"

    node2.start_clickhouse()

-    if status == "BACKUP_CREATED":
-        node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
-        node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}")
-        node1.query("SYSTEM SYNC REPLICA tbl")
-        assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5])
-    elif status == "BACKUP_FAILED":
-        assert not os.path.exists(
-            os.path.join(get_path_to_backup(backup_name), ".backup")
-        )
+    node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
+    node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}")
+    node1.query("SYSTEM SYNC REPLICA tbl")
+    assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5])
--- a/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py
@ -0,0 +1,780 @@
+import os
+import random
+import time
+import uuid
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.config_manager import ConfigManager
+from helpers.network import PartitionManager
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+main_configs = [
+    "configs/backups_disk.xml",
+    "configs/cluster.xml",
+    "configs/lesser_timeouts.xml",  # Default timeouts are quite big (a few minutes), the tests don't need them to be that big.
+    "configs/slow_backups.xml",
+    "configs/shutdown_cancel_backups.xml",
+]
+
+user_configs = [
+    "configs/zookeeper_retries.xml",
+]
+
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "node1", "shard": "shard1"},
+    with_zookeeper=True,
+    stay_alive=True,  # Necessary for "test_shutdown_cancel_backup"
+)
+
+node2 = cluster.add_instance(
+    "node2",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "node2", "shard": "shard1"},
+    with_zookeeper=True,
+    stay_alive=True,  # Necessary for "test_shutdown_cancel_backup"
+)
+
+nodes = [node1, node2]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_after_test():
+    try:
+        yield
+    finally:
+        node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' SYNC")
+
+
+# Utilities
+
+
+# Gets a printable version the name of a node.
+def get_node_name(node):
+    return "node1" if (node == node1) else "node2"
+
+
+# Choose a random instance.
+def random_node():
+    return random.choice(nodes)
+
+
+# Makes table "tbl" and fill it with data.
+def create_and_fill_table(node, num_parts=10, on_cluster=True):
+    # We use partitioning to make sure there will be more files in a backup.
+    partition_by_clause = " PARTITION BY x%" + str(num_parts) if num_parts > 1 else ""
+    node.query(
+        "CREATE TABLE tbl "
+        + ("ON CLUSTER 'cluster' " if on_cluster else "")
+        + "(x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}') "
+        + "ORDER BY tuple()"
+        + partition_by_clause
+    )
+    if num_parts > 0:
+        node.query(f"INSERT INTO tbl SELECT number FROM numbers({num_parts})")
+
+
+# Generates an ID suitable both as backup id or restore id.
+def random_id():
+    return uuid.uuid4().hex
+
+
+# Generates a backup name prepared for using in BACKUP and RESTORE queries.
+def get_backup_name(backup_id):
+    return f"Disk('backups', '{backup_id}')"
+
+
+# Reads the status of a backup or a restore from system.backups.
+def get_status(initiator, backup_id=None, restore_id=None):
+    id = backup_id if backup_id is not None else restore_id
+    return initiator.query(f"SELECT status FROM system.backups WHERE id='{id}'").rstrip(
+        "\n"
+    )
+
+
+# Reads the error message of a failed backup or a failed restore from system.backups.
+def get_error(initiator, backup_id=None, restore_id=None):
+    id = backup_id if backup_id is not None else restore_id
+    return initiator.query(f"SELECT error FROM system.backups WHERE id='{id}'").rstrip(
+        "\n"
+    )
+
+
+# Waits until the status of a backup or a restore becomes a desired one.
+# Returns how many seconds the function was waiting.
+def wait_status(
+    initiator,
+    status="BACKUP_CREATED",
+    backup_id=None,
+    restore_id=None,
+    timeout=None,
+):
+    print(f"Waiting for status {status}")
+    id = backup_id if backup_id is not None else restore_id
+    operation_name = "backup" if backup_id is not None else "restore"
+    current_status = get_status(initiator, backup_id=backup_id, restore_id=restore_id)
+    waited = 0
+    while (
+        (current_status != status)
+        and (current_status in ["CREATING_BACKUP", "RESTORING"])
+        and ((timeout is None) or (waited < timeout))
+    ):
+        sleep_time = 1 if (timeout is None) else min(1, timeout - waited)
+        time.sleep(sleep_time)
+        waited += sleep_time
+        current_status = get_status(
+            initiator, backup_id=backup_id, restore_id=restore_id
+        )
+    start_time, end_time = (
+        initiator.query(
+            f"SELECT start_time, end_time FROM system.backups WHERE id='{id}'"
+        )
+        .splitlines()[0]
+        .split("\t")
+    )
+    print(
+        f"{get_node_name(initiator)} : Got status {current_status} for {operation_name} {id} after waiting {waited} seconds "
+        f"(start_time = {start_time}, end_time = {end_time})"
+    )
+    assert current_status == status
+
+
+# Returns how many entries are in system.processes corresponding to a specified backup or restore.
+def get_num_system_processes(
+    node_or_nodes, backup_id=None, restore_id=None, is_initial_query=None
+):
+    id = backup_id if backup_id is not None else restore_id
+    query_kind = "Backup" if backup_id is not None else "Restore"
+    total = 0
+    filter_for_is_initial_query = (
+        f" AND (is_initial_query = {is_initial_query})"
+        if is_initial_query is not None
+        else ""
+    )
+    nodes_to_consider = (
+        node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+    )
+    for node in nodes_to_consider:
+        count = int(
+            node.query(
+                f"SELECT count() FROM system.processes WHERE (query_kind='{query_kind}') AND (query LIKE '%{id}%'){filter_for_is_initial_query}"
+            )
+        )
+        total += count
+    return total
+
+
+# Waits until the number of entries in system.processes corresponding to a specified backup or restore becomes a desired one.
+# Returns how many seconds the function was waiting.
+def wait_num_system_processes(
+    node_or_nodes,
+    num_system_processes=0,
+    backup_id=None,
+    restore_id=None,
+    is_initial_query=None,
+    timeout=None,
+):
+    print(f"Waiting for number of system processes = {num_system_processes}")
+    id = backup_id if backup_id is not None else restore_id
+    operation_name = "backup" if backup_id is not None else "restore"
+    current_count = get_num_system_processes(
+        node_or_nodes,
+        backup_id=backup_id,
+        restore_id=restore_id,
+        is_initial_query=is_initial_query,
+    )
+
+    def is_current_count_ok():
+        return (current_count == num_system_processes) or (
+            num_system_processes == "1+" and current_count >= 1
+        )
+
+    waited = 0
+    while not is_current_count_ok() and ((timeout is None) or (waited < timeout)):
+        sleep_time = 1 if (timeout is None) else min(1, timeout - waited)
+        time.sleep(sleep_time)
+        waited += sleep_time
+        current_count = get_num_system_processes(
+            node_or_nodes,
+            backup_id=backup_id,
+            restore_id=restore_id,
+            is_initial_query=is_initial_query,
+        )
+    if is_current_count_ok():
+        print(
+            f"Got {current_count} system processes for {operation_name} {id} after waiting {waited} seconds"
+        )
+    else:
+        nodes_to_consider = (
+            node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+        )
+        for node in nodes_to_consider:
+            count = get_num_system_processes(
+                node, backup_id=backup_id, restore_id=restore_id
+            )
+            print(
+                f"{get_node_name(node)}: Got {count} system processes for {operation_name} {id} after waiting {waited} seconds"
+            )
+        assert False
+    return waited
+
+
+# Kills a BACKUP or RESTORE query.
+# Returns how many seconds the KILL QUERY was executing.
+def kill_query(
+    node, backup_id=None, restore_id=None, is_initial_query=None, timeout=None
+):
+    id = backup_id if backup_id is not None else restore_id
+    query_kind = "Backup" if backup_id is not None else "Restore"
+    operation_name = "backup" if backup_id is not None else "restore"
+    print(f"{get_node_name(node)}: Cancelling {operation_name} {id}")
+    filter_for_is_initial_query = (
+        f" AND (is_initial_query = {is_initial_query})"
+        if is_initial_query is not None
+        else ""
+    )
+    node.query(
+        f"KILL QUERY WHERE (query_kind='{query_kind}') AND (query LIKE '%{id}%'){filter_for_is_initial_query} SYNC"
+    )
+    node.query("SYSTEM FLUSH LOGS")
+    duration = (
+        int(
+            node.query(
+                f"SELECT query_duration_ms FROM system.query_log WHERE query_kind='KillQuery' AND query LIKE '%{id}%' AND type='QueryFinish'"
+            )
+        )
+        / 1000
+    )
+    print(
+        f"{get_node_name(node)}: Cancelled {operation_name} {id} after {duration} seconds"
+    )
+    if timeout is not None:
+        assert duration < timeout
+
+
+# Stops all ZooKeeper servers.
+def stop_zookeeper_servers(zoo_nodes):
+    print(f"Stopping ZooKeeper servers {zoo_nodes}")
+    old_time = time.monotonic()
+    cluster.stop_zookeeper_nodes(zoo_nodes)
+    print(
+        f"Stopped ZooKeeper servers {zoo_nodes} in {time.monotonic() - old_time} seconds"
+    )
+
+
+# Starts all ZooKeeper servers back.
+def start_zookeeper_servers(zoo_nodes):
+    print(f"Starting ZooKeeper servers {zoo_nodes}")
+    old_time = time.monotonic()
+    cluster.start_zookeeper_nodes(zoo_nodes)
+    print(
+        f"Started ZooKeeper servers {zoo_nodes} in {time.monotonic() - old_time} seconds"
+    )
+
+
+# Sleeps for random amount of time.
+def random_sleep(max_seconds):
+    if random.randint(0, 5) > 0:
+        sleep(random.uniform(0, max_seconds))
+
+
+def sleep(seconds):
+    print(f"Sleeping {seconds} seconds")
+    time.sleep(seconds)
+
+
+# Checks that BACKUP and RESTORE cleaned up properly with no trash left in ZooKeeper, backups folder, and logs.
+class NoTrashChecker:
+    def __init__(self):
+        self.expect_backups = []
+        self.expect_unfinished_backups = []
+        self.expect_errors = []
+        self.allow_errors = []
+        self.check_zookeeper = True
+
+        # Sleep 1 second to ensure this NoTrashChecker won't collect errors from a possible previous NoTrashChecker.
+        time.sleep(1)
+
+        self.__start_time_for_collecting_errors = time.gmtime()
+        self.__previous_list_of_backups = set(
+            os.listdir(os.path.join(node1.cluster.instances_dir, "backups"))
+        )
+
+        self.__previous_list_of_znodes = set(
+            node1.query(
+                "SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups' "
+                + "AND NOT (name == 'alive_tracker')"
+            ).splitlines()
+        )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        list_of_znodes = set(
+            node1.query(
+                "SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups' "
+                + "AND NOT (name == 'alive_tracker')"
+            ).splitlines()
+        )
+        new_znodes = list_of_znodes.difference(self.__previous_list_of_znodes)
+        if new_znodes:
+            print(f"Found nodes in ZooKeeper: {new_znodes}")
+            for node in new_znodes:
+                print(
+                    f"Nodes in '/clickhouse/backups/{node}':\n"
+                    + node1.query(
+                        f"SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups/{node}'"
+                    )
+                )
+                print(
+                    f"Nodes in '/clickhouse/backups/{node}/stage':\n"
+                    + node1.query(
+                        f"SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups/{node}/stage'"
+                    )
+                )
+        if self.check_zookeeper:
+            assert new_znodes == set()
+
+        list_of_backups = set(
+            os.listdir(os.path.join(node1.cluster.instances_dir, "backups"))
+        )
+        new_backups = list_of_backups.difference(self.__previous_list_of_backups)
+        unfinished_backups = set(
+            backup
+            for backup in new_backups
+            if not os.path.exists(
+                os.path.join(node1.cluster.instances_dir, "backups", backup, ".backup")
+            )
+        )
+        new_backups = set(
+            backup for backup in new_backups if backup not in unfinished_backups
+        )
+        if new_backups:
+            print(f"Found new backups: {new_backups}")
+        if unfinished_backups:
+            print(f"Found unfinished backups: {unfinished_backups}")
+        assert new_backups == set(self.expect_backups)
+        assert unfinished_backups == set(self.expect_unfinished_backups)
+
+        all_errors = set()
+        start_time = time.strftime(
+            "%Y-%m-%d %H:%M:%S", self.__start_time_for_collecting_errors
+        )
+        for node in nodes:
+            errors_query_result = node.query(
+                "SELECT name FROM system.errors WHERE last_error_time >= toDateTime('"
+                + start_time
+                + "') "
+                + "AND NOT ((name == 'KEEPER_EXCEPTION') AND (last_error_message LIKE '%Fault injection%')) "
+                + "AND NOT (name == 'NO_ELEMENTS_IN_CONFIG')"
+            )
+            errors = errors_query_result.splitlines()
+            if errors:
+                print(f"{get_node_name(node)}: Found errors: {errors}")
+                print(
+                    node.query(
+                        "SELECT name, last_error_message FROM system.errors WHERE last_error_time >= toDateTime('"
+                        + start_time
+                        + "')"
+                    )
+                )
+            for error in errors:
+                assert (error in self.expect_errors) or (error in self.allow_errors)
+                all_errors.update(errors)
+
+        not_found_expected_errors = set(self.expect_errors).difference(all_errors)
+        if not_found_expected_errors:
+            print(f"Not found expected errors: {not_found_expected_errors}")
+            assert False
+
+
+__backup_id_of_successful_backup = None
+
+
+# Generates a backup which will be used to test RESTORE.
+def get_backup_id_of_successful_backup():
+    global __backup_id_of_successful_backup
+    if __backup_id_of_successful_backup is None:
+        __backup_id_of_successful_backup = random_id()
+        with NoTrashChecker() as no_trash_checker:
+            print("Will make backup successfully")
+            backup_id = __backup_id_of_successful_backup
+            create_and_fill_table(random_node())
+            initiator = random_node()
+            print(f"Using {get_node_name(initiator)} as initiator")
+            initiator.query(
+                f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+            )
+            wait_status(initiator, "BACKUP_CREATED", backup_id=backup_id)
+            assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+            no_trash_checker.expect_backups = [backup_id]
+
+            # Dropping the table before restoring.
+            node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
+
+    return __backup_id_of_successful_backup
+
+
+# Actual tests
+
+
+# Test that a BACKUP operation can be cancelled with KILL QUERY.
+def test_cancel_backup():
+    with NoTrashChecker() as no_trash_checker:
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the backup might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_cancel, cancel_as_initiator = random.choice(
+            [(node1, False), (node2, False), (initiator, True)]
+        )
+
+        wait_num_system_processes(
+            node_to_cancel,
+            "1+",
+            backup_id=backup_id,
+            is_initial_query=cancel_as_initiator,
+        )
+
+        print(
+            f"Cancelling on {'initiator' if cancel_as_initiator else 'node'} {get_node_name(node_to_cancel)}"
+        )
+
+        # The timeout is 2 seconds here because a backup must be cancelled quickly.
+        kill_query(
+            node_to_cancel,
+            backup_id=backup_id,
+            is_initial_query=cancel_as_initiator,
+            timeout=3,
+        )
+
+        if cancel_as_initiator:
+            assert get_status(initiator, backup_id=backup_id) == "BACKUP_CANCELLED"
+        wait_status(initiator, "BACKUP_CANCELLED", backup_id=backup_id, timeout=3)
+
+        assert "QUERY_WAS_CANCELLED" in get_error(initiator, backup_id=backup_id)
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+
+# Test that a RESTORE operation can be cancelled with KILL QUERY.
+def test_cancel_restore():
+    # Make backup.
+    backup_id = get_backup_id_of_successful_backup()
+
+    # Cancel restoring.
+    with NoTrashChecker() as no_trash_checker:
+        print("Will cancel restoring")
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        restore_id = random_id()
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC"
+        )
+
+        assert get_status(initiator, restore_id=restore_id) == "RESTORING"
+        assert get_num_system_processes(initiator, restore_id=restore_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the restore might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_cancel, cancel_as_initiator = random.choice(
+            [(node1, False), (node2, False), (initiator, True)]
+        )
+
+        wait_num_system_processes(
+            node_to_cancel,
+            "1+",
+            restore_id=restore_id,
+            is_initial_query=cancel_as_initiator,
+        )
+
+        print(
+            f"Cancelling on {'initiator' if cancel_as_initiator else 'node'} {get_node_name(node_to_cancel)}"
+        )
+
+        # The timeout is 2 seconds here because a restore must be cancelled quickly.
+        kill_query(
+            node_to_cancel,
+            restore_id=restore_id,
+            is_initial_query=cancel_as_initiator,
+            timeout=3,
+        )
+
+        if cancel_as_initiator:
+            assert get_status(initiator, restore_id=restore_id) == "RESTORE_CANCELLED"
+        wait_status(initiator, "RESTORE_CANCELLED", restore_id=restore_id, timeout=3)
+
+        assert "QUERY_WAS_CANCELLED" in get_error(initiator, restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+    # Restore successfully.
+    with NoTrashChecker() as no_trash_checker:
+        print("Will restore from backup successfully")
+        restore_id = random_id()
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC"
+        )
+
+        wait_status(initiator, "RESTORED", restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+
+
+# Test that shutdown cancels a running backup and doesn't wait until it finishes.
+def test_shutdown_cancels_backup():
+    with NoTrashChecker() as no_trash_checker:
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the backup might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_restart = random.choice([node1, node2])
+        wait_num_system_processes(node_to_restart, "1+", backup_id=backup_id)
+
+        print(f"{get_node_name(node_to_restart)}: Restarting...")
+        node_to_restart.restart_clickhouse()  # Must cancel the backup.
+        print(f"{get_node_name(node_to_restart)}: Restarted")
+
+        wait_num_system_processes(nodes, 0, backup_id=backup_id)
+
+        if initiator != node_to_restart:
+            assert get_status(initiator, backup_id=backup_id) == "BACKUP_CANCELLED"
+            assert "QUERY_WAS_CANCELLED" in get_error(initiator, backup_id=backup_id)
+
+        # The information about this cancelled backup must be stored in system.backup_log
+        initiator.query("SYSTEM FLUSH LOGS")
+        assert initiator.query(
+            f"SELECT status FROM system.backup_log WHERE id='{backup_id}' ORDER BY status"
+        ) == TSV(["CREATING_BACKUP", "BACKUP_CANCELLED"])
+
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+
+# After an error backup should clean the destination folder and used nodes in ZooKeeper.
+# No unexpected errors must be generated.
+def test_error_leaves_no_trash():
+    with NoTrashChecker() as no_trash_checker:
+        # We create table "tbl" on one node only in order to make "BACKUP TABLE tbl ON CLUSTER" fail
+        # (because of the non-existing table on another node).
+        create_and_fill_table(random_node(), on_cluster=False)
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        wait_status(initiator, "BACKUP_FAILED", backup_id=backup_id)
+        assert "UNKNOWN_TABLE" in get_error(initiator, backup_id=backup_id)
+
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+        no_trash_checker.expect_errors = ["UNKNOWN_TABLE"]
+
+
+# A backup must be stopped if Zookeeper is disconnected longer than `failure_after_host_disconnected_for_seconds`.
+def test_long_disconnection_stops_backup():
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        # Config "faster_zk_disconnect_detect.xml" is used in this test to decrease number of retries when reconnecting to ZooKeeper.
+        # Without this config this test can take several minutes (instead of seconds) to run.
+        config_manager.add_main_config(nodes, "configs/faster_zk_disconnect_detect.xml")
+
+        create_and_fill_table(random_node(), num_parts=100)
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 3},
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        no_trash_checker.expect_unfinished_backups = [backup_id]
+        no_trash_checker.allow_errors = [
+            "FAILED_TO_SYNC_BACKUP_OR_RESTORE",
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
+        no_trash_checker.check_zookeeper = False
+
+        with PartitionManager() as pm:
+            random_sleep(3)
+
+            time_before_disconnection = time.monotonic()
+
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+
+            # Being disconnected from ZooKeeper a backup is expected to fail.
+            wait_status(initiator, "BACKUP_FAILED", backup_id=backup_id)
+
+            time_to_fail = time.monotonic() - time_before_disconnection
+            error = get_error(initiator, backup_id=backup_id)
+            print(f"error={error}")
+            assert "Lost connection" in error
+
+            # A backup is expected to fail, but it isn't expected to fail too soon.
+            print(f"Backup failed after {time_to_fail} seconds disconnection")
+            assert time_to_fail > 3
+            assert time_to_fail < 30
+
+
+# A backup must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.
+def test_short_disconnection_doesnt_stop_backup():
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        use_faster_zk_disconnect_detect = random.choice([True, False])
+        if use_faster_zk_disconnect_detect:
+            print("Using faster_zk_disconnect_detect.xml")
+            config_manager.add_main_config(
+                nodes, "configs/faster_zk_disconnect_detect.xml"
+            )
+
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 6},
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # Dropping connection for less than `failure_after_host_disconnected_for_seconds`
+        with PartitionManager() as pm:
+            random_sleep(3)
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+            random_sleep(3)
+            print(
+                f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+
+        # Backup must be successful.
+        wait_status(initiator, "BACKUP_CREATED", backup_id=backup_id)
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+
+        no_trash_checker.expect_backups = [backup_id]
+        no_trash_checker.allow_errors = [
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
+
+
+# A restore must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.
+def test_short_disconnection_doesnt_stop_restore():
+    # Make a backup.
+    backup_id = get_backup_id_of_successful_backup()
+
+    # Restore from the backup.
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        use_faster_zk_disconnect_detect = random.choice([True, False])
+        if use_faster_zk_disconnect_detect:
+            print("Using faster_zk_disconnect_detect.xml")
+            config_manager.add_main_config(
+                nodes, "configs/faster_zk_disconnect_detect.xml"
+            )
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        restore_id = random_id()
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 6},
+        )
+
+        assert get_status(initiator, restore_id=restore_id) == "RESTORING"
+        assert get_num_system_processes(initiator, restore_id=restore_id) >= 1
+
+        # Dropping connection for less than `failure_after_host_disconnected_for_seconds`
+        with PartitionManager() as pm:
+            random_sleep(3)
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+            random_sleep(3)
+            print(
+                f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+
+        # Restore must be successful.
+        wait_status(initiator, "RESTORED", restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+
+        no_trash_checker.allow_errors = [
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
--- a/tests/integration/test_backup_restore_on_cluster/test_different_versions.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_different_versions.py
@ -0,0 +1,125 @@
+import random
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+main_configs = [
+    "configs/backups_disk.xml",
+    "configs/cluster_different_versions.xml",
+]
+
+user_configs = []
+
+new_node = cluster.add_instance(
+    "new_node",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "new_node", "shard": "shard1"},
+    with_zookeeper=True,
+)
+
+old_node = cluster.add_instance(
+    "old_node",
+    image="clickhouse/clickhouse-server",
+    tag="24.9.2.42",
+    with_installed_binary=True,
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "old_node", "shard": "shard1"},
+    with_zookeeper=True,
+)
+
+nodes = [new_node, old_node]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_after_test():
+    try:
+        yield
+    finally:
+        new_node.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster_ver' SYNC")
+
+
+backup_id_counter = 0
+
+
+def new_backup_name():
+    global backup_id_counter
+    backup_id_counter += 1
+    return f"Disk('backups', '{backup_id_counter}')"
+
+
+# Gets a printable version the name of a node.
+def get_node_name(node):
+    return "new_node" if (node == new_node) else "old_node"
+
+
+# Choose a random instance.
+def random_node():
+    return random.choice(nodes)
+
+
+def test_different_versions():
+    new_node.query(
+        "CREATE TABLE tbl"
+        " ON CLUSTER 'cluster_ver'"
+        " (x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')"
+        " ORDER BY tuple()"
+    )
+
+    new_node.query(f"INSERT INTO tbl VALUES (1)")
+    old_node.query(f"INSERT INTO tbl VALUES (2)")
+
+    backup_name = new_backup_name()
+
+    initiator = random_node()
+    print(f"Using {get_node_name(initiator)} as initiator for BACKUP")
+    initiator.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster_ver' TO {backup_name}")
+
+    new_node.query("DROP TABLE tbl ON CLUSTER 'cluster_ver' SYNC")
+
+    initiator = random_node()
+    print(f"Using {get_node_name(initiator)} as initiator for RESTORE")
+    initiator.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster_ver' FROM {backup_name}")
+
+    new_node.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster_ver' tbl")
+    assert new_node.query("SELECT * FROM tbl ORDER BY x") == TSV([1, 2])
+    assert old_node.query("SELECT * FROM tbl ORDER BY x") == TSV([1, 2])
+
+    # Error NO_ELEMENTS_IN_CONFIG is unrelated.
+    assert (
+        new_node.query(
+            "SELECT name, last_error_message FROM system.errors WHERE NOT ("
+            "(name == 'NO_ELEMENTS_IN_CONFIG')"
+            ")"
+        )
+        == ""
+    )
+
+    # Error FAILED_TO_SYNC_BACKUP_OR_RESTORE: "No connection to host new_node:9000 yet, will retry" is generated by the old version
+    # when it fails to connect to other host because that other host hasn't started yet.
+    # This is not an error actually, just an exception thrown and caught. The new version doesn't throw this exception.
+    assert (
+        old_node.query(
+            "SELECT name, last_error_message FROM system.errors WHERE NOT ("
+            "(name == 'NO_ELEMENTS_IN_CONFIG') OR"
+            "((name == 'FAILED_TO_SYNC_BACKUP_OR_RESTORE') AND (last_error_message == 'No connection to host new_node:9000 yet, will retry'))"
+            ")"
+        )
+        == ""
+    )
--- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
@ -145,7 +145,7 @@ def wait_for_restore(node, restore_id):

 def check_backup_error(error):
    expected_errors = [
-        "Concurrent backups not supported",
+        "Concurrent backups are not allowed",
        "BACKUP_ALREADY_EXISTS",
    ]
    assert any([expected_error in error for expected_error in expected_errors])
@ -153,7 +153,7 @@ def check_backup_error(error):

 def check_restore_error(error):
    expected_errors = [
-        "Concurrent restores not supported",
+        "Concurrent restores are not allowed",
        "Cannot restore the table default.tbl because it already contains some data",
    ]
    assert any([expected_error in error for expected_error in expected_errors])