Fix retries for disconnected nodes for BACKUP/RESTORE ON CLUSTER.

2024-11-27 01:51:59 +00:00 · 2023-12-12 00:19:16 +01:00 · 2023-12-12 00:19:16 +01:00 · 16afd81322
commit 16afd81322
parent 30148972ed
3 changed files with 43 additions and 72 deletions
--- a/src/Backups/BackupCoordinationRemote.cpp
+++ b/src/Backups/BackupCoordinationRemote.cpp
@ -184,11 +184,9 @@ BackupCoordinationRemote::BackupCoordinationRemote(
            if (my_is_internal)
            {
                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
+                zk->createAncestors(alive_node_path);
                auto code = zk->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-
-                if (code == Coordination::Error::ZNODEEXISTS)
-                    zk->handleEphemeralNodeExistenceNoFailureInjection(alive_node_path, "");
-                else if (code != Coordination::Error::ZOK)
+                if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
                    throw zkutil::KeeperException::fromPath(code, alive_node_path);
            }
        })
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
@ -60,12 +60,6 @@ void BackupCoordinationStageSync::set(const String & current_host, const String
        }
        else
        {
-            /// Make an ephemeral node so the initiator can track if the current host is still working.
-            String alive_node_path = zookeeper_path + "/alive|" + current_host;
-            auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS)
-                throw zkutil::KeeperException::fromPath(code, alive_node_path);
-
            zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
            zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message);
        }
@ -118,27 +112,24 @@ struct BackupCoordinationStageSync::State
    Strings results;
    std::map<String, UnreadyHostState> unready_hosts;
    std::optional<std::pair<String, Exception>> error;
-    std::optional<String> host_terminated;
+    std::optional<String> disconnected_host;
 };

 BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState(
-    const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const
+    WithRetries::RetriesControlHolder & retries_control_holder,
+    const Strings & zk_nodes,
+    const Strings & all_hosts,
+    const String & stage_to_wait) const
 {
+    auto zookeeper = retries_control_holder.faulty_zookeeper;
+    auto & retries_ctl = retries_control_holder.retries_ctl;
+
    std::unordered_set<std::string_view> zk_nodes_set{zk_nodes.begin(), zk_nodes.end()};

    State state;
    if (zk_nodes_set.contains("error"))
    {
-        String errors;
-        {
-            auto holder = with_retries.createRetriesControlHolder("readCurrentState");
-            holder.retries_ctl.retryLoop(
-                [&, &zookeeper = holder.faulty_zookeeper]()
-                {
-                    with_retries.renewZooKeeper(zookeeper);
-                    errors = zookeeper->get(zookeeper_path + "/error");
-                });
-        }
+        String errors = zookeeper->get(zookeeper_path + "/error");
        ReadBufferFromOwnString buf{errors};
        String host;
        readStringBinary(host, buf);
@ -150,60 +141,40 @@ BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState
    {
        if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait))
        {
-            UnreadyHostState unready_host_state;
            const String started_node_name = "started|" + host;
            const String alive_node_name = "alive|" + host;
            const String alive_node_path = zookeeper_path + "/" + alive_node_name;
+
+            UnreadyHostState unready_host_state;
            unready_host_state.started = zk_nodes_set.contains(started_node_name);
-
-            /// Because we do retries everywhere we can't fully rely on ephemeral nodes anymore.
-            /// Though we recreate "alive" node when reconnecting it might be not enough and race condition is possible.
-            /// And everything we can do here - just retry.
-            /// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
            unready_host_state.alive = zk_nodes_set.contains(alive_node_name);
-            if (!unready_host_state.alive)
-            {
-                LOG_TRACE(log, "Seems like host ({}) is dead. Will retry the check to confirm", host);
-                auto holder = with_retries.createRetriesControlHolder("readCurrentState::checkAliveNode");
-                holder.retries_ctl.retryLoop(
-                    [&, &zookeeper = holder.faulty_zookeeper]()
-                {
-                    with_retries.renewZooKeeper(zookeeper);
-
-                    if (zookeeper->existsNoFailureInjection(alive_node_path))
-                    {
-                        unready_host_state.alive = true;
-                        return;
-                    }
-
-                    // Retry with backoff. We also check whether it is last retry or no, because we won't to rethrow an exception.
-                    if (!holder.retries_ctl.isLastRetry())
-                        holder.retries_ctl.setKeeperError(Coordination::Error::ZNONODE, "There is no alive node for host {}. Will retry", host);
-                });
-            }
-            LOG_TRACE(log, "Host ({}) appeared to be {}", host, unready_host_state.alive ? "alive" : "dead");
-
            state.unready_hosts.emplace(host, unready_host_state);
-            if (!unready_host_state.alive && unready_host_state.started && !state.host_terminated)
-                state.host_terminated = host;
+
+            if (!unready_host_state.alive && !state.disconnected_host)
+            {
+                /// If the "alive" node doesn't exist then we don't have connection to the corresponding host.
+                /// This node is ephemeral so probably it will be recreated soon. We use zookeeper retries to wait.
+                /// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
+                state.disconnected_host = host;
+                String message;
+                if (unready_host_state.started)
+                    message = fmt::format("Lost connection to host {}", host);
+                else
+                    message = fmt::format("No connection to host {} yet", host);
+                if (!retries_ctl.isLastRetry())
+                    message += ", will retry";
+                retries_ctl.setUserError(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, message);
+            }
        }
    }

-    if (state.host_terminated || !state.unready_hosts.empty())
+    if (state.disconnected_host || !state.unready_hosts.empty())
        return state;

-    auto holder = with_retries.createRetriesControlHolder("waitImpl::collectStagesToWait");
-    holder.retries_ctl.retryLoop(
-        [&, &zookeeper = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zookeeper);
-        Strings results;
-
-        for (const auto & host : all_hosts)
-            results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
-
-        state.results = std::move(results);
-    });
+    Strings results;
+    for (const auto & host : all_hosts)
+        results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
+    state.results = std::move(results);

    return state;
 }
@ -229,7 +200,7 @@ Strings BackupCoordinationStageSync::waitImpl(
        auto watch = std::make_shared<Poco::Event>();
        Strings zk_nodes;
        {
-            auto holder = with_retries.createRetriesControlHolder("waitImpl::getChildren");
+            auto holder = with_retries.createRetriesControlHolder("waitImpl");
            holder.retries_ctl.retryLoop(
                [&, &zookeeper = holder.faulty_zookeeper]()
            {
@ -237,12 +208,14 @@ Strings BackupCoordinationStageSync::waitImpl(
                watch->reset();
                /// Get zk nodes and subscribe on their changes.
                zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
+
+                /// Read the current state of zk nodes.
+                state = readCurrentState(holder, zk_nodes, all_hosts, stage_to_wait);
            });
        }

-        /// Read and analyze the current state of zk nodes.
-        state = readCurrentState(zk_nodes, all_hosts, stage_to_wait);
-        if (state.error || state.host_terminated || state.unready_hosts.empty())
+        /// Analyze the current state of zk nodes.
+        if (state.error || state.disconnected_host || state.unready_hosts.empty())
            break; /// Error happened or everything is ready.

        /// Log that we will wait
@ -270,8 +243,8 @@ Strings BackupCoordinationStageSync::waitImpl(
        state.error->second.rethrow();

    /// Another host terminated without errors.
-    if (state.host_terminated)
-        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Host {} suddenly stopped working", *state.host_terminated);
+    if (state.disconnected_host)
+        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "No connection to host {}", *state.disconnected_host);

    /// Something's unready, timeout is probably not enough.
    if (!state.unready_hosts.empty())
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -29,7 +29,7 @@ private:
    void createRootNodes();

    struct State;
-    State readCurrentState(const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+    State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;

    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;