From a7831c991fbe7c7abd01d43eaa4d6d9071128239 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Sun, 17 Nov 2024 17:33:38 +0100
Subject: [PATCH] Better conditions when cancelling a backup.

---
 src/Backups/BackupCoordinationStageSync.cpp | 21 ++++++++++++++++-----
 src/Backups/BackupCoordinationStageSync.h   |  3 +++
 2 files changed, 19 insertions(+), 5 deletions(-)
diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp
index df5f08091ba..1d7f93398cc 100644
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
@@ -685,13 +685,13 @@ void BackupCoordinationStageSync::cancelQueryIfError()
 
     {
         std::lock_guard lock{mutex};
-        if (!state.host_with_error)
-            return;
-
-        exception = state.hosts.at(*state.host_with_error).exception;
+        if (state.host_with_error)
+            exception = state.hosts.at(*state.host_with_error).exception;
     }
 
-    chassert(exception);
+    if (!exception)
+        return;
+
     process_list_element->cancelQuery(false, exception);
     state_changed.notify_all();
 }
@@ -741,6 +741,11 @@ void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong()
     if (!exception)
         return;
 
+    /// In this function we only pass the new `exception` (about that the connection was lost) to `process_list_element`.
+    /// We don't try to create the 'error' node here (because this function is called from watchingThread() and
+    /// we don't want the watching thread to try waiting here for retries or a reconnection).
+    /// Also we don't set the `state.host_with_error` field here because `state.host_with_error` can only be set
+    /// AFTER creating the 'error' node (see the comment for `State`).
     process_list_element->cancelQuery(false, exception);
     state_changed.notify_all();
 }
@@ -870,6 +875,9 @@ bool BackupCoordinationStageSync::checkIfHostsReachStage(const Strings & hosts,
             continue;
         }
 
+        if (state.host_with_error)
+            std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
+
         if (host_info.finished)
             throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
                             "{} finished without coming to stage {}", getHostDesc(host), stage_to_wait);
@@ -1150,6 +1158,9 @@ bool BackupCoordinationStageSync::checkIfOtherHostsFinish(
         if ((host == current_host) || host_info.finished)
             continue;
 
+        if (throw_if_error && state.host_with_error)
+            std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
+
         String reason_text = reason.empty() ? "" : (" " + reason);
 
         String host_status;
diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h
index 879b2422b84..acfd31c05af 100644
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@@ -197,6 +197,9 @@ private:
     };
 
     /// Information about all the host participating in the current BACKUP or RESTORE operation.
+    /// This information is read from ZooKeeper.
+    /// To simplify the programming logic `state` can only be updated AFTER changing corresponding nodes in ZooKeeper
+    /// (for example, first we create the 'error' node, and only after that we set or read from ZK the `state.host_with_error` field).
     struct State
     {
         std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states