From a7831c991fbe7c7abd01d43eaa4d6d9071128239 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 17 Nov 2024 17:33:38 +0100 Subject: [PATCH] Better conditions when cancelling a backup. --- src/Backups/BackupCoordinationStageSync.cpp | 21 ++++++++++++++++----- src/Backups/BackupCoordinationStageSync.h | 3 +++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index df5f08091ba..1d7f93398cc 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -685,13 +685,13 @@ void BackupCoordinationStageSync::cancelQueryIfError() { std::lock_guard lock{mutex}; - if (!state.host_with_error) - return; - - exception = state.hosts.at(*state.host_with_error).exception; + if (state.host_with_error) + exception = state.hosts.at(*state.host_with_error).exception; } - chassert(exception); + if (!exception) + return; + process_list_element->cancelQuery(false, exception); state_changed.notify_all(); } @@ -741,6 +741,11 @@ void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong() if (!exception) return; + /// In this function we only pass the new `exception` (about that the connection was lost) to `process_list_element`. + /// We don't try to create the 'error' node here (because this function is called from watchingThread() and + /// we don't want the watching thread to try waiting here for retries or a reconnection). + /// Also we don't set the `state.host_with_error` field here because `state.host_with_error` can only be set + /// AFTER creating the 'error' node (see the comment for `State`). process_list_element->cancelQuery(false, exception); state_changed.notify_all(); } @@ -870,6 +875,9 @@ bool BackupCoordinationStageSync::checkIfHostsReachStage(const Strings & hosts, continue; } + if (state.host_with_error) + std::rethrow_exception(state.hosts.at(*state.host_with_error).exception); + if (host_info.finished) throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "{} finished without coming to stage {}", getHostDesc(host), stage_to_wait); @@ -1150,6 +1158,9 @@ bool BackupCoordinationStageSync::checkIfOtherHostsFinish( if ((host == current_host) || host_info.finished) continue; + if (throw_if_error && state.host_with_error) + std::rethrow_exception(state.hosts.at(*state.host_with_error).exception); + String reason_text = reason.empty() ? "" : (" " + reason); String host_status; diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h index 879b2422b84..acfd31c05af 100644 --- a/src/Backups/BackupCoordinationStageSync.h +++ b/src/Backups/BackupCoordinationStageSync.h @@ -197,6 +197,9 @@ private: }; /// Information about all the host participating in the current BACKUP or RESTORE operation. + /// This information is read from ZooKeeper. + /// To simplify the programming logic `state` can only be updated AFTER changing corresponding nodes in ZooKeeper + /// (for example, first we create the 'error' node, and only after that we set or read from ZK the `state.host_with_error` field). struct State { std::map hosts; /// std::map because we need to compare states