diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index df5f08091ba..1d7f93398cc 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -685,13 +685,13 @@ void BackupCoordinationStageSync::cancelQueryIfError() { std::lock_guard lock{mutex}; - if (!state.host_with_error) - return; - - exception = state.hosts.at(*state.host_with_error).exception; + if (state.host_with_error) + exception = state.hosts.at(*state.host_with_error).exception; } - chassert(exception); + if (!exception) + return; + process_list_element->cancelQuery(false, exception); state_changed.notify_all(); } @@ -741,6 +741,11 @@ void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong() if (!exception) return; + /// In this function we only pass the new `exception` (about that the connection was lost) to `process_list_element`. + /// We don't try to create the 'error' node here (because this function is called from watchingThread() and + /// we don't want the watching thread to try waiting here for retries or a reconnection). + /// Also we don't set the `state.host_with_error` field here because `state.host_with_error` can only be set + /// AFTER creating the 'error' node (see the comment for `State`). process_list_element->cancelQuery(false, exception); state_changed.notify_all(); } @@ -870,6 +875,9 @@ bool BackupCoordinationStageSync::checkIfHostsReachStage(const Strings & hosts, continue; } + if (state.host_with_error) + std::rethrow_exception(state.hosts.at(*state.host_with_error).exception); + if (host_info.finished) throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "{} finished without coming to stage {}", getHostDesc(host), stage_to_wait); @@ -1150,6 +1158,9 @@ bool BackupCoordinationStageSync::checkIfOtherHostsFinish( if ((host == current_host) || host_info.finished) continue; + if (throw_if_error && state.host_with_error) + std::rethrow_exception(state.hosts.at(*state.host_with_error).exception); + String reason_text = reason.empty() ? "" : (" " + reason); String host_status; diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h index 879b2422b84..acfd31c05af 100644 --- a/src/Backups/BackupCoordinationStageSync.h +++ b/src/Backups/BackupCoordinationStageSync.h @@ -197,6 +197,9 @@ private: }; /// Information about all the host participating in the current BACKUP or RESTORE operation. + /// This information is read from ZooKeeper. + /// To simplify the programming logic `state` can only be updated AFTER changing corresponding nodes in ZooKeeper + /// (for example, first we create the 'error' node, and only after that we set or read from ZK the `state.host_with_error` field). struct State { std::map hosts; /// std::map because we need to compare states diff --git a/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py b/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py index 4ad53acc735..e9e20602d5c 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py +++ b/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py @@ -667,7 +667,7 @@ def test_long_disconnection_stops_backup(): # A backup is expected to fail, but it isn't expected to fail too soon. print(f"Backup failed after {time_to_fail} seconds disconnection") assert time_to_fail > 3 - assert time_to_fail < 35 + assert time_to_fail < 45 # A backup must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.