Merge pull request #72018 from vitlibar/correction-after-reworking-backup-sync-3

Corrections after reworking backup/restore synchronization #3
This commit is contained in:
Nikita Mikhaylov 2024-11-17 22:36:20 +00:00 committed by GitHub
commit 57cefae6f2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 20 additions and 6 deletions

View File

@ -685,13 +685,13 @@ void BackupCoordinationStageSync::cancelQueryIfError()
{
std::lock_guard lock{mutex};
if (!state.host_with_error)
return;
exception = state.hosts.at(*state.host_with_error).exception;
if (state.host_with_error)
exception = state.hosts.at(*state.host_with_error).exception;
}
chassert(exception);
if (!exception)
return;
process_list_element->cancelQuery(false, exception);
state_changed.notify_all();
}
@ -741,6 +741,11 @@ void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong()
if (!exception)
return;
/// In this function we only pass the new `exception` (about that the connection was lost) to `process_list_element`.
/// We don't try to create the 'error' node here (because this function is called from watchingThread() and
/// we don't want the watching thread to try waiting here for retries or a reconnection).
/// Also we don't set the `state.host_with_error` field here because `state.host_with_error` can only be set
/// AFTER creating the 'error' node (see the comment for `State`).
process_list_element->cancelQuery(false, exception);
state_changed.notify_all();
}
@ -870,6 +875,9 @@ bool BackupCoordinationStageSync::checkIfHostsReachStage(const Strings & hosts,
continue;
}
if (state.host_with_error)
std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
if (host_info.finished)
throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
"{} finished without coming to stage {}", getHostDesc(host), stage_to_wait);
@ -1150,6 +1158,9 @@ bool BackupCoordinationStageSync::checkIfOtherHostsFinish(
if ((host == current_host) || host_info.finished)
continue;
if (throw_if_error && state.host_with_error)
std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
String reason_text = reason.empty() ? "" : (" " + reason);
String host_status;

View File

@ -197,6 +197,9 @@ private:
};
/// Information about all the host participating in the current BACKUP or RESTORE operation.
/// This information is read from ZooKeeper.
/// To simplify the programming logic `state` can only be updated AFTER changing corresponding nodes in ZooKeeper
/// (for example, first we create the 'error' node, and only after that we set or read from ZK the `state.host_with_error` field).
struct State
{
std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states

View File

@ -667,7 +667,7 @@ def test_long_disconnection_stops_backup():
# A backup is expected to fail, but it isn't expected to fail too soon.
print(f"Backup failed after {time_to_fail} seconds disconnection")
assert time_to_fail > 3
assert time_to_fail < 35
assert time_to_fail < 45
# A backup must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.