mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Merge pull request #72018 from vitlibar/correction-after-reworking-backup-sync-3
Corrections after reworking backup/restore synchronization #3
This commit is contained in:
commit
57cefae6f2
@ -685,13 +685,13 @@ void BackupCoordinationStageSync::cancelQueryIfError()
|
||||
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
if (!state.host_with_error)
|
||||
return;
|
||||
|
||||
exception = state.hosts.at(*state.host_with_error).exception;
|
||||
if (state.host_with_error)
|
||||
exception = state.hosts.at(*state.host_with_error).exception;
|
||||
}
|
||||
|
||||
chassert(exception);
|
||||
if (!exception)
|
||||
return;
|
||||
|
||||
process_list_element->cancelQuery(false, exception);
|
||||
state_changed.notify_all();
|
||||
}
|
||||
@ -741,6 +741,11 @@ void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong()
|
||||
if (!exception)
|
||||
return;
|
||||
|
||||
/// In this function we only pass the new `exception` (about that the connection was lost) to `process_list_element`.
|
||||
/// We don't try to create the 'error' node here (because this function is called from watchingThread() and
|
||||
/// we don't want the watching thread to try waiting here for retries or a reconnection).
|
||||
/// Also we don't set the `state.host_with_error` field here because `state.host_with_error` can only be set
|
||||
/// AFTER creating the 'error' node (see the comment for `State`).
|
||||
process_list_element->cancelQuery(false, exception);
|
||||
state_changed.notify_all();
|
||||
}
|
||||
@ -870,6 +875,9 @@ bool BackupCoordinationStageSync::checkIfHostsReachStage(const Strings & hosts,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (state.host_with_error)
|
||||
std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
|
||||
|
||||
if (host_info.finished)
|
||||
throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
|
||||
"{} finished without coming to stage {}", getHostDesc(host), stage_to_wait);
|
||||
@ -1150,6 +1158,9 @@ bool BackupCoordinationStageSync::checkIfOtherHostsFinish(
|
||||
if ((host == current_host) || host_info.finished)
|
||||
continue;
|
||||
|
||||
if (throw_if_error && state.host_with_error)
|
||||
std::rethrow_exception(state.hosts.at(*state.host_with_error).exception);
|
||||
|
||||
String reason_text = reason.empty() ? "" : (" " + reason);
|
||||
|
||||
String host_status;
|
||||
|
@ -197,6 +197,9 @@ private:
|
||||
};
|
||||
|
||||
/// Information about all the host participating in the current BACKUP or RESTORE operation.
|
||||
/// This information is read from ZooKeeper.
|
||||
/// To simplify the programming logic `state` can only be updated AFTER changing corresponding nodes in ZooKeeper
|
||||
/// (for example, first we create the 'error' node, and only after that we set or read from ZK the `state.host_with_error` field).
|
||||
struct State
|
||||
{
|
||||
std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
|
||||
|
@ -667,7 +667,7 @@ def test_long_disconnection_stops_backup():
|
||||
# A backup is expected to fail, but it isn't expected to fail too soon.
|
||||
print(f"Backup failed after {time_to_fail} seconds disconnection")
|
||||
assert time_to_fail > 3
|
||||
assert time_to_fail < 35
|
||||
assert time_to_fail < 45
|
||||
|
||||
|
||||
# A backup must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.
|
||||
|
Loading…
Reference in New Issue
Block a user