mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Updated to add error or completed status in zookeeper for a cluster for backup/restore, to avoid interpreting previously failed backup/restore when zookeeper is unable to remove nodes
This commit is contained in:
parent
17aecb797c
commit
49c95a535a
@ -15,7 +15,7 @@ BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
|
|||||||
|
|
||||||
BackupCoordinationLocal::~BackupCoordinationLocal() = default;
|
BackupCoordinationLocal::~BackupCoordinationLocal() = default;
|
||||||
|
|
||||||
void BackupCoordinationLocal::setStage(const String &, const String &)
|
void BackupCoordinationLocal::setStage(const String &, const String &, const bool &)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ public:
|
|||||||
BackupCoordinationLocal(bool plain_backup_);
|
BackupCoordinationLocal(bool plain_backup_);
|
||||||
~BackupCoordinationLocal() override;
|
~BackupCoordinationLocal() override;
|
||||||
|
|
||||||
void setStage(const String & new_stage, const String & message) override;
|
void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override;
|
||||||
void setError(const Exception & exception) override;
|
void setError(const Exception & exception) override;
|
||||||
Strings waitForStage(const String & stage_to_wait) override;
|
Strings waitForStage(const String & stage_to_wait) override;
|
||||||
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
||||||
|
@ -252,13 +252,17 @@ void BackupCoordinationRemote::removeAllNodes()
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void BackupCoordinationRemote::setStage(const String & new_stage, const String & message)
|
void BackupCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster)
|
||||||
{
|
{
|
||||||
stage_sync->set(current_host, new_stage, message);
|
if (for_cluster)
|
||||||
|
stage_sync->setStageForCluster(new_stage);
|
||||||
|
else
|
||||||
|
stage_sync->set(current_host, new_stage, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackupCoordinationRemote::setError(const Exception & exception)
|
void BackupCoordinationRemote::setError(const Exception & exception)
|
||||||
{
|
{
|
||||||
|
stage_sync->setStageForCluster(Stage::ERROR);
|
||||||
stage_sync->setError(current_host, exception);
|
stage_sync->setError(current_host, exception);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -777,8 +781,8 @@ bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &)
|
|||||||
String status;
|
String status;
|
||||||
if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
|
if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
|
||||||
{
|
{
|
||||||
/// If status is not COMPLETED it could be because the backup failed, check if 'error' exists
|
/// Check if some other restore is in progress
|
||||||
if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_backup_path + "/error"))
|
if (status == Stage::SCHEDULED_TO_START)
|
||||||
{
|
{
|
||||||
LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
|
LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
|
||||||
result = true;
|
result = true;
|
||||||
|
@ -33,7 +33,7 @@ public:
|
|||||||
|
|
||||||
~BackupCoordinationRemote() override;
|
~BackupCoordinationRemote() override;
|
||||||
|
|
||||||
void setStage(const String & new_stage, const String & message) override;
|
void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override;
|
||||||
void setError(const Exception & exception) override;
|
void setError(const Exception & exception) override;
|
||||||
Strings waitForStage(const String & stage_to_wait) override;
|
Strings waitForStage(const String & stage_to_wait) override;
|
||||||
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
||||||
|
@ -43,6 +43,10 @@ namespace BackupCoordinationStage
|
|||||||
|
|
||||||
/// Coordination stage meaning that a host finished its work.
|
/// Coordination stage meaning that a host finished its work.
|
||||||
constexpr const char * COMPLETED = "completed";
|
constexpr const char * COMPLETED = "completed";
|
||||||
|
|
||||||
|
/// Coordination stage meaning that backup/restore has failed due to an error
|
||||||
|
/// Check '/error' for the error message
|
||||||
|
constexpr const char * ERROR = "error";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,20 @@ void BackupCoordinationStageSync::set(const String & current_host, const String
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void BackupCoordinationStageSync::setStageForCluster(const String & new_stage)
|
||||||
|
{
|
||||||
|
auto holder = with_retries.createRetriesControlHolder("setStageForCluster");
|
||||||
|
holder.retries_ctl.retryLoop(
|
||||||
|
[&, &zookeeper = holder.faulty_zookeeper]()
|
||||||
|
{
|
||||||
|
with_retries.renewZooKeeper(zookeeper);
|
||||||
|
zookeeper->trySet(zookeeper_path, new_stage);
|
||||||
|
auto code = zookeeper->trySet(zookeeper_path, new_stage);
|
||||||
|
if (code != Coordination::Error::ZOK)
|
||||||
|
throw zkutil::KeeperException(code, zookeeper_path);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception)
|
void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception)
|
||||||
{
|
{
|
||||||
auto holder = with_retries.createRetriesControlHolder("setError");
|
auto holder = with_retries.createRetriesControlHolder("setError");
|
||||||
|
@ -16,6 +16,7 @@ public:
|
|||||||
|
|
||||||
/// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
|
/// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
|
||||||
void set(const String & current_host, const String & new_stage, const String & message);
|
void set(const String & current_host, const String & new_stage, const String & message);
|
||||||
|
void setStageForCluster(const String & new_stage);
|
||||||
void setError(const String & current_host, const Exception & exception);
|
void setError(const String & current_host, const Exception & exception);
|
||||||
|
|
||||||
/// Sets the stage of the current host and waits until all hosts come to the same stage.
|
/// Sets the stage of the current host and waits until all hosts come to the same stage.
|
||||||
|
@ -368,6 +368,7 @@ void BackupsWorker::doBackup(
|
|||||||
|
|
||||||
/// Wait until all the hosts have written their backup entries.
|
/// Wait until all the hosts have written their backup entries.
|
||||||
backup_coordination->waitForStage(Stage::COMPLETED);
|
backup_coordination->waitForStage(Stage::COMPLETED);
|
||||||
|
backup_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -654,12 +655,26 @@ void BackupsWorker::doRestore(
|
|||||||
/// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
|
/// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
|
||||||
ClusterPtr cluster;
|
ClusterPtr cluster;
|
||||||
bool on_cluster = !restore_query->cluster.empty();
|
bool on_cluster = !restore_query->cluster.empty();
|
||||||
|
|
||||||
if (on_cluster)
|
if (on_cluster)
|
||||||
{
|
{
|
||||||
restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
|
restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
|
||||||
cluster = context->getCluster(restore_query->cluster);
|
cluster = context->getCluster(restore_query->cluster);
|
||||||
restore_settings.cluster_host_ids = cluster->getHostIDs();
|
restore_settings.cluster_host_ids = cluster->getHostIDs();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Make a restore coordination.
|
||||||
|
if (!restore_coordination)
|
||||||
|
restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
|
||||||
|
|
||||||
|
if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
|
||||||
|
throw Exception(
|
||||||
|
ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
|
||||||
|
"Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
|
||||||
|
|
||||||
|
|
||||||
|
if (on_cluster)
|
||||||
|
{
|
||||||
/// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
|
/// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
|
||||||
/// because different replicas can contain different set of tables and so the required access rights can differ too.
|
/// because different replicas can contain different set of tables and so the required access rights can differ too.
|
||||||
/// So the right way is pass through the entire cluster and check access for each host.
|
/// So the right way is pass through the entire cluster and check access for each host.
|
||||||
@ -676,15 +691,6 @@ void BackupsWorker::doRestore(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Make a restore coordination.
|
|
||||||
if (!restore_coordination)
|
|
||||||
restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
|
|
||||||
|
|
||||||
if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
|
|
||||||
throw Exception(
|
|
||||||
ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
|
|
||||||
"Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
|
|
||||||
|
|
||||||
/// Do RESTORE.
|
/// Do RESTORE.
|
||||||
if (on_cluster)
|
if (on_cluster)
|
||||||
{
|
{
|
||||||
@ -703,6 +709,7 @@ void BackupsWorker::doRestore(
|
|||||||
|
|
||||||
/// Wait until all the hosts have written their backup entries.
|
/// Wait until all the hosts have written their backup entries.
|
||||||
restore_coordination->waitForStage(Stage::COMPLETED);
|
restore_coordination->waitForStage(Stage::COMPLETED);
|
||||||
|
restore_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -21,7 +21,7 @@ public:
|
|||||||
virtual ~IBackupCoordination() = default;
|
virtual ~IBackupCoordination() = default;
|
||||||
|
|
||||||
/// Sets the current stage and waits for other hosts to come to this stage too.
|
/// Sets the current stage and waits for other hosts to come to this stage too.
|
||||||
virtual void setStage(const String & new_stage, const String & message) = 0;
|
virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0;
|
||||||
virtual void setError(const Exception & exception) = 0;
|
virtual void setError(const Exception & exception) = 0;
|
||||||
virtual Strings waitForStage(const String & stage_to_wait) = 0;
|
virtual Strings waitForStage(const String & stage_to_wait) = 0;
|
||||||
virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
|
virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
|
||||||
|
@ -18,7 +18,7 @@ public:
|
|||||||
virtual ~IRestoreCoordination() = default;
|
virtual ~IRestoreCoordination() = default;
|
||||||
|
|
||||||
/// Sets the current stage and waits for other hosts to come to this stage too.
|
/// Sets the current stage and waits for other hosts to come to this stage too.
|
||||||
virtual void setStage(const String & new_stage, const String & message) = 0;
|
virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0;
|
||||||
virtual void setError(const Exception & exception) = 0;
|
virtual void setError(const Exception & exception) = 0;
|
||||||
virtual Strings waitForStage(const String & stage_to_wait) = 0;
|
virtual Strings waitForStage(const String & stage_to_wait) = 0;
|
||||||
virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
|
virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
|
||||||
|
@ -11,7 +11,7 @@ RestoreCoordinationLocal::RestoreCoordinationLocal() : log(&Poco::Logger::get("R
|
|||||||
|
|
||||||
RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;
|
RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;
|
||||||
|
|
||||||
void RestoreCoordinationLocal::setStage(const String &, const String &)
|
void RestoreCoordinationLocal::setStage(const String &, const String &, const bool &)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ public:
|
|||||||
~RestoreCoordinationLocal() override;
|
~RestoreCoordinationLocal() override;
|
||||||
|
|
||||||
/// Sets the current stage and waits for other hosts to come to this stage too.
|
/// Sets the current stage and waits for other hosts to come to this stage too.
|
||||||
void setStage(const String & new_stage, const String & message) override;
|
void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override;
|
||||||
void setError(const Exception & exception) override;
|
void setError(const Exception & exception) override;
|
||||||
Strings waitForStage(const String & stage_to_wait) override;
|
Strings waitForStage(const String & stage_to_wait) override;
|
||||||
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
||||||
|
@ -90,13 +90,17 @@ void RestoreCoordinationRemote::createRootNodes()
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
|
void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster)
|
||||||
{
|
{
|
||||||
stage_sync->set(current_host, new_stage, message);
|
if (for_cluster)
|
||||||
|
stage_sync->setStageForCluster(new_stage);
|
||||||
|
else
|
||||||
|
stage_sync->set(current_host, new_stage, message);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RestoreCoordinationRemote::setError(const Exception & exception)
|
void RestoreCoordinationRemote::setError(const Exception & exception)
|
||||||
{
|
{
|
||||||
|
stage_sync->setStageForCluster(Stage::ERROR);
|
||||||
stage_sync->setError(current_host, exception);
|
stage_sync->setError(current_host, exception);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -282,8 +286,8 @@ bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t>
|
|||||||
String status;
|
String status;
|
||||||
if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
|
if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
|
||||||
{
|
{
|
||||||
/// If status is not COMPLETED it could be because the restore failed, check if 'error' exists
|
/// Check if some other restore is in progress
|
||||||
if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_restore_path + "/error"))
|
if (status == Stage::SCHEDULED_TO_START)
|
||||||
{
|
{
|
||||||
LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
|
LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
|
||||||
result = true;
|
result = true;
|
||||||
|
@ -26,7 +26,7 @@ public:
|
|||||||
~RestoreCoordinationRemote() override;
|
~RestoreCoordinationRemote() override;
|
||||||
|
|
||||||
/// Sets the current stage and waits for other hosts to come to this stage too.
|
/// Sets the current stage and waits for other hosts to come to this stage too.
|
||||||
void setStage(const String & new_stage, const String & message) override;
|
void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override;
|
||||||
void setError(const Exception & exception) override;
|
void setError(const Exception & exception) override;
|
||||||
Strings waitForStage(const String & stage_to_wait) override;
|
Strings waitForStage(const String & stage_to_wait) override;
|
||||||
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
|
||||||
|
Loading…
Reference in New Issue
Block a user