Updated Backup/Restore Coordination construction and removed coordination_path and added uuid in settings - Use cluster state data to check concurrent backup/restore

This commit is contained in:
Smita Kulkarni 2023-02-16 09:30:27 +01:00
parent 3ee380618b
commit 0506d9289c
15 changed files with 69 additions and 66 deletions

View File

@ -202,7 +202,7 @@ Strings BackupCoordinationLocal::getAllArchiveSuffixes() const
return archive_suffixes;
}
bool BackupCoordinationLocal::hasConcurrentBackups(const String &, const String &, const std::atomic<size_t> & num_active_backups) const
bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
{
return (num_active_backups > 1);
}

View File

@ -52,7 +52,7 @@ public:
String getNextArchiveSuffix() override;
Strings getAllArchiveSuffixes() const override;
bool hasConcurrentBackups(const String & backup_id, const String & common_backups_path, const std::atomic<size_t> & num_active_backups) const override;
bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
private:
mutable std::mutex mutex;

View File

@ -166,14 +166,16 @@ namespace
}
BackupCoordinationRemote::BackupCoordinationRemote(
const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_)
: zookeeper_path(zookeeper_path_)
const String & root_zookeeper_path_, const String & backup_uuid_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_)
: root_zookeeper_path(root_zookeeper_path_)
, zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
, backup_uuid(backup_uuid_)
, get_zookeeper(get_zookeeper_)
, is_internal(is_internal_)
{
createRootNodes();
stage_sync.emplace(
zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("BackupCoordination"));
zookeeper_path + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("BackupCoordination"));
}
BackupCoordinationRemote::~BackupCoordinationRemote()
@ -595,36 +597,36 @@ Strings BackupCoordinationRemote::getAllArchiveSuffixes() const
return node_names;
}
bool BackupCoordinationRemote::hasConcurrentBackups(const String & backup_id, const String & common_backups_path, const std::atomic<size_t> &) const
bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
{
/// If its internal concurrency will be checked for the base backup
if (is_internal)
return false;
auto zk = getZooKeeper();
std::string backup_stage_path = common_backups_path + "/backup-" + toString(backup_id) +"/stage";
std::string backup_stage_path = zookeeper_path +"/stage";
if (!zk->exists(common_backups_path))
zk->createAncestors(common_backups_path);
if (!zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
{
Coordination::Stat stat;
zk->get(common_backups_path, &stat);
Strings existing_backup_paths = zk->getChildren(common_backups_path);
zk->get(root_zookeeper_path, &stat);
Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
for (const auto & existing_backup_path : existing_backup_paths)
{
if (startsWith(existing_backup_path, "restore-"))
continue;
String existing_backup_id = existing_backup_path;
existing_backup_id.erase(0, String("backup-").size());
String existing_backup_uuid = existing_backup_path;
existing_backup_uuid.erase(0, String("backup-").size());
if (existing_backup_id == toString(backup_id))
if (existing_backup_uuid == toString(backup_uuid))
continue;
const auto status = zk->get(common_backups_path + "/" + existing_backup_path + "/stage");
const auto status = zk->get(root_zookeeper_path + "/" + existing_backup_path + "/stage");
if (status != Stage::COMPLETED)
return true;
}
@ -637,6 +639,7 @@ bool BackupCoordinationRemote::hasConcurrentBackups(const String & backup_id, co
if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
throw zkutil::KeeperException(code, backup_stage_path);
}
return false;
}

View File

@ -16,7 +16,7 @@ constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
class BackupCoordinationRemote : public IBackupCoordination
{
public:
BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_);
BackupCoordinationRemote(const String & root_zookeeper_path_, const String & backup_uuid_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_);
~BackupCoordinationRemote() override;
void setStage(const String & current_host, const String & new_stage, const String & message) override;
@ -58,7 +58,7 @@ public:
String getNextArchiveSuffix() override;
Strings getAllArchiveSuffixes() const override;
bool hasConcurrentBackups(const String & backup_id, const String & common_backups_path, const std::atomic<size_t> & num_active_backups) const override;
bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
private:
zkutil::ZooKeeperPtr getZooKeeper() const;
@ -68,7 +68,9 @@ private:
void prepareReplicatedTables() const;
void prepareReplicatedAccess() const;
const String root_zookeeper_path;
const String zookeeper_path;
const String backup_uuid;
const zkutil::GetZooKeeper get_zookeeper;
const bool is_internal;

View File

@ -28,7 +28,6 @@ namespace ErrorCodes
M(UInt64, replica_num) \
M(Bool, internal) \
M(String, host_id) \
M(String, coordination_zk_path) \
M(OptionalUUID, backup_uuid)
/// M(Int64, compression_level)

View File

@ -55,10 +55,6 @@ struct BackupSettings
/// Cluster's hosts' IDs in the format 'escaped_host_name:port' for all shards and replicas in a cluster specified in BACKUP ON CLUSTER.
std::vector<Strings> cluster_host_ids;
/// Internal, should not be specified by user.
/// Path in Zookeeper used to coordinate a distributed backup created by BACKUP ON CLUSTER.
String coordination_zk_path;
/// Internal, should not be specified by user.
/// UUID of the backup. If it's not set it will be generated randomly.
std::optional<UUID> backup_uuid;

View File

@ -38,12 +38,12 @@ namespace Stage = BackupCoordinationStage;
namespace
{
std::shared_ptr<IBackupCoordination> makeBackupCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup)
std::shared_ptr<IBackupCoordination> makeBackupCoordination(const String & root_zk_path, const String & backup_uuid, const ContextPtr & context, bool is_internal_backup)
{
if (!coordination_zk_path.empty())
if (!root_zk_path.empty())
{
auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
return std::make_shared<BackupCoordinationRemote>(coordination_zk_path, get_zookeeper, is_internal_backup);
return std::make_shared<BackupCoordinationRemote>(root_zk_path, backup_uuid, get_zookeeper, is_internal_backup);
}
else
{
@ -51,12 +51,12 @@ namespace
}
}
std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup)
std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(const String & root_zk_path, const String & restore_uuid, const ContextPtr & context, bool is_internal_backup)
{
if (!coordination_zk_path.empty())
if (!root_zk_path.empty())
{
auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
return std::make_shared<RestoreCoordinationRemote>(coordination_zk_path, get_zookeeper, is_internal_backup);
return std::make_shared<RestoreCoordinationRemote>(root_zk_path, restore_uuid, get_zookeeper, is_internal_backup);
}
else
{
@ -160,13 +160,16 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
else
backup_id = toString(*backup_settings.backup_uuid);
String root_zk_path;
std::shared_ptr<IBackupCoordination> backup_coordination;
if (backup_settings.internal)
{
/// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
/// if it's not created here. However to handle errors better it's better to make a coordination here because this way
/// if an exception will be thrown in startMakingBackup() other hosts will know about that.
backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal);
root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
backup_coordination = makeBackupCoordination(root_zk_path, toString(*backup_settings.backup_uuid), context, backup_settings.internal);
}
auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
@ -270,17 +273,13 @@ void BackupsWorker::doBackup(
backup_query->cluster = context->getMacros()->expand(backup_query->cluster);
cluster = context->getCluster(backup_query->cluster);
backup_settings.cluster_host_ids = cluster->getHostIDs();
if (backup_settings.coordination_zk_path.empty())
{
backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(*backup_settings.backup_uuid);
}
}
/// Make a backup coordination.
if (!backup_coordination)
backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal);
backup_coordination = makeBackupCoordination(root_zk_path, toString(*backup_settings.backup_uuid), context, backup_settings.internal);
if (!allow_concurrent_backups && backup_coordination->hasConcurrentBackups(backup_id, root_zk_path, std::ref(num_active_backups)))
if (!allow_concurrent_backups && backup_coordination->hasConcurrentBackups(std::ref(num_active_backups)))
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
/// Opens a backup for writing.
@ -383,6 +382,9 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
if (!restore_settings.restore_uuid)
restore_settings.restore_uuid = UUIDHelpers::generateV4();
/// `restore_id` will be used as a key to the `infos` map, so it should be unique.
OperationID restore_id;
if (restore_settings.internal)
@ -390,7 +392,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
else if (!restore_settings.id.empty())
restore_id = restore_settings.id;
else
restore_id = toString(UUIDHelpers::generateV4());
restore_id = toString(*restore_settings.restore_uuid);
std::shared_ptr<IRestoreCoordination> restore_coordination;
if (restore_settings.internal)
@ -398,7 +400,8 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
/// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
/// if it's not created here. However to handle errors better it's better to make a coordination here because this way
/// if an exception will be thrown in startRestoring() other hosts will know about that.
restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal);
auto root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
restore_coordination = makeRestoreCoordination(root_zk_path, toString(*restore_settings.restore_uuid), context, restore_settings.internal);
}
try
@ -518,15 +521,10 @@ void BackupsWorker::doRestore(
}
/// Make a restore coordination.
if (on_cluster && restore_settings.coordination_zk_path.empty())
{
restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_id);
}
if (!restore_coordination)
restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal);
restore_coordination = makeRestoreCoordination(root_zk_path, toString(*restore_settings.restore_uuid), context, restore_settings.internal);
if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(restore_id, root_zk_path, std::ref(num_active_restores)))
if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
/// Do RESTORE.

View File

@ -117,7 +117,7 @@ public:
/// This function is used to check if concurrent backups are running
/// other than the backup passed to the function
virtual bool hasConcurrentBackups(const String & backup_id, const String & common_backups_path, const std::atomic<size_t> & num_active_backups) const = 0;
virtual bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const = 0;
};
}

View File

@ -37,7 +37,7 @@ public:
/// This function is used to check if concurrent restores are running
/// other than the restore passed to the function
virtual bool hasConcurrentRestores(const String & restore_id, const String & common_restores_path, const std::atomic<size_t> & num_active_restores) const = 0;
virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
};
}

View File

@ -42,7 +42,7 @@ bool RestoreCoordinationLocal::acquireReplicatedAccessStorage(const String &)
return true;
}
bool RestoreCoordinationLocal::hasConcurrentRestores(const String &, const String &, const std::atomic<size_t> & num_active_restores) const
bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
{
return (num_active_restores > 1);
}

View File

@ -35,7 +35,7 @@ public:
/// The function returns false if this access storage is being already restored by another replica.
bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override;
bool hasConcurrentRestores(const String & restore_id, const String & common_restores_path, const std::atomic<size_t> & num_active_restores) const override;
bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
private:
std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;

View File

@ -10,15 +10,17 @@ namespace DB
namespace Stage = BackupCoordinationStage;
RestoreCoordinationRemote::RestoreCoordinationRemote(
const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_)
: zookeeper_path(zookeeper_path_)
const String & root_zookeeper_path_, const String & restore_uuid_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_)
: root_zookeeper_path(root_zookeeper_path_)
, zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
, restore_uuid(restore_uuid_)
, get_zookeeper(get_zookeeper_)
, is_internal(is_internal_)
{
createRootNodes();
stage_sync.emplace(
zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("RestoreCoordination"));
zookeeper_path + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("RestoreCoordination"));
}
RestoreCoordinationRemote::~RestoreCoordinationRemote()
@ -132,36 +134,36 @@ void RestoreCoordinationRemote::removeAllNodes()
zk->removeRecursive(zookeeper_path);
}
bool RestoreCoordinationRemote::hasConcurrentRestores(const String & restore_id, const String & common_restores_path, const std::atomic<size_t> &) const
bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
{
/// If its internal concurrency will be checked for the base restore
if (is_internal)
return false;
auto zk = getZooKeeper();
std::string path = common_restores_path + "/restore-" + toString(restore_id) +"/stage";
std::string path = zookeeper_path +"/stage";
if (! zk->exists(common_restores_path))
zk->createAncestors(common_restores_path);
if (! zk->exists(root_zookeeper_path))
zk->createAncestors(root_zookeeper_path);
for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
{
Coordination::Stat stat;
zk->get(common_restores_path, &stat);
Strings existing_restore_paths = zk->getChildren(common_restores_path);
zk->get(root_zookeeper_path, &stat);
Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
for (const auto & existing_restore_path : existing_restore_paths)
{
if (startsWith(existing_restore_path, "backup-"))
continue;
String existing_restore_id = existing_restore_path;
existing_restore_id.erase(0, String("restore-").size());
String existing_restore_uuid = existing_restore_path;
existing_restore_uuid.erase(0, String("restore-").size());
if (existing_restore_id == toString(restore_id))
if (existing_restore_uuid == toString(restore_uuid))
continue;
const auto status = zk->get(common_restores_path + "/" + existing_restore_path + "/stage");
const auto status = zk->get(root_zookeeper_path + "/" + existing_restore_path + "/stage");
if (status != Stage::COMPLETED)
return true;
}

View File

@ -11,7 +11,7 @@ namespace DB
class RestoreCoordinationRemote : public IRestoreCoordination
{
public:
RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_);
RestoreCoordinationRemote(const String & root_zookeeper_path_, const String & restore_uuid_, zkutil::GetZooKeeper get_zookeeper_, bool is_internal_);
~RestoreCoordinationRemote() override;
/// Sets the current stage and waits for other hosts to come to this stage too.
@ -31,7 +31,7 @@ public:
/// The function returns false if this access storage is being already restored by another replica.
bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override;
bool hasConcurrentRestores(const String & restore_id, const String & common_restores_path, const std::atomic<size_t> & num_active_restores) const override;
bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
private:
zkutil::ZooKeeperPtr getZooKeeper() const;
@ -40,7 +40,9 @@ private:
class ReplicatedDatabasesMetadataSync;
const String root_zookeeper_path;
const String zookeeper_path;
const String restore_uuid;
const zkutil::GetZooKeeper get_zookeeper;
const bool is_internal;

View File

@ -163,7 +163,7 @@ namespace
M(RestoreUDFCreationMode, create_function) \
M(Bool, internal) \
M(String, host_id) \
M(String, coordination_zk_path)
M(OptionalUUID, restore_uuid)
RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query)

View File

@ -119,8 +119,9 @@ struct RestoreSettings
std::vector<Strings> cluster_host_ids;
/// Internal, should not be specified by user.
/// Path in Zookeeper used to coordinate restoring process while executing by RESTORE ON CLUSTER.
String coordination_zk_path;
/// UUID of the restore. If it's not set it will be generated randomly.
/// This is used to generate coordination path and for concurrency check
std::optional<UUID> restore_uuid;
static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query);
void copySettingsToQuery(ASTBackupQuery & query) const;