Merge pull request #45072 from ClickHouse/43891_Disallow_concurrent_backups_and_restores

Added settings to disallow concurrent backups and restores
2024-11-22 15:42:02 +00:00 · 2023-01-20 09:17:20 +01:00 · 2023-01-20 09:17:20 +01:00 · 6aa63414db
commit 6aa63414db
parent c7d12564cb 67e2bf31f5
10 changed files with 375 additions and 48 deletions
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -6,7 +6,7 @@
 #include <Parsers/ASTSetQuery.h>
 #include <Parsers/ASTLiteral.h>
 #include <IO/ReadHelpers.h>
-
+#include <Backups/SettingsFieldOptionalUUID.h>

 namespace DB
 {
@ -16,48 +16,6 @@ namespace ErrorCodes
    extern const int WRONG_BACKUP_SETTINGS;
 }

-
-namespace
-{
-    struct SettingFieldOptionalUUID
-    {
-        std::optional<UUID> value;
-
-        explicit SettingFieldOptionalUUID(const std::optional<UUID> & value_) : value(value_) {}
-
-        explicit SettingFieldOptionalUUID(const Field & field)
-        {
-            if (field.getType() == Field::Types::Null)
-            {
-                value = std::nullopt;
-                return;
-            }
-
-            if (field.getType() == Field::Types::String)
-            {
-                const String & str = field.get<const String &>();
-                if (str.empty())
-                {
-                    value = std::nullopt;
-                    return;
-                }
-
-                UUID id;
-                if (tryParse(id, str))
-                {
-                    value = id;
-                    return;
-                }
-            }
-
-            throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse uuid from {}", field);
-        }
-
-        explicit operator Field() const { return Field(value ? toString(*value) : ""); }
-    };
-}
-
-
 /// List of backup settings except base_backup_name and cluster_host_ids.
 #define LIST_OF_BACKUP_SETTINGS(M) \
    M(String, id) \
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -30,6 +30,7 @@ namespace ErrorCodes
 {
    extern const int BAD_ARGUMENTS;
    extern const int LOGICAL_ERROR;
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
 }

 using OperationID = BackupsWorker::OperationID;
@ -121,10 +122,12 @@ namespace
 }


-BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads)
+BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_)
    : backups_thread_pool(num_backup_threads, /* max_free_threads = */ 0, num_backup_threads)
    , restores_thread_pool(num_restore_threads, /* max_free_threads = */ 0, num_restore_threads)
    , log(&Poco::Logger::get("BackupsWorker"))
+    , allow_concurrent_backups(allow_concurrent_backups_)
+    , allow_concurrent_restores(allow_concurrent_restores_)
 {
    /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now.
 }
@ -157,6 +160,16 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
    else
        backup_id = toString(*backup_settings.backup_uuid);

+    /// Check if there are no concurrent backups
+    if (num_active_backups && !allow_concurrent_backups)
+    {
+        /// If its an internal backup and we currently have 1 active backup, it could be the original query, validate using backup_uuid
+        if (!(num_active_backups == 1 && backup_settings.internal && getAllActiveBackupInfos().at(0).id == toString(*backup_settings.backup_uuid)))
+        {
+            throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
+        }
+    }
+
    std::shared_ptr<IBackupCoordination> backup_coordination;
    if (backup_settings.internal)
    {
@ -370,6 +383,9 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);

+    if (!restore_settings.backup_uuid)
+        restore_settings.backup_uuid = UUIDHelpers::generateV4();
+
    /// `restore_id` will be used as a key to the `infos` map, so it should be unique.
    OperationID restore_id;
    if (restore_settings.internal)
@ -377,7 +393,17 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    else if (!restore_settings.id.empty())
        restore_id = restore_settings.id;
    else
-        restore_id = toString(UUIDHelpers::generateV4());
+        restore_id = toString(*restore_settings.backup_uuid);
+
+    /// Check if there are no concurrent restores
+    if (num_active_restores && !allow_concurrent_restores)
+    {
+        /// If its an internal restore and we currently have 1 active restore, it could be the original query, validate using iz
+        if (!(num_active_restores == 1 && restore_settings.internal && getAllActiveRestoreInfos().at(0).id == toString(*restore_settings.backup_uuid)))
+        {
+            throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
+        }
+    }

    std::shared_ptr<IRestoreCoordination> restore_coordination;
    if (restore_settings.internal)
@ -471,6 +497,7 @@ void BackupsWorker::doRestore(
        backup_open_params.context = context;
        backup_open_params.backup_info = backup_info;
        backup_open_params.base_backup_info = restore_settings.base_backup_info;
+        backup_open_params.backup_uuid = restore_settings.backup_uuid;
        backup_open_params.password = restore_settings.password;
        BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);

@ -687,6 +714,30 @@ std::vector<BackupsWorker::Info> BackupsWorker::getAllInfos() const
    return res_infos;
 }

+std::vector<BackupsWorker::Info> BackupsWorker::getAllActiveBackupInfos() const
+{
+    std::vector<Info> res_infos;
+    std::lock_guard lock{infos_mutex};
+    for (const auto & info : infos | boost::adaptors::map_values)
+    {
+        if (info.status==BackupStatus::CREATING_BACKUP)
+            res_infos.push_back(info);
+    }
+    return res_infos;
+}
+
+std::vector<BackupsWorker::Info> BackupsWorker::getAllActiveRestoreInfos() const
+{
+    std::vector<Info> res_infos;
+    std::lock_guard lock{infos_mutex};
+    for (const auto & info : infos | boost::adaptors::map_values)
+    {
+        if (info.status==BackupStatus::RESTORING)
+            res_infos.push_back(info);
+    }
+    return res_infos;
+}
+
 void BackupsWorker::shutdown()
 {
    bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -23,7 +23,7 @@ class IRestoreCoordination;
 class BackupsWorker
 {
 public:
-    BackupsWorker(size_t num_backup_threads, size_t num_restore_threads);
+    BackupsWorker(size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_);

    /// Waits until all tasks have been completed.
    void shutdown();
@ -103,6 +103,8 @@ private:
    void setStatus(const OperationID & id, BackupStatus status, bool throw_if_error = true);
    void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
    void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);
+    std::vector<Info> getAllActiveBackupInfos() const;
+    std::vector<Info> getAllActiveRestoreInfos() const;

    ThreadPool backups_thread_pool;
    ThreadPool restores_thread_pool;
@ -113,6 +115,8 @@ private:
    std::atomic<size_t> num_active_restores = 0;
    mutable std::mutex infos_mutex;
    Poco::Logger * log;
+    const bool allow_concurrent_backups;
+    const bool allow_concurrent_restores;
 };

 }
--- a/src/Backups/RestoreSettings.cpp
+++ b/src/Backups/RestoreSettings.cpp
@ -7,6 +7,7 @@
 #include <Parsers/ASTSetQuery.h>
 #include <boost/algorithm/string/predicate.hpp>
 #include <Common/FieldVisitorConvertToNumber.h>
+#include <Backups/SettingsFieldOptionalUUID.h>


 namespace DB
@ -162,7 +163,9 @@ namespace
    M(RestoreUDFCreationMode, create_function) \
    M(Bool, internal) \
    M(String, host_id) \
-    M(String, coordination_zk_path)
+    M(String, coordination_zk_path) \
+    M(OptionalUUID, backup_uuid)
+

 RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query)
 {
--- a/src/Backups/RestoreSettings.h
+++ b/src/Backups/RestoreSettings.h
@ -122,6 +122,11 @@ struct RestoreSettings
    /// Path in Zookeeper used to coordinate restoring process while executing by RESTORE ON CLUSTER.
    String coordination_zk_path;

+    /// Internal, should not be specified by user.
+    /// UUID of the backup. If it's not set it will be generated randomly.
+    /// This is used to validate internal restores when allow_concurrent_restores is turned off
+    std::optional<UUID> backup_uuid;
+
    static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query);
    void copySettingsToQuery(ASTBackupQuery & query) const;
 };
--- a/src/Backups/SettingsFieldOptionalUUID.cpp
+++ b/src/Backups/SettingsFieldOptionalUUID.cpp
@ -0,0 +1,43 @@
+#include <Backups/SettingsFieldOptionalUUID.h>
+#include <Common/ErrorCodes.h>
+#include <Core/SettingsFields.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_PARSE_BACKUP_SETTINGS;
+}
+
+
+    SettingFieldOptionalUUID::SettingFieldOptionalUUID(const Field & field)
+    {
+        if (field.getType() == Field::Types::Null)
+        {
+            value = std::nullopt;
+            return;
+        }
+
+        if (field.getType() == Field::Types::String)
+        {
+            const String & str = field.get<const String &>();
+            if (str.empty())
+            {
+                value = std::nullopt;
+                return;
+            }
+
+            UUID id;
+            if (tryParse(id, str))
+            {
+                value = id;
+                return;
+            }
+        }
+
+        throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse uuid from {}", field);
+    }
+
+}
--- a/src/Backups/SettingsFieldOptionalUUID.h
+++ b/src/Backups/SettingsFieldOptionalUUID.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include <optional>
+#include <Core/SettingsFields.h>
+
+namespace DB
+{
+struct SettingFieldOptionalUUID
+    {
+        std::optional<UUID> value;
+
+        explicit SettingFieldOptionalUUID(const std::optional<UUID> & value_) : value(value_) {}
+
+        explicit SettingFieldOptionalUUID(const Field & field);
+
+        explicit operator Field() const { return Field(value ? toString(*value) : ""); }
+    };
+}
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -1888,8 +1888,11 @@ BackupsWorker & Context::getBackupsWorker() const
 {
    auto lock = getLock();

+    const bool allow_concurrent_backups = this->getConfigRef().getBool("allow_concurrent_backups", true);
+    const bool allow_concurrent_restores = this->getConfigRef().getBool("allow_concurrent_restores", true);
+
    if (!shared->backups_worker)
-        shared->backups_worker.emplace(getSettingsRef().backup_threads, getSettingsRef().restore_threads);
+        shared->backups_worker.emplace(getSettingsRef().backup_threads, getSettingsRef().restore_threads, allow_concurrent_backups, allow_concurrent_restores);

    return *shared->backups_worker;
 }
--- a/tests/integration/test_backup_restore_on_cluster/configs/disallow_concurrency.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/disallow_concurrency.xml
@ -0,0 +1,15 @@
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <backups>
+                <type>local</type>
+                <path>/backups/</path>
+            </backups>
+        </disks>
+    </storage_configuration>
+    <backups>
+        <allowed_disk>backups</allowed_disk>
+    </backups>
+    <allow_concurrent_backups>false</allow_concurrent_backups>
+    <allow_concurrent_restores>false</allow_concurrent_restores>
+</clickhouse>
--- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
@ -0,0 +1,227 @@
+from random import randint
+import pytest
+import os.path
+import time
+import concurrent
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV, assert_eq_with_retry
+
+
+cluster = ClickHouseCluster(__file__)
+
+num_nodes = 10
+
+
+def generate_cluster_def():
+    path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "./_gen/cluster_for_concurrency_test.xml",
+    )
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "w") as f:
+        f.write(
+            """
+        <clickhouse>
+            <remote_servers>
+                <cluster>
+                    <shard>
+        """
+        )
+        for i in range(num_nodes):
+            f.write(
+                """
+                        <replica>
+                            <host>node"""
+                + str(i)
+                + """</host>
+                            <port>9000</port>
+                        </replica>
+            """
+            )
+        f.write(
+            """
+                    </shard>
+                </cluster>
+            </remote_servers>
+        </clickhouse>
+        """
+        )
+    return path
+
+
+main_configs = ["configs/disallow_concurrency.xml", generate_cluster_def()]
+user_configs = ["configs/allow_database_types.xml"]
+
+nodes = []
+for i in range(num_nodes):
+    nodes.append(
+        cluster.add_instance(
+            f"node{i}",
+            main_configs=main_configs,
+            user_configs=user_configs,
+            external_dirs=["/backups/"],
+            macros={"replica": f"node{i}", "shard": "shard1"},
+            with_zookeeper=True,
+        )
+    )
+
+node0 = nodes[0]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def drop_after_test():
+    try:
+        yield
+    finally:
+        node0.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY")
+        node0.query("DROP DATABASE IF EXISTS mydb ON CLUSTER 'cluster' NO DELAY")
+
+
+backup_id_counter = 0
+
+
+def new_backup_name():
+    global backup_id_counter
+    backup_id_counter += 1
+    return f"Disk('backups', '{backup_id_counter}')"
+
+
+def create_and_fill_table():
+    node0.query(
+        "CREATE TABLE tbl ON CLUSTER 'cluster' ("
+        "x UInt64"
+        ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')"
+        "ORDER BY x"
+    )
+    for i in range(num_nodes):
+        nodes[i].query(f"INSERT INTO tbl SELECT number FROM numbers(40000000)")
+
+
+# All the tests have concurrent backup/restores with same backup names
+# The same works with different backup names too. Since concurrency
+# check comes before backup name check, separate tests are not added for different names
+
+
+def test_concurrent_backups_on_same_node():
+    create_and_fill_table()
+
+    backup_name = new_backup_name()
+
+    id = (
+        nodes[0]
+        .query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC")
+        .split("\t")[0]
+    )
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP' AND id = '{id}'",
+        "CREATING_BACKUP",
+    )
+    assert "Concurrent backups not supported" in nodes[0].query_and_get_error(
+        f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}"
+    )
+
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'BACKUP_CREATED' AND id = '{id}'",
+        "BACKUP_CREATED",
+    )
+
+    # This restore part is added to confirm creating an internal backup & restore work
+    # even when a concurrent backup is stopped
+    nodes[0].query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY")
+    nodes[0].query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}")
+    nodes[0].query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl")
+
+
+def test_concurrent_backups_on_different_nodes():
+    create_and_fill_table()
+
+    backup_name = new_backup_name()
+
+    nodes[1].query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC")
+    assert_eq_with_retry(
+        nodes[1],
+        f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP'",
+        "CREATING_BACKUP",
+    )
+    assert "Concurrent backups not supported" in nodes[2].query_and_get_error(
+        f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}"
+    )
+
+
+def test_concurrent_restores_on_same_node():
+    create_and_fill_table()
+
+    backup_name = new_backup_name()
+
+    id = (
+        nodes[0]
+        .query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC")
+        .split("\t")[0]
+    )
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP' AND id = '{id}'",
+        "CREATING_BACKUP",
+    )
+
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'BACKUP_CREATED' AND id = '{id}'",
+        "BACKUP_CREATED",
+    )
+
+    nodes[0].query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY")
+    nodes[0].query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} ASYNC")
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'RESTORING'",
+        "RESTORING",
+    )
+    assert "Concurrent restores not supported" in nodes[0].query_and_get_error(
+        f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}"
+    )
+
+
+def test_concurrent_restores_on_different_node():
+    create_and_fill_table()
+
+    backup_name = new_backup_name()
+
+    id = (
+        nodes[0]
+        .query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC")
+        .split("\t")[0]
+    )
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP' AND id = '{id}'",
+        "CREATING_BACKUP",
+    )
+
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'BACKUP_CREATED' AND id = '{id}'",
+        "BACKUP_CREATED",
+    )
+
+    nodes[0].query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY")
+    nodes[0].query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} ASYNC")
+    assert_eq_with_retry(
+        nodes[0],
+        f"SELECT status FROM system.backups WHERE status == 'RESTORING'",
+        "RESTORING",
+    )
+    assert "Concurrent restores not supported" in nodes[1].query_and_get_error(
+        f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}"
+    )