From 49c95a535ab5982a03b3dea731692893ed559806 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 12 Apr 2023 20:26:57 +0200 Subject: [PATCH 001/127] Updated to add error or completed status in zookeeper for a cluster for backup/restore, to avoid interpreting previously failed backup/restore when zookeeper is unable to remove nodes --- src/Backups/BackupCoordinationLocal.cpp | 2 +- src/Backups/BackupCoordinationLocal.h | 2 +- src/Backups/BackupCoordinationRemote.cpp | 12 ++++++---- src/Backups/BackupCoordinationRemote.h | 2 +- src/Backups/BackupCoordinationStage.h | 4 ++++ src/Backups/BackupCoordinationStageSync.cpp | 14 ++++++++++++ src/Backups/BackupCoordinationStageSync.h | 1 + src/Backups/BackupsWorker.cpp | 25 +++++++++++++-------- src/Backups/IBackupCoordination.h | 2 +- src/Backups/IRestoreCoordination.h | 2 +- src/Backups/RestoreCoordinationLocal.cpp | 2 +- src/Backups/RestoreCoordinationLocal.h | 2 +- src/Backups/RestoreCoordinationRemote.cpp | 12 ++++++---- src/Backups/RestoreCoordinationRemote.h | 2 +- 14 files changed, 59 insertions(+), 25 deletions(-) diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 27e0f173cf3..47b67693114 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -15,7 +15,7 @@ BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_) BackupCoordinationLocal::~BackupCoordinationLocal() = default; -void BackupCoordinationLocal::setStage(const String &, const String &) +void BackupCoordinationLocal::setStage(const String &, const String &, const bool &) { } diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index 60fcc014720..1f6bb84972e 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -22,7 +22,7 @@ public: BackupCoordinationLocal(bool plain_backup_); ~BackupCoordinationLocal() override; - void setStage(const String & new_stage, const String & message) override; + void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 8e6b5db91b1..48f1ce3eef7 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -252,13 +252,17 @@ void BackupCoordinationRemote::removeAllNodes() } -void BackupCoordinationRemote::setStage(const String & new_stage, const String & message) +void BackupCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster) { - stage_sync->set(current_host, new_stage, message); + if (for_cluster) + stage_sync->setStageForCluster(new_stage); + else + stage_sync->set(current_host, new_stage, message); } void BackupCoordinationRemote::setError(const Exception & exception) { + stage_sync->setStageForCluster(Stage::ERROR); stage_sync->setError(current_host, exception); } @@ -777,8 +781,8 @@ bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic &) String status; if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status)) { - /// If status is not COMPLETED it could be because the backup failed, check if 'error' exists - if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_backup_path + "/error")) + /// Check if some other restore is in progress + if (status == Stage::SCHEDULED_TO_START) { LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid)); result = true; diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index 949dd9c9bf0..40ce2ae6ccc 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -33,7 +33,7 @@ public: ~BackupCoordinationRemote() override; - void setStage(const String & new_stage, const String & message) override; + void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationStage.h b/src/Backups/BackupCoordinationStage.h index 40a4b262caa..41cd66346a2 100644 --- a/src/Backups/BackupCoordinationStage.h +++ b/src/Backups/BackupCoordinationStage.h @@ -43,6 +43,10 @@ namespace BackupCoordinationStage /// Coordination stage meaning that a host finished its work. constexpr const char * COMPLETED = "completed"; + + /// Coordination stage meaning that backup/restore has failed due to an error + /// Check '/error' for the error message + constexpr const char * ERROR = "error"; } } diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index effb00085c3..5cbeec0ec76 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -61,6 +61,20 @@ void BackupCoordinationStageSync::set(const String & current_host, const String }); } +void BackupCoordinationStageSync::setStageForCluster(const String & new_stage) +{ + auto holder = with_retries.createRetriesControlHolder("setStageForCluster"); + holder.retries_ctl.retryLoop( + [&, &zookeeper = holder.faulty_zookeeper]() + { + with_retries.renewZooKeeper(zookeeper); + zookeeper->trySet(zookeeper_path, new_stage); + auto code = zookeeper->trySet(zookeeper_path, new_stage); + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException(code, zookeeper_path); + }); +} + void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception) { auto holder = with_retries.createRetriesControlHolder("setError"); diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h index 56081f8779c..9dde4e3095f 100644 --- a/src/Backups/BackupCoordinationStageSync.h +++ b/src/Backups/BackupCoordinationStageSync.h @@ -16,6 +16,7 @@ public: /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that. void set(const String & current_host, const String & new_stage, const String & message); + void setStageForCluster(const String & new_stage); void setError(const String & current_host, const Exception & exception); /// Sets the stage of the current host and waits until all hosts come to the same stage. diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 4b17174a8de..aae9cfd620f 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -368,6 +368,7 @@ void BackupsWorker::doBackup( /// Wait until all the hosts have written their backup entries. backup_coordination->waitForStage(Stage::COMPLETED); + backup_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true); } else { @@ -654,12 +655,26 @@ void BackupsWorker::doRestore( /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.) ClusterPtr cluster; bool on_cluster = !restore_query->cluster.empty(); + if (on_cluster) { restore_query->cluster = context->getMacros()->expand(restore_query->cluster); cluster = context->getCluster(restore_query->cluster); restore_settings.cluster_host_ids = cluster->getHostIDs(); + } + /// Make a restore coordination. + if (!restore_coordination) + restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster); + + if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores))) + throw Exception( + ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, + "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'"); + + + if (on_cluster) + { /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect /// because different replicas can contain different set of tables and so the required access rights can differ too. /// So the right way is pass through the entire cluster and check access for each host. @@ -676,15 +691,6 @@ void BackupsWorker::doRestore( } } - /// Make a restore coordination. - if (!restore_coordination) - restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster); - - if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores))) - throw Exception( - ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, - "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'"); - /// Do RESTORE. if (on_cluster) { @@ -703,6 +709,7 @@ void BackupsWorker::doRestore( /// Wait until all the hosts have written their backup entries. restore_coordination->waitForStage(Stage::COMPLETED); + restore_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true); } else { diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 75d9202374b..614e6a16db8 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -21,7 +21,7 @@ public: virtual ~IBackupCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message) = 0; + virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index 2f9e8d171f6..599a698a1f9 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -18,7 +18,7 @@ public: virtual ~IRestoreCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message) = 0; + virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp index 068c4fe7e52..f689277f5b6 100644 --- a/src/Backups/RestoreCoordinationLocal.cpp +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -11,7 +11,7 @@ RestoreCoordinationLocal::RestoreCoordinationLocal() : log(&Poco::Logger::get("R RestoreCoordinationLocal::~RestoreCoordinationLocal() = default; -void RestoreCoordinationLocal::setStage(const String &, const String &) +void RestoreCoordinationLocal::setStage(const String &, const String &, const bool &) { } diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index e27f0d1ef88..4456ad966d4 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -19,7 +19,7 @@ public: ~RestoreCoordinationLocal() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message) override; + void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index cc03f0c4a2a..0a89b1cd4e7 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -90,13 +90,17 @@ void RestoreCoordinationRemote::createRootNodes() }); } -void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message) +void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster) { - stage_sync->set(current_host, new_stage, message); + if (for_cluster) + stage_sync->setStageForCluster(new_stage); + else + stage_sync->set(current_host, new_stage, message); } void RestoreCoordinationRemote::setError(const Exception & exception) { + stage_sync->setStageForCluster(Stage::ERROR); stage_sync->setError(current_host, exception); } @@ -282,8 +286,8 @@ bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic String status; if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status)) { - /// If status is not COMPLETED it could be because the restore failed, check if 'error' exists - if (status != Stage::COMPLETED && !zk->exists(root_zookeeper_path + "/" + existing_restore_path + "/error")) + /// Check if some other restore is in progress + if (status == Stage::SCHEDULED_TO_START) { LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid)); result = true; diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index eb0fcff9c2d..21a38f01fa6 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -26,7 +26,7 @@ public: ~RestoreCoordinationRemote() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message) override; + void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; From d4b2297e9fa53336cd4d05919a1048ad742018cd Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 13 Apr 2023 09:53:39 +0200 Subject: [PATCH 002/127] Fixed comment --- src/Backups/BackupCoordinationRemote.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 48f1ce3eef7..cd4901eb5ae 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -781,7 +781,7 @@ bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic &) String status; if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status)) { - /// Check if some other restore is in progress + /// Check if some other backup is in progress if (status == Stage::SCHEDULED_TO_START) { LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid)); From 74c6ca558b3301427368941f3e0df031b04cc10d Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 14 Apr 2023 18:03:46 +0200 Subject: [PATCH 003/127] Removed line from test_disallow_concurrrency for CI checks --- .../test_backup_restore_on_cluster/test_disallow_concurrency.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py index 0d8fad96438..a76af00d339 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py @@ -6,7 +6,6 @@ import concurrent from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV, assert_eq_with_retry - cluster = ClickHouseCluster(__file__) num_nodes = 10 From 93572ab42768195fccc77a809e149355a3f8065d Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 15 Apr 2023 13:43:04 +0200 Subject: [PATCH 004/127] Removed parameter from setStage function and added function setStageForCluster --- src/Backups/BackupCoordinationLocal.cpp | 6 +++++- src/Backups/BackupCoordinationLocal.h | 3 ++- src/Backups/BackupCoordinationRemote.cpp | 12 +++++++----- src/Backups/BackupCoordinationRemote.h | 3 ++- src/Backups/BackupsWorker.cpp | 4 ++-- src/Backups/IBackupCoordination.h | 3 ++- src/Backups/IRestoreCoordination.h | 3 ++- src/Backups/RestoreCoordinationLocal.cpp | 6 +++++- src/Backups/RestoreCoordinationLocal.h | 3 ++- src/Backups/RestoreCoordinationRemote.cpp | 12 +++++++----- src/Backups/RestoreCoordinationRemote.h | 3 ++- 11 files changed, 38 insertions(+), 20 deletions(-) diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 47b67693114..5b7ee37618b 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -15,7 +15,11 @@ BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_) BackupCoordinationLocal::~BackupCoordinationLocal() = default; -void BackupCoordinationLocal::setStage(const String &, const String &, const bool &) +void BackupCoordinationLocal::setStage(const String &, const String &) +{ +} + +void BackupCoordinationLocal::setStageForCluster(const String &) { } diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index 1f6bb84972e..f1ffa8e8517 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -22,7 +22,8 @@ public: BackupCoordinationLocal(bool plain_backup_); ~BackupCoordinationLocal() override; - void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; + void setStage(const String & new_stage, const String & message) override; + void setStageForCluster(const String & new_stage) override; /// Sets stage for cluster void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index cd4901eb5ae..c5c4efa3530 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -252,12 +252,14 @@ void BackupCoordinationRemote::removeAllNodes() } -void BackupCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster) +void BackupCoordinationRemote::setStage(const String & new_stage, const String & message) { - if (for_cluster) - stage_sync->setStageForCluster(new_stage); - else - stage_sync->set(current_host, new_stage, message); + stage_sync->set(current_host, new_stage, message); +} + +void BackupCoordinationRemote::setStageForCluster(const String & new_stage) +{ + stage_sync->setStageForCluster(new_stage); } void BackupCoordinationRemote::setError(const Exception & exception) diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index 40ce2ae6ccc..c659cb0d459 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -33,7 +33,8 @@ public: ~BackupCoordinationRemote() override; - void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; + void setStage(const String & new_stage, const String & message) override; + void setStageForCluster(const String & new_stage) override; /// Sets stage for cluster void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index aae9cfd620f..de05cc2b092 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -368,7 +368,7 @@ void BackupsWorker::doBackup( /// Wait until all the hosts have written their backup entries. backup_coordination->waitForStage(Stage::COMPLETED); - backup_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true); + backup_coordination->setStageForCluster(Stage::COMPLETED); } else { @@ -709,7 +709,7 @@ void BackupsWorker::doRestore( /// Wait until all the hosts have written their backup entries. restore_coordination->waitForStage(Stage::COMPLETED); - restore_coordination->setStage(Stage::COMPLETED, /* message */ "", /* for_cluster */ true); + restore_coordination->setStageForCluster(Stage::COMPLETED); } else { diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 614e6a16db8..6caae1dd741 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -21,7 +21,8 @@ public: virtual ~IBackupCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0; + virtual void setStage(const String & new_stage, const String & message) = 0; + virtual void setStageForCluster(const String & new_stage) = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index 599a698a1f9..a5c8db84c86 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -18,7 +18,8 @@ public: virtual ~IRestoreCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) = 0; + virtual void setStage(const String & new_stage, const String & message) = 0; + virtual void setStageForCluster(const String & new_stage) = 0; /// Sets stage for cluster virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp index f689277f5b6..513204c931c 100644 --- a/src/Backups/RestoreCoordinationLocal.cpp +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -11,7 +11,11 @@ RestoreCoordinationLocal::RestoreCoordinationLocal() : log(&Poco::Logger::get("R RestoreCoordinationLocal::~RestoreCoordinationLocal() = default; -void RestoreCoordinationLocal::setStage(const String &, const String &, const bool &) +void RestoreCoordinationLocal::setStage(const String &, const String &) +{ +} + +void RestoreCoordinationLocal::setStageForCluster(const String &) { } diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index 4456ad966d4..0e4f4f01846 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -19,7 +19,8 @@ public: ~RestoreCoordinationLocal() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; + void setStage(const String & new_stage, const String & message) override; + void setStageForCluster(const String & new_stage) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index 0a89b1cd4e7..2c2187a1eb5 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -90,12 +90,14 @@ void RestoreCoordinationRemote::createRootNodes() }); } -void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message, const bool & for_cluster) +void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message) { - if (for_cluster) - stage_sync->setStageForCluster(new_stage); - else - stage_sync->set(current_host, new_stage, message); + stage_sync->set(current_host, new_stage, message); +} + +void RestoreCoordinationRemote::setStageForCluster(const String & new_stage) +{ + stage_sync->setStageForCluster(new_stage); } void RestoreCoordinationRemote::setError(const Exception & exception) diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index 21a38f01fa6..947d08a66e5 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -26,7 +26,8 @@ public: ~RestoreCoordinationRemote() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message, const bool & for_cluster = false) override; + void setStage(const String & new_stage, const String & message) override; + void setStageForCluster(const String & new_stage) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; From 913b63edc93c80f8bfaedfe2332859fc5dab83d3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 3 May 2023 21:28:33 +0200 Subject: [PATCH 005/127] Fix another zero copy bug --- src/Storages/MergeTree/MergeTreeData.cpp | 4 +-- src/Storages/MergeTree/ZeroCopyLock.cpp | 2 +- src/Storages/MergeTree/ZeroCopyLock.h | 2 ++ src/Storages/StorageReplicatedMergeTree.cpp | 39 ++++++++++++++++++--- 4 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index fd98db7962e..e4181a5f9de 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7202,8 +7202,8 @@ std::pair MergeTreeData::cloneAn copy_instead_of_hardlink, files_to_copy_instead_of_hardlinks); - LOG_DEBUG(log, "Clone {} part {} to {}{}", - src_flushed_tmp_part ? "flushed" : "", + LOG_DEBUG(log, "Clone{} part {} to {}{}", + src_flushed_tmp_part ? " flushed" : "", src_part_storage->getFullPath(), std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name), with_copy); diff --git a/src/Storages/MergeTree/ZeroCopyLock.cpp b/src/Storages/MergeTree/ZeroCopyLock.cpp index 53dfe0c769f..cca005dd7c0 100644 --- a/src/Storages/MergeTree/ZeroCopyLock.cpp +++ b/src/Storages/MergeTree/ZeroCopyLock.cpp @@ -3,7 +3,7 @@ namespace DB { ZeroCopyLock::ZeroCopyLock(const zkutil::ZooKeeperPtr & zookeeper, const std::string & lock_path, const std::string & lock_message) - : lock(zkutil::createSimpleZooKeeperLock(zookeeper, lock_path, "part_exclusive_lock", lock_message)) + : lock(zkutil::createSimpleZooKeeperLock(zookeeper, lock_path, ZERO_COPY_LOCK_NAME, lock_message)) { } } diff --git a/src/Storages/MergeTree/ZeroCopyLock.h b/src/Storages/MergeTree/ZeroCopyLock.h index 4400ea55b8f..2803952af18 100644 --- a/src/Storages/MergeTree/ZeroCopyLock.h +++ b/src/Storages/MergeTree/ZeroCopyLock.h @@ -12,6 +12,8 @@ namespace DB /// because due to bad abstraction we use it in MergeTreeData. struct ZeroCopyLock { + static inline const std::string_view ZERO_COPY_LOCK_NAME = "part_exclusive_lock"; + ZeroCopyLock(const zkutil::ZooKeeperPtr & zookeeper, const std::string & lock_path, const std::string & lock_message); bool isLocked() const { return lock->isLocked(); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 20839a61c92..94abc1422fd 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8249,7 +8249,7 @@ void StorageReplicatedMergeTree::lockSharedData( { String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name; - LOG_TRACE(log, "Trying to create zookeeper persistent lock {}", zookeeper_node); + LOG_TRACE(log, "Trying to create zookeeper persistent lock {} with hardlinks [{}]", zookeeper_node, fmt::join(hardlinks, ", ")); createZeroCopyLockNode( zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, @@ -8362,7 +8362,7 @@ namespace /// But sometimes we need an opposite. When we deleting all_0_0_0_1 it can be non replicated to other replicas, so we are the only owner of this part. /// In this case when we will drop all_0_0_0_1 we will drop blobs for all_0_0_0. But it will lead to dataloss. For such case we need to check that other replicas /// still need parent part. -std::pair getParentLockedBlobs(const ZooKeeperWithFaultInjectionPtr & zookeeper_ptr, const std::string & zero_copy_part_path_prefix, const MergeTreePartInfo & part_info, MergeTreeDataFormatVersion format_version, Poco::Logger * log) +std::pair> getParentLockedBlobs(const ZooKeeperWithFaultInjectionPtr & zookeeper_ptr, const std::string & zero_copy_part_path_prefix, const MergeTreePartInfo & part_info, MergeTreeDataFormatVersion format_version, Poco::Logger * log) { NameSet files_not_to_remove; @@ -8404,15 +8404,40 @@ std::pair getParentLockedBlobs(const ZooKeeperWithFaultInjectionP /// Get hardlinked files String files_not_to_remove_str; Coordination::Error code; - zookeeper_ptr->tryGet(fs::path(zero_copy_part_path_prefix) / part_candidate_info_str, files_not_to_remove_str, nullptr, nullptr, &code); + zookeeper_ptr->tryGet(fs::path(zero_copy_part_path_prefix) / part_candidate_info_str, files_not_to_remove_str, nullptr, &code); if (code != Coordination::Error::ZOK) + { LOG_TRACE(log, "Cannot get parent files from ZooKeeper on path ({}), error {}", (fs::path(zero_copy_part_path_prefix) / part_candidate_info_str).string(), errorMessage(code)); + return {true, std::nullopt}; + } if (!files_not_to_remove_str.empty()) { boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); LOG_TRACE(log, "Found files not to remove from parent part {}: [{}]", part_candidate_info_str, fmt::join(files_not_to_remove, ", ")); } + else + { + std::vector children; + code = zookeeper_ptr->tryGetChildren(fs::path(zero_copy_part_path_prefix) / part_candidate_info_str, children); + if (code != Coordination::Error::ZOK) + { + LOG_TRACE(log, "Cannot get parent locks in ZooKeeper on path ({}), error {}", (fs::path(zero_copy_part_path_prefix) / part_candidate_info_str).string(), errorMessage(code)); + return {true, std::nullopt}; + } + + if (children.size() > 1 || children.size() == 1 && children[0] != ZeroCopyLock::ZERO_COPY_LOCK_NAME) + { + LOG_TRACE(log, "No files not to remove found for part {} from parent {}", part_info_str, part_candidate_info_str); + } + else + { + /// The case when part is actually removed, but some stale replica trying to execute merge/mutation. + /// We shouldn't use the part to check hardlinked blobs, it just doesn't exist. + LOG_TRACE(log, "Part {} is not parent (only merge/mutation locks exist), refusing to use as parent", part_candidate_info_str); + continue; + } + } return {true, files_not_to_remove}; } @@ -8448,6 +8473,12 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( auto [has_parent, parent_not_to_remove] = getParentLockedBlobs( zookeeper_ptr, fs::path(zc_zookeeper_path).parent_path(), part_info, data_format_version, logger); + if (has_parent && parent_not_to_remove == std::nullopt) + { + LOG_TRACE(logger, "Failed to get mutation parent on {} for part {}, refusing to remove blobs", zookeeper_part_replica_node, part_name); + return {false, {}}; + } + files_not_to_remove.insert(parent_not_to_remove.begin(), parent_not_to_remove.end()); String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; @@ -8527,7 +8558,7 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( if (error_code == Coordination::Error::ZOK) { - LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} (part is finally unlocked)", zookeeper_part_uniq_node, part_name); + LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} (part is finally unlocked)", zookeeper_part_node, part_name); } else if (error_code == Coordination::Error::ZNOTEMPTY) { From 610e63bfd2b21e06c312c625e54279c31d06853c Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 3 May 2023 23:54:24 +0200 Subject: [PATCH 006/127] Fix build --- src/Storages/MergeTree/ZeroCopyLock.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/ZeroCopyLock.h b/src/Storages/MergeTree/ZeroCopyLock.h index 2803952af18..d4c829a3652 100644 --- a/src/Storages/MergeTree/ZeroCopyLock.h +++ b/src/Storages/MergeTree/ZeroCopyLock.h @@ -12,7 +12,7 @@ namespace DB /// because due to bad abstraction we use it in MergeTreeData. struct ZeroCopyLock { - static inline const std::string_view ZERO_COPY_LOCK_NAME = "part_exclusive_lock"; + static inline const auto ZERO_COPY_LOCK_NAME = "part_exclusive_lock"; ZeroCopyLock(const zkutil::ZooKeeperPtr & zookeeper, const std::string & lock_path, const std::string & lock_message); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 94abc1422fd..280150f27ad 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8404,7 +8404,7 @@ std::pair> getParentLockedBlobs(const ZooKeeperWith /// Get hardlinked files String files_not_to_remove_str; Coordination::Error code; - zookeeper_ptr->tryGet(fs::path(zero_copy_part_path_prefix) / part_candidate_info_str, files_not_to_remove_str, nullptr, &code); + zookeeper_ptr->tryGet(fs::path(zero_copy_part_path_prefix) / part_candidate_info_str, files_not_to_remove_str, nullptr, nullptr, &code); if (code != Coordination::Error::ZOK) { LOG_TRACE(log, "Cannot get parent files from ZooKeeper on path ({}), error {}", (fs::path(zero_copy_part_path_prefix) / part_candidate_info_str).string(), errorMessage(code)); @@ -8426,7 +8426,7 @@ std::pair> getParentLockedBlobs(const ZooKeeperWith return {true, std::nullopt}; } - if (children.size() > 1 || children.size() == 1 && children[0] != ZeroCopyLock::ZERO_COPY_LOCK_NAME) + if (children.size() > 1 || (children.size() == 1 && children[0] != ZeroCopyLock::ZERO_COPY_LOCK_NAME)) { LOG_TRACE(log, "No files not to remove found for part {} from parent {}", part_info_str, part_candidate_info_str); } @@ -8471,20 +8471,23 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( if (!files_not_to_remove_str.empty()) boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; + + /// Delete our replica node for part from zookeeper (we are not interested in it anymore) + String zookeeper_part_replica_node = fs::path(zookeeper_part_uniq_node) / replica_name_; + auto [has_parent, parent_not_to_remove] = getParentLockedBlobs( zookeeper_ptr, fs::path(zc_zookeeper_path).parent_path(), part_info, data_format_version, logger); - if (has_parent && parent_not_to_remove == std::nullopt) + + // parent_not_to_remove == std::nullopt means that we were unable to retrieve parts set + if (has_parent || parent_not_to_remove == std::nullopt) { LOG_TRACE(logger, "Failed to get mutation parent on {} for part {}, refusing to remove blobs", zookeeper_part_replica_node, part_name); return {false, {}}; } - files_not_to_remove.insert(parent_not_to_remove.begin(), parent_not_to_remove.end()); + files_not_to_remove.insert(parent_not_to_remove->begin(), parent_not_to_remove->end()); - String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; - - /// Delete our replica node for part from zookeeper (we are not interested in it anymore) - String zookeeper_part_replica_node = fs::path(zookeeper_part_uniq_node) / replica_name_; LOG_TRACE(logger, "Remove zookeeper lock {} for part {}", zookeeper_part_replica_node, part_name); From 49ecba63af61313d4419a80e827a1ba22f163838 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 8 May 2023 14:51:04 +0200 Subject: [PATCH 007/127] Removed setStageForCluster and added option all_hosts to set stage for cluster --- src/Backups/BackupCoordinationLocal.cpp | 4 -- src/Backups/BackupCoordinationLocal.h | 3 +- src/Backups/BackupCoordinationRemote.cpp | 11 ++---- src/Backups/BackupCoordinationRemote.h | 3 +- src/Backups/BackupCoordinationStageSync.cpp | 43 +++++++++++---------- src/Backups/BackupCoordinationStageSync.h | 3 +- src/Backups/BackupsWorker.cpp | 6 +-- src/Backups/IBackupCoordination.h | 3 +- src/Backups/IRestoreCoordination.h | 3 +- src/Backups/RestoreCoordinationLocal.cpp | 4 -- src/Backups/RestoreCoordinationLocal.h | 3 +- src/Backups/RestoreCoordinationRemote.cpp | 11 ++---- src/Backups/RestoreCoordinationRemote.h | 3 +- 13 files changed, 40 insertions(+), 60 deletions(-) diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 5b7ee37618b..27e0f173cf3 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -19,10 +19,6 @@ void BackupCoordinationLocal::setStage(const String &, const String &) { } -void BackupCoordinationLocal::setStageForCluster(const String &) -{ -} - void BackupCoordinationLocal::setError(const Exception &) { } diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index f1ffa8e8517..a7b05fbb83c 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -22,8 +22,7 @@ public: BackupCoordinationLocal(bool plain_backup_); ~BackupCoordinationLocal() override; - void setStage(const String & new_stage, const String & message) override; - void setStageForCluster(const String & new_stage) override; /// Sets stage for cluster + void setStage(const String & new_stage, const String & message = "") override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index af88b15d622..27e7d23ce5f 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -254,17 +254,14 @@ void BackupCoordinationRemote::removeAllNodes() void BackupCoordinationRemote::setStage(const String & new_stage, const String & message) { - stage_sync->set(current_host, new_stage, message); -} - -void BackupCoordinationRemote::setStageForCluster(const String & new_stage) -{ - stage_sync->setStageForCluster(new_stage); + if (is_internal) + stage_sync->set(current_host, new_stage, message); + else + stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true); } void BackupCoordinationRemote::setError(const Exception & exception) { - stage_sync->setStageForCluster(Stage::ERROR); stage_sync->setError(current_host, exception); } diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index c659cb0d459..5671079fa27 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -33,8 +33,7 @@ public: ~BackupCoordinationRemote() override; - void setStage(const String & new_stage, const String & message) override; - void setStageForCluster(const String & new_stage) override; /// Sets stage for cluster + void setStage(const String & new_stage, const String & message = "") override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index 5cbeec0ec76..3d8c283f084 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -8,11 +8,13 @@ #include #include #include - +#include namespace DB { +namespace Stage = BackupCoordinationStage; + namespace ErrorCodes { extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; @@ -42,7 +44,7 @@ void BackupCoordinationStageSync::createRootNodes() }); } -void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message) +void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts) { auto holder = with_retries.createRetriesControlHolder("set"); holder.retries_ctl.retryLoop( @@ -50,29 +52,24 @@ void BackupCoordinationStageSync::set(const String & current_host, const String { with_retries.renewZooKeeper(zookeeper); - /// Make an ephemeral node so the initiator can track if the current host is still working. - String alive_node_path = zookeeper_path + "/alive|" + current_host; - auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral); - if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS) - throw zkutil::KeeperException(code, alive_node_path); - - zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, ""); - zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message); - }); -} - -void BackupCoordinationStageSync::setStageForCluster(const String & new_stage) -{ - auto holder = with_retries.createRetriesControlHolder("setStageForCluster"); - holder.retries_ctl.retryLoop( - [&, &zookeeper = holder.faulty_zookeeper]() + if (all_hosts) { - with_retries.renewZooKeeper(zookeeper); - zookeeper->trySet(zookeeper_path, new_stage); auto code = zookeeper->trySet(zookeeper_path, new_stage); if (code != Coordination::Error::ZOK) throw zkutil::KeeperException(code, zookeeper_path); - }); + } + else + { + /// Make an ephemeral node so the initiator can track if the current host is still working. + String alive_node_path = zookeeper_path + "/alive|" + current_host; + auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS) + throw zkutil::KeeperException(code, alive_node_path); + + zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, ""); + zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message); + } + }); } void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception) @@ -87,6 +84,10 @@ void BackupCoordinationStageSync::setError(const String & current_host, const Ex writeStringBinary(current_host, buf); writeException(exception, buf, true); zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str()); + + auto code = zookeeper->trySet(zookeeper_path, Stage::ERROR); + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException(code, zookeeper_path); }); } diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h index 9dde4e3095f..2efaec46b3a 100644 --- a/src/Backups/BackupCoordinationStageSync.h +++ b/src/Backups/BackupCoordinationStageSync.h @@ -15,8 +15,7 @@ public: Poco::Logger * log_); /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that. - void set(const String & current_host, const String & new_stage, const String & message); - void setStageForCluster(const String & new_stage); + void set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts = false); void setError(const String & current_host, const Exception & exception); /// Sets the stage of the current host and waits until all hosts come to the same stage. diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index de05cc2b092..720ca994a40 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -368,7 +368,7 @@ void BackupsWorker::doBackup( /// Wait until all the hosts have written their backup entries. backup_coordination->waitForStage(Stage::COMPLETED); - backup_coordination->setStageForCluster(Stage::COMPLETED); + backup_coordination->setStage(Stage::COMPLETED); } else { @@ -386,7 +386,7 @@ void BackupsWorker::doBackup( writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal); /// We have written our backup entries, we need to tell other hosts (they could be waiting for it). - backup_coordination->setStage(Stage::COMPLETED, ""); + backup_coordination->setStage(Stage::COMPLETED); } size_t num_files = 0; @@ -709,7 +709,7 @@ void BackupsWorker::doRestore( /// Wait until all the hosts have written their backup entries. restore_coordination->waitForStage(Stage::COMPLETED); - restore_coordination->setStageForCluster(Stage::COMPLETED); + restore_coordination->setStage(Stage::COMPLETED); } else { diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 6caae1dd741..68a13ab7846 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -21,8 +21,7 @@ public: virtual ~IBackupCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message) = 0; - virtual void setStageForCluster(const String & new_stage) = 0; + virtual void setStage(const String & new_stage, const String & message = "") = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index a5c8db84c86..b4df9491c4c 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -18,8 +18,7 @@ public: virtual ~IRestoreCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message) = 0; - virtual void setStageForCluster(const String & new_stage) = 0; /// Sets stage for cluster + virtual void setStage(const String & new_stage, const String & message = "") = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp index 513204c931c..068c4fe7e52 100644 --- a/src/Backups/RestoreCoordinationLocal.cpp +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -15,10 +15,6 @@ void RestoreCoordinationLocal::setStage(const String &, const String &) { } -void RestoreCoordinationLocal::setStageForCluster(const String &) -{ -} - void RestoreCoordinationLocal::setError(const Exception &) { } diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index 0e4f4f01846..2240a25ef3d 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -19,8 +19,7 @@ public: ~RestoreCoordinationLocal() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message) override; - void setStageForCluster(const String & new_stage) override; + void setStage(const String & new_stage, const String & message = "") override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index a3541614f36..c4ecee4aaa6 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -93,17 +93,14 @@ void RestoreCoordinationRemote::createRootNodes() void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message) { - stage_sync->set(current_host, new_stage, message); -} - -void RestoreCoordinationRemote::setStageForCluster(const String & new_stage) -{ - stage_sync->setStageForCluster(new_stage); + if (is_internal) + stage_sync->set(current_host, new_stage, message); + else + stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true); } void RestoreCoordinationRemote::setError(const Exception & exception) { - stage_sync->setStageForCluster(Stage::ERROR); stage_sync->setError(current_host, exception); } diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index 947d08a66e5..989b1c1b727 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -26,8 +26,7 @@ public: ~RestoreCoordinationRemote() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message) override; - void setStageForCluster(const String & new_stage) override; + void setStage(const String & new_stage, const String & message = "") override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; From b320527158e3318dd28d9eaca4b31178d8a05a34 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:33:39 +0000 Subject: [PATCH 008/127] Fix assert in SpanHolder::finish() with fibers --- contrib/boost | 2 +- contrib/boost-cmake/CMakeLists.txt | 23 ++++++++++ src/CMakeLists.txt | 3 ++ src/Common/OpenTelemetryTraceContext.cpp | 55 +++++++++++++++--------- 4 files changed, 62 insertions(+), 21 deletions(-) diff --git a/contrib/boost b/contrib/boost index 8fe7b3326ef..d6c95434acb 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98 +Subproject commit d6c95434acbb1a02d0b9de52bf4f37cac6c00328 diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c9a759eab9c..ae20568f386 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -151,6 +151,7 @@ add_library (_boost_context ${SRCS_CONTEXT}) add_library (boost::context ALIAS _boost_context) target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) + if (SANITIZE OR BOOST_USE_UCONTEXT) target_compile_definitions(_boost_context PUBLIC BOOST_USE_UCONTEXT) endif() @@ -161,6 +162,28 @@ elseif (SANITIZE STREQUAL "thread") target_compile_definitions(_boost_context PUBLIC BOOST_USE_TSAN) endif() +# fiber + +set (SRCS_FIBER + "${LIBRARY_DIR}/libs/fiber/src/context.cpp" + "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" + "${LIBRARY_DIR}/libs/fiber/src/barrier.cpp" + "${LIBRARY_DIR}/libs/fiber/src/condition_variable.cpp" + "${LIBRARY_DIR}/libs/fiber/src/future.cpp" + "${LIBRARY_DIR}/libs/fiber/src/mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/properties.cpp" + "${LIBRARY_DIR}/libs/fiber/src/recursive_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/recursive_timed_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" + "${LIBRARY_DIR}/libs/fiber/src/timed_mutex.cpp" + "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" + "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" +) + +add_library (_boost_fiber ${SRCS_FIBER}) +add_library (boost::fiber ALIAS _boost_fiber) +target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) + # coroutine set (SRCS_COROUTINE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 76a67ade99c..c69ac885154 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -547,6 +547,9 @@ endif () target_link_libraries(clickhouse_common_io PUBLIC boost::context) dbms_target_link_libraries(PUBLIC boost::context) +target_link_libraries(clickhouse_common_io PUBLIC boost::fiber) +dbms_target_link_libraries(PUBLIC boost::fiber) + if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::stemmer) dbms_target_link_libraries (PUBLIC ch_contrib::wnb) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 0d89c581318..1c75bd3efaf 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -6,13 +6,26 @@ #include #include #include +#include namespace DB { namespace OpenTelemetry { -thread_local TracingContextOnThread current_thread_trace_context; +static TracingContextOnThread & getCurrentThreadTraceContext() +{ + static boost::fibers::fiber_specific_ptr current_thread_trace_context; + + auto * ptr = current_thread_trace_context.get(); + if (unlikely(!ptr)) + { + ptr = new TracingContextOnThread(); + current_thread_trace_context.reset(ptr); + } + return *ptr; +} + bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { @@ -104,7 +117,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) { - if (!current_thread_trace_context.isTraceEnabled()) + if (!getCurrentThreadTraceContext().isTraceEnabled()) { return; } @@ -112,8 +125,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id = current_thread_trace_context.trace_id; - this->parent_span_id = current_thread_trace_context.span_id; + this->trace_id = getCurrentThreadTraceContext().trace_id; + this->parent_span_id = getCurrentThreadTraceContext().span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; @@ -132,7 +145,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - current_thread_trace_context.span_id = this->span_id; + getCurrentThreadTraceContext().span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -141,12 +154,12 @@ void SpanHolder::finish() noexcept return; // First of all, restore old value of current span. - assert(current_thread_trace_context.span_id == span_id); - current_thread_trace_context.span_id = parent_span_id; + assert(getCurrentThreadTraceContext().span_id == span_id); + getCurrentThreadTraceContext().span_id = parent_span_id; try { - auto log = current_thread_trace_context.span_log.lock(); + auto log = getCurrentThreadTraceContext().span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -269,7 +282,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return current_thread_trace_context; + return getCurrentThreadTraceContext(); } void TracingContextOnThread::reset() noexcept @@ -291,7 +304,7 @@ TracingContextHolder::TracingContextHolder( /// If any exception is raised during the construction, the tracing is not enabled on current thread. try { - if (current_thread_trace_context.isTraceEnabled()) + if (getCurrentThreadTraceContext().isTraceEnabled()) { /// /// This is not the normal case, @@ -304,15 +317,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id = current_thread_trace_context.trace_id; - this->root_span.parent_span_id = current_thread_trace_context.span_id; + this->root_span.trace_id = getCurrentThreadTraceContext().trace_id; + this->root_span.parent_span_id = getCurrentThreadTraceContext().span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - current_thread_trace_context.span_id = this->root_span.span_id; + getCurrentThreadTraceContext().span_id = this->root_span.span_id; return; } @@ -356,10 +369,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - current_thread_trace_context = _parent_trace_context; - current_thread_trace_context.span_id = this->root_span.span_id; - current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED; - current_thread_trace_context.span_log = _span_log; + getCurrentThreadTraceContext() = _parent_trace_context; + getCurrentThreadTraceContext().span_id = this->root_span.span_id; + getCurrentThreadTraceContext().trace_flags = TRACE_FLAG_SAMPLED; + getCurrentThreadTraceContext().span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -371,7 +384,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log = current_thread_trace_context.span_log.lock(); + auto shared_span_log = getCurrentThreadTraceContext().span_log.lock(); if (shared_span_log) { try @@ -402,12 +415,14 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - current_thread_trace_context.reset(); + getCurrentThreadTraceContext().reset(); } else { - current_thread_trace_context.span_id = this->root_span.parent_span_id; + getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; } + + } } From 5527d43a5d09f9a9e75d3f9b94bd8ef1bec9980a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:51:17 +0000 Subject: [PATCH 009/127] Use only needed src files --- contrib/boost-cmake/CMakeLists.txt | 8 -------- 1 file changed, 8 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index ae20568f386..6c722c42e7d 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -167,15 +167,7 @@ endif() set (SRCS_FIBER "${LIBRARY_DIR}/libs/fiber/src/context.cpp" "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" - "${LIBRARY_DIR}/libs/fiber/src/barrier.cpp" - "${LIBRARY_DIR}/libs/fiber/src/condition_variable.cpp" - "${LIBRARY_DIR}/libs/fiber/src/future.cpp" - "${LIBRARY_DIR}/libs/fiber/src/mutex.cpp" - "${LIBRARY_DIR}/libs/fiber/src/properties.cpp" - "${LIBRARY_DIR}/libs/fiber/src/recursive_mutex.cpp" - "${LIBRARY_DIR}/libs/fiber/src/recursive_timed_mutex.cpp" "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" - "${LIBRARY_DIR}/libs/fiber/src/timed_mutex.cpp" "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" ) From c961e3706ed1b4028f7420bd2fa8ac96126f378d Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 8 May 2023 16:52:57 +0000 Subject: [PATCH 010/127] Clean up --- contrib/boost-cmake/CMakeLists.txt | 1 - src/Common/OpenTelemetryTraceContext.cpp | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 6c722c42e7d..c8be40be1d4 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -151,7 +151,6 @@ add_library (_boost_context ${SRCS_CONTEXT}) add_library (boost::context ALIAS _boost_context) target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) - if (SANITIZE OR BOOST_USE_UCONTEXT) target_compile_definitions(_boost_context PUBLIC BOOST_USE_UCONTEXT) endif() diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 1c75bd3efaf..86ce30941a3 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -16,7 +16,7 @@ namespace OpenTelemetry static TracingContextOnThread & getCurrentThreadTraceContext() { static boost::fibers::fiber_specific_ptr current_thread_trace_context; - + auto * ptr = current_thread_trace_context.get(); if (unlikely(!ptr)) { @@ -421,8 +421,6 @@ TracingContextHolder::~TracingContextHolder() { getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; } - - } } From 930c8c3043fbb2ff462f6ee3d14b2568271c774d Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 May 2023 18:56:09 +0000 Subject: [PATCH 011/127] Use own fiber local implementation --- src/Common/AsyncTaskExecutor.cpp | 10 +++++ src/Common/AsyncTaskExecutor.h | 25 +++++++++++ src/Common/OpenTelemetryTraceContext.cpp | 56 ++++++++++-------------- 3 files changed, 58 insertions(+), 33 deletions(-) diff --git a/src/Common/AsyncTaskExecutor.cpp b/src/Common/AsyncTaskExecutor.cpp index 4e2fb18fb78..d0c6454a849 100644 --- a/src/Common/AsyncTaskExecutor.cpp +++ b/src/Common/AsyncTaskExecutor.cpp @@ -3,11 +3,18 @@ namespace DB { +thread_local const Fiber * current_fiber = nullptr; + AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr task_) : task(std::move(task_)) { createFiber(); } +const Fiber * AsyncTaskExecutor::getCurrentFiber() +{ + return current_fiber; +} + void AsyncTaskExecutor::resume() { if (routine_is_finished) @@ -31,7 +38,10 @@ void AsyncTaskExecutor::resume() void AsyncTaskExecutor::resumeUnlocked() { + const auto * parent_fiber = current_fiber; + current_fiber = &fiber; fiber = std::move(fiber).resume(); + current_fiber = parent_fiber; } void AsyncTaskExecutor::cancel() diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index f749c3066fc..cf7cdc5ad82 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -74,6 +74,7 @@ public: ERROR = 4, }; #endif + static const Fiber * getCurrentFiber(); protected: /// Method that is called in resume() before actual fiber resuming. @@ -118,6 +119,30 @@ private: std::unique_ptr task; }; +/// Simple class for storing fiber local variables. +template +class FiberLocalVariable +{ +public: + T & operator*() + { + return get(); + } + + T * operator->() + { + return &get(); + } + +private: + T & get() + { + return data[AsyncTaskExecutor::getCurrentFiber()]; + } + + std::unordered_map data; +}; + String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 86ce30941a3..f25acc571d8 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -6,26 +6,16 @@ #include #include #include -#include + +#include namespace DB { namespace OpenTelemetry { -static TracingContextOnThread & getCurrentThreadTraceContext() -{ - static boost::fibers::fiber_specific_ptr current_thread_trace_context; - - auto * ptr = current_thread_trace_context.get(); - if (unlikely(!ptr)) - { - ptr = new TracingContextOnThread(); - current_thread_trace_context.reset(ptr); - } - return *ptr; -} - +/// This code can be executed inside fiber, we should use fiber local context. +thread_local FiberLocalVariable current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { @@ -117,7 +107,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) { - if (!getCurrentThreadTraceContext().isTraceEnabled()) + if (!current_fiber_trace_context->isTraceEnabled()) { return; } @@ -125,8 +115,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id = getCurrentThreadTraceContext().trace_id; - this->parent_span_id = getCurrentThreadTraceContext().span_id; + this->trace_id =current_fiber_trace_context->trace_id; + this->parent_span_id =current_fiber_trace_context->span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; @@ -145,7 +135,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - getCurrentThreadTraceContext().span_id = this->span_id; + current_fiber_trace_context->span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -154,12 +144,12 @@ void SpanHolder::finish() noexcept return; // First of all, restore old value of current span. - assert(getCurrentThreadTraceContext().span_id == span_id); - getCurrentThreadTraceContext().span_id = parent_span_id; + assert(current_fiber_trace_context->span_id == span_id); + current_fiber_trace_context->span_id = parent_span_id; try { - auto log = getCurrentThreadTraceContext().span_log.lock(); + auto log =current_fiber_trace_context->span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -282,7 +272,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return getCurrentThreadTraceContext(); + return*current_fiber_trace_context; } void TracingContextOnThread::reset() noexcept @@ -304,7 +294,7 @@ TracingContextHolder::TracingContextHolder( /// If any exception is raised during the construction, the tracing is not enabled on current thread. try { - if (getCurrentThreadTraceContext().isTraceEnabled()) + if (current_fiber_trace_context->isTraceEnabled()) { /// /// This is not the normal case, @@ -317,15 +307,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id = getCurrentThreadTraceContext().trace_id; - this->root_span.parent_span_id = getCurrentThreadTraceContext().span_id; + this->root_span.trace_id =current_fiber_trace_context->trace_id; + this->root_span.parent_span_id =current_fiber_trace_context->span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - getCurrentThreadTraceContext().span_id = this->root_span.span_id; + current_fiber_trace_context->span_id = this->root_span.span_id; return; } @@ -369,10 +359,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - getCurrentThreadTraceContext() = _parent_trace_context; - getCurrentThreadTraceContext().span_id = this->root_span.span_id; - getCurrentThreadTraceContext().trace_flags = TRACE_FLAG_SAMPLED; - getCurrentThreadTraceContext().span_log = _span_log; + *current_fiber_trace_context = _parent_trace_context; + current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; + current_fiber_trace_context->span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -384,7 +374,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log = getCurrentThreadTraceContext().span_log.lock(); + auto shared_span_log =current_fiber_trace_context->span_log.lock(); if (shared_span_log) { try @@ -415,11 +405,11 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - getCurrentThreadTraceContext().reset(); + current_fiber_trace_context->reset(); } else { - getCurrentThreadTraceContext().span_id = this->root_span.parent_span_id; + current_fiber_trace_context->span_id = this->root_span.parent_span_id; } } From 66971662de3a5cb7c5a3edc29b4a103c7c862329 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 May 2023 18:56:59 +0000 Subject: [PATCH 012/127] Update cmake --- contrib/boost-cmake/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c8be40be1d4..cb0db5622a8 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -174,6 +174,7 @@ set (SRCS_FIBER add_library (_boost_fiber ${SRCS_FIBER}) add_library (boost::fiber ALIAS _boost_fiber) target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) +target_link_libraries(_boost_fiber PRIVATE _boost_context) # coroutine From 0cf6b9f1459175388f7e2a58338a839004e0d6b8 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:27:13 +0000 Subject: [PATCH 013/127] Inherit context from parent fiber --- contrib/boost | 2 +- contrib/boost-cmake/CMakeLists.txt | 15 -------- src/CMakeLists.txt | 3 -- src/Common/AsyncTaskExecutor.cpp | 12 +++---- src/Common/AsyncTaskExecutor.h | 31 ++++------------- src/Common/OpenTelemetryTraceContext.cpp | 44 ++++++++++++++++++++++-- 6 files changed, 55 insertions(+), 52 deletions(-) diff --git a/contrib/boost b/contrib/boost index d6c95434acb..1035c8bfcc9 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit d6c95434acbb1a02d0b9de52bf4f37cac6c00328 +Subproject commit 1035c8bfcc9a3c1cfa7f6e827db94dae1ce1a43a diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index cb0db5622a8..c9a759eab9c 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -161,21 +161,6 @@ elseif (SANITIZE STREQUAL "thread") target_compile_definitions(_boost_context PUBLIC BOOST_USE_TSAN) endif() -# fiber - -set (SRCS_FIBER - "${LIBRARY_DIR}/libs/fiber/src/context.cpp" - "${LIBRARY_DIR}/libs/fiber/src/fiber.cpp" - "${LIBRARY_DIR}/libs/fiber/src/scheduler.cpp" - "${LIBRARY_DIR}/libs/fiber/src/waker.cpp" - "${LIBRARY_DIR}/libs/fiber/src/algo/round_robin.cpp" -) - -add_library (_boost_fiber ${SRCS_FIBER}) -add_library (boost::fiber ALIAS _boost_fiber) -target_include_directories (_boost_fiber PRIVATE ${LIBRARY_DIR}) -target_link_libraries(_boost_fiber PRIVATE _boost_context) - # coroutine set (SRCS_COROUTINE diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c69ac885154..76a67ade99c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -547,9 +547,6 @@ endif () target_link_libraries(clickhouse_common_io PUBLIC boost::context) dbms_target_link_libraries(PUBLIC boost::context) -target_link_libraries(clickhouse_common_io PUBLIC boost::fiber) -dbms_target_link_libraries(PUBLIC boost::fiber) - if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::stemmer) dbms_target_link_libraries (PUBLIC ch_contrib::wnb) diff --git a/src/Common/AsyncTaskExecutor.cpp b/src/Common/AsyncTaskExecutor.cpp index d0c6454a849..68af535b22a 100644 --- a/src/Common/AsyncTaskExecutor.cpp +++ b/src/Common/AsyncTaskExecutor.cpp @@ -3,16 +3,16 @@ namespace DB { -thread_local const Fiber * current_fiber = nullptr; +thread_local FiberInfo current_fiber_info; AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr task_) : task(std::move(task_)) { createFiber(); } -const Fiber * AsyncTaskExecutor::getCurrentFiber() +FiberInfo AsyncTaskExecutor::getCurrentFiberInfo() { - return current_fiber; + return current_fiber_info; } void AsyncTaskExecutor::resume() @@ -38,10 +38,10 @@ void AsyncTaskExecutor::resume() void AsyncTaskExecutor::resumeUnlocked() { - const auto * parent_fiber = current_fiber; - current_fiber = &fiber; + auto parent_fiber_info = current_fiber_info; + current_fiber_info = FiberInfo{&fiber, &parent_fiber_info}; fiber = std::move(fiber).resume(); - current_fiber = parent_fiber; + current_fiber_info = parent_fiber_info; } void AsyncTaskExecutor::cancel() diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index cf7cdc5ad82..1c2f758504a 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -24,6 +24,11 @@ enum class AsyncEventTimeoutType using AsyncCallback = std::function; using ResumeCallback = std::function; +struct FiberInfo +{ + const Fiber * fiber = nullptr; + const FiberInfo * parent_fiber_info = nullptr; +}; /// Base class for a task that will be executed in a fiber. /// It has only one method - run, that takes 2 callbacks: @@ -74,7 +79,7 @@ public: ERROR = 4, }; #endif - static const Fiber * getCurrentFiber(); + static FiberInfo getCurrentFiberInfo(); protected: /// Method that is called in resume() before actual fiber resuming. @@ -119,30 +124,6 @@ private: std::unique_ptr task; }; -/// Simple class for storing fiber local variables. -template -class FiberLocalVariable -{ -public: - T & operator*() - { - return get(); - } - - T * operator->() - { - return &get(); - } - -private: - T & get() - { - return data[AsyncTaskExecutor::getCurrentFiber()]; - } - - std::unordered_map data; -}; - String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index f25acc571d8..178efa33817 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -14,8 +14,48 @@ namespace DB namespace OpenTelemetry { -/// This code can be executed inside fiber, we should use fiber local context. -thread_local FiberLocalVariable current_fiber_trace_context; +/// This code can be executed inside several fibers in one thread, +/// we should use fiber local tracing context. +struct FiberLocalTracingContextOnThread +{ +public: + FiberLocalTracingContextOnThread() + { + /// Initialize main context for this thread. + /// Contexts for fibers will inherit this main context. + data[nullptr] = TracingContextOnThread(); + } + + TracingContextOnThread & operator*() + { + return get(); + } + + TracingContextOnThread * operator->() + { + return &get(); + } + +private: + TracingContextOnThread & get() + { + /// Get context for current fiber. + return getContextForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); + } + + TracingContextOnThread & getContextForFiber(FiberInfo info) + { + auto it = data.find(info.fiber); + /// If it's the first request, we need to initialize context for the fiber using context from parent fiber. + if (it == data.end()) + it = data.insert({info.fiber, getContextForFiber(*info.parent_fiber_info)}).first; + return it->second; + } + + std::unordered_map data; +}; + +thread_local FiberLocalTracingContextOnThread current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { From 10e6f5b59a8f19bfab4b24f76c29b2b9a5324555 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:31:34 +0000 Subject: [PATCH 014/127] Fix indents --- src/Common/OpenTelemetryTraceContext.cpp | 26 ++++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 178efa33817..037ada88f80 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -175,7 +175,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) } /// Set current span as parent of other spans created later on this thread. - current_fiber_trace_context->span_id = this->span_id; + current_fiber_trace_context->span_id = this->span_id; } void SpanHolder::finish() noexcept @@ -189,7 +189,7 @@ void SpanHolder::finish() noexcept try { - auto log =current_fiber_trace_context->span_log.lock(); + auto log = current_fiber_trace_context->span_log.lock(); /// The log might be disabled, check it before use if (log) @@ -312,7 +312,7 @@ void TracingContext::serialize(WriteBuffer & buf) const const TracingContextOnThread & CurrentContext() { - return*current_fiber_trace_context; + return *current_fiber_trace_context; } void TracingContextOnThread::reset() noexcept @@ -347,15 +347,15 @@ TracingContextHolder::TracingContextHolder( /// So this branch ensures this class can be instantiated multiple times on one same thread safely. /// this->is_context_owner = false; - this->root_span.trace_id =current_fiber_trace_context->trace_id; - this->root_span.parent_span_id =current_fiber_trace_context->span_id; + this->root_span.trace_id = current_fiber_trace_context->trace_id; + this->root_span.parent_span_id = current_fiber_trace_context->span_id; this->root_span.span_id = thread_local_rng(); this->root_span.operation_name = _operation_name; this->root_span.start_time_us = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); /// Set the root span as parent of other spans created on current thread - current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->span_id = this->root_span.span_id; return; } @@ -399,10 +399,10 @@ TracingContextHolder::TracingContextHolder( } /// Set up trace context on current thread only when the root span is successfully initialized. - *current_fiber_trace_context = _parent_trace_context; - current_fiber_trace_context->span_id = this->root_span.span_id; - current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; - current_fiber_trace_context->span_log = _span_log; + *current_fiber_trace_context = _parent_trace_context; + current_fiber_trace_context->span_id = this->root_span.span_id; + current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED; + current_fiber_trace_context->span_log = _span_log; } TracingContextHolder::~TracingContextHolder() @@ -414,7 +414,7 @@ TracingContextHolder::~TracingContextHolder() try { - auto shared_span_log =current_fiber_trace_context->span_log.lock(); + auto shared_span_log = current_fiber_trace_context->span_log.lock(); if (shared_span_log) { try @@ -445,11 +445,11 @@ TracingContextHolder::~TracingContextHolder() if (this->is_context_owner) { /// Clear the context on current thread - current_fiber_trace_context->reset(); + current_fiber_trace_context->reset(); } else { - current_fiber_trace_context->span_id = this->root_span.parent_span_id; + current_fiber_trace_context->span_id = this->root_span.parent_span_id; } } From a7aec49fbbcab364bc443787ec2c11d2bf52762d Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 10 May 2023 12:43:29 +0000 Subject: [PATCH 015/127] Fix indents --- src/Common/OpenTelemetryTraceContext.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 037ada88f80..8cf4879c1e2 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -155,8 +155,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind) /// Use try-catch to make sure the ctor is exception safe. try { - this->trace_id =current_fiber_trace_context->trace_id; - this->parent_span_id =current_fiber_trace_context->span_id; + this->trace_id = current_fiber_trace_context->trace_id; + this->parent_span_id = current_fiber_trace_context->span_id; this->span_id = thread_local_rng(); // create a new id for this span this->operation_name = _operation_name; this->kind = _kind; From 7fbf87be176081411918ae35f040f338892d1416 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Tue, 18 Apr 2023 13:11:42 +0200 Subject: [PATCH 016/127] rework WriteBufferFromS3, squashed --- contrib/googletest-cmake/CMakeLists.txt | 25 +- src/Backups/BackupIO_S3.cpp | 1 - src/CMakeLists.txt | 1 + src/Disks/DiskLocal.cpp | 1 + .../IO/CachedOnDiskWriteBufferFromFile.cpp | 13 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 3 +- src/IO/S3/Client.cpp | 33 +- src/IO/S3/Client.h | 7 + src/IO/SwapHelper.cpp | 17 + src/IO/SwapHelper.h | 21 +- src/IO/WriteBuffer.h | 5 +- src/IO/WriteBufferFromFileDecorator.cpp | 38 +- src/IO/WriteBufferFromFileDecorator.h | 7 +- src/IO/WriteBufferFromS3.cpp | 946 +++++++------- src/IO/WriteBufferFromS3.h | 95 +- src/IO/WriteBufferFromS3MemoryStream.cpp | 68 + src/IO/WriteBufferFromS3MemoryStream.h | 39 + src/IO/WriteBufferFromS3TaskTracker.cpp | 137 ++ src/IO/WriteBufferFromS3TaskTracker.h | 37 + src/IO/tests/gtest_writebuffer_s3.cpp | 1114 +++++++++++++++++ src/Storages/MergeTree/MergeTreeData.cpp | 1 + .../MergeTree/MergeTreeDeduplicationLog.cpp | 3 + .../MergeTree/MergeTreeMutationEntry.cpp | 1 + src/Storages/StorageS3.cpp | 1 - .../02240_filesystem_query_cache.reference | 1 + .../02240_filesystem_query_cache.sql | 2 +- ...system_cache_on_write_operations.reference | 36 +- ...41_filesystem_cache_on_write_operations.sh | 19 +- ...ilesystem_cache_persistent_files.reference | 17 +- ...events_from_query_log_and_client.reference | 4 +- 30 files changed, 2102 insertions(+), 591 deletions(-) create mode 100644 src/IO/SwapHelper.cpp create mode 100644 src/IO/WriteBufferFromS3MemoryStream.cpp create mode 100644 src/IO/WriteBufferFromS3MemoryStream.h create mode 100644 src/IO/WriteBufferFromS3TaskTracker.cpp create mode 100644 src/IO/WriteBufferFromS3TaskTracker.h create mode 100644 src/IO/tests/gtest_writebuffer_s3.cpp diff --git a/contrib/googletest-cmake/CMakeLists.txt b/contrib/googletest-cmake/CMakeLists.txt index 90fdde0c185..3905df03155 100644 --- a/contrib/googletest-cmake/CMakeLists.txt +++ b/contrib/googletest-cmake/CMakeLists.txt @@ -1,15 +1,30 @@ -set (SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest") +set (SRC_DIR "${ClickHouse_SOURCE_DIR}/contrib/googletest") -add_library(_gtest "${SRC_DIR}/src/gtest-all.cc") +add_library(_gtest "${SRC_DIR}/googletest/src/gtest-all.cc") set_target_properties(_gtest PROPERTIES VERSION "1.0.0") target_compile_definitions (_gtest PUBLIC GTEST_HAS_POSIX_RE=0) -target_include_directories(_gtest SYSTEM PUBLIC "${SRC_DIR}/include") -target_include_directories(_gtest PRIVATE "${SRC_DIR}") +target_include_directories(_gtest SYSTEM PUBLIC "${SRC_DIR}/googletest/include") +target_include_directories(_gtest PRIVATE "${SRC_DIR}/googletest") -add_library(_gtest_main "${SRC_DIR}/src/gtest_main.cc") +add_library(_gtest_main "${SRC_DIR}/googletest/src/gtest_main.cc") set_target_properties(_gtest_main PROPERTIES VERSION "1.0.0") target_link_libraries(_gtest_main PUBLIC _gtest) add_library(_gtest_all INTERFACE) target_link_libraries(_gtest_all INTERFACE _gtest _gtest_main) add_library(ch_contrib::gtest_all ALIAS _gtest_all) + + +add_library(_gmock "${SRC_DIR}/googlemock/src/gmock-all.cc") +set_target_properties(_gmock PROPERTIES VERSION "1.0.0") +target_compile_definitions (_gmock PUBLIC GTEST_HAS_POSIX_RE=0) +target_include_directories(_gmock SYSTEM PUBLIC "${SRC_DIR}/googlemock/include" "${SRC_DIR}/googletest/include") +target_include_directories(_gmock PRIVATE "${SRC_DIR}/googlemock") + +add_library(_gmock_main "${SRC_DIR}/googlemock/src/gmock_main.cc") +set_target_properties(_gmock_main PROPERTIES VERSION "1.0.0") +target_link_libraries(_gmock_main PUBLIC _gmock) + +add_library(_gmock_all INTERFACE) +target_link_libraries(_gmock_all INTERFACE _gmock _gmock_main) +add_library(ch_contrib::gmock_all ALIAS _gmock_all) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 90333900d4a..84dba63ae4e 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -253,7 +253,6 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) fs::path(s3_uri.key) / file_name, request_settings, std::nullopt, - DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3f4fbb7420..ac99a7c3669 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -612,6 +612,7 @@ if (ENABLE_TESTS) target_link_libraries(unit_tests_dbms PRIVATE ch_contrib::gtest_all + ch_contrib::gmock_all clickhouse_functions clickhouse_aggregate_functions clickhouse_parsers diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 69b70da272a..1abecb7af4e 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -544,6 +544,7 @@ try auto tmp_file = std::make_unique(disk_ptr); auto buf = std::make_unique(std::move(tmp_file)); buf->write(data.data, data.PAGE_SIZE_IN_BYTES); + buf->finalize(); buf->sync(); } return true; diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp index af2226ea6ca..9153af90312 100644 --- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp @@ -211,10 +211,16 @@ void CachedOnDiskWriteBufferFromFile::nextImpl() { size_t size = offset(); + /// Write data to cache. + cacheData(working_buffer.begin(), size, throw_on_error_from_cache); + current_download_offset += size; + try { SwapHelper swap(*this, *impl); /// Write data to the underlying buffer. + /// Actually here WriteBufferFromFileDecorator::nextImpl has to be called, but it is pivate method. + /// In particular WriteBufferFromFileDecorator introduces logic with swaps in order to achieve delegation. impl->next(); } catch (...) @@ -225,10 +231,6 @@ void CachedOnDiskWriteBufferFromFile::nextImpl() throw; } - - /// Write data to cache. - cacheData(working_buffer.begin(), size, throw_on_error_from_cache); - current_download_offset += size; } void CachedOnDiskWriteBufferFromFile::cacheData(char * data, size_t size, bool throw_on_error) @@ -292,8 +294,7 @@ void CachedOnDiskWriteBufferFromFile::finalizeImpl() { try { - SwapHelper swap(*this, *impl); - impl->finalize(); + WriteBufferFromFileDecorator::finalizeImpl(); } catch (...) { diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 2eee8bf5693..79b3d3a2b8b 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -161,7 +161,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN WriteMode mode, // S3 doesn't support append, only rewrite std::optional attributes, FinalizeCallback && finalize_callback, - size_t buf_size, + size_t buf_size [[maybe_unused]], const WriteSettings & write_settings) { WriteSettings disk_write_settings = IObjectStorage::patchSettings(write_settings); @@ -180,7 +180,6 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN object.remote_path, settings_ptr->request_settings, attributes, - buf_size, std::move(scheduler), disk_write_settings); diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 3c0a8122a91..3c39893b44e 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -255,7 +255,7 @@ Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) c if (auto uri = getURIForBucket(bucket); uri.has_value()) request.overrideURI(std::move(*uri)); - auto result = Aws::S3::S3Client::HeadObject(request); + auto result = HeadObject(static_cast(request)); if (result.IsSuccess()) return result; @@ -312,70 +312,75 @@ Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) c request.overrideURI(std::move(*bucket_uri)); - return Aws::S3::S3Client::HeadObject(request); + /// The next call is NOT a recurcive call + /// This is a virtuall call Aws::S3::S3Client::HeadObject(const Model::HeadObjectRequest&) + return HeadObject(static_cast(request)); } +/// For each request, we wrap the request functions from Aws::S3::Client with doRequest +/// doRequest calls virtuall function from Aws::S3::Client while DB::S3::Client has not virtual calls for each request type + Model::ListObjectsV2Outcome Client::ListObjectsV2(const ListObjectsV2Request & request) const { - return doRequest(request, [this](const Model::ListObjectsV2Request & req) { return Aws::S3::S3Client::ListObjectsV2(req); }); + return doRequest(request, [this](const Model::ListObjectsV2Request & req) { return ListObjectsV2(req); }); } Model::ListObjectsOutcome Client::ListObjects(const ListObjectsRequest & request) const { - return doRequest(request, [this](const Model::ListObjectsRequest & req) { return Aws::S3::S3Client::ListObjects(req); }); + return doRequest(request, [this](const Model::ListObjectsRequest & req) { return ListObjects(req); }); } Model::GetObjectOutcome Client::GetObject(const GetObjectRequest & request) const { - return doRequest(request, [this](const Model::GetObjectRequest & req) { return Aws::S3::S3Client::GetObject(req); }); + return doRequest(request, [this](const Model::GetObjectRequest & req) { return GetObject(req); }); } Model::AbortMultipartUploadOutcome Client::AbortMultipartUpload(const AbortMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::AbortMultipartUploadRequest & req) { return Aws::S3::S3Client::AbortMultipartUpload(req); }); + request, [this](const Model::AbortMultipartUploadRequest & req) { return AbortMultipartUpload(req); }); } Model::CreateMultipartUploadOutcome Client::CreateMultipartUpload(const CreateMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::CreateMultipartUploadRequest & req) { return Aws::S3::S3Client::CreateMultipartUpload(req); }); + request, [this](const Model::CreateMultipartUploadRequest & req) { return CreateMultipartUpload(req); }); } Model::CompleteMultipartUploadOutcome Client::CompleteMultipartUpload(const CompleteMultipartUploadRequest & request) const { return doRequest( - request, [this](const Model::CompleteMultipartUploadRequest & req) { return Aws::S3::S3Client::CompleteMultipartUpload(req); }); + request, [this](const Model::CompleteMultipartUploadRequest & req) { return CompleteMultipartUpload(req); }); } Model::CopyObjectOutcome Client::CopyObject(const CopyObjectRequest & request) const { - return doRequest(request, [this](const Model::CopyObjectRequest & req) { return Aws::S3::S3Client::CopyObject(req); }); + return doRequest(request, [this](const Model::CopyObjectRequest & req) { return CopyObject(req); }); } Model::PutObjectOutcome Client::PutObject(const PutObjectRequest & request) const { - return doRequest(request, [this](const Model::PutObjectRequest & req) { return Aws::S3::S3Client::PutObject(req); }); + return doRequest(request, [this](const Model::PutObjectRequest & req) { return PutObject(req); }); } Model::UploadPartOutcome Client::UploadPart(const UploadPartRequest & request) const { - return doRequest(request, [this](const Model::UploadPartRequest & req) { return Aws::S3::S3Client::UploadPart(req); }); + return doRequest(request, [this](const Model::UploadPartRequest & req) { return UploadPart(req); }); } Model::UploadPartCopyOutcome Client::UploadPartCopy(const UploadPartCopyRequest & request) const { - return doRequest(request, [this](const Model::UploadPartCopyRequest & req) { return Aws::S3::S3Client::UploadPartCopy(req); }); + return doRequest(request, [this](const Model::UploadPartCopyRequest & req) { return UploadPartCopy(req); }); } Model::DeleteObjectOutcome Client::DeleteObject(const DeleteObjectRequest & request) const { - return doRequest(request, [this](const Model::DeleteObjectRequest & req) { return Aws::S3::S3Client::DeleteObject(req); }); + return doRequest(request, [this](const Model::DeleteObjectRequest & req) { return DeleteObject(req); }); } Model::DeleteObjectsOutcome Client::DeleteObjects(const DeleteObjectsRequest & request) const { - return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return Aws::S3::S3Client::DeleteObjects(req); }); + return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return DeleteObjects(req); }); } template diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 63feb94e593..330c85c418a 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -40,6 +40,11 @@ struct ServerSideEncryptionKMSConfig #include #include +namespace MockS3 +{ + struct Client; +} + namespace DB::S3 { @@ -195,6 +200,8 @@ public: bool supportsMultiPartCopy() const; private: + friend struct ::MockS3::Client; + Client(size_t max_redirects_, ServerSideEncryptionKMSConfig sse_kms_config_, const std::shared_ptr& credentials_provider, diff --git a/src/IO/SwapHelper.cpp b/src/IO/SwapHelper.cpp new file mode 100644 index 00000000000..4a1cc8acf4c --- /dev/null +++ b/src/IO/SwapHelper.cpp @@ -0,0 +1,17 @@ +#include + +namespace DB +{ + +SwapHelper::SwapHelper(BufferBase & b1_, BufferBase & b2_) + : b1(b1_), b2(b2_) +{ + b1.swap(b2); +} + +SwapHelper::~SwapHelper() +{ + b1.swap(b2); +} + +} diff --git a/src/IO/SwapHelper.h b/src/IO/SwapHelper.h index cedbf5f78fe..fcf32927f23 100644 --- a/src/IO/SwapHelper.h +++ b/src/IO/SwapHelper.h @@ -1,16 +1,19 @@ #pragma once + #include namespace DB { - class SwapHelper - { - public: - SwapHelper(BufferBase & b1_, BufferBase & b2_) : b1(b1_), b2(b2_) { b1.swap(b2); } - ~SwapHelper() { b1.swap(b2); } - private: - BufferBase & b1; - BufferBase & b2; - }; +class SwapHelper +{ +public: + SwapHelper(BufferBase & b1_, BufferBase & b2_); + ~SwapHelper(); + +private: + BufferBase & b1; + BufferBase & b2; +}; + } diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h index 436d07515a3..2c891e17d9a 100644 --- a/src/IO/WriteBuffer.h +++ b/src/IO/WriteBuffer.h @@ -42,7 +42,8 @@ public: { if (!offset()) return; - bytes += offset(); + + auto bytes_in_buffer = offset(); try { @@ -54,9 +55,11 @@ public: * so that later (for example, when the stack was expanded) there was no second attempt to write data. */ pos = working_buffer.begin(); + bytes += bytes_in_buffer; throw; } + bytes += bytes_in_buffer; pos = working_buffer.begin(); } diff --git a/src/IO/WriteBufferFromFileDecorator.cpp b/src/IO/WriteBufferFromFileDecorator.cpp index ac801534b4f..4cc881f177f 100644 --- a/src/IO/WriteBufferFromFileDecorator.cpp +++ b/src/IO/WriteBufferFromFileDecorator.cpp @@ -1,6 +1,7 @@ #include "WriteBufferFromFileDecorator.h" #include +#include namespace DB { @@ -13,12 +14,18 @@ WriteBufferFromFileDecorator::WriteBufferFromFileDecorator(std::unique_ptrfinalized is remain false + /// That leads to situation when the destructor of impl is called with impl->finalized equal false. if (!is_prefinalized) WriteBufferFromFileDecorator::preFinalize(); - impl->finalize(); + { + SwapHelper swap(*this, *impl); + impl->finalize(); + } } WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator() @@ -31,11 +38,21 @@ WriteBufferFromFileDecorator::~WriteBufferFromFileDecorator() { tryLogCurrentException(__PRETTY_FUNCTION__); } + + /// It is not a mistake that swap is called here + /// Swap has been called at constructor, it should be called at destructor + /// In oreder to provide valid buffer for impl's d-tor call + swap(*impl); } void WriteBufferFromFileDecorator::sync() { - impl->sync(); + next(); + + { + SwapHelper swap(*this, *impl); + impl->sync(); + } } std::string WriteBufferFromFileDecorator::getFileName() const @@ -45,11 +62,22 @@ std::string WriteBufferFromFileDecorator::getFileName() const return std::string(); } +void WriteBufferFromFileDecorator::preFinalize() +{ + next(); + + { + SwapHelper swap(*this, *impl); + impl->preFinalize(); + } + + is_prefinalized = true; +} + void WriteBufferFromFileDecorator::nextImpl() { - swap(*impl); + SwapHelper swap(*this, *impl); impl->next(); - swap(*impl); } } diff --git a/src/IO/WriteBufferFromFileDecorator.h b/src/IO/WriteBufferFromFileDecorator.h index dde05276c28..5344bb1425c 100644 --- a/src/IO/WriteBufferFromFileDecorator.h +++ b/src/IO/WriteBufferFromFileDecorator.h @@ -17,12 +17,7 @@ public: std::string getFileName() const override; - void preFinalize() override - { - next(); - impl->preFinalize(); - is_prefinalized = true; - } + void preFinalize() override; const WriteBuffer & getImpl() const { return *impl; } diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5a25cb89107..5630ed2cb68 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -2,13 +2,16 @@ #if USE_AWS_S3 +#include "WriteBufferFromS3.h" +#include "WriteBufferFromS3MemoryStream.h" +#include "WriteBufferFromS3TaskTracker.h" + #include #include #include #include #include -#include #include #include #include @@ -29,11 +32,13 @@ namespace ProfileEvents extern const Event S3CreateMultipartUpload; extern const Event S3CompleteMultipartUpload; + extern const Event S3AbortMultipartUpload; extern const Event S3UploadPart; extern const Event S3PutObject; extern const Event DiskS3CreateMultipartUpload; extern const Event DiskS3CompleteMultipartUpload; + extern const Event DiskS3AbortMultipartUpload; extern const Event DiskS3UploadPart; extern const Event DiskS3PutObject; @@ -43,30 +48,105 @@ namespace ProfileEvents namespace DB { -// S3 protocol does not allow to have multipart upload with more than 10000 parts. -// In case server does not return an error on exceeding that number, we print a warning -// because custom S3 implementation may allow relaxed requirements on that. -const int S3_WARN_MAX_PARTS = 10000; namespace ErrorCodes { extern const int S3_ERROR; extern const int INVALID_CONFIG_PARAMETER; + extern const int LOGICAL_ERROR; } -struct WriteBufferFromS3::UploadPartTask +struct WriteBufferFromS3::PartData { - S3::UploadPartRequest req; - bool is_finished = false; - std::string tag; - std::exception_ptr exception; + Memory<> memory; + size_t data_size = 0; + + std::shared_ptr createAwsBuffer() + { + auto buffer = std::make_shared(memory.data(), data_size); + buffer->exceptions(std::ios::badbit); + return buffer; + } + + bool isEmpty() const + { + return data_size == 0; + } }; -struct WriteBufferFromS3::PutObjectTask +struct WriteBufferFromS3::BufferAllocationPolicy { - S3::PutObjectRequest req; - bool is_finished = false; - std::exception_ptr exception; + size_t first_size = 0; + size_t second_size = 0; + + size_t multiply_factor = 0; + size_t multiply_threshold = 0; + size_t max_size = 0; + + size_t current_size = 0; + size_t buffer_number = 0; + + explicit BufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) + : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) + , second_size(settings_.min_upload_part_size) + , multiply_factor(settings_.upload_part_size_multiply_factor) + , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) + , max_size(settings_.max_upload_part_size) + { + if (settings_.strict_upload_part_size > 0) + { + first_size = settings_.strict_upload_part_size; + second_size = settings_.strict_upload_part_size; + multiply_factor = 1; + multiply_threshold = 10000; + max_size = settings_.max_upload_part_size; + } + else + { + first_size = std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size); + second_size = settings_.min_upload_part_size; + multiply_factor = settings_.upload_part_size_multiply_factor; + multiply_threshold = settings_.upload_part_size_multiply_parts_count_threshold; + max_size = settings_.max_upload_part_size; + } + + chassert(first_size > 0); + chassert(second_size > 0); + chassert(multiply_factor >= 1); + chassert(multiply_threshold > 0); + chassert(max_size > 0); + } + + size_t getNumber() const + { + return buffer_number; + } + + size_t getSize() const + { + chassert(buffer_number > 0); + return current_size; + } + + void next() + { + ++buffer_number; + + if (1 == buffer_number) + { + current_size = first_size; + return; + } + + if (2 == buffer_number) + current_size = second_size; + + if (0 == ((buffer_number-1) % multiply_threshold)) + { + current_size *= multiply_factor; + current_size = std::min(current_size, max_size); + } + } }; WriteBufferFromS3::WriteBufferFromS3( @@ -75,146 +155,88 @@ WriteBufferFromS3::WriteBufferFromS3( const String & key_, const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_, - size_t buffer_size_, ThreadPoolCallbackRunner schedule_, const WriteSettings & write_settings_) - : BufferWithOwnMemory(buffer_size_, nullptr, 0) - , bucket(bucket_) + : bucket(bucket_) , key(key_) , request_settings(request_settings_) , upload_settings(request_settings.getUploadSettings()) + , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , strict_upload_part_size(upload_settings.strict_upload_part_size) - , current_upload_part_size(upload_settings.min_upload_part_size) - , schedule(std::move(schedule_)) - , write_settings(write_settings_) + , buffer_allocation_policy(std::make_unique(request_settings_.getUploadSettings())) + , task_tracker(std::make_unique(std::move(schedule_))) { + LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails()); + allocateBuffer(); } void WriteBufferFromS3::nextImpl() { - if (!offset()) + LOG_TRACE(log, "nextImpl with incoming data size {}, memory buffer size {}. {}", offset(), memory.size(), getLogDetails()); + + if (is_prefinalized) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest"); + + /// Make sense to call to before adding new async task to check if there is an exception + task_tracker->getReady(); + + hidePartialData(); + + reallocateFirstBuffer(); + + if (available() > 0) return; - /// Buffer in a bad state after exception - if (temporary_buffer->tellp() == -1) - allocateBuffer(); - else - chassert(temporary_buffer->tellp() == static_cast(last_part_size)); + detachBuffer(); - if (strict_upload_part_size) - processWithStrictParts(); - else - processWithDynamicParts(); + if (!multipart_upload_id.empty() || detached_part_data.size() > 1) + writeMultipartUpload(); - waitForReadyBackgroundTasks(); -} - -void WriteBufferFromS3::processWithStrictParts() -{ - chassert(strict_upload_part_size > 0); - - size_t buffer_size = offset(); - size_t left_in_buffer = buffer_size; - size_t new_size = last_part_size + buffer_size; - size_t buffer_offset = 0; - - if (new_size > strict_upload_part_size) - { - /// Data size will exceed fixed part size threshold for multipart upload, need to use multipart upload. - if (multipart_upload_id.empty()) - createMultipartUpload(); - - while (new_size > strict_upload_part_size) - { - size_t to_write = strict_upload_part_size - last_part_size; - temporary_buffer->write(working_buffer.begin() + buffer_offset, to_write); - buffer_offset += to_write; - - writePart(); - allocateBuffer(); - - new_size -= strict_upload_part_size; - left_in_buffer -= to_write; - } - } - - if (left_in_buffer) - { - temporary_buffer->write(working_buffer.begin() + buffer_offset, left_in_buffer); - last_part_size += left_in_buffer; - } - - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, buffer_size); - - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(buffer_size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); -} - -void WriteBufferFromS3::processWithDynamicParts() -{ - chassert(current_upload_part_size > 0); - - size_t size = offset(); - temporary_buffer->write(working_buffer.begin(), size); - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, size); - last_part_size += size; - - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); - - /// Data size exceeds singlepart upload threshold, need to use multipart upload. - if (multipart_upload_id.empty() && last_part_size > upload_settings.max_single_part_upload_size) - createMultipartUpload(); - - if (!multipart_upload_id.empty() && last_part_size > current_upload_part_size) - { - writePart(); - allocateBuffer(); - } -} - -void WriteBufferFromS3::allocateBuffer() -{ - temporary_buffer = Aws::MakeShared("temporary buffer"); - temporary_buffer->exceptions(std::ios::badbit); - last_part_size = 0; -} - -WriteBufferFromS3::~WriteBufferFromS3() -{ -#ifndef NDEBUG - if (!finalized) - { - LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It's a bug"); - std::terminate(); - } -#else - try - { - finalize(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - } -#endif + allocateBuffer(); } void WriteBufferFromS3::preFinalize() { - next(); + if (is_prefinalized) + return; - if (multipart_upload_id.empty()) + LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails()); + + task_tracker->getReady(); + + hidePartialData(); + + if (hidden_size > 0) + detachBuffer(); + setFakeBufferWhenPreFinalized(); + + bool do_single_part_upload = false; + + if (multipart_upload_id.empty() && detached_part_data.size() <= 1) { - makeSinglepartUpload(); + if (detached_part_data.empty() || detached_part_data.front().data_size <= upload_settings.max_single_part_upload_size) + do_single_part_upload = true; + } + + if (do_single_part_upload) + { + if (detached_part_data.empty()) + { + makeSinglepartUpload({}); + } + else + { + makeSinglepartUpload(std::move(detached_part_data.front())); + detached_part_data.pop_front(); + } } else { - /// Write rest of the data as last part. - writePart(); + writeMultipartUpload(); } is_prefinalized = true; @@ -222,24 +244,182 @@ void WriteBufferFromS3::preFinalize() void WriteBufferFromS3::finalizeImpl() { + LOG_TRACE(log, "finalizeImpl WriteBufferFromS3. {}.", getLogDetails()); + if (!is_prefinalized) preFinalize(); - waitForAllBackgroundTasks(); + chassert(offset() == 0); + chassert(hidden_size == 0); + + task_tracker->getAll(); if (!multipart_upload_id.empty()) + { completeMultipartUpload(); + multipart_upload_finished = true; + } if (request_settings.check_objects_after_upload) { LOG_TRACE(log, "Checking object {} exists after upload", key); S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload"); + + LOG_TRACE(log, "Checking object {} has size as expected {}", key, total_size); + size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage); + if (actual_size != total_size) + throw Exception( + ErrorCodes::S3_ERROR, + "Object {} from bucket {} has unexpected size {} after upload, expected size {}, it's a bug in S3 or S3 API.", + key, bucket, actual_size, total_size); + LOG_TRACE(log, "Object {} exists after upload", key); } } -void WriteBufferFromS3::fillCreateMultipartRequest(DB::S3::CreateMultipartUploadRequest & req) +String WriteBufferFromS3::getLogDetails() const { + String multipart_upload_details; + if (!multipart_upload_id.empty()) + multipart_upload_details = fmt::format(", upload id {}, upload has finished {}" + , multipart_upload_id, multipart_upload_finished); + + return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}", + bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details); +} + +void WriteBufferFromS3::tryToAbortMultipartUpload() +{ + try + { + task_tracker->safeWaitAll(); + abortMultipartUpload(); + } + catch (...) + { + LOG_ERROR(log, "Multipart upload hasn't aborted. {}", getLogDetails()); + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} + +WriteBufferFromS3::~WriteBufferFromS3() +{ + LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails()); + + // That descructor could be call with finalized=false in case of exceptions + if (!finalized) + { + LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails()); + } + + task_tracker->safeWaitAll(); + + if (!multipart_upload_id.empty() && !multipart_upload_finished) + { + LOG_WARNING(log, "WriteBufferFromS3 was neither finished nor aborted, try to abort upload in destructor. {}.", getLogDetails()); + tryToAbortMultipartUpload(); + } +} + +void WriteBufferFromS3::hidePartialData() +{ + if (write_settings.remote_throttler) + write_settings.remote_throttler->add(offset(), ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); + + chassert(memory.size() >= hidden_size + offset()); + + hidden_size += offset(); + chassert(memory.data() + hidden_size == working_buffer.begin() + offset()); + chassert(memory.data() + hidden_size == position()); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + chassert(offset() == 0); +} + +void WriteBufferFromS3::reallocateFirstBuffer() +{ + chassert(offset() == 0); + + if (buffer_allocation_policy->getNumber() > 1 || available() > 0) + return; + + const size_t max_first_buffer = buffer_allocation_policy->getSize(); + if (memory.size() == max_first_buffer) + return; + + size_t size = std::min(memory.size() * 2, max_first_buffer); + memory.resize(size); + + WriteBuffer::set(memory.data() + hidden_size, memory.size() - hidden_size); + + chassert(offset() == 0); + + LOG_TRACE(log, "Reallocated first buffer with size {}. {}", memory.size(), getLogDetails()); +} + +void WriteBufferFromS3::detachBuffer() +{ + size_t data_size = size_t(position() - memory.data()); + chassert(data_size == hidden_size); + + auto buf = std::move(memory); + + WriteBuffer::set(nullptr, 0); + total_size += hidden_size; + hidden_size = 0; + + detached_part_data.push_back({std::move(buf), data_size}); +} + +void WriteBufferFromS3::allocateFirstBuffer() +{ + const auto max_first_buffer = buffer_allocation_policy->getSize(); + const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); + memory = Memory(size); + WriteBuffer::set(memory.data(), memory.size()); + + LOG_TRACE(log, "Allocated first buffer with size {}. {}", memory.size(), getLogDetails()); +} + +void WriteBufferFromS3::allocateBuffer() +{ + buffer_allocation_policy->next(); + chassert(0 == hidden_size); + + if (buffer_allocation_policy->getNumber() == 1) + return allocateFirstBuffer(); + + memory = Memory(buffer_allocation_policy->getSize()); + WriteBuffer::set(memory.data(), memory.size()); + + LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getSize(), getLogDetails()); +} + +void WriteBufferFromS3::setFakeBufferWhenPreFinalized() +{ + WriteBuffer::set(fake_buffer_when_prefinalized, sizeof(fake_buffer_when_prefinalized)); +} + +void WriteBufferFromS3::writeMultipartUpload() +{ + if (multipart_upload_id.empty()) + { + createMultipartUpload(); + } + + while (!detached_part_data.empty()) + { + writePart(std::move(detached_part_data.front())); + detached_part_data.pop_front(); + } +} + +void WriteBufferFromS3::createMultipartUpload() +{ + LOG_TRACE(log, "Create multipart upload. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id); + + S3::CreateMultipartUploadRequest req; + req.SetBucket(bucket); req.SetKey(key); @@ -250,12 +430,6 @@ void WriteBufferFromS3::fillCreateMultipartRequest(DB::S3::CreateMultipartUpload req.SetMetadata(object_metadata.value()); client_ptr->setKMSHeaders(req); -} - -void WriteBufferFromS3::createMultipartUpload() -{ - DB::S3::CreateMultipartUploadRequest req; - fillCreateMultipartRequest(req); ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload); if (write_settings.for_object_storage) @@ -267,184 +441,164 @@ void WriteBufferFromS3::createMultipartUpload() ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - if (outcome.IsSuccess()) - { - multipart_upload_id = outcome.GetResult().GetUploadId(); - LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id); - } - else + if (!outcome.IsSuccess()) { ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); } + + multipart_upload_id = outcome.GetResult().GetUploadId(); + LOG_TRACE(log, "Multipart upload has created. {}", getLogDetails()); } -void WriteBufferFromS3::writePart() +void WriteBufferFromS3::abortMultipartUpload() { - auto size = temporary_buffer->tellp(); - - LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Size: {}", bucket, key, multipart_upload_id, size); - - if (size < 0) + if (multipart_upload_id.empty()) { - LOG_WARNING(log, "Skipping part upload. Buffer is in bad state, it means that we have tried to upload something, but got an exception."); + LOG_WARNING(log, "Nothing to abort. {}", getLogDetails()); return; } - if (size == 0) + LOG_WARNING(log, "Abort multipart upload. {}", getLogDetails()); + + S3::AbortMultipartUploadRequest req; + req.SetBucket(bucket); + req.SetKey(key); + req.SetUploadId(multipart_upload_id); + + ProfileEvents::increment(ProfileEvents::S3AbortMultipartUpload); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3AbortMultipartUpload); + + Stopwatch watch; + auto outcome = client_ptr->AbortMultipartUpload(req); + watch.stop(); + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (!outcome.IsSuccess()) { - LOG_TRACE(log, "Skipping writing part. Buffer is empty."); - return; + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); } - if (TSA_SUPPRESS_WARNING_FOR_READ(part_tags).size() == S3_WARN_MAX_PARTS) - { - // Don't throw exception here by ourselves but leave the decision to take by S3 server. - LOG_WARNING(log, "Maximum part number in S3 protocol has reached (too many parts). Server may not accept this whole upload."); - } - - if (schedule) - { - UploadPartTask * task = nullptr; - - { - std::lock_guard lock(bg_tasks_mutex); - task = &upload_object_tasks.emplace_back(); - ++num_added_bg_tasks; - } - - /// Notify waiting thread when task finished - auto task_finish_notify = [&, task]() - { - std::lock_guard lock(bg_tasks_mutex); - task->is_finished = true; - ++num_finished_bg_tasks; - - /// Notification under mutex is important here. - /// Otherwise, WriteBuffer could be destroyed in between - /// Releasing lock and condvar notification. - bg_tasks_condvar.notify_one(); - }; - - try - { - fillUploadRequest(task->req); - - schedule([this, task, task_finish_notify]() - { - try - { - processUploadRequest(*task); - } - catch (...) - { - task->exception = std::current_exception(); - } - - task_finish_notify(); - }, 0); - } - catch (...) - { - task_finish_notify(); - throw; - } - } - else - { - UploadPartTask task; - auto & tags = TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags); /// Suppress warning because schedule == false. - - fillUploadRequest(task.req); - processUploadRequest(task); - tags.push_back(task.tag); - } + LOG_WARNING(log, "Multipart upload has aborted successfully. {}", getLogDetails()); } -void WriteBufferFromS3::fillUploadRequest(S3::UploadPartRequest & req) +S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, PartData & data) { - /// Increase part number. - ++part_number; + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); + LOG_TRACE(log, "fillUploadRequest, size {}, key: {}", data.data_size, key); - auto max_part_number = upload_settings.max_part_number; - - if (!multipart_upload_id.empty() && (part_number > max_part_number)) - { - throw Exception( - ErrorCodes::INVALID_CONFIG_PARAMETER, - "Part number exceeded {} while writing {} bytes to S3. " - "Check min_upload_part_size = {}, max_upload_part_size = {}, " - "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, " - "max_single_part_upload_size = {}", - max_part_number, count(), - upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, - upload_settings.upload_part_size_multiply_factor, - upload_settings.upload_part_size_multiply_parts_count_threshold, - upload_settings.max_single_part_upload_size); - } + S3::UploadPartRequest req; /// Setup request. req.SetBucket(bucket); req.SetKey(key); req.SetPartNumber(static_cast(part_number)); req.SetUploadId(multipart_upload_id); - req.SetContentLength(temporary_buffer->tellp()); - req.SetBody(temporary_buffer); - + req.SetContentLength(data.data_size); + req.SetBody(data.createAwsBuffer()); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 req.SetContentType("binary/octet-stream"); - if (!strict_upload_part_size) - { - /// Maybe increase `current_upload_part_size` (we need to increase it sometimes to keep `part_number` less or equal than `max_part_number`). - auto threshold = upload_settings.upload_part_size_multiply_parts_count_threshold; - if (!multipart_upload_id.empty() && (part_number % threshold == 0)) - { - auto max_upload_part_size = upload_settings.max_upload_part_size; - auto upload_part_size_multiply_factor = upload_settings.upload_part_size_multiply_factor; - current_upload_part_size *= upload_part_size_multiply_factor; - current_upload_part_size = std::min(current_upload_part_size, max_upload_part_size); - } - } + return req; } -void WriteBufferFromS3::processUploadRequest(UploadPartTask & task) +void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) { - ProfileEvents::increment(ProfileEvents::S3UploadPart); - if (write_settings.for_object_storage) - ProfileEvents::increment(ProfileEvents::DiskS3UploadPart); - - ResourceCost cost = task.req.GetContentLength(); - ResourceGuard rlock(write_settings.resource_link, cost); - Stopwatch watch; - auto outcome = client_ptr->UploadPart(task.req); - watch.stop(); - rlock.unlock(); // Avoid acquiring other locks under resource lock - - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - - if (outcome.IsSuccess()) + if (data.data_size == 0) { - task.tag = outcome.GetResult().GetETag(); - std::lock_guard lock(bg_tasks_mutex); /// Protect part_tags from race - LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size()); + LOG_TRACE(log, "Skipping writing part as empty."); + return; } - else + + multipart_tags.push_back({}); + size_t part_number = multipart_tags.size(); + LOG_TRACE(log, "WritePart. {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); + + if (multipart_upload_id.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Unable to write a part without multipart_upload_id, details: WriteBufferFromS3 created for bucket {}, key {}", + bucket, key); + + if (part_number > upload_settings.max_part_number) { - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure - throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Part number exceeded {} while writing {} bytes to S3. Check min_upload_part_size = {}, max_upload_part_size = {}, " + "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, max_single_part_upload_size = {}", + upload_settings.max_part_number, count(), upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, + upload_settings.upload_part_size_multiply_factor, upload_settings.upload_part_size_multiply_parts_count_threshold, + upload_settings.max_single_part_upload_size); } + + if (data.data_size > upload_settings.max_upload_part_size) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Part size exceeded max_upload_part_size, part number: {}, part size {}, max_upload_part_size {}, {}", + part_number, + data.data_size, + upload_settings.max_upload_part_size, + getLogDetails()); + } + + auto req = getUploadRequest(part_number, data); + auto worker_data = std::make_shared>(std::move(req), std::move(data)); + + auto upload_worker = [&, worker_data, part_number] () + { + LOG_TEST(log, "Writing part started. bucket {}, key {}, part id {}", bucket, key, part_number); + + ProfileEvents::increment(ProfileEvents::S3UploadPart); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3UploadPart); + + auto & request = std::get<0>(*worker_data); + + ResourceCost cost = request.GetContentLength(); + ResourceGuard rlock(write_settings.resource_link, cost); + Stopwatch watch; + auto outcome = client_ptr->UploadPart(request); + watch.stop(); + rlock.unlock(); // Avoid acquiring other locks under resource lock + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (!outcome.IsSuccess()) + { + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType()); + } + + multipart_tags[part_number-1] = outcome.GetResult().GetETag(); + + LOG_TEST(log, "Writing part finished. bucket {}, key{}, part id {}, etag {}", bucket, key, part_number, multipart_tags[part_number-1]); + }; + + task_tracker->add(std::move(upload_worker)); } void WriteBufferFromS3::completeMultipartUpload() { - const auto & tags = TSA_SUPPRESS_WARNING_FOR_READ(part_tags); + LOG_TRACE(log, "Completing multipart upload. {}, Parts: {}", getLogDetails(), multipart_tags.size()); - LOG_TRACE(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size()); + if (multipart_tags.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Failed to complete multipart upload. No parts have uploaded"); - if (tags.empty()) - throw Exception(ErrorCodes::S3_ERROR, "Failed to complete multipart upload. No parts have uploaded"); + for (size_t i = 0; i < multipart_tags.size(); ++i) + { + const auto tag = multipart_tags.at(i); + if (tag.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Failed to complete multipart upload. Part {} haven't been uploaded.", i); + } S3::CompleteMultipartUploadRequest req; req.SetBucket(bucket); @@ -452,10 +606,10 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetUploadId(multipart_upload_id); Aws::S3::Model::CompletedMultipartUpload multipart_upload; - for (size_t i = 0; i < tags.size(); ++i) + for (size_t i = 0; i < multipart_tags.size(); ++i) { Aws::S3::Model::CompletedPart part; - multipart_upload.AddParts(part.WithETag(tags[i]).WithPartNumber(static_cast(i + 1))); + multipart_upload.AddParts(part.WithETag(multipart_tags[i]).WithPartNumber(static_cast(i + 1))); } req.SetMultipartUpload(multipart_upload); @@ -475,26 +629,24 @@ void WriteBufferFromS3::completeMultipartUpload() if (outcome.IsSuccess()) { - LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size()); + LOG_TRACE(log, "Multipart upload has completed. {}, Parts: {}", getLogDetails(), multipart_tags.size()); return; } + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + + if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) + { + /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests + /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it + LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error, will retry. {}, Parts: {}", getLogDetails(), multipart_tags.size()); + } else { - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); - - if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) - { - /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests - /// BTW, NO_SUCH_UPLOAD is expected error and we shouldn't retry it - LOG_INFO(log, "Multipart upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Upload_id: {}, Parts: {}, will retry", bucket, key, multipart_upload_id, tags.size()); - } - else - { - throw S3Exception( - outcome.GetError().GetErrorType(), - "Message: {}, Key: {}, Bucket: {}, Tags: {}", - outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " ")); - } + throw S3Exception( + outcome.GetError().GetErrorType(), + "Message: {}, Key: {}, Bucket: {}, Tags: {}", + outcome.GetError().GetMessage(), key, bucket, fmt::join(multipart_tags.begin(), multipart_tags.end(), " ")); } } @@ -504,73 +656,17 @@ void WriteBufferFromS3::completeMultipartUpload() max_retry, key, bucket); } -void WriteBufferFromS3::makeSinglepartUpload() +S3::PutObjectRequest WriteBufferFromS3::getPutRequest(PartData & data) { - auto size = temporary_buffer->tellp(); - bool with_pool = static_cast(schedule); + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); + LOG_TRACE(log, "getPutRequest, size {}, key {}", data.data_size, key); - LOG_TRACE(log, "Making single part upload. Bucket: {}, Key: {}, Size: {}, WithPool: {}", bucket, key, size, with_pool); + S3::PutObjectRequest req; - if (size < 0) - { - LOG_WARNING(log, "Skipping single part upload. Buffer is in bad state, it mean that we have tried to upload something, but got an exception."); - return; - } - - if (schedule) - { - put_object_task = std::make_unique(); - - /// Notify waiting thread when put object task finished - auto task_notify_finish = [&]() - { - std::lock_guard lock(bg_tasks_mutex); - put_object_task->is_finished = true; - - /// Notification under mutex is important here. - /// Othervies, WriteBuffer could be destroyed in between - /// Releasing lock and condvar notification. - bg_tasks_condvar.notify_one(); - }; - - try - { - fillPutRequest(put_object_task->req); - - schedule([this, task_notify_finish]() - { - try - { - processPutRequest(*put_object_task); - } - catch (...) - { - put_object_task->exception = std::current_exception(); - } - - task_notify_finish(); - }, 0); - } - catch (...) - { - task_notify_finish(); - throw; - } - } - else - { - PutObjectTask task; - fillPutRequest(task.req); - processPutRequest(task); - } -} - -void WriteBufferFromS3::fillPutRequest(S3::PutObjectRequest & req) -{ req.SetBucket(bucket); req.SetKey(key); - req.SetContentLength(temporary_buffer->tellp()); - req.SetBody(temporary_buffer); + req.SetContentLength(data.data_size); + req.SetBody(data.createAwsBuffer()); if (object_metadata.has_value()) req.SetMetadata(object_metadata.value()); if (!upload_settings.storage_class_name.empty()) @@ -580,121 +676,73 @@ void WriteBufferFromS3::fillPutRequest(S3::PutObjectRequest & req) req.SetContentType("binary/octet-stream"); client_ptr->setKMSHeaders(req); + + return req; } -void WriteBufferFromS3::processPutRequest(const PutObjectTask & task) +void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data) { - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); - for (size_t i = 0; i < max_retry; ++i) + LOG_TRACE(log, "Making single part upload. {}.", getLogDetails()); + + auto req = getPutRequest(data); + auto worker_data = std::make_shared>(std::move(req), std::move(data)); + + auto upload_worker = [&, worker_data] () { - ProfileEvents::increment(ProfileEvents::S3PutObject); - if (write_settings.for_object_storage) - ProfileEvents::increment(ProfileEvents::DiskS3PutObject); + LOG_TEST(log, "writing single part upload started. bucket {}, key {}", bucket, key); - ResourceCost cost = task.req.GetContentLength(); - ResourceGuard rlock(write_settings.resource_link, cost); - Stopwatch watch; - auto outcome = client_ptr->PutObject(task.req); - watch.stop(); - rlock.unlock(); + auto & request = std::get<0>(*worker_data); + size_t content_length = request.GetContentLength(); - ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); - - bool with_pool = static_cast(schedule); - if (outcome.IsSuccess()) - { - LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool); - return; - } - else + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + for (size_t i = 0; i < max_retry; ++i) { + ProfileEvents::increment(ProfileEvents::S3PutObject); + if (write_settings.for_object_storage) + ProfileEvents::increment(ProfileEvents::DiskS3PutObject); + + ResourceCost cost = request.GetContentLength(); + ResourceGuard rlock(write_settings.resource_link, cost); + Stopwatch watch; + auto outcome = client_ptr->PutObject(request); + watch.stop(); + rlock.unlock(); + + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Microseconds, watch.elapsedMicroseconds()); + + if (outcome.IsSuccess()) + { + LOG_TRACE(log, "Single part upload has completed. bucket {}, key {}, object size {}", bucket, key, content_length); + return; + } + ProfileEvents::increment(ProfileEvents::WriteBufferFromS3RequestsErrors, 1); + write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY) { - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + /// For unknown reason, at least MinIO can respond with NO_SUCH_KEY for put requests - LOG_INFO(log, "Single part upload failed with NO_SUCH_KEY error for Bucket: {}, Key: {}, Object size: {}, WithPool: {}, will retry", bucket, key, task.req.GetContentLength(), with_pool); + LOG_INFO(log, "Single part upload failed with NO_SUCH_KEY error for bucket {}, key {}, object size {}, will retry", bucket, key, content_length); } else { - write_settings.resource_link.accumulate(cost); // We assume no resource was used in case of failure + LOG_ERROR(log, "S3Exception name {}, Message: {}, bucket {}, key {}, object size {}", + outcome.GetError().GetExceptionName(), outcome.GetError().GetMessage(), bucket, key, content_length); throw S3Exception( outcome.GetError().GetErrorType(), - "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}", - outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool); + "Message: {}, bucket {}, key {}, object size {}", + outcome.GetError().GetMessage(), bucket, key, content_length); } } - } - throw S3Exception( - Aws::S3::S3Errors::NO_SUCH_KEY, - "Message: Single part upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}", - max_retry, key, bucket); -} + throw S3Exception( + Aws::S3::S3Errors::NO_SUCH_KEY, + "Message: Single part upload failed with NO_SUCH_KEY error, retries {}, Key: {}, Bucket: {}", + max_retry, key, bucket); + }; -void WriteBufferFromS3::waitForReadyBackgroundTasks() -{ - if (schedule) - { - std::unique_lock lock(bg_tasks_mutex); - - /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock - auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks); - - while (!tasks.empty() && tasks.front().is_finished) - { - auto & task = tasks.front(); - auto exception = task.exception; - auto tag = std::move(task.tag); - tasks.pop_front(); - - if (exception) - { - waitForAllBackgroundTasksUnlocked(lock); - std::rethrow_exception(exception); - } - - TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(tag); - } - } -} - -void WriteBufferFromS3::waitForAllBackgroundTasks() -{ - if (schedule) - { - std::unique_lock lock(bg_tasks_mutex); - waitForAllBackgroundTasksUnlocked(lock); - } -} - -void WriteBufferFromS3::waitForAllBackgroundTasksUnlocked(std::unique_lock & bg_tasks_lock) -{ - if (schedule) - { - bg_tasks_condvar.wait(bg_tasks_lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); }); - - /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock - auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks); - while (!tasks.empty()) - { - auto & task = tasks.front(); - - if (task.exception) - std::rethrow_exception(task.exception); - - TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(task.tag); - - tasks.pop_front(); - } - - if (put_object_task) - { - bg_tasks_condvar.wait(bg_tasks_lock, [this]() { return put_object_task->is_finished; }); - if (put_object_task->exception) - std::rethrow_exception(put_object_task->exception); - } - } + task_tracker->add(std::move(upload_worker)); } } diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 2374f1502f5..13ed151ad57 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -4,20 +4,16 @@ #if USE_AWS_S3 -#include -#include -#include - #include #include #include #include -#include #include #include -#include - +#include +#include +#include namespace Aws::S3 { @@ -27,8 +23,6 @@ class Client; namespace DB { -class WriteBufferFromFile; - /** * Buffer to write a data to a S3 object with specified bucket and key. * If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload. @@ -45,81 +39,74 @@ public: const String & key_, const S3Settings::RequestSettings & request_settings_, std::optional> object_metadata_ = std::nullopt, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE, ThreadPoolCallbackRunner schedule_ = {}, const WriteSettings & write_settings_ = {}); ~WriteBufferFromS3() override; - void nextImpl() override; - void preFinalize() override; private: - void allocateBuffer(); - - void processWithStrictParts(); - void processWithDynamicParts(); - - void fillCreateMultipartRequest(S3::CreateMultipartUploadRequest & req); - void createMultipartUpload(); - void writePart(); - void completeMultipartUpload(); - - void makeSinglepartUpload(); - /// Receives response from the server after sending all data. void finalizeImpl() override; - struct UploadPartTask; - void fillUploadRequest(S3::UploadPartRequest & req); - void processUploadRequest(UploadPartTask & task); + String getLogDetails() const; - struct PutObjectTask; - void fillPutRequest(S3::PutObjectRequest & req); - void processPutRequest(const PutObjectTask & task); + struct PartData; + void hidePartialData(); + void allocateFirstBuffer(); + void reallocateFirstBuffer(); + void detachBuffer(); + void allocateBuffer(); + void setFakeBufferWhenPreFinalized(); - void waitForReadyBackgroundTasks(); - void waitForAllBackgroundTasks(); - void waitForAllBackgroundTasksUnlocked(std::unique_lock & bg_tasks_lock); + S3::UploadPartRequest getUploadRequest(size_t part_number, PartData & data); + void writePart(PartData && data); + void writeMultipartUpload(); + void createMultipartUpload(); + void completeMultipartUpload(); + void abortMultipartUpload(); + void tryToAbortMultipartUpload(); + + S3::PutObjectRequest getPutRequest(PartData & data); + void makeSinglepartUpload(PartData && data); const String bucket; const String key; const S3Settings::RequestSettings request_settings; const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const WriteSettings write_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; + Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - /// Strict/static Part size, no adjustments will be done on fly. - size_t strict_upload_part_size = 0; - /// Part size will be adjusted on fly (for bigger uploads) - size_t current_upload_part_size = 0; - std::shared_ptr temporary_buffer; /// Buffer to accumulate data. - size_t last_part_size = 0; - size_t part_number = 0; + struct BufferAllocationPolicy; + std::unique_ptr buffer_allocation_policy; /// Upload in S3 is made in parts. /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. String multipart_upload_id; - std::vector TSA_GUARDED_BY(bg_tasks_mutex) part_tags; + std::deque multipart_tags; + bool multipart_upload_finished = false; + /// Track that prefinalize() is called only once bool is_prefinalized = false; - /// Following fields are for background uploads in thread pool (if specified). - /// We use std::function to avoid dependency of Interpreters - const ThreadPoolCallbackRunner schedule; + /// First fully filled buffer has to be delayed + /// There are two ways after: + /// First is to call prefinalize/finalize, which leads to single part upload + /// Second is to write more data, which leads to multi part upload + std::deque detached_part_data; + char fake_buffer_when_prefinalized[1] = {}; - std::unique_ptr put_object_task; /// Does not need protection by mutex because of the logic around is_finished field. - std::list TSA_GUARDED_BY(bg_tasks_mutex) upload_object_tasks; - int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; - int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + /// offset() and count() are unstable inside nextImpl + /// For example nextImpl changes position hence offset() and count() is changed + /// This vars are dedicated to store information about sizes when offset() and count() are unstable + size_t total_size = 0; + size_t hidden_size = 0; - std::mutex bg_tasks_mutex; - std::condition_variable bg_tasks_condvar; - - Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - - WriteSettings write_settings; + class TaskTracker; + std::unique_ptr task_tracker; }; } diff --git a/src/IO/WriteBufferFromS3MemoryStream.cpp b/src/IO/WriteBufferFromS3MemoryStream.cpp new file mode 100644 index 00000000000..6271f15f055 --- /dev/null +++ b/src/IO/WriteBufferFromS3MemoryStream.cpp @@ -0,0 +1,68 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +MemoryStream::MemoryBuf::MemoryBuf(char * begin_, size_t size_) + : begin(begin_) + , size(size_) +{ + this->setg(begin, begin, begin + size); +} + +MemoryStream::MemoryBuf::int_type MemoryStream::MemoryBuf::underflow() +{ + if (gptr() < egptr()) + return traits_type::to_int_type(*gptr()); + return traits_type::eof(); +} + +MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode mode) +{ + bool out_mode = (std::ios_base::out & mode) != 0; + if (out_mode) + return off_type(-1); + + off_type ret(-1); + + if (way == std::ios_base::beg) + ret = 0; + else if (way == std::ios_base::cur) + ret = gptr() - begin; + else if (way == std::ios_base::end) + ret = size; + + if (ret == off_type(-1)) + return ret; + + ret += off; + if (!(ret >= 0 && size_t(ret) <= size)) + return off_type(-1); + + this->setg(begin, begin + ret, begin + size); + + return pos_type(ret); +} + +MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekpos(pos_type sp, + std::ios_base::openmode mode) +{ + return seekoff(off_type(sp), std::ios_base::beg, mode); +} + +MemoryStream::MemoryStream(char * begin_, size_t size_) + : std::iostream(nullptr) + , mem_buf(begin_, size_) +{ + init(&mem_buf); +} + +} + +#endif + diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/WriteBufferFromS3MemoryStream.h new file mode 100644 index 00000000000..5a7cc17705d --- /dev/null +++ b/src/IO/WriteBufferFromS3MemoryStream.h @@ -0,0 +1,39 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include "WriteBufferFromS3.h" + +#include + +namespace DB +{ + +struct MemoryStream: std::iostream +{ + struct MemoryBuf: std::streambuf + { + MemoryBuf(char * begin_, size_t size_); + + int_type underflow() override; + + pos_type seekoff(off_type off, std::ios_base::seekdir way, + std::ios_base::openmode mode) override; + + pos_type seekpos(pos_type sp, + std::ios_base::openmode mode) override; + + char * begin = nullptr; + size_t size = 0; + }; + + MemoryStream(char * begin_, size_t size_); + + MemoryBuf mem_buf; +}; + +} + +#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp new file mode 100644 index 00000000000..0769f7731c2 --- /dev/null +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -0,0 +1,137 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner scheduler_) + : is_async(bool(scheduler_)) + , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner()) +{} + +WriteBufferFromS3::TaskTracker::~TaskTracker() +{ + safeWaitAll(); +} + +ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() +{ + return [](Callback && callback, int64_t) mutable -> std::future + { + auto package = std::packaged_task(std::move(callback)); + /// No exceptions are propagated, exceptions are packed to future + package(); + return package.get_future(); + }; +} + +void WriteBufferFromS3::TaskTracker::getReady() +{ + LOG_TEST(log, "getReady, in queue {}", futures.size()); + + /// Exceptions are propagated + auto it = futures.begin(); + while (it != futures.end()) + { + chassert(it->valid()); + if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready) + { + ++it; + continue; + } + + try + { + it->get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + + it = futures.erase(it); + } + + LOG_TEST(log, "getReady ended, in queue {}", futures.size()); +} + +void WriteBufferFromS3::TaskTracker::getAll() +{ + LOG_TEST(log, "getAll, in queue {}", futures.size()); + + /// Exceptions are propagated + for (auto & future : futures) + { + try + { + future.get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + throw; + } + } + futures.clear(); +} + +void WriteBufferFromS3::TaskTracker::safeWaitAll() +{ + LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size()); + + /// Exceptions are not propagated + for (auto & future : futures) + { + LOG_TEST(log, "safeWaitAll, wait future"); + + if (future.valid()) + future.wait(); + } + + LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size()); + + for (auto & future : futures) + { + if (future.valid()) + { + try + { + future.get(); + } catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + } + futures.clear(); + LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size()); +} + +void WriteBufferFromS3::TaskTracker::add(Callback && func) +{ + LOG_TEST(log, "add, in queue {}", futures.size()); + + auto future = scheduler(std::move(func), 0); + auto exit_scope = scope_guard( + [&future]() + { + future.wait(); + } + ); + + futures.push_back(std::move(future)); + + exit_scope.release(); + LOG_TEST(log, "add ended, in queue {}", futures.size()); +} + +bool WriteBufferFromS3::TaskTracker::isAsync() const +{ + return is_async; +} + +} + +#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h new file mode 100644 index 00000000000..fa214a4f8c5 --- /dev/null +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -0,0 +1,37 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include "WriteBufferFromS3.h" + +namespace DB +{ + +class WriteBufferFromS3::TaskTracker +{ +public: + using Callback = std::function; + + explicit TaskTracker(ThreadPoolCallbackRunner scheduler_); + ~TaskTracker(); + + static ThreadPoolCallbackRunner syncRunner(); + + bool isAsync() const; + void getReady(); + void getAll(); + void safeWaitAll(); + void add(Callback && func); + +private: + bool is_async; + ThreadPoolCallbackRunner scheduler; + std::list> futures; + Poco::Logger * log = &Poco::Logger::get("TaskTracker"); +}; + +} + +#endif diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp new file mode 100644 index 00000000000..d7661d3e3d0 --- /dev/null +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -0,0 +1,1114 @@ +#include + +#include "config.h" + +#if USE_AWS_S3 + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int S3_ERROR; +} + +} + +namespace MockS3 +{ + +class Sequencer +{ +public: + size_t next() { return counter++; } + std::string next_id() + { + std::stringstream ss; + ss << "id-" << next(); + return ss.str(); + } + +private: + size_t counter = 0; +}; + +class BucketMemStore +{ +public: + typedef std::string Key; + typedef std::string Data; + typedef std::string ETag; + typedef std::string MPU_ID; + typedef std::map MPUPartsInProgress; + typedef std::vector MPUParts; + + + std::map objects; + std::map multiPartUploads; + std::vector> CompletedPartUploads; + + Sequencer sequencer; + + std::string CreateMPU() + { + auto id = sequencer.next_id(); + multiPartUploads.emplace(id, MPUPartsInProgress{}); + return id; + } + + std::string UploadPart(const std::string & upload_id, const std::string & part) + { + auto etag = sequencer.next_id(); + auto & parts = multiPartUploads.at(upload_id); + parts.emplace(etag, part); + return etag; + } + + void PutObject(const std::string & key, const std::string & data) + { + objects[key] = data; + } + + void CompleteMPU(const std::string & key, const std::string & upload_id, const std::vector & etags) + { + MPUParts completedParts; + completedParts.reserve(etags.size()); + + auto & parts = multiPartUploads.at(upload_id); + for (const auto & tag: etags) { + completedParts.push_back(parts.at(tag)); + } + + std::stringstream file_data; + for (const auto & part_data: completedParts) { + file_data << part_data; + } + + CompletedPartUploads.emplace_back(upload_id, std::move(completedParts)); + objects[key] = file_data.str(); + multiPartUploads.erase(upload_id); + } + + void AbortMPU(const std::string & upload_id) + { + multiPartUploads.erase(upload_id); + } + + + const std::vector> & GetCompletedPartUploads() const + { + return CompletedPartUploads; + } + + static std::vector GetPartSizes(const MPUParts & parts) + { + std::vector result; + result.reserve(parts.size()); + for (auto & part_data : parts) + result.push_back(part_data.size()); + + return result; + } + +}; + +class S3MemStrore +{ +public: + void CreateBucket(const std::string & bucket) + { + assert(buckets.count(bucket) == 0); + buckets.emplace(bucket, BucketMemStore{}); + } + + BucketMemStore& GetBucketStore(const std::string & bucket) { + return buckets.at(bucket); + } + +private: + std::map buckets; +}; + +struct EventCounts +{ + size_t headObject = 0; + size_t getObject = 0; + size_t putObject = 0; + size_t multiUploadCreate = 0; + size_t multiUploadComplete = 0; + size_t multiUploadAbort = 0; + size_t uploadParts = 0; + size_t writtenSize = 0; + + size_t totalRequestsCount() const + { + return headObject + getObject + putObject + multiUploadCreate + multiUploadComplete + uploadParts; + } +}; + +struct Client; + +struct InjectionModel +{ + virtual ~InjectionModel() = default; + +#define DeclareInjectCall(ObjectTypePart) \ + virtual std::optional call(const Aws::S3::Model::ObjectTypePart##Request & /*request*/) \ + { \ + return std::nullopt; \ + } + DeclareInjectCall(PutObject) + DeclareInjectCall(HeadObject) + DeclareInjectCall(CreateMultipartUpload) + DeclareInjectCall(CompleteMultipartUpload) + DeclareInjectCall(AbortMultipartUpload) + DeclareInjectCall(UploadPart) +#undef DeclareInjectCall +}; + +struct Client : DB::S3::Client +{ + Client(std::shared_ptr mock_s3_store) + : DB::S3::Client( + 100, + DB::S3::ServerSideEncryptionKMSConfig(), + std::make_shared("", ""), + GetClientConfiguration(), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + /* useVirtualAddressing = */ true) + , store(mock_s3_store) + { } + + static std::shared_ptr CreateClient(String bucket = "mock-s3-bucket") + { + auto s3store = std::make_shared(); + s3store->CreateBucket(bucket); + return std::make_shared(s3store); + } + + static DB::S3::PocoHTTPClientConfiguration GetClientConfiguration() + { + DB::RemoteHostFilter remote_host_filter; + return DB::S3::ClientFactory::instance().createClientConfiguration( + "some-region", + remote_host_filter, + /* s3_max_redirects = */ 100, + /* enable_s3_requests_logging = */ true, + /* for_disk_s3 = */ false, + /* get_request_throttler = */ {}, + /* put_request_throttler = */ {} + ); + } + + void setInjectionModel(std::shared_ptr injections_) + { + injections = injections_; + } + + Aws::S3::Model::PutObjectOutcome PutObject(const Aws::S3::Model::PutObjectRequest & request) const override + { + ++counters.putObject; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return *opt_val; + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + std::stringstream data; + data << request.GetBody()->rdbuf(); + bStore.PutObject(request.GetKey(), data.str()); + counters.writtenSize += data.str().length(); + + Aws::S3::Model::PutObjectOutcome outcome; + Aws::S3::Model::PutObjectResult result(outcome.GetResultWithOwnership()); + return result; + } + + Aws::S3::Model::GetObjectOutcome GetObject(const Aws::S3::Model::GetObjectRequest & request) const override + { + ++counters.getObject; + + auto & bStore = store->GetBucketStore(request.GetBucket()); + + auto factory = request.GetResponseStreamFactory(); + Aws::Utils::Stream::ResponseStream responseStream(factory); + responseStream.GetUnderlyingStream() << std::stringstream(bStore.objects[request.GetKey()]).rdbuf(); + + Aws::AmazonWebServiceResult awsStream(std::move(responseStream), Aws::Http::HeaderValueCollection()); + Aws::S3::Model::GetObjectResult getObjectResult(std::move(awsStream)); + return Aws::S3::Model::GetObjectOutcome(std::move(getObjectResult)); + } + + Aws::S3::Model::HeadObjectOutcome HeadObject(const Aws::S3::Model::HeadObjectRequest & request) const override + { + ++counters.headObject; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto obj = bStore.objects[request.GetKey()]; + Aws::S3::Model::HeadObjectOutcome outcome; + Aws::S3::Model::HeadObjectResult result(outcome.GetResultWithOwnership()); + result.SetContentLength(obj.length()); + return result; + } + + Aws::S3::Model::CreateMultipartUploadOutcome CreateMultipartUpload(const Aws::S3::Model::CreateMultipartUploadRequest & request) const override + { + ++counters.multiUploadCreate; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto mpu_id = bStore.CreateMPU(); + + Aws::S3::Model::CreateMultipartUploadResult result; + result.SetUploadId(mpu_id.c_str()); + return Aws::S3::Model::CreateMultipartUploadOutcome(result); + } + + Aws::S3::Model::UploadPartOutcome UploadPart(const Aws::S3::Model::UploadPartRequest & request) const override + { + ++counters.uploadParts; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + std::stringstream data; + data << request.GetBody()->rdbuf(); + counters.writtenSize += data.str().length(); + + auto & bStore = store->GetBucketStore(request.GetBucket()); + auto etag = bStore.UploadPart(request.GetUploadId(), data.str()); + + Aws::S3::Model::UploadPartResult result; + result.SetETag(etag); + return Aws::S3::Model::UploadPartOutcome(result); + } + + Aws::S3::Model::CompleteMultipartUploadOutcome CompleteMultipartUpload(const Aws::S3::Model::CompleteMultipartUploadRequest & request) const override + { + ++counters.multiUploadComplete; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + + std::vector etags; + for (const auto & x: request.GetMultipartUpload().GetParts()) { + etags.push_back(x.GetETag()); + } + bStore.CompleteMPU(request.GetKey(), request.GetUploadId(), etags); + + Aws::S3::Model::CompleteMultipartUploadResult result; + return Aws::S3::Model::CompleteMultipartUploadOutcome(result); + } + + Aws::S3::Model::AbortMultipartUploadOutcome AbortMultipartUpload(const Aws::S3::Model::AbortMultipartUploadRequest & request) const override + { + ++counters.multiUploadAbort; + + if (injections) + { + if (auto opt_val = injections->call(request)) + { + return std::move(*opt_val); + } + } + + auto & bStore = store->GetBucketStore(request.GetBucket()); + bStore.AbortMPU(request.GetUploadId()); + + Aws::S3::Model::AbortMultipartUploadResult result; + return Aws::S3::Model::AbortMultipartUploadOutcome(result); + } + + std::shared_ptr store; + mutable EventCounts counters; + mutable std::shared_ptr injections; + void resetCounters() const { counters = {}; } +}; + +struct PutObjectFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::PutObjectRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "PutObjectFailIngection", false); + } +}; + +struct HeadObjectFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::HeadObjectRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "HeadObjectFailIngection", false); + } +}; + +struct CreateMPUFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::CreateMultipartUploadRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "CreateMPUFailIngection", false); + } +}; + +struct CompleteMPUFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::CompleteMultipartUploadRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "CompleteMPUFailIngection", false); + } +}; + +struct UploadPartFailIngection: InjectionModel +{ + std::optional call(const Aws::S3::Model::UploadPartRequest & /*request*/) override + { + return Aws::Client::AWSError(Aws::Client::CoreErrors::VALIDATION, "FailInjection", "UploadPartFailIngection", false); + } +}; + +struct BaseSyncPolicy +{ + virtual ~BaseSyncPolicy() = default; + virtual DB::ThreadPoolCallbackRunner getScheduler() { return {}; } + virtual void execute(size_t = 0) {} + virtual void setAutoExecute(bool = true) {} + + virtual size_t size() const { return 0; } + virtual bool empty() const { return size() == 0; } +}; + +struct SimpleAsyncTasks : BaseSyncPolicy +{ + bool auto_execute = false; + std::deque> queue; + + virtual DB::ThreadPoolCallbackRunner getScheduler() override + { + return [this] (std::function && operation, size_t /*priority*/) + { + if (auto_execute) + { + auto task = std::packaged_task(std::move(operation)); + task(); + return task.get_future(); + } + + queue.emplace_back(std::move(operation)); + return queue.back().get_future(); + }; + } + + virtual void execute(size_t limit = 0) override + { + if (limit == 0) + limit = queue.size(); + + while (!queue.empty() && limit) + { + auto & request = queue.front(); + request(); + + queue.pop_front(); + --limit; + } + } + + virtual void setAutoExecute(bool value = true) override + { + auto_execute = value; + if (auto_execute) + execute(); + } + + virtual size_t size() const override { return queue.size(); } +}; + +} + +using namespace DB; + +void writeAsOneBlock(WriteBuffer& buf, size_t size) +{ + std::vector data(size, 'a'); + buf.write(data.data(), data.size()); +} + +void writeAsPieces(WriteBuffer& buf, size_t size) +{ + size_t ceil = 15ull*1024*1024*1024; + size_t piece = 1; + size_t written = 0; + while (written < size) { + size_t len = std::min({piece, size-written, ceil}); + writeAsOneBlock(buf, len); + written += len; + piece *= 2; + } +} + +class WBS3Test : public ::testing::Test +{ +public: + const String bucket = "WBS3Test-bucket"; + + Settings & getSettings() + { + return settings; + } + + MockS3::BaseSyncPolicy & getAsyncPolicy() + { + return *async_policy; + } + + std::unique_ptr getWriteBuffer(String file_name = "file") + { + S3Settings::RequestSettings request_settings; + request_settings.updateFromSettings(settings); + + client->resetCounters(); + + getAsyncPolicy().setAutoExecute(false); + + return std::make_unique( + client, + bucket, + file_name, + request_settings, + std::nullopt, + getAsyncPolicy().getScheduler()); + } + + void setInjectionModel(std::shared_ptr injections_) + { + client->setInjectionModel(injections_); + } + + void runSimpleScenario(MockS3::EventCounts expected_counters, size_t size) + { + auto scenario = [&] (std::function writeMethod) { + auto buffer = getWriteBuffer("file"); + writeMethod(*buffer, size); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + expected_counters.writtenSize = size; + assertCountersEQ(expected_counters); + + auto & bStore = client->store->GetBucketStore(bucket); + auto & data = bStore.objects["file"]; + ASSERT_EQ(size, data.size()); + for (char c : data) + ASSERT_EQ('a', c); + }; + + scenario(writeAsOneBlock); + scenario(writeAsPieces); + } + + void assertCountersEQ(const MockS3::EventCounts & canonical) { + const auto & actual = client->counters; + ASSERT_EQ(canonical.headObject, actual.headObject); + ASSERT_EQ(canonical.getObject, actual.getObject); + ASSERT_EQ(canonical.putObject, actual.putObject); + ASSERT_EQ(canonical.multiUploadCreate, actual.multiUploadCreate); + ASSERT_EQ(canonical.multiUploadComplete, actual.multiUploadComplete); + ASSERT_EQ(canonical.multiUploadAbort, actual.multiUploadAbort); + ASSERT_EQ(canonical.uploadParts, actual.uploadParts); + ASSERT_EQ(canonical.writtenSize, actual.writtenSize); + } + + auto getCompletedPartUploads () + { + return client->store->GetBucketStore(bucket).GetCompletedPartUploads(); + } + +protected: + Settings settings; + + std::shared_ptr client; + std::unique_ptr async_policy; + + virtual void SetUp() override + { + client = MockS3::Client::CreateClient(bucket); + async_policy = std::make_unique(); + } + + virtual void TearDown() override + { + client.reset(); + async_policy.reset(); + } +}; + +class SyncAsync : public WBS3Test, public ::testing::WithParamInterface +{ +protected: + bool test_with_pool = false; + + virtual void SetUp() override + { + test_with_pool = GetParam(); + client = MockS3::Client::CreateClient(bucket); + if (test_with_pool) + async_policy = std::make_unique(); + else + async_policy = std::make_unique(); + } +}; + +INSTANTIATE_TEST_SUITE_P(WBS3 + , SyncAsync + , ::testing::Values(true, false) + , [] (const ::testing::TestParamInfo& info_param) { + std::string name = info_param.param ? "async" : "sync"; + return name; + }); + +TEST_P(SyncAsync, exception_on_head) { + setInjectionModel(std::make_shared()); + + getSettings().s3_check_objects_after_upload = true; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_head_1"); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("Immediately after upload:")); + throw; + } + }, DB::S3Exception); +} + +TEST_P(SyncAsync, exception_on_put) { + setInjectionModel(std::make_shared()); + + EXPECT_THROW({ + try + { + auto buffer = getWriteBuffer("exception_on_put_1"); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_put_2"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_put_3"); + buffer->write('A'); + getAsyncPolicy().setAutoExecute(); + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("PutObjectFailIngection")); + throw; + } + }, DB::S3Exception); + +} + +TEST_P(SyncAsync, exception_on_create_mpu) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_1"); + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_2"); + buffer->write('A'); + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_create_mpu_2"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch( const DB::Exception& e ) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CreateMPUFailIngection")); + throw; + } + }, DB::S3Exception); +} + + +TEST_P(SyncAsync, exception_on_complete_mpu) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_complete_mpu_1"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("CompleteMPUFailIngection")); + throw; + } + }, DB::S3Exception); +} + +TEST_P(SyncAsync, exception_on_upload_part) { + setInjectionModel(std::make_shared()); + + getSettings().s3_max_single_part_upload_size = 0; // no single part + getSettings().s3_min_upload_part_size = 1; // small parts ara ok + + MockS3::EventCounts counters = {.multiUploadCreate = 1, .multiUploadAbort = 1}; + + counters.uploadParts = 2; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_1"); + + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_2"); + getAsyncPolicy().setAutoExecute(); + + buffer->write('A'); + buffer->next(); + + buffer->write('A'); + buffer->next(); + + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + counters.uploadParts = 1; + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_3"); + buffer->write('A'); + + buffer->preFinalize(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); + + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("exception_on_upload_part_4"); + buffer->write('A'); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + assertCountersEQ(counters); + ASSERT_EQ(ErrorCodes::S3_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("UploadPartFailIngection")); + throw; + } + }, DB::S3Exception); +} + + +TEST_F(WBS3Test, prefinalize_called_multiple_times) { +#ifdef ABORT_ON_LOGICAL_ERROR + GTEST_SKIP() << "this test trigger LOGICAL_ERROR, runs only if ABORT_ON_LOGICAL_ERROR is not defined"; +#else + EXPECT_THROW({ + try { + auto buffer = getWriteBuffer("prefinalize_called_multiple_times"); + buffer->write('A'); + buffer->next(); + buffer->preFinalize(); + buffer->write('A'); + buffer->next(); + buffer->preFinalize(); + buffer->finalize(); + } + catch(const DB::Exception & e) + { + ASSERT_EQ(ErrorCodes::LOGICAL_ERROR, e.code()); + EXPECT_THAT(e.what(), testing::HasSubstr("write to prefinalized buffer for S3")); + throw; + } + }, DB::Exception); +#endif +} + +TEST_P(SyncAsync, empty_file) { + getSettings().s3_check_objects_after_upload = true; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + runSimpleScenario(counters, 0); +} + +TEST_P(SyncAsync, manual_next_calls) { + getSettings().s3_check_objects_after_upload = true; + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_1"); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_2"); + buffer->next(); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1, .writtenSize = 1}; + + auto buffer = getWriteBuffer("manual_next_calls_3"); + buffer->next(); + buffer->write('A'); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } + + { + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1, .writtenSize = 2}; + + auto buffer = getWriteBuffer("manual_next_calls_4"); + buffer->write('A'); + buffer->next(); + buffer->write('A'); + buffer->next(); + buffer->next(); + + getAsyncPolicy().setAutoExecute(); + buffer->finalize(); + + assertCountersEQ(counters); + } +} + +TEST_P(SyncAsync, small_file_is_one_put_request) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + runSimpleScenario(counters, 1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size-1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size/2); + } + + { + + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; + + runSimpleScenario(counters, 1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size-1); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size); + runSimpleScenario(counters, getSettings().s3_max_single_part_upload_size/2); + } +} + +TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + MockS3::EventCounts counters = {.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 1); + + counters.uploadParts = 101; + runSimpleScenario(counters, 2*settings.s3_max_single_part_upload_size); + } + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + MockS3::EventCounts counters = {.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 1}; + + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 1); + runSimpleScenario(counters, 2*settings.s3_max_single_part_upload_size); + runSimpleScenario(counters, settings.s3_min_upload_part_size-1); + runSimpleScenario(counters, settings.s3_min_upload_part_size); + } +} + +TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 1000; + getSettings().s3_min_upload_part_size = 10; + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size); + + counters.uploadParts = 3; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size + 1); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 2*settings.s3_min_upload_part_size - 1); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + 2*settings.s3_min_upload_part_size); + } + + + { + // but not in that case, when s3_min_upload_part_size > s3_max_single_part_upload_size + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 1000; + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 2}; + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size); + runSimpleScenario(counters, settings.s3_max_single_part_upload_size + settings.s3_min_upload_part_size + 1); + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size-1); + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size); + + counters.uploadParts = 3; + runSimpleScenario(counters, 2*settings.s3_min_upload_part_size+1); + } +} + +TEST_P(SyncAsync, increase_upload_buffer) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 1; + // parts: 10 20 40 80 160 + // size: 10 30 70 150 310 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 6}; + runSimpleScenario(counters, 350); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 20, 40, 80, 160, 40)); + } + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 2; + getSettings().s3_upload_part_size_multiply_factor = 3; + // parts: 10 10 30 30 90 + // size: 10 20 50 80 170 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 6}; + runSimpleScenario(counters, 190); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 10, 30, 30, 90, 20)); + } +} + +TEST_P(SyncAsync, increase_limited) { + getSettings().s3_check_objects_after_upload = true; + + { + getSettings().s3_max_single_part_upload_size = 10; + getSettings().s3_min_upload_part_size = 10; + getSettings().s3_upload_part_size_multiply_parts_count_threshold = 1; + getSettings().s3_max_upload_part_size = 45; + // parts: 10 20 40 45 45 45 + // size: 10 30 70 115 160 205 + + auto counters = MockS3::EventCounts{.headObject = 2, .multiUploadCreate = 1, .multiUploadComplete = 1, .uploadParts = 7}; + runSimpleScenario(counters, 220); + + auto actual_parts_sizes = MockS3::BucketMemStore::GetPartSizes(getCompletedPartUploads().back().second); + ASSERT_THAT(actual_parts_sizes, testing::ElementsAre(10, 20, 40, 45, 45, 45, 15)); + } +} + +#endif diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 9adfcf7fef7..f61c1dad59f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -290,6 +290,7 @@ void MergeTreeData::initializeDirectoriesAndFormatVersion(const std::string & re { auto buf = disk->writeFile(format_version_path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, getContext()->getWriteSettings()); writeIntText(format_version.toUnderType(), *buf); + buf->finalize(); if (getContext()->getSettingsRef().fsync_metadata) buf->sync(); } diff --git a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp index b843ce6a078..6c6a6ded5dd 100644 --- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp +++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp @@ -160,7 +160,10 @@ void MergeTreeDeduplicationLog::rotate() existing_logs.emplace(current_log_number, log_description); if (current_writer) + { + current_writer->finalize(); current_writer->sync(); + } current_writer = disk->writeFile(log_description.path, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append); } diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 2e30a3f3986..feffffb57ea 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -75,6 +75,7 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP TransactionID::write(tid, *out); *out << "\n"; } + out->finalize(); out->sync(); } catch (...) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 00e72482a17..a4d9dc9f2e3 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -777,7 +777,6 @@ public: key, configuration_.request_settings, std::nullopt, - DBMS_DEFAULT_BUFFER_SIZE, threadPoolCallbackRunner(IOThreadPool::get(), "S3ParallelWrite"), context->getWriteSettings()), compression_method, diff --git a/tests/queries/0_stateless/02240_filesystem_query_cache.reference b/tests/queries/0_stateless/02240_filesystem_query_cache.reference index f4b9f7bb127..16c4cd1c049 100644 --- a/tests/queries/0_stateless/02240_filesystem_query_cache.reference +++ b/tests/queries/0_stateless/02240_filesystem_query_cache.reference @@ -6,6 +6,7 @@ SET skip_download_if_exceeds_query_cache=1; SET filesystem_cache_max_download_size=128; DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_4', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; +SYSTEM DROP FILESYSTEM CACHE; INSERT INTO test SELECT number, toString(number) FROM numbers(100); SELECT * FROM test FORMAT Null; SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size; diff --git a/tests/queries/0_stateless/02240_filesystem_query_cache.sql b/tests/queries/0_stateless/02240_filesystem_query_cache.sql index 94eb4bc5ccd..44856a2188c 100644 --- a/tests/queries/0_stateless/02240_filesystem_query_cache.sql +++ b/tests/queries/0_stateless/02240_filesystem_query_cache.sql @@ -9,8 +9,8 @@ SET filesystem_cache_max_download_size=128; DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_4', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; +SYSTEM DROP FILESYSTEM CACHE; INSERT INTO test SELECT number, toString(number) FROM numbers(100); - SELECT * FROM test FORMAT Null; SELECT file_segment_range_begin, file_segment_range_end, size FROM system.filesystem_cache ORDER BY file_segment_range_end, size; SYSTEM DROP FILESYSTEM CACHE; diff --git a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference index f3fac9b32d3..b3b7d12d219 100644 --- a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference +++ b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.reference @@ -1,6 +1,6 @@ Using storage policy: s3_cache 0 -0 +0 0 Row 1: ────── file_segment_range_begin: 0 @@ -8,11 +8,11 @@ file_segment_range_end: 745 size: 746 state: DOWNLOADED 8 -8 +8 1100 0 2 2 -8 +8 1100 Row 1: ────── file_segment_range_begin: 0 @@ -20,17 +20,17 @@ file_segment_range_end: 1659 size: 1660 state: DOWNLOADED 8 -8 -8 -8 -24 -35 -43 +8 2014 +8 2014 +8 2014 +24 84045 +35 168815 +44 252113 5010500 18816 Using storage policy: local_cache 0 -0 +0 0 Row 1: ────── file_segment_range_begin: 0 @@ -38,11 +38,11 @@ file_segment_range_end: 745 size: 746 state: DOWNLOADED 8 -8 +8 1100 0 2 2 -8 +8 1100 Row 1: ────── file_segment_range_begin: 0 @@ -50,11 +50,11 @@ file_segment_range_end: 1659 size: 1660 state: DOWNLOADED 8 -8 -8 -8 -24 -35 -43 +8 2014 +8 2014 +8 2014 +24 84045 +35 168815 +44 252113 5010500 18816 diff --git a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh index 048fb792e6e..e65bf9cb35f 100755 --- a/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh +++ b/tests/queries/0_stateless/02241_filesystem_cache_on_write_operations.sh @@ -33,7 +33,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100)" @@ -54,7 +54,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache WHERE cache_hits > 0" @@ -64,7 +64,7 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do $CLICKHOUSE_CLIENT --query "SELECT * FROM test_02241 FORMAT Null" $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache WHERE cache_hits > 0" - $CLICKHOUSE_CLIENT --query "SELECT count() size FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) size FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SYSTEM DROP FILESYSTEM CACHE" @@ -87,24 +87,23 @@ for STORAGE_POLICY in 's3_cache' 'local_cache'; do FORMAT Vertical;" $CLICKHOUSE_CLIENT --query "SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100) SETTINGS enable_filesystem_cache_on_write_operations=0" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(100)" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(300, 10000)" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --query "SYSTEM START MERGES test_02241" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "OPTIMIZE TABLE test_02241 FINAL" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --mutations_sync=2 --query "ALTER TABLE test_02241 UPDATE value = 'kek' WHERE key = 100" - $CLICKHOUSE_CLIENT --query "SELECT count() FROM system.filesystem_cache" - + $CLICKHOUSE_CLIENT --query "SELECT count(), sum(size) FROM system.filesystem_cache" $CLICKHOUSE_CLIENT --enable_filesystem_cache_on_write_operations=1 --query "INSERT INTO test_02241 SELECT number, toString(number) FROM numbers(5000000)" $CLICKHOUSE_CLIENT --query "SYSTEM FLUSH LOGS" diff --git a/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference b/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference index 083f0f69dc8..e77afc98007 100644 --- a/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference +++ b/tests/queries/0_stateless/02382_filesystem_cache_persistent_files.reference @@ -8,7 +8,7 @@ SYSTEM STOP MERGES nopers; INSERT INTO nopers SELECT number, toString(number) FROM numbers(10); SELECT * FROM nopers FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -194 +195 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -21,17 +21,18 @@ ON data_paths.cache_path = caches.cache_path ORDER BY file, cache, size; data.bin 0 114 data.mrk3 0 80 +format_version.txt 0 1 DROP TABLE IF EXISTS test; CREATE TABLE test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_small', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; SYSTEM STOP MERGES test; INSERT INTO test SELECT number, toString(number) FROM numbers(100); SELECT * FROM test FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -1020 +1021 SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path; -4 +5 SELECT count() FROM system.filesystem_cache; -4 +5 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -46,17 +47,18 @@ data.bin 0 114 data.bin 0 746 data.mrk3 0 80 data.mrk3 0_persistent 80 +format_version.txt 0 1 DROP TABLE IF EXISTS test2; CREATE TABLE test2 (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_small', min_bytes_for_wide_part = 10485760, compress_marks=false, compress_primary_key=false; SYSTEM STOP MERGES test2; INSERT INTO test2 SELECT number, toString(number) FROM numbers(100000); SELECT * FROM test2 FORMAT Null; SELECT sum(size) FROM system.filesystem_cache; -794 +795 SELECT count() FROM (SELECT arrayJoin(cache_paths) AS cache_path, local_path, remote_path FROM system.remote_data_paths ) AS data_paths INNER JOIN system.filesystem_cache AS caches ON data_paths.cache_path = caches.cache_path; -4 +5 SELECT count() FROM system.filesystem_cache; -4 +5 SELECT extract(local_path, '.*/([\w.]+)') as file, extract(cache_path, '.*/([\w.]+)') as cache, size FROM ( @@ -71,6 +73,7 @@ data.bin 0 114 data.mrk3 0 80 data.mrk3 0_persistent 80 data.mrk3 0_persistent 520 +format_version.txt 0 1 DROP TABLE test; DROP TABLE test2; DROP TABLE nopers; diff --git a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference index 00e93b1db3d..3f34d5e2c79 100644 --- a/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference +++ b/tests/queries/0_stateless/02675_profile_events_from_query_log_and_client.reference @@ -1,8 +1,8 @@ INSERT TO S3 [ 0 ] S3CompleteMultipartUpload: 1 [ 0 ] S3CreateMultipartUpload: 1 - [ 0 ] S3HeadObject: 1 - [ 0 ] S3ReadRequestsCount: 1 + [ 0 ] S3HeadObject: 2 + [ 0 ] S3ReadRequestsCount: 2 [ 0 ] S3UploadPart: 1 [ 0 ] S3WriteRequestsCount: 3 CHECK WITH query_log From cd449cca38ce8bb831a75822921dd9f35c649562 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 5 May 2023 13:18:42 +0200 Subject: [PATCH 017/127] WriteBufferFromS3BufferAllocationPolicy for FixedSize and Exp policy --- src/IO/WriteBufferFromS3.cpp | 80 +------------ src/IO/WriteBufferFromS3.h | 5 +- ...riteBufferFromS3BufferAllocationPolicy.cpp | 108 ++++++++++++++++++ .../WriteBufferFromS3BufferAllocationPolicy.h | 26 +++++ src/IO/WriteBufferFromS3MemoryStream.h | 2 - ...02720_s3_strict_upload_part_size.reference | 8 +- .../02720_s3_strict_upload_part_size.sh | 2 +- 7 files changed, 144 insertions(+), 87 deletions(-) create mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp create mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.h diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5630ed2cb68..73d78cb13be 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -74,80 +74,6 @@ struct WriteBufferFromS3::PartData } }; -struct WriteBufferFromS3::BufferAllocationPolicy -{ - size_t first_size = 0; - size_t second_size = 0; - - size_t multiply_factor = 0; - size_t multiply_threshold = 0; - size_t max_size = 0; - - size_t current_size = 0; - size_t buffer_number = 0; - - explicit BufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) - : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) - , second_size(settings_.min_upload_part_size) - , multiply_factor(settings_.upload_part_size_multiply_factor) - , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) - , max_size(settings_.max_upload_part_size) - { - if (settings_.strict_upload_part_size > 0) - { - first_size = settings_.strict_upload_part_size; - second_size = settings_.strict_upload_part_size; - multiply_factor = 1; - multiply_threshold = 10000; - max_size = settings_.max_upload_part_size; - } - else - { - first_size = std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size); - second_size = settings_.min_upload_part_size; - multiply_factor = settings_.upload_part_size_multiply_factor; - multiply_threshold = settings_.upload_part_size_multiply_parts_count_threshold; - max_size = settings_.max_upload_part_size; - } - - chassert(first_size > 0); - chassert(second_size > 0); - chassert(multiply_factor >= 1); - chassert(multiply_threshold > 0); - chassert(max_size > 0); - } - - size_t getNumber() const - { - return buffer_number; - } - - size_t getSize() const - { - chassert(buffer_number > 0); - return current_size; - } - - void next() - { - ++buffer_number; - - if (1 == buffer_number) - { - current_size = first_size; - return; - } - - if (2 == buffer_number) - current_size = second_size; - - if (0 == ((buffer_number-1) % multiply_threshold)) - { - current_size *= multiply_factor; - current_size = std::min(current_size, max_size); - } - } -}; WriteBufferFromS3::WriteBufferFromS3( std::shared_ptr client_ptr_, @@ -164,7 +90,7 @@ WriteBufferFromS3::WriteBufferFromS3( , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , buffer_allocation_policy(std::make_unique(request_settings_.getUploadSettings())) + , buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings())) , task_tracker(std::make_unique(std::move(schedule_))) { LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails()); @@ -488,7 +414,7 @@ void WriteBufferFromS3::abortMultipartUpload() S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, PartData & data) { ProfileEvents::increment(ProfileEvents::WriteBufferFromS3Bytes, data.data_size); - LOG_TRACE(log, "fillUploadRequest, size {}, key: {}", data.data_size, key); + LOG_TRACE(log, "getUploadRequest, size {}, key: {}", data.data_size, key); S3::UploadPartRequest req; @@ -515,7 +441,7 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) multipart_tags.push_back({}); size_t part_number = multipart_tags.size(); - LOG_TRACE(log, "WritePart. {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); + LOG_TRACE(log, "writePart {}, part size: {}, part number: {}", getLogDetails(), data.data_size, part_number); if (multipart_upload_id.empty()) throw Exception( diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 13ed151ad57..b0d8d329589 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,6 @@ class Client; namespace DB { - /** * Buffer to write a data to a S3 object with specified bucket and key. * If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload. @@ -80,8 +80,7 @@ private: const std::optional> object_metadata; Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3"); - struct BufferAllocationPolicy; - std::unique_ptr buffer_allocation_policy; + IBufferAllocationPolicyPtr buffer_allocation_policy; /// Upload in S3 is made in parts. /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp new file mode 100644 index 00000000000..1e9b209087c --- /dev/null +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -0,0 +1,108 @@ +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace +{ + +struct FixedSizeBufferAllocationPolicy : DB::IBufferAllocationPolicy +{ + const size_t size = 0; + size_t buffer_number = 0; + + explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + : size(settings_.strict_upload_part_size) + { + chassert(size > 0); + } + + size_t getNumber() const override { return buffer_number; } + + size_t getSize() const override + { + chassert(buffer_number > 0); + return size; + } + + void next() override + { + ++buffer_number; + } +}; + + +struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy +{ + const size_t first_size = 0; + const size_t second_size = 0; + + const size_t multiply_factor = 0; + const size_t multiply_threshold = 0; + const size_t max_size = 0; + + size_t current_size = 0; + size_t buffer_number = 0; + + explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) + , second_size(settings_.min_upload_part_size) + , multiply_factor(settings_.upload_part_size_multiply_factor) + , multiply_threshold(settings_.upload_part_size_multiply_parts_count_threshold) + , max_size(settings_.max_upload_part_size) + { + chassert(first_size > 0); + chassert(second_size > 0); + chassert(multiply_factor >= 1); + chassert(multiply_threshold > 0); + chassert(max_size > 0); + } + + size_t getNumber() const override { return buffer_number; } + + size_t getSize() const override + { + chassert(buffer_number > 0); + return current_size; + } + + void next() override + { + ++buffer_number; + + if (1 == buffer_number) + { + current_size = first_size; + return; + } + + if (2 == buffer_number) + current_size = second_size; + + if (0 == ((buffer_number - 1) % multiply_threshold)) + { + current_size *= multiply_factor; + current_size = std::min(current_size, max_size); + } + } +}; + +} + +namespace DB +{ + +IBufferAllocationPolicy::~IBufferAllocationPolicy() { } + +IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) +{ + if (settings_.strict_upload_part_size > 0) + return std::make_unique(settings_); + else + return std::make_unique(settings_); +} + +} + +#endif diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h new file mode 100644 index 00000000000..1ee7c982ed2 --- /dev/null +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h @@ -0,0 +1,26 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include + +namespace DB +{ + +struct IBufferAllocationPolicy +{ + virtual size_t getNumber() const = 0; + virtual size_t getSize() const = 0; + virtual void next() = 0; + virtual ~IBufferAllocationPolicy() = 0; +}; + +using IBufferAllocationPolicyPtr = std::unique_ptr; + +IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); + +} + +#endif diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/WriteBufferFromS3MemoryStream.h index 5a7cc17705d..e9606798910 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.h +++ b/src/IO/WriteBufferFromS3MemoryStream.h @@ -4,8 +4,6 @@ #if USE_AWS_S3 -#include "WriteBufferFromS3.h" - #include namespace DB diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference index 360b484bf28..f7c4ece5f1f 100644 --- a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference +++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.reference @@ -1,4 +1,4 @@ -Size: 6000001 -Size: 6000001 -Size: 6000001 -Size: 2971517 +part size: 6000001, part number: 1 +part size: 6000001, part number: 2 +part size: 6000001, part number: 3 +part size: 2971517, part number: 4 diff --git a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh index 69e2f734914..9799ef0478a 100755 --- a/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh +++ b/tests/queries/0_stateless/02720_s3_strict_upload_part_size.sh @@ -19,7 +19,7 @@ $CLICKHOUSE_LOCAL -q "SELECT randomPrintableASCII(1023) FROM numbers(20*1024) FO $CLICKHOUSE_CLIENT --send_logs_level=trace --server_logs_file="$log" -q "INSERT INTO FUNCTION s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" --s3_strict_upload_part_size=6000001 < "$in" grep -F '' "$log" || : -grep -o 'WriteBufferFromS3: Writing part.*Size: .*' "$log" | grep -o 'Size: .*' +grep -o 'WriteBufferFromS3: writePart.*, part size: .*' "$log" | grep -o 'part size: .*' $CLICKHOUSE_CLIENT -q "SELECT * FROM s3(s3_conn, filename='$CLICKHOUSE_TEST_UNIQUE_NAME', format='LineAsString', structure='line String') FORMAT LineAsString" > "$out" diff -q "$in" "$out" From c8028bfd7fc83ec64aeb65f75fa7f85b05225fb7 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 5 May 2023 15:46:52 +0000 Subject: [PATCH 018/127] ajust 02240_system_filesystem_cache_table --- ...40_system_filesystem_cache_table.reference | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference index cf2bf5fb521..f960b4eb21c 100644 --- a/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference +++ b/tests/queries/0_stateless/02240_system_filesystem_cache_table.reference @@ -1,13 +1,15 @@ Using storage policy: s3_cache 0 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -15,13 +17,15 @@ DOWNLOADED 0 745 746 2 Expect no cache Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -31,13 +35,15 @@ Expect no cache Using storage policy: local_cache 0 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 @@ -45,13 +51,15 @@ DOWNLOADED 0 745 746 2 Expect no cache Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect cache +DOWNLOADED 0 0 1 DOWNLOADED 0 79 80 DOWNLOADED 0 745 746 -2 +3 Expect no cache Expect cache DOWNLOADED 0 79 80 From 60bf45f863a1e4184cd2159d5435f32e228a0008 Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Sat, 6 May 2023 12:11:16 +0200 Subject: [PATCH 019/127] Update WriteBufferFromS3BufferAllocationPolicy.cpp --- src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp index 1e9b209087c..0eec6b0d034 100644 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -93,7 +93,7 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy namespace DB { -IBufferAllocationPolicy::~IBufferAllocationPolicy() { } +IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) { From 8bc9a32d19b5ecd02bc08787800cd475564069fa Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 10 May 2023 18:45:59 +0000 Subject: [PATCH 020/127] fix special build --- src/IO/S3Common.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 370c5911482..71d52c727c7 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -20,8 +20,6 @@ #include #include -namespace Aws::S3 { class Client; } - namespace DB { From 26743b54394de20cf0ff6e3e56b08bdc642c1330 Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Thu, 11 May 2023 15:36:47 +0200 Subject: [PATCH 021/127] Fix Local Cache documentation explanations --- docs/en/operations/storing-data.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index ac6ea22ab75..495716858ec 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -172,17 +172,19 @@ Example of configuration for versions earlier than 22.8: ``` -Cache **configuration settings**: +File Cache **configuration settings**: + +These settings should be defined in the disk configuration section. - `path` - path to the directory with cache. Default: None, this setting is obligatory. - `max_size` - maximum size of the cache in bytes or in readable format, e.g. `ki, Mi, Gi, etc`, example `10Gi` (such format works starting from `22.10` version). When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory. -- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled). +- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. - `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`. -- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. +- `enable_cache_hits_threshold` - number which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it. - `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading. @@ -190,19 +192,21 @@ Cache **configuration settings**: - `max_elements` - a limit for a number of cache files. Default: `1048576`. -Cache **query settings**: +File Cache **query settings**: -- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. +Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1 cache per query is disabled. -- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. +- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. -- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. +- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. This is a profile level configuration setting. -- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. +- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. This setting should be defined in the disk configuration section. -- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. +- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. This is a profile level configuration setting. -- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit. +- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. This setting should be defined in the disk configuration section. + +- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. This is a profile level configuration setting. ** Warning ** Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported. From 10e0c1d832769f3244e4ad7563a02b6cd532883b Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Thu, 11 May 2023 16:06:14 +0200 Subject: [PATCH 022/127] Reworked documentation using local cache section --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 495716858ec..78c0fb8a049 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -194,7 +194,7 @@ These settings should be defined in the disk configuration section. File Cache **query settings**: -Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1 cache per query is disabled. +Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1, cache per query is disabled. - `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. From 0c0ea7bfc0ca60c3c04d516bb5ae1b7f14e67ca3 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 11 May 2023 23:07:48 +0000 Subject: [PATCH 023/127] Analyzer: apply _CAST to constants only once --- src/Analyzer/FunctionNode.cpp | 10 +++++++--- tests/broken_tests.txt | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Analyzer/FunctionNode.cpp b/src/Analyzer/FunctionNode.cpp index e635750569d..2385531db4f 100644 --- a/src/Analyzer/FunctionNode.cpp +++ b/src/Analyzer/FunctionNode.cpp @@ -209,15 +209,19 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const function_ast->kind = ASTFunction::Kind::WINDOW_FUNCTION; } + auto new_options = options; + if (function_name == "_CAST") + new_options.add_cast_for_constants = false; + const auto & parameters = getParameters(); if (!parameters.getNodes().empty()) { - function_ast->children.push_back(parameters.toAST(options)); + function_ast->children.push_back(parameters.toAST(new_options)); function_ast->parameters = function_ast->children.back(); } const auto & arguments = getArguments(); - function_ast->children.push_back(arguments.toAST(options)); + function_ast->children.push_back(arguments.toAST(new_options)); function_ast->arguments = function_ast->children.back(); auto window_node = getWindowNode(); @@ -226,7 +230,7 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const if (auto * identifier_node = window_node->as()) function_ast->window_name = identifier_node->getIdentifier().getFullName(); else - function_ast->window_definition = window_node->toAST(options); + function_ast->window_definition = window_node->toAST(new_options); } return function_ast; diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index f3fd0f395d4..02b539d583b 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -115,7 +115,6 @@ 02481_merge_array_join_sample_by 02493_inconsistent_hex_and_binary_number 02494_optimize_group_by_function_keys_and_alias_columns -02511_complex_literals_as_aggregate_function_parameters 02521_aggregation_by_partitions 02554_fix_grouping_sets_predicate_push_down 02575_merge_prewhere_different_default_kind From 2c40dd6a4c6aaa956762b8709fe5e4686e16876c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 2 May 2023 14:36:02 +0200 Subject: [PATCH 024/127] Switch Block::NameMap to google::dense_hash_map over HashMap Since HashMap creates 2^8 elements by default, while dense_hash_map should be good here. Signed-off-by: Azat Khuzhin --- src/Access/AuthenticationData.cpp | 1 + src/AggregateFunctions/AggregateFunctionForEach.h | 1 + src/AggregateFunctions/AggregateFunctionMinMaxAny.h | 1 + src/CMakeLists.txt | 3 ++- src/Core/Block.cpp | 6 ++---- src/Core/Block.h | 6 ++---- .../SerializationAggregateFunction.cpp | 1 + src/Formats/ColumnMapping.cpp | 6 +++--- src/Functions/FunctionBinaryArithmetic.h | 1 + src/Interpreters/QueryLog.h | 1 + src/Interpreters/QueryViewsLog.h | 1 + src/Interpreters/tests/gtest_context_race.cpp | 1 + .../Formats/Impl/BSONEachRowRowInputFormat.cpp | 12 +++++++----- .../Formats/Impl/BSONEachRowRowInputFormat.h | 2 +- .../Impl/JSONColumnsBlockInputFormatBase.cpp | 2 +- .../Formats/Impl/JSONEachRowRowInputFormat.cpp | 13 ++++++------- .../Formats/Impl/JSONEachRowRowInputFormat.h | 5 ++--- .../QueryPlan/DistributedCreateLocalPlan.h | 1 + src/Storages/DataLakes/DeltaLakeMetadataParser.cpp | 5 +++-- src/Storages/DataLakes/HudiMetadataParser.cpp | 1 + src/Storages/DataLakes/IcebergMetadataParser.cpp | 1 + src/Storages/StorageS3.h | 1 + 22 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/Access/AuthenticationData.cpp b/src/Access/AuthenticationData.cpp index 409338209cc..3bb0be160f4 100644 --- a/src/Access/AuthenticationData.cpp +++ b/src/Access/AuthenticationData.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionForEach.h b/src/AggregateFunctions/AggregateFunctionForEach.h index 81ba298bb8a..480b4cc690e 100644 --- a/src/AggregateFunctions/AggregateFunctionForEach.h +++ b/src/AggregateFunctions/AggregateFunctionForEach.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index c2cf2ac418f..94c0d60be81 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3f4fbb7420..c04b7acad3d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -211,6 +211,7 @@ endif() if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc) endif() +target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::sparsehash) add_subdirectory(Access/Common) add_subdirectory(Common/ZooKeeper) @@ -463,7 +464,7 @@ endif () if (TARGET ch_contrib::ldap) dbms_target_link_libraries (PRIVATE ch_contrib::ldap ch_contrib::lber) endif () -dbms_target_link_libraries (PRIVATE ch_contrib::sparsehash) +dbms_target_link_libraries (PUBLIC ch_contrib::sparsehash) if (TARGET ch_contrib::protobuf) dbms_target_link_libraries (PRIVATE ch_contrib::protobuf) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 456f1d5d95e..a3bd29faab1 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -663,12 +663,10 @@ Names Block::getDataTypeNames() const Block::NameMap Block::getNamesToIndexesMap() const { - NameMap res; - res.reserve(index_by_name.size()); - + NameMap res(index_by_name.size()); + res.set_empty_key(StringRef{}); for (const auto & [name, index] : index_by_name) res[name] = index; - return res; } diff --git a/src/Core/Block.h b/src/Core/Block.h index eb9d57ea6f8..7eed48d3d9f 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -5,13 +5,11 @@ #include #include -#include - #include #include -#include #include #include +#include namespace DB @@ -97,7 +95,7 @@ public: Names getDataTypeNames() const; /// Hash table match `column name -> position in the block`. - using NameMap = HashMap; + using NameMap = ::google::dense_hash_map; NameMap getNamesToIndexesMap() const; Serializations getSerializations() const; diff --git a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp index 7e192595114..c482c9623e9 100644 --- a/src/DataTypes/Serializations/SerializationAggregateFunction.cpp +++ b/src/DataTypes/Serializations/SerializationAggregateFunction.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include diff --git a/src/Formats/ColumnMapping.cpp b/src/Formats/ColumnMapping.cpp index 2fab5766591..e33dfc878f4 100644 --- a/src/Formats/ColumnMapping.cpp +++ b/src/Formats/ColumnMapping.cpp @@ -26,8 +26,8 @@ void ColumnMapping::addColumns( { names_of_columns.push_back(name); - const auto * column_it = column_indexes_by_names.find(name); - if (!column_it) + const auto column_it = column_indexes_by_names.find(name); + if (column_it == column_indexes_by_names.end()) { if (settings.skip_unknown_fields) { @@ -43,7 +43,7 @@ void ColumnMapping::addColumns( name, column_indexes_for_input_fields.size()); } - const auto column_index = column_it->getMapped(); + const auto column_index = column_it->second; if (read_columns[column_index]) throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing format header: {}", name); diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index a83bc6382b6..0174899d767 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include diff --git a/src/Interpreters/QueryLog.h b/src/Interpreters/QueryLog.h index a2d434e8843..44780f530e0 100644 --- a/src/Interpreters/QueryLog.h +++ b/src/Interpreters/QueryLog.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/src/Interpreters/QueryViewsLog.h b/src/Interpreters/QueryViewsLog.h index 986311fc822..e28bce0b91c 100644 --- a/src/Interpreters/QueryViewsLog.h +++ b/src/Interpreters/QueryViewsLog.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace ProfileEvents { diff --git a/src/Interpreters/tests/gtest_context_race.cpp b/src/Interpreters/tests/gtest_context_race.cpp index 60531494592..ec61fc9467c 100644 --- a/src/Interpreters/tests/gtest_context_race.cpp +++ b/src/Interpreters/tests/gtest_context_race.cpp @@ -1,6 +1,7 @@ #include #include #include +#include using namespace DB; diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp index 370cddd2dcb..57598fb507f 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp @@ -64,20 +64,22 @@ inline size_t BSONEachRowRowInputFormat::columnIndex(const StringRef & name, siz /// Optimization by caching the order of fields (which is almost always the same) /// and a quick check to match the next expected field, instead of searching the hash table. - if (prev_positions.size() > key_index && prev_positions[key_index] && name == prev_positions[key_index]->getKey()) + if (prev_positions.size() > key_index + && prev_positions[key_index] != Block::NameMap::const_iterator{} + && name == prev_positions[key_index]->first) { - return prev_positions[key_index]->getMapped(); + return prev_positions[key_index]->second; } else { - auto * it = name_map.find(name); + const auto it = name_map.find(name); - if (it) + if (it != name_map.end()) { if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->getMapped(); + return it->second; } else return UNKNOWN_FIELD; diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h index ad6d712b6dd..538a59e05c3 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.h @@ -91,7 +91,7 @@ private: Block::NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. - std::vector prev_positions; + std::vector prev_positions; DataTypes types; diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 3d052e23c21..2e264c59f56 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -128,7 +128,7 @@ Chunk JSONColumnsBlockInputFormatBase::generate() { /// Check if this name appears in header. If no, skip this column or throw /// an exception according to setting input_format_skip_unknown_fields - if (!name_to_index.has(*column_name)) + if (name_to_index.find(*column_name) == name_to_index.end()) { if (!format_settings.skip_unknown_fields) throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown column found in input data: {}", *column_name); diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 22ac31c7824..e5f52936021 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -71,21 +71,20 @@ inline size_t JSONEachRowRowInputFormat::columnIndex(StringRef name, size_t key_ /// and a quick check to match the next expected field, instead of searching the hash table. if (prev_positions.size() > key_index - && prev_positions[key_index] - && name == prev_positions[key_index]->getKey()) + && prev_positions[key_index] != Block::NameMap::const_iterator{} + && name == prev_positions[key_index]->first) { - return prev_positions[key_index]->getMapped(); + return prev_positions[key_index]->second; } else { - auto * it = name_map.find(name); - - if (it) + const auto it = name_map.find(name); + if (it != name_map.end()) { if (key_index < prev_positions.size()) prev_positions[key_index] = it; - return it->getMapped(); + return it->second; } else return UNKNOWN_FIELD; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index beee9e95821..ce42071585e 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -71,11 +71,10 @@ private: /// for row like {..., "non-nullable column name" : null, ...} /// Hash table match `field name -> position in the block`. NOTE You can use perfect hash map. - using NameMap = HashMap; - NameMap name_map; + Block::NameMap name_map; /// Cached search results for previous row (keyed as index in JSON object) - used as a hint. - std::vector prev_positions; + std::vector prev_positions; bool allow_new_rows = true; diff --git a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h index 16bf1c565ff..1afdc07fa4d 100644 --- a/src/Processors/QueryPlan/DistributedCreateLocalPlan.h +++ b/src/Processors/QueryPlan/DistributedCreateLocalPlan.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp index 45189ca325a..309aa54909a 100644 --- a/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp +++ b/src/Storages/DataLakes/DeltaLakeMetadataParser.cpp @@ -11,10 +11,11 @@ #include #include #include -#include -#include #include #include +#include +#include +#include #include namespace fs = std::filesystem; diff --git a/src/Storages/DataLakes/HudiMetadataParser.cpp b/src/Storages/DataLakes/HudiMetadataParser.cpp index 74e556f2bf2..a1f35a5ae42 100644 --- a/src/Storages/DataLakes/HudiMetadataParser.cpp +++ b/src/Storages/DataLakes/HudiMetadataParser.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "config.h" #include diff --git a/src/Storages/DataLakes/IcebergMetadataParser.cpp b/src/Storages/DataLakes/IcebergMetadataParser.cpp index 4e90da41721..3ab90e271cf 100644 --- a/src/Storages/DataLakes/IcebergMetadataParser.cpp +++ b/src/Storages/DataLakes/IcebergMetadataParser.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index c3b862f6bbd..12573ab513f 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include From eb62030fa425af8ef176e7f7f333b8e34fa4dc4f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 10 May 2023 05:02:52 +0200 Subject: [PATCH 025/127] Fix assigning a setting to NULL in settings profile's definition. --- src/Access/SettingsConstraints.cpp | 12 ++++---- src/Access/SettingsProfileElement.cpp | 30 +++++++++---------- src/Access/SettingsProfileElement.h | 6 ++-- .../Access/ASTSettingsProfileElement.cpp | 12 ++++---- .../Access/ASTSettingsProfileElement.h | 6 ++-- .../Access/ParserSettingsProfileElement.cpp | 24 +++++++-------- .../StorageSystemSettingsProfileElements.cpp | 12 ++++---- 7 files changed, 51 insertions(+), 51 deletions(-) diff --git a/src/Access/SettingsConstraints.cpp b/src/Access/SettingsConstraints.cpp index e83ab264f4f..12f584cab83 100644 --- a/src/Access/SettingsConstraints.cpp +++ b/src/Access/SettingsConstraints.cpp @@ -105,21 +105,21 @@ void SettingsConstraints::check(const Settings & current_settings, const Setting if (SettingsProfileElements::isAllowBackupSetting(element.setting_name)) continue; - if (!element.value.isNull()) + if (element.value) { - SettingChange value(element.setting_name, element.value); + SettingChange value(element.setting_name, *element.value); check(current_settings, value); } - if (!element.min_value.isNull()) + if (element.min_value) { - SettingChange value(element.setting_name, element.min_value); + SettingChange value(element.setting_name, *element.min_value); check(current_settings, value); } - if (!element.max_value.isNull()) + if (element.max_value) { - SettingChange value(element.setting_name, element.max_value); + SettingChange value(element.setting_name, *element.max_value); check(current_settings, value); } diff --git a/src/Access/SettingsProfileElement.cpp b/src/Access/SettingsProfileElement.cpp index ce56782d887..9358391cb93 100644 --- a/src/Access/SettingsProfileElement.cpp +++ b/src/Access/SettingsProfileElement.cpp @@ -63,18 +63,18 @@ void SettingsProfileElement::init(const ASTSettingsProfileElement & ast, const A max_value = ast.max_value; writability = ast.writability; - if (!value.isNull()) - value = Settings::castValueUtil(setting_name, value); - if (!min_value.isNull()) - min_value = Settings::castValueUtil(setting_name, min_value); - if (!max_value.isNull()) - max_value = Settings::castValueUtil(setting_name, max_value); + if (value) + value = Settings::castValueUtil(setting_name, *value); + if (min_value) + min_value = Settings::castValueUtil(setting_name, *min_value); + if (max_value) + max_value = Settings::castValueUtil(setting_name, *max_value); } } bool SettingsProfileElement::isConstraint() const { - return this->writability || !this->min_value.isNull() || !this->max_value.isNull(); + return this->writability || this->min_value || this->max_value; } std::shared_ptr SettingsProfileElement::toAST() const @@ -187,8 +187,8 @@ Settings SettingsProfileElements::toSettings() const Settings res; for (const auto & elem : *this) { - if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name) && !elem.value.isNull()) - res.set(elem.setting_name, elem.value); + if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name) && elem.value) + res.set(elem.setting_name, *elem.value); } return res; } @@ -200,8 +200,8 @@ SettingsChanges SettingsProfileElements::toSettingsChanges() const { if (!elem.setting_name.empty() && !isAllowBackupSetting(elem.setting_name)) { - if (!elem.value.isNull()) - res.push_back({elem.setting_name, elem.value}); + if (elem.value) + res.push_back({elem.setting_name, *elem.value}); } } return res; @@ -214,8 +214,8 @@ SettingsConstraints SettingsProfileElements::toSettingsConstraints(const AccessC if (!elem.setting_name.empty() && elem.isConstraint() && !isAllowBackupSetting(elem.setting_name)) res.set( elem.setting_name, - elem.min_value, - elem.max_value, + elem.min_value ? *elem.min_value : Field{}, + elem.max_value ? *elem.max_value : Field{}, elem.writability ? *elem.writability : SettingConstraintWritability::WRITABLE); return res; } @@ -240,8 +240,8 @@ bool SettingsProfileElements::isBackupAllowed() const { for (const auto & setting : *this) { - if (isAllowBackupSetting(setting.setting_name)) - return static_cast(SettingFieldBool{setting.value}); + if (isAllowBackupSetting(setting.setting_name) && setting.value) + return static_cast(SettingFieldBool{*setting.value}); } return true; } diff --git a/src/Access/SettingsProfileElement.h b/src/Access/SettingsProfileElement.h index 7f9379c1e47..7078f565295 100644 --- a/src/Access/SettingsProfileElement.h +++ b/src/Access/SettingsProfileElement.h @@ -23,9 +23,9 @@ struct SettingsProfileElement std::optional parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; auto toTuple() const { return std::tie(parent_profile, setting_name, value, min_value, max_value, writability); } diff --git a/src/Parsers/Access/ASTSettingsProfileElement.cpp b/src/Parsers/Access/ASTSettingsProfileElement.cpp index 76973c428b2..7b29b15cb29 100644 --- a/src/Parsers/Access/ASTSettingsProfileElement.cpp +++ b/src/Parsers/Access/ASTSettingsProfileElement.cpp @@ -35,21 +35,21 @@ void ASTSettingsProfileElement::formatImpl(const FormatSettings & settings, Form formatSettingName(setting_name, settings.ostr); - if (!value.isNull()) + if (value) { - settings.ostr << " = " << applyVisitor(FieldVisitorToString{}, value); + settings.ostr << " = " << applyVisitor(FieldVisitorToString{}, *value); } - if (!min_value.isNull()) + if (min_value) { settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " MIN " << (settings.hilite ? IAST::hilite_none : "") - << applyVisitor(FieldVisitorToString{}, min_value); + << applyVisitor(FieldVisitorToString{}, *min_value); } - if (!max_value.isNull()) + if (max_value) { settings.ostr << (settings.hilite ? IAST::hilite_keyword : "") << " MAX " << (settings.hilite ? IAST::hilite_none : "") - << applyVisitor(FieldVisitorToString{}, max_value); + << applyVisitor(FieldVisitorToString{}, *max_value); } if (writability) diff --git a/src/Parsers/Access/ASTSettingsProfileElement.h b/src/Parsers/Access/ASTSettingsProfileElement.h index 275257e4f8e..13c1926d9b0 100644 --- a/src/Parsers/Access/ASTSettingsProfileElement.h +++ b/src/Parsers/Access/ASTSettingsProfileElement.h @@ -14,9 +14,9 @@ class ASTSettingsProfileElement : public IAST public: String parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; bool id_mode = false; /// If true then `parent_profile` keeps UUID, not a name. bool use_inherit_keyword = false; /// If true then this element is a part of ASTCreateSettingsProfileQuery. diff --git a/src/Parsers/Access/ParserSettingsProfileElement.cpp b/src/Parsers/Access/ParserSettingsProfileElement.cpp index db23a806a12..36330b96622 100644 --- a/src/Parsers/Access/ParserSettingsProfileElement.cpp +++ b/src/Parsers/Access/ParserSettingsProfileElement.cpp @@ -52,7 +52,7 @@ namespace } - bool parseValue(IParserBase::Pos & pos, Expected & expected, Field & res) + bool parseValue(IParserBase::Pos & pos, Expected & expected, std::optional & res) { return IParserBase::wrapParseImpl(pos, [&] { @@ -69,7 +69,7 @@ namespace } - bool parseMinMaxValue(IParserBase::Pos & pos, Expected & expected, Field & min_value, Field & max_value) + bool parseMinMaxValue(IParserBase::Pos & pos, Expected & expected, std::optional & min_value, std::optional & max_value) { return IParserBase::wrapParseImpl(pos, [&] { @@ -124,9 +124,9 @@ namespace IParserBase::Pos & pos, Expected & expected, String & setting_name, - Field & value, - Field & min_value, - Field & max_value, + std::optional & value, + std::optional & min_value, + std::optional & max_value, std::optional & writability) { return IParserBase::wrapParseImpl(pos, [&] @@ -136,9 +136,9 @@ namespace return false; String res_setting_name = getIdentifierName(name_ast); - Field res_value; - Field res_min_value; - Field res_max_value; + std::optional res_value; + std::optional res_min_value; + std::optional res_max_value; std::optional res_writability; bool has_value_or_constraint = false; @@ -151,7 +151,7 @@ namespace if (!has_value_or_constraint) return false; - if (boost::iequals(res_setting_name, "PROFILE") && res_value.isNull() && res_min_value.isNull() && res_max_value.isNull() + if (boost::iequals(res_setting_name, "PROFILE") && !res_value && !res_min_value && !res_max_value && res_writability == SettingConstraintWritability::CONST) { /// Ambiguity: "profile readonly" can be treated either as a profile named "readonly" or @@ -181,9 +181,9 @@ namespace { String parent_profile; String setting_name; - Field value; - Field min_value; - Field max_value; + std::optional value; + std::optional min_value; + std::optional max_value; std::optional writability; bool ok = parseSettingNameWithValueOrConstraints(pos, expected, setting_name, value, min_value, max_value, writability); diff --git a/src/Storages/System/StorageSystemSettingsProfileElements.cpp b/src/Storages/System/StorageSystemSettingsProfileElements.cpp index 6785a4392e1..e01d3cb0ace 100644 --- a/src/Storages/System/StorageSystemSettingsProfileElements.cpp +++ b/src/Storages/System/StorageSystemSettingsProfileElements.cpp @@ -87,27 +87,27 @@ void StorageSystemSettingsProfileElements::fillData(MutableColumns & res_columns size_t current_index = index++; bool inserted_value = false; - if (!element.value.isNull() && !element.setting_name.empty()) + if (element.value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.value); + String str = Settings::valueToStringUtil(element.setting_name, *element.value); column_value.insertData(str.data(), str.length()); column_value_null_map.push_back(false); inserted_value = true; } bool inserted_min = false; - if (!element.min_value.isNull() && !element.setting_name.empty()) + if (element.min_value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.min_value); + String str = Settings::valueToStringUtil(element.setting_name, *element.min_value); column_min.insertData(str.data(), str.length()); column_min_null_map.push_back(false); inserted_min = true; } bool inserted_max = false; - if (!element.max_value.isNull() && !element.setting_name.empty()) + if (element.max_value && !element.setting_name.empty()) { - String str = Settings::valueToStringUtil(element.setting_name, element.max_value); + String str = Settings::valueToStringUtil(element.setting_name, *element.max_value); column_max.insertData(str.data(), str.length()); column_max_null_map.push_back(false); inserted_max = true; From 8fc0083a264cb4f1afb821bd9884ea1f6f414710 Mon Sep 17 00:00:00 2001 From: "Diego Nieto (lesandie)" Date: Fri, 12 May 2023 11:45:38 +0200 Subject: [PATCH 026/127] Rewrite following conversation/comments --- docs/en/operations/storing-data.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index d0102ce38d4..e8b043e7a27 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -172,7 +172,7 @@ Example of configuration for versions earlier than 22.8: ``` -File Cache **configuration settings**: +File Cache **disk configuration settings**: These settings should be defined in the disk configuration section. @@ -180,7 +180,7 @@ These settings should be defined in the disk configuration section. - `max_size` - maximum size of the cache in bytes or in readable format, e.g. `ki, Mi, Gi, etc`, example `10Gi` (such format works starting from `22.10` version). When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory. -- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. +- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled). - `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`. @@ -192,23 +192,23 @@ These settings should be defined in the disk configuration section. - `max_elements` - a limit for a number of cache files. Default: `1048576`. -File Cache **query settings**: +File Cache **query/profile settings**: -Some of these settings will disable cache per query features that are enabled by default. For example, setting `cache_on_write_operations` to 1 means that general file and per query cache are enabled but also setting `enable_filesystem_cache_on_write_operations` to 1 means that file cache is enabled but disabled per query cache. The same approach must be used for `enable_filesystem_cache`, if set to 1, cache per query is disabled. +Some of these settings will disable cache features per query/profile that are enabled by default. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that general file and per query cache are enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. -- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. This setting should be defined in the disk configuration section. +- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. -- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. This is a profile level configuration setting. +- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`. -- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. This setting should be defined in the disk configuration section. +- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on. Default: `false`. -- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`. This is a profile level configuration setting. +- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. It can be turn on for specific queries or enabled in a profile. Default: `false`. -- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. This setting should be defined in the disk configuration section. +- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`. -- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. This is a profile level configuration setting. +- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recently used` behaviour while keeping query cache limit. -** Warning ** +**Warning** Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported. Cache **system tables**: From 22f7aa8d89107910f10c8ff5fb92296856c81283 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Fri, 12 May 2023 12:00:15 +0200 Subject: [PATCH 027/127] make special build pass --- src/IO/WriteBufferFromS3.h | 5 -- src/IO/tests/gtest_writebuffer_s3.cpp | 104 +++++++++++++------------- 2 files changed, 52 insertions(+), 57 deletions(-) diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index b0d8d329589..e65127872fa 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -16,11 +16,6 @@ #include #include -namespace Aws::S3 -{ -class Client; -} - namespace DB { /** diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index d7661d3e3d0..c0bd6742ea3 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -60,12 +60,12 @@ private: class BucketMemStore { public: - typedef std::string Key; - typedef std::string Data; - typedef std::string ETag; - typedef std::string MPU_ID; - typedef std::map MPUPartsInProgress; - typedef std::vector MPUParts; + using Key = std::string; + using Data = std::string; + using ETag = std::string; + using MPU_ID = std::string; + using MPUPartsInProgress = std::map; + using MPUParts = std::vector; std::map objects; @@ -129,7 +129,7 @@ public: { std::vector result; result.reserve(parts.size()); - for (auto & part_data : parts) + for (const auto & part_data : parts) result.push_back(part_data.size()); return result; @@ -142,7 +142,7 @@ class S3MemStrore public: void CreateBucket(const std::string & bucket) { - assert(buckets.count(bucket) == 0); + assert(!buckets.contains(bucket)); buckets.emplace(bucket, BucketMemStore{}); } @@ -193,14 +193,14 @@ struct InjectionModel struct Client : DB::S3::Client { - Client(std::shared_ptr mock_s3_store) + explicit Client(std::shared_ptr mock_s3_store) : DB::S3::Client( 100, DB::S3::ServerSideEncryptionKMSConfig(), std::make_shared("", ""), GetClientConfiguration(), Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - /* useVirtualAddressing = */ true) + /* use_virtual_addressing = */ true) , store(mock_s3_store) { } @@ -425,8 +425,8 @@ struct BaseSyncPolicy { virtual ~BaseSyncPolicy() = default; virtual DB::ThreadPoolCallbackRunner getScheduler() { return {}; } - virtual void execute(size_t = 0) {} - virtual void setAutoExecute(bool = true) {} + virtual void execute(size_t) {} + virtual void setAutoExecute(bool) {} virtual size_t size() const { return 0; } virtual bool empty() const { return size() == 0; } @@ -437,7 +437,7 @@ struct SimpleAsyncTasks : BaseSyncPolicy bool auto_execute = false; std::deque> queue; - virtual DB::ThreadPoolCallbackRunner getScheduler() override + DB::ThreadPoolCallbackRunner getScheduler() override { return [this] (std::function && operation, size_t /*priority*/) { @@ -453,7 +453,7 @@ struct SimpleAsyncTasks : BaseSyncPolicy }; } - virtual void execute(size_t limit = 0) override + void execute(size_t limit) override { if (limit == 0) limit = queue.size(); @@ -468,14 +468,14 @@ struct SimpleAsyncTasks : BaseSyncPolicy } } - virtual void setAutoExecute(bool value = true) override + void setAutoExecute(bool value) override { auto_execute = value; if (auto_execute) - execute(); + execute(0); } - virtual size_t size() const override { return queue.size(); } + size_t size() const override { return queue.size(); } }; } @@ -545,7 +545,7 @@ public: auto buffer = getWriteBuffer("file"); writeMethod(*buffer, size); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); expected_counters.writtenSize = size; @@ -585,13 +585,13 @@ protected: std::shared_ptr client; std::unique_ptr async_policy; - virtual void SetUp() override + void SetUp() override { client = MockS3::Client::CreateClient(bucket); async_policy = std::make_unique(); } - virtual void TearDown() override + void TearDown() override { client.reset(); async_policy.reset(); @@ -603,13 +603,13 @@ class SyncAsync : public WBS3Test, public ::testing::WithParamInterface protected: bool test_with_pool = false; - virtual void SetUp() override + void SetUp() override { test_with_pool = GetParam(); client = MockS3::Client::CreateClient(bucket); if (test_with_pool) async_policy = std::make_unique(); - else + else async_policy = std::make_unique(); } }; @@ -622,7 +622,7 @@ INSTANTIATE_TEST_SUITE_P(WBS3 return name; }); -TEST_P(SyncAsync, exception_on_head) { +TEST_P(SyncAsync, ExceptionOnHead) { setInjectionModel(std::make_shared()); getSettings().s3_check_objects_after_upload = true; @@ -633,7 +633,7 @@ TEST_P(SyncAsync, exception_on_head) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -645,7 +645,7 @@ TEST_P(SyncAsync, exception_on_head) { }, DB::S3Exception); } -TEST_P(SyncAsync, exception_on_put) { +TEST_P(SyncAsync, ExceptionOnPut) { setInjectionModel(std::make_shared()); EXPECT_THROW({ @@ -655,7 +655,7 @@ TEST_P(SyncAsync, exception_on_put) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -671,7 +671,7 @@ TEST_P(SyncAsync, exception_on_put) { auto buffer = getWriteBuffer("exception_on_put_2"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -686,10 +686,10 @@ TEST_P(SyncAsync, exception_on_put) { try { auto buffer = getWriteBuffer("exception_on_put_3"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -702,7 +702,7 @@ TEST_P(SyncAsync, exception_on_put) { } -TEST_P(SyncAsync, exception_on_create_mpu) { +TEST_P(SyncAsync, ExceptionOnCreateMPU) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -716,7 +716,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -733,7 +733,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { buffer->write('A'); buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -749,7 +749,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { auto buffer = getWriteBuffer("exception_on_create_mpu_2"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch( const DB::Exception& e ) @@ -762,7 +762,7 @@ TEST_P(SyncAsync, exception_on_create_mpu) { } -TEST_P(SyncAsync, exception_on_complete_mpu) { +TEST_P(SyncAsync, ExceptionOnCompleteMPU) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -773,7 +773,7 @@ TEST_P(SyncAsync, exception_on_complete_mpu) { auto buffer = getWriteBuffer("exception_on_complete_mpu_1"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -785,7 +785,7 @@ TEST_P(SyncAsync, exception_on_complete_mpu) { }, DB::S3Exception); } -TEST_P(SyncAsync, exception_on_upload_part) { +TEST_P(SyncAsync, ExceptionOnUploadPart) { setInjectionModel(std::make_shared()); getSettings().s3_max_single_part_upload_size = 0; // no single part @@ -804,7 +804,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } @@ -820,7 +820,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { EXPECT_THROW({ try { auto buffer = getWriteBuffer("exception_on_upload_part_2"); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->write('A'); buffer->next(); @@ -848,7 +848,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { buffer->preFinalize(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -865,7 +865,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { auto buffer = getWriteBuffer("exception_on_upload_part_4"); buffer->write('A'); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); } catch(const DB::Exception & e) @@ -879,7 +879,7 @@ TEST_P(SyncAsync, exception_on_upload_part) { } -TEST_F(WBS3Test, prefinalize_called_multiple_times) { +TEST_F(WBS3Test, PrefinalizeCalledMultipleTimes) { #ifdef ABORT_ON_LOGICAL_ERROR GTEST_SKIP() << "this test trigger LOGICAL_ERROR, runs only if ABORT_ON_LOGICAL_ERROR is not defined"; #else @@ -904,14 +904,14 @@ TEST_F(WBS3Test, prefinalize_called_multiple_times) { #endif } -TEST_P(SyncAsync, empty_file) { +TEST_P(SyncAsync, EmptyFile) { getSettings().s3_check_objects_after_upload = true; MockS3::EventCounts counters = {.headObject = 2, .putObject = 1}; runSimpleScenario(counters, 0); } -TEST_P(SyncAsync, manual_next_calls) { +TEST_P(SyncAsync, ManualNextCalls) { getSettings().s3_check_objects_after_upload = true; { @@ -920,7 +920,7 @@ TEST_P(SyncAsync, manual_next_calls) { auto buffer = getWriteBuffer("manual_next_calls_1"); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -933,7 +933,7 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->next(); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -947,7 +947,7 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->write('A'); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); @@ -963,14 +963,14 @@ TEST_P(SyncAsync, manual_next_calls) { buffer->next(); buffer->next(); - getAsyncPolicy().setAutoExecute(); + getAsyncPolicy().setAutoExecute(true); buffer->finalize(); assertCountersEQ(counters); } } -TEST_P(SyncAsync, small_file_is_one_put_request) { +TEST_P(SyncAsync, SmallFileIsOnePutRequest) { getSettings().s3_check_objects_after_upload = true; { @@ -999,7 +999,7 @@ TEST_P(SyncAsync, small_file_is_one_put_request) { } } -TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { +TEST_P(SyncAsync, LittleBiggerFileIsMultiPartUpload) { getSettings().s3_check_objects_after_upload = true; { @@ -1026,7 +1026,7 @@ TEST_P(SyncAsync, little_bigger_file_is_multi_part_upload) { } } -TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { +TEST_P(SyncAsync, BiggerFileIsMultiPartUpload) { getSettings().s3_check_objects_after_upload = true; { @@ -1059,7 +1059,7 @@ TEST_P(SyncAsync, bigger_file_is_multi_part_upload) { } } -TEST_P(SyncAsync, increase_upload_buffer) { +TEST_P(SyncAsync, IncreaseUploadBuffer) { getSettings().s3_check_objects_after_upload = true; { @@ -1092,7 +1092,7 @@ TEST_P(SyncAsync, increase_upload_buffer) { } } -TEST_P(SyncAsync, increase_limited) { +TEST_P(SyncAsync, IncreaseLimited) { getSettings().s3_check_objects_after_upload = true; { From a4694ac1858890bcd3d35547cf5c4417252933c9 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 12 May 2023 12:29:29 +0200 Subject: [PATCH 028/127] Add test. --- .../01418_custom_settings.reference | 16 +++++++++---- .../0_stateless/01418_custom_settings.sql | 24 +++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/01418_custom_settings.reference b/tests/queries/0_stateless/01418_custom_settings.reference index cf0cb35c72a..8484a5d0e6f 100644 --- a/tests/queries/0_stateless/01418_custom_settings.reference +++ b/tests/queries/0_stateless/01418_custom_settings.reference @@ -1,3 +1,4 @@ +--- assigning --- 5 UInt8 -177 Int16 98.11 Float64 @@ -6,7 +7,7 @@ custom_a UInt64_5 custom_b Int64_-177 custom_c Float64_98.11 custom_d \'abc def\' - +--- modifying --- changed String \N Nullable(Nothing) 50000 UInt16 @@ -15,9 +16,10 @@ custom_a \'changed\' custom_b NULL custom_c UInt64_50000 custom_d Float64_1.11 - +--- undefined setting --- 404 UInt16 - +--- wrong prefix --- +--- using query context --- -0.333 Float64 custom_e Float64_-0.333 404 UInt16 @@ -25,7 +27,13 @@ custom_e UInt64_404 word String custom_f \'word\' 0 - +--- compound identifier --- test String custom_compound.identifier.v1 \'test\' CREATE SETTINGS PROFILE s1_01418 SETTINGS custom_compound.identifier.v2 = 100 +--- null type --- +\N Nullable(Nothing) +custom_null NULL +\N Nullable(Nothing) +custom_null NULL +CREATE SETTINGS PROFILE s2_01418 SETTINGS custom_null = NULL diff --git a/tests/queries/0_stateless/01418_custom_settings.sql b/tests/queries/0_stateless/01418_custom_settings.sql index 95051db3a34..be18f553589 100644 --- a/tests/queries/0_stateless/01418_custom_settings.sql +++ b/tests/queries/0_stateless/01418_custom_settings.sql @@ -1,3 +1,6 @@ +DROP SETTINGS PROFILE IF EXISTS s1_01418, s2_01418; + +SELECT '--- assigning ---'; SET custom_a = 5; SET custom_b = -177; SET custom_c = 98.11; @@ -8,7 +11,7 @@ SELECT getSetting('custom_c') as v, toTypeName(v); SELECT getSetting('custom_d') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name LIKE 'custom_%' ORDER BY name; -SELECT ''; +SELECT '--- modifying ---'; SET custom_a = 'changed'; SET custom_b = NULL; SET custom_c = 50000; @@ -19,14 +22,15 @@ SELECT getSetting('custom_c') as v, toTypeName(v); SELECT getSetting('custom_d') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name LIKE 'custom_%' ORDER BY name; -SELECT ''; +SELECT '--- undefined setting ---'; SELECT getSetting('custom_e') as v, toTypeName(v); -- { serverError 115 } -- Setting not found. SET custom_e = 404; SELECT getSetting('custom_e') as v, toTypeName(v); +SELECT '--- wrong prefix ---'; SET invalid_custom = 8; -- { serverError 115 } -- Setting is neither a builtin nor started with one of the registered prefixes for user-defined settings. -SELECT ''; +SELECT '--- using query context ---'; SELECT getSetting('custom_e') as v, toTypeName(v) SETTINGS custom_e = -0.333; SELECT name, value FROM system.settings WHERE name = 'custom_e' SETTINGS custom_e = -0.333; SELECT getSetting('custom_e') as v, toTypeName(v); @@ -37,7 +41,7 @@ SELECT name, value FROM system.settings WHERE name = 'custom_f' SETTINGS custom_ SELECT getSetting('custom_f') as v, toTypeName(v); -- { serverError 115 } -- Setting not found. SELECT COUNT() FROM system.settings WHERE name = 'custom_f'; -SELECT ''; +SELECT '--- compound identifier ---'; SET custom_compound.identifier.v1 = 'test'; SELECT getSetting('custom_compound.identifier.v1') as v, toTypeName(v); SELECT name, value FROM system.settings WHERE name = 'custom_compound.identifier.v1'; @@ -45,3 +49,15 @@ SELECT name, value FROM system.settings WHERE name = 'custom_compound.identifier CREATE SETTINGS PROFILE s1_01418 SETTINGS custom_compound.identifier.v2 = 100; SHOW CREATE SETTINGS PROFILE s1_01418; DROP SETTINGS PROFILE s1_01418; + +SELECT '--- null type ---'; +SELECT getSetting('custom_null') as v, toTypeName(v) SETTINGS custom_null = NULL; +SELECT name, value FROM system.settings WHERE name = 'custom_null' SETTINGS custom_null = NULL; + +SET custom_null = NULL; +SELECT getSetting('custom_null') as v, toTypeName(v); +SELECT name, value FROM system.settings WHERE name = 'custom_null'; + +CREATE SETTINGS PROFILE s2_01418 SETTINGS custom_null = NULL; +SHOW CREATE SETTINGS PROFILE s2_01418; +DROP SETTINGS PROFILE s2_01418; From fcce60ad25243f128d97a264cdec4c025e67a1b3 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Fri, 12 May 2023 15:09:34 +0200 Subject: [PATCH 029/127] Randomize enable_multiple_prewhere_read_steps --- tests/clickhouse-test | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e279b899a93..09bc15fbe3d 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -530,6 +530,7 @@ class SettingsRandomizer: "max_threads": lambda: random.randint(1, 64), "optimize_or_like_chain": lambda: random.randint(0, 1), "optimize_read_in_order": lambda: random.randint(0, 1), + "enable_multiple_prewhere_read_steps": lambda: random.randint(0, 1), "read_in_order_two_level_merge_threshold": lambda: random.randint(0, 100), "optimize_aggregation_in_order": lambda: random.randint(0, 1), "aggregation_in_order_max_block_bytes": lambda: random.randint(0, 50000000), From 1e3b7af97a5845c0dd02299a40e65bad5e6468fb Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 12 May 2023 10:26:05 -0300 Subject: [PATCH 030/127] Add setting to limit the max number of pairs produced by extractKeyValuePairs --- src/Common/ErrorCodes.cpp | 1 + src/Core/Settings.h | 1 + .../keyvaluepair/extractKeyValuePairs.cpp | 13 ++++++-- .../impl/CHKeyValuePairExtractor.h | 15 +++++++--- .../impl/KeyValuePairExtractorBuilder.cpp | 14 ++++++--- .../impl/KeyValuePairExtractorBuilder.h | 3 ++ ...t_key_value_pairs_multiple_input.reference | 16 ++++++++++ ...extract_key_value_pairs_multiple_input.sql | 30 +++++++++++++++++++ 8 files changed, 82 insertions(+), 11 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index e9dc5649245..382b8ed8019 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -576,6 +576,7 @@ M(691, UNKNOWN_ELEMENT_OF_ENUM) \ M(692, TOO_MANY_MUTATIONS) \ M(693, AWS_ERROR) \ + M(694, TOO_LARGE_MAP_SIZE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f60632fae91..3d1640e21b1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -748,6 +748,7 @@ class IColumn; M(Bool, optimize_distinct_in_order, false, "This optimization has a bug and it is disabled. Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ M(Bool, allow_experimental_undrop_table_query, false, "Allow to use undrop query to restore dropped table in a limited time", 0) \ M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \ + M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index a1b140001ac..00588052870 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include #include #include @@ -41,6 +43,8 @@ class ExtractKeyValuePairs : public IFunction builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); } + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + return builder.build(); } @@ -73,7 +77,7 @@ class ExtractKeyValuePairs : public IFunction } public: - ExtractKeyValuePairs() = default; + explicit ExtractKeyValuePairs(ContextPtr context_) : context(context_) {} static constexpr auto name = Name::name; @@ -82,9 +86,9 @@ public: return name; } - static FunctionPtr create(ContextPtr) + static FunctionPtr create(ContextPtr context) { - return std::make_shared(); + return std::make_shared(context); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override @@ -120,6 +124,9 @@ public: { return {1, 2, 3, 4}; } + +private: + ContextPtr context; }; struct NameExtractKeyValuePairs diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 139417579de..24c8d3cd89a 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -13,6 +13,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int TOO_LARGE_MAP_SIZE; } /* @@ -25,8 +26,8 @@ class CHKeyValuePairExtractor : public KeyValuePairExtractor using NextState = DB::extractKV::StateHandler::NextState; public: - explicit CHKeyValuePairExtractor(StateHandler state_handler_) - : state_handler(std::move(state_handler_)) + explicit CHKeyValuePairExtractor(StateHandler state_handler_, uint64_t max_number_of_pairs_) + : state_handler(std::move(state_handler_)), max_number_of_pairs(max_number_of_pairs_) {} uint64_t extract(const std::string & data, ColumnString::MutablePtr & keys, ColumnString::MutablePtr & values) override @@ -113,11 +114,16 @@ private: NextState flushPair(const std::string_view & file, auto & key, auto & value, uint64_t & row_offset) { + row_offset++; + + if (row_offset > max_number_of_pairs) + { + throw Exception(ErrorCodes::TOO_LARGE_MAP_SIZE, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); + } + key.commit(); value.commit(); - row_offset++; - return {0, file.empty() ? State::END : State::WAITING_KEY}; } @@ -128,6 +134,7 @@ private: } StateHandler state_handler; + uint64_t max_number_of_pairs; }; } diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp index 188285dc4bd..7f2a6449ab0 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.cpp @@ -31,6 +31,12 @@ KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withEscaping() return *this; } +KeyValuePairExtractorBuilder & KeyValuePairExtractorBuilder::withMaxNumberOfPairs(uint64_t max_number_of_pairs_) +{ + max_number_of_pairs = max_number_of_pairs_; + return *this; +} + std::shared_ptr KeyValuePairExtractorBuilder::build() const { if (with_escaping) @@ -46,9 +52,9 @@ namespace using namespace extractKV; template -auto makeStateHandler(const T && handler) +auto makeStateHandler(const T && handler, uint64_t max_number_of_pairs) { - return std::make_shared>(handler); + return std::make_shared>(handler, max_number_of_pairs); } } @@ -57,14 +63,14 @@ std::shared_ptr KeyValuePairExtractorBuilder::buildWithou { auto configuration = ConfigurationFactory::createWithoutEscaping(key_value_delimiter, quoting_character, item_delimiters); - return makeStateHandler(NoEscapingStateHandler(configuration)); + return makeStateHandler(NoEscapingStateHandler(configuration), max_number_of_pairs); } std::shared_ptr KeyValuePairExtractorBuilder::buildWithEscaping() const { auto configuration = ConfigurationFactory::createWithEscaping(key_value_delimiter, quoting_character, item_delimiters); - return makeStateHandler(InlineEscapingStateHandler(configuration)); + return makeStateHandler(InlineEscapingStateHandler(configuration), max_number_of_pairs); } } diff --git a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h index 5746c58dccc..0c673f12ccf 100644 --- a/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h +++ b/src/Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h @@ -20,6 +20,8 @@ public: KeyValuePairExtractorBuilder & withEscaping(); + KeyValuePairExtractorBuilder & withMaxNumberOfPairs(uint64_t max_number_of_pairs_); + std::shared_ptr build() const; private: @@ -27,6 +29,7 @@ private: char key_value_delimiter = ':'; char quoting_character = '"'; std::vector item_delimiters = {' ', ',', ';'}; + uint64_t max_number_of_pairs = std::numeric_limits::max(); std::shared_ptr buildWithEscaping() const; diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index 125afa19427..ec51b61a382 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -292,6 +292,22 @@ SELECT x; {'age':'31','last_key':'last_value','name':'neymar','nationality':'brazil','team':'psg'} -- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} +SET extract_kvp_max_pairs_per_row = 99999; -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 0a1500b1796..7dfcae879b0 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -414,7 +414,37 @@ WITH SELECT x; -- {serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH} +-- Should fail allowed because it exceeds the max number of pairs +SET extract_kvp_max_pairs_per_row = 1; +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; -- {serverError TOO_LARGE_MAP_SIZE} + -- { echoOn } + +SET extract_kvp_max_pairs_per_row = 2; +-- Should be allowed because it no longer exceeds the max number of pairs +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; + +SET extract_kvp_max_pairs_per_row = 99999; + -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, From 5fdaef851c8787d7d2bfb4a25c9729d986b8005b Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 12 May 2023 14:14:40 +0000 Subject: [PATCH 031/127] Handle constants carefully --- src/Analyzer/FunctionNode.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Analyzer/FunctionNode.cpp b/src/Analyzer/FunctionNode.cpp index 2385531db4f..f5bcdc103d2 100644 --- a/src/Analyzer/FunctionNode.cpp +++ b/src/Analyzer/FunctionNode.cpp @@ -210,7 +210,8 @@ ASTPtr FunctionNode::toASTImpl(const ConvertToASTOptions & options) const } auto new_options = options; - if (function_name == "_CAST") + /// To avoid surrounding constants with several internal casts. + if (function_name == "_CAST" && (*getArguments().begin())->getNodeType() == QueryTreeNodeType::CONSTANT) new_options.add_cast_for_constants = false; const auto & parameters = getParameters(); From b1549a19a50048a93f5cc0b48c482160fcb66613 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 12 May 2023 11:19:35 -0300 Subject: [PATCH 032/127] Use 0 as unlimited --- .../keyvaluepair/extractKeyValuePairs.cpp | 7 ++++++- ...tract_key_value_pairs_multiple_input.reference | 15 ++++++++++++++- ...499_extract_key_value_pairs_multiple_input.sql | 14 +++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp index 00588052870..9956eaf01f1 100644 --- a/src/Functions/keyvaluepair/extractKeyValuePairs.cpp +++ b/src/Functions/keyvaluepair/extractKeyValuePairs.cpp @@ -43,7 +43,12 @@ class ExtractKeyValuePairs : public IFunction builder.withQuotingCharacter(parsed_arguments.quoting_character.value()); } - builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + bool is_number_of_pairs_unlimited = context->getSettingsRef().extract_kvp_max_pairs_per_row == 0; + + if (!is_number_of_pairs_unlimited) + { + builder.withMaxNumberOfPairs(context->getSettingsRef().extract_kvp_max_pairs_per_row); + } return builder.build(); } diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference index ec51b61a382..a4ff1464fb8 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.reference @@ -307,7 +307,20 @@ WITH SELECT x; {'key1':'value1','key2':'value2'} -SET extract_kvp_max_pairs_per_row = 99999; +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; +{'key1':'value1','key2':'value2'} -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH extractKeyValuePairs('not_important', ':', '12345678', '\'') AS s_map, diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 7dfcae879b0..8c5f0365cc7 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -443,7 +443,19 @@ WITH SELECT x; -SET extract_kvp_max_pairs_per_row = 99999; +SET extract_kvp_max_pairs_per_row = 0; +-- Should be allowed because max pairs per row is set to 0 (unlimited) +-- expected output: {'key1':'value1','key2':'value2'} +WITH + extractKeyValuePairs('key1:value1,key2:value2') AS s_map, + CAST( + arrayMap( + (x) -> (x, s_map[x]), arraySort(mapKeys(s_map)) + ), + 'Map(String,String)' + ) AS x +SELECT + x; -- should not fail because pair delimiters contains 8 characters, which is within the limit WITH From d2e185d86df4f7e6a7b19134d7de2f960e61a644 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 12 May 2023 14:36:54 +0000 Subject: [PATCH 033/127] Analyzer: do not optimize GROUP BY keys with ROLLUP and CUBE --- src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp | 3 +++ .../queries/0_stateless/02734_optimize_group_by.reference | 8 ++++++++ tests/queries/0_stateless/02734_optimize_group_by.sql | 7 +++++++ 3 files changed, 18 insertions(+) create mode 100644 tests/queries/0_stateless/02734_optimize_group_by.reference create mode 100644 tests/queries/0_stateless/02734_optimize_group_by.sql diff --git a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp index c97645219da..5ed52f1210b 100644 --- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp +++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp @@ -38,6 +38,9 @@ public: if (!query->hasGroupBy()) return; + if (query->isGroupByWithCube() || query->isGroupByWithRollup()) + return; + auto & group_by = query->getGroupBy().getNodes(); if (query->isGroupByWithGroupingSets()) { diff --git a/tests/queries/0_stateless/02734_optimize_group_by.reference b/tests/queries/0_stateless/02734_optimize_group_by.reference new file mode 100644 index 00000000000..3f5ef03cb61 --- /dev/null +++ b/tests/queries/0_stateless/02734_optimize_group_by.reference @@ -0,0 +1,8 @@ +a b +a b +a b + b +a b +a b +a b +a b diff --git a/tests/queries/0_stateless/02734_optimize_group_by.sql b/tests/queries/0_stateless/02734_optimize_group_by.sql new file mode 100644 index 00000000000..28e86c04b0f --- /dev/null +++ b/tests/queries/0_stateless/02734_optimize_group_by.sql @@ -0,0 +1,7 @@ +SELECT 'a' AS key, 'b' as value GROUP BY key WITH CUBE SETTINGS allow_experimental_analyzer = 0; +SELECT 'a' AS key, 'b' as value GROUP BY key WITH CUBE SETTINGS allow_experimental_analyzer = 1; + +SELECT 'a' AS key, 'b' as value GROUP BY ignore(1) WITH CUBE; + +SELECT 'a' AS key, 'b' as value GROUP BY ignore(1); +SELECT 'a' AS key, 'b' as value GROUP BY key; From 37e4c531ba5ff9b639830f32c6a4b8cd9a42d9d6 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 12 May 2023 15:02:37 +0000 Subject: [PATCH 034/127] Don't update contrib --- contrib/boost | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/boost b/contrib/boost index 1035c8bfcc9..8fe7b3326ef 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 1035c8bfcc9a3c1cfa7f6e827db94dae1ce1a43a +Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98 From f0d21a910078c12fe3a4a038ef283a5ceb6dc219 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 3 Mar 2023 11:00:12 +0100 Subject: [PATCH 035/127] row policy template: initial, works, restrictive rules fix --- src/Access/AccessEntityIO.cpp | 4 + src/Access/EnabledRowPolicies.cpp | 36 ++++- src/Access/EnabledRowPolicies.h | 8 ++ src/Access/RowPolicy.h | 1 + src/Access/RowPolicyCache.cpp | 128 ++++++++++++++---- src/Access/RowPolicyCache.h | 7 +- .../Access/ParserCreateRowPolicyQuery.cpp | 1 + src/Parsers/Access/ParserRowPolicyName.cpp | 22 ++- src/Parsers/parseQuery.cpp | 2 + 9 files changed, 183 insertions(+), 26 deletions(-) diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index 80bb63b04bf..008ba5e5dfe 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -28,6 +28,9 @@ #include #include +#include + + namespace DB { namespace ErrorCodes @@ -62,6 +65,7 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition) const char * end = begin + definition.size(); while (pos < end) { + LOG_TRACE((&Poco::Logger::get("deserializeAccessEntityImpl")), "{}", std::string(pos, end)); queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); while (isWhitespaceASCII(*pos) || *pos == ';') ++pos; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index c00dcf9e3a7..225f211bdd4 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace DB { @@ -18,6 +20,12 @@ size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const return std::hash{}(key.database) - std::hash{}(key.table_name) + static_cast(key.filter_type); } + +// size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const +// { +// return std::hash{}(key.database) + static_cast(key.filter_type); +// } + EnabledRowPolicies::EnabledRowPolicies() : params() { } @@ -32,11 +40,37 @@ EnabledRowPolicies::~EnabledRowPolicies() = default; RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const String & table_name, RowPolicyFilterType filter_type) const { /// We don't lock `mutex` here. + auto loaded = mixed_filters.load(); + { + + + for (auto it = loaded->begin(); it != loaded->end(); ++it) + { + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), " db: {}, table {}", it->first.database, it->first.table_name); + + } + + } + + + + auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) - return {}; + { + it = loaded->find({database, "*", filter_type}); + if (it == loaded->end()) + { + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - not found ({} records)", + database, table_name, loaded->size()); + return {}; + } + } + + LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - found ({} records)", + database, table_name, loaded->size()); return it->second; } diff --git a/src/Access/EnabledRowPolicies.h b/src/Access/EnabledRowPolicies.h index b8e6b2e0549..e09d32264f0 100644 --- a/src/Access/EnabledRowPolicies.h +++ b/src/Access/EnabledRowPolicies.h @@ -72,6 +72,14 @@ private: auto toTuple() const { return std::tie(database, table_name, filter_type); } friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() != right.toTuple(); } + // friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) + // { + // return left.database == right.database && left.filter_type == right.filter_type; + // } + // friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) + // { + // return left.database != right.database || left.filter_type != right.filter_type; + // } }; struct Hash diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 99e6f1992f5..31ee876b47b 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -34,6 +34,7 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } + bool isDatabase() const { return full_name.table_name == "*"; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 1036df92609..81e5acdf3ce 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -11,6 +11,8 @@ #include #include +#include + namespace DB { @@ -148,9 +150,19 @@ void RowPolicyCache::ensureAllRowPoliciesRead() for (const UUID & id : access_control.findAll()) { - auto quota = access_control.tryRead(id); - if (quota) - all_policies.emplace(id, PolicyInfo(quota)); + auto policy = access_control.tryRead(id); + if (policy) + { + PolicyInfo policy_info(policy); + if (policy_info.database_and_table_name->second == "*") + { + database_policies.emplace(id, std::move(policy_info)); + } + else + { + table_policies.emplace(id, std::move(policy_info)); + } + } } } @@ -158,15 +170,23 @@ void RowPolicyCache::ensureAllRowPoliciesRead() void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy) { std::lock_guard lock{mutex}; - auto it = all_policies.find(policy_id); - if (it == all_policies.end()) + bool found = true; + + auto it = table_policies.find(policy_id); + if (it == table_policies.end()) { - it = all_policies.emplace(policy_id, PolicyInfo(new_policy)).first; + it = database_policies.find(policy_id); + if (it == database_policies.end()) + { + PolicyMap & policy_map = new_policy->isDatabase() ? database_policies : table_policies; + it = policy_map.emplace(policy_id, PolicyInfo(new_policy)).first; + found = false; + } } - else + + if (found && it->second.policy == new_policy) { - if (it->second.policy == new_policy) - return; + return; } auto & info = it->second; @@ -178,7 +198,15 @@ void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPo void RowPolicyCache::rowPolicyRemoved(const UUID & policy_id) { std::lock_guard lock{mutex}; - all_policies.erase(policy_id); + auto it = database_policies.find(policy_id); + if (it != database_policies.end()) + { + database_policies.erase(it); + } + else + { + table_policies.erase(policy_id); + } mixFilters(); } @@ -215,22 +243,71 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::vector policies; }; - std::unordered_map mixers; + std::unordered_map table_mixers; + std::unordered_map database_mixers; - for (const auto & [policy_id, info] : all_policies) + + for (const auto & [policy_id, info] : database_policies) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - MixedFiltersKey key; - key.database = info.database_and_table_name->first; - key.table_name = info.database_and_table_name->second; for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { auto filter_type_i = static_cast(filter_type); if (info.parsed_filters[filter_type_i]) { - key.filter_type = filter_type; - auto & mixer = mixers[key]; + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "db: {} : {}", key.database, key.table_name); + + auto & mixer = database_mixers[key]; // getting database level mixer + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } + } + } + } + + + for (const auto & [policy_id, info] : table_policies) + { + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + { + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) + { + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: {} : {}", key.database, key.table_name); + auto table_it = table_mixers.find(key); + if (table_it == table_mixers.end()) + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, looking for db"); + MixedFiltersKey database_key = key; + database_key.table_name = "*"; + + auto database_it = database_mixers.find(database_key); + + if (database_it == database_mixers.end()) + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database not found"); + table_it = table_mixers.try_emplace(key).first; + } + else + { + LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database found"); + table_it = table_mixers.insert({key, database_it->second}).first; + } + } + + auto & mixer = table_it->second; // table_mixers[key]; getting table level mixer mixer.database_and_table_name = info.database_and_table_name; if (match) { @@ -242,15 +319,20 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } auto mixed_filters = boost::make_shared(); - for (auto & [key, mixer] : mixers) + + for (auto mixer_map_ptr : { &table_mixers, &database_mixers}) { - auto mixed_filter = std::make_shared(); - mixed_filter->database_and_table_name = std::move(mixer.database_and_table_name); - mixed_filter->expression = std::move(mixer.mixer).getResult(access_control.isEnabledUsersWithoutRowPoliciesCanReadRows()); - mixed_filter->policies = std::move(mixer.policies); - mixed_filters->emplace(key, std::move(mixed_filter)); + for (auto & [key, mixer] : *mixer_map_ptr) + { + auto mixed_filter = std::make_shared(); + mixed_filter->database_and_table_name = std::move(mixer.database_and_table_name); + mixed_filter->expression = std::move(mixer.mixer).getResult(access_control.isEnabledUsersWithoutRowPoliciesCanReadRows()); + mixed_filter->policies = std::move(mixer.policies); + mixed_filters->emplace(key, std::move(mixed_filter)); + } } + enabled.mixed_filters.store(mixed_filters); } diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 4fbf90d1a2d..49e6a2e5df3 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -12,6 +12,7 @@ namespace DB class AccessControl; struct RolesOrUsersSet; struct RowPolicy; + using RowPolicyPtr = std::shared_ptr; /// Stores read and parsed row policies. @@ -35,14 +36,18 @@ private: ASTPtr parsed_filters[static_cast(RowPolicyFilterType::MAX)]; }; + using PolicyMap = std::unordered_map; + void ensureAllRowPoliciesRead(); void rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy); void rowPolicyRemoved(const UUID & policy_id); void mixFilters(); void mixFiltersFor(EnabledRowPolicies & enabled); + const AccessControl & access_control; - std::unordered_map all_policies; + PolicyMap database_policies; + PolicyMap table_policies; bool all_policies_read = false; scope_guard subscription; std::map> enabled_row_policies; diff --git a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp index 2c25fc14e7d..03d0754ca1a 100644 --- a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp +++ b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp @@ -203,6 +203,7 @@ namespace bool ParserCreateRowPolicyQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserCreateRowPolicyQuery::parseImpl" == nullptr); bool alter = false; if (attach_mode) { diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index cf5d2ab21b6..e087c45fad9 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -7,6 +7,8 @@ #include #include +#include + namespace DB { @@ -26,8 +28,19 @@ namespace return IParserBase::wrapParseImpl(pos, [&] { String res_database, res_table_name; - if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) + // if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) + bool any_database = false; + bool any_table = true; + + if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, any_database, res_table_name, any_table)) + { + // poco_assert("parseDatabaseAndTableName failed" == nullptr); + LOG_TRACE((&Poco::Logger::get("ParserRowPolicyName")), "parseDatabaseAndTableName failed"); return false; + } + if (any_table) + res_table_name = "*"; + /// If table is specified without DB it cannot be followed by "ON" /// (but can be followed by "ON CLUSTER"). @@ -51,8 +64,10 @@ namespace } + bool parseOnDBAndTableName(IParser::Pos & pos, Expected & expected, String & database, String & table_name) { + // poco_assert("parseOnDBAndTableNames" == nullptr); return IParserBase::wrapParseImpl(pos, [&] { return ParserKeyword{"ON"}.ignore(pos, expected) && parseDBAndTableName(pos, expected, database, table_name); @@ -62,6 +77,9 @@ namespace bool parseOnDBAndTableNames(IParser::Pos & pos, Expected & expected, std::vector> & database_and_table_names) { + // poco_assert("parseOnDBAndTableNames" == nullptr); + + return IParserBase::wrapParseImpl(pos, [&] { if (!ParserKeyword{"ON"}.ignore(pos, expected)) @@ -146,6 +164,7 @@ namespace bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; String cluster; if (!parseRowPolicyNamesAroundON(pos, expected, false, false, allow_on_cluster, full_names, cluster)) @@ -162,6 +181,7 @@ bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte bool ParserRowPolicyNames::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { + // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; size_t num_added_names_last_time = 0; String cluster; diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 8d794409f78..1b1ff439e13 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -139,6 +139,8 @@ void writeCommonErrorMessage( if (!query_description.empty()) out << " (" << query_description << ")"; + // poco_assert("writeCommonErrorMessage" == nullptr); + out << ": failed at position " << (last_token.begin - begin + 1); if (last_token.type == TokenType::EndOfStream || last_token.type == TokenType::Semicolon) From 7fa6ea4ccc7c03ea879396572fad24f7ffe93551 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 16 Mar 2023 17:02:05 +0100 Subject: [PATCH 036/127] row policy template: tests and code cleanup, code review changes --- .../statements/create/row-policy.md | 20 ++- src/Access/AccessEntityIO.cpp | 4 - src/Access/EnabledRowPolicies.cpp | 30 +--- src/Access/EnabledRowPolicies.h | 8 - src/Access/RolesOrUsersSet.cpp | 10 +- src/Access/RowPolicy.h | 4 +- src/Access/RowPolicyCache.cpp | 152 ++++++++---------- src/Access/RowPolicyCache.h | 8 +- .../Access/ParserCreateRowPolicyQuery.cpp | 1 - src/Parsers/Access/ParserRowPolicyName.cpp | 26 +-- src/Parsers/parseQuery.cpp | 2 - .../02131_row_policies_combination.sql | 8 + ...ow_policies_database_combination.reference | 42 +++++ ...5341_row_policies_database_combination.sql | 88 ++++++++++ .../25341_row_policy_database.reference | 22 +++ .../0_stateless/25341_row_policy_database.sql | 53 ++++++ 16 files changed, 315 insertions(+), 163 deletions(-) create mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.reference create mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.sql create mode 100644 tests/queries/0_stateless/25341_row_policy_database.reference create mode 100644 tests/queries/0_stateless/25341_row_policy_database.sql diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index aa0a07747f2..83bb2e6bb9a 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -14,8 +14,8 @@ Row policies makes sense only for users with readonly access. If user can modify Syntax: ``` sql -CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1 - [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2 ...] +CREATE [ROW] POLICY [IF NOT EXISTS | OR REPLACE] policy_name1 [ON CLUSTER cluster_name1] ON [db1.]table1|db1.* + [, policy_name2 [ON CLUSTER cluster_name2] ON [db2.]table2|db2.* ...] [FOR SELECT] USING condition [AS {PERMISSIVE | RESTRICTIVE}] [TO {role1 [, role2 ...] | ALL | ALL EXCEPT role1 [, role2 ...]}] @@ -76,6 +76,20 @@ CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio enables the user `peter` to see rows only if both `b=1` AND `c=2`. +Database policies are combined with table policies. + +For example, the following policies + +``` sql +CREATE ROW POLICY pol1 ON mydb.* USING b=1 TO mira, peter +CREATE ROW POLICY pol2 ON mydb.table1 USING c=2 AS RESTRICTIVE TO peter, antonio +``` + +enables the user `peter` to see table1 rows only if both `b=1` AND `c=2`, although +any other table in mydb would have only `b=1` policy applied for the user. + + + ## ON CLUSTER Clause Allows creating row policies on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). @@ -88,3 +102,5 @@ Allows creating row policies on a cluster, see [Distributed DDL](../../../sql-re `CREATE ROW POLICY filter2 ON mydb.mytable USING a<1000 AND b=5 TO ALL EXCEPT mira` `CREATE ROW POLICY filter3 ON mydb.mytable USING 1 TO admin` + +`CREATE ROW POLICY filter4 ON mydb.* USING 1 TO admin` diff --git a/src/Access/AccessEntityIO.cpp b/src/Access/AccessEntityIO.cpp index 008ba5e5dfe..80bb63b04bf 100644 --- a/src/Access/AccessEntityIO.cpp +++ b/src/Access/AccessEntityIO.cpp @@ -28,9 +28,6 @@ #include #include -#include - - namespace DB { namespace ErrorCodes @@ -65,7 +62,6 @@ AccessEntityPtr deserializeAccessEntityImpl(const String & definition) const char * end = begin + definition.size(); while (pos < end) { - LOG_TRACE((&Poco::Logger::get("deserializeAccessEntityImpl")), "{}", std::string(pos, end)); queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH)); while (isWhitespaceASCII(*pos) || *pos == ';') ++pos; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 225f211bdd4..9efac6e992e 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -3,8 +3,6 @@ #include #include -#include - namespace DB { @@ -20,12 +18,6 @@ size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const return std::hash{}(key.database) - std::hash{}(key.table_name) + static_cast(key.filter_type); } - -// size_t EnabledRowPolicies::Hash::operator()(const MixedFiltersKey & key) const -// { -// return std::hash{}(key.database) + static_cast(key.filter_type); -// } - EnabledRowPolicies::EnabledRowPolicies() : params() { } @@ -40,37 +32,17 @@ EnabledRowPolicies::~EnabledRowPolicies() = default; RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const String & table_name, RowPolicyFilterType filter_type) const { /// We don't lock `mutex` here. - auto loaded = mixed_filters.load(); - { - - - for (auto it = loaded->begin(); it != loaded->end(); ++it) - { - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), " db: {}, table {}", it->first.database, it->first.table_name); - - } - - } - - - - auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) { - it = loaded->find({database, "*", filter_type}); + it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - not found ({} records)", - database, table_name, loaded->size()); return {}; } } - - LOG_TRACE((&Poco::Logger::get("EnabledRowPolicies::getFilter")), "db: {}, table {} - found ({} records)", - database, table_name, loaded->size()); return it->second; } diff --git a/src/Access/EnabledRowPolicies.h b/src/Access/EnabledRowPolicies.h index e09d32264f0..b8e6b2e0549 100644 --- a/src/Access/EnabledRowPolicies.h +++ b/src/Access/EnabledRowPolicies.h @@ -72,14 +72,6 @@ private: auto toTuple() const { return std::tie(database, table_name, filter_type); } friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() == right.toTuple(); } friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) { return left.toTuple() != right.toTuple(); } - // friend bool operator==(const MixedFiltersKey & left, const MixedFiltersKey & right) - // { - // return left.database == right.database && left.filter_type == right.filter_type; - // } - // friend bool operator!=(const MixedFiltersKey & left, const MixedFiltersKey & right) - // { - // return left.database != right.database || left.filter_type != right.filter_type; - // } }; struct Hash diff --git a/src/Access/RolesOrUsersSet.cpp b/src/Access/RolesOrUsersSet.cpp index 52374c3739d..c026ae42f76 100644 --- a/src/Access/RolesOrUsersSet.cpp +++ b/src/Access/RolesOrUsersSet.cpp @@ -228,25 +228,25 @@ void RolesOrUsersSet::add(const std::vector & ids_) bool RolesOrUsersSet::match(const UUID & id) const { - return (all || ids.count(id)) && !except_ids.count(id); + return (all || ids.contains(id)) && !except_ids.contains(id); } bool RolesOrUsersSet::match(const UUID & user_id, const boost::container::flat_set & enabled_roles) const { - if (!all && !ids.count(user_id)) + if (!all && !ids.contains(user_id)) { bool found_enabled_role = std::any_of( - enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return ids.count(enabled_role); }); + enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return ids.contains(enabled_role); }); if (!found_enabled_role) return false; } - if (except_ids.count(user_id)) + if (except_ids.contains(user_id)) return false; bool in_except_list = std::any_of( - enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return except_ids.count(enabled_role); }); + enabled_roles.begin(), enabled_roles.end(), [this](const UUID & enabled_role) { return except_ids.contains(enabled_role); }); return !in_except_list; } diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 31ee876b47b..b9ba528e9bb 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -14,6 +14,8 @@ namespace DB */ struct RowPolicy : public IAccessEntity { + static constexpr char ANY_TABLE_MARK[] = "*"; + void setShortName(const String & short_name); void setDatabase(const String & database); void setTableName(const String & table_name); @@ -34,7 +36,7 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } - bool isDatabase() const { return full_name.table_name == "*"; } + bool isDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 81e5acdf3ce..07bec185131 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -11,14 +11,13 @@ #include #include -#include - namespace DB { namespace { - /// Accumulates filters from multiple row policies and joins them using the AND logical operation. + /// Helper to accumulate filters from multiple row policies and join them together + /// by AND or OR logical operations. class FiltersMixer { public: @@ -153,15 +152,7 @@ void RowPolicyCache::ensureAllRowPoliciesRead() auto policy = access_control.tryRead(id); if (policy) { - PolicyInfo policy_info(policy); - if (policy_info.database_and_table_name->second == "*") - { - database_policies.emplace(id, std::move(policy_info)); - } - else - { - table_policies.emplace(id, std::move(policy_info)); - } + all_policies.emplace(id, PolicyInfo(policy)); } } } @@ -170,23 +161,15 @@ void RowPolicyCache::ensureAllRowPoliciesRead() void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy) { std::lock_guard lock{mutex}; - bool found = true; - - auto it = table_policies.find(policy_id); - if (it == table_policies.end()) + auto it = all_policies.find(policy_id); + if (it == all_policies.end()) { - it = database_policies.find(policy_id); - if (it == database_policies.end()) - { - PolicyMap & policy_map = new_policy->isDatabase() ? database_policies : table_policies; - it = policy_map.emplace(policy_id, PolicyInfo(new_policy)).first; - found = false; - } + it = all_policies.emplace(policy_id, PolicyInfo(new_policy)).first; } - - if (found && it->second.policy == new_policy) + else { - return; + if (it->second.policy == new_policy) + return; } auto & info = it->second; @@ -198,15 +181,7 @@ void RowPolicyCache::rowPolicyAddedOrChanged(const UUID & policy_id, const RowPo void RowPolicyCache::rowPolicyRemoved(const UUID & policy_id) { std::lock_guard lock{mutex}; - auto it = database_policies.find(policy_id); - if (it != database_policies.end()) - { - database_policies.erase(it); - } - else - { - table_policies.erase(policy_id); - } + all_policies.erase(policy_id); mixFilters(); } @@ -246,73 +221,76 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::unordered_map table_mixers; std::unordered_map database_mixers; - - for (const auto & [policy_id, info] : database_policies) + /// populate database_mixers using database-level policies + /// to aggregate (mix) rules per database + for (const auto & [policy_id, info] : all_policies) { - const auto & policy = *info.policy; - bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + if (info.isDatabase()) { - auto filter_type_i = static_cast(filter_type); - if (info.parsed_filters[filter_type_i]) + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { - MixedFiltersKey key{info.database_and_table_name->first, - info.database_and_table_name->second, - filter_type}; - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "db: {} : {}", key.database, key.table_name); - - auto & mixer = database_mixers[key]; // getting database level mixer - mixer.database_and_table_name = info.database_and_table_name; - if (match) + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) { - mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); - mixer.policies.push_back(info.policy); + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + + auto & mixer = database_mixers[key]; + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } } } } } - - for (const auto & [policy_id, info] : table_policies) + /// populate table_mixers using database_mixers and table-level policies + for (const auto & [policy_id, info] : all_policies) { - const auto & policy = *info.policy; - bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); - for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) + if (!info.isDatabase()) { - auto filter_type_i = static_cast(filter_type); - if (info.parsed_filters[filter_type_i]) + const auto & policy = *info.policy; + bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); + for (auto filter_type : collections::range(0, RowPolicyFilterType::MAX)) { - MixedFiltersKey key{info.database_and_table_name->first, - info.database_and_table_name->second, - filter_type}; - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: {} : {}", key.database, key.table_name); - auto table_it = table_mixers.find(key); - if (table_it == table_mixers.end()) + auto filter_type_i = static_cast(filter_type); + if (info.parsed_filters[filter_type_i]) { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, looking for db"); - MixedFiltersKey database_key = key; - database_key.table_name = "*"; + MixedFiltersKey key{info.database_and_table_name->first, + info.database_and_table_name->second, + filter_type}; + auto table_it = table_mixers.find(key); + if (table_it == table_mixers.end()) + { /// no exact match - create new mixer + MixedFiltersKey database_key = key; + database_key.table_name = RowPolicy::ANY_TABLE_MARK; - auto database_it = database_mixers.find(database_key); + auto database_it = database_mixers.find(database_key); - if (database_it == database_mixers.end()) - { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database not found"); - table_it = table_mixers.try_emplace(key).first; + if (database_it == database_mixers.end()) + { + table_it = table_mixers.try_emplace(key).first; + } + else + { + /// table policies are based on database ones + table_it = table_mixers.insert({key, database_it->second}).first; + } } - else - { - LOG_TRACE((&Poco::Logger::get("mixFiltersFor")), "table: not found, database found"); - table_it = table_mixers.insert({key, database_it->second}).first; - } - } - auto & mixer = table_it->second; // table_mixers[key]; getting table level mixer - mixer.database_and_table_name = info.database_and_table_name; - if (match) - { - mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); - mixer.policies.push_back(info.policy); + auto & mixer = table_it->second; /// getting table level mixer + mixer.database_and_table_name = info.database_and_table_name; + if (match) + { + mixer.mixer.add(info.parsed_filters[filter_type_i], policy.isRestrictive()); + mixer.policies.push_back(info.policy); + } } } } @@ -320,7 +298,8 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) auto mixed_filters = boost::make_shared(); - for (auto mixer_map_ptr : { &table_mixers, &database_mixers}) + /// retrieve aggregated policies from mixers + for (auto * mixer_map_ptr : {&table_mixers, &database_mixers}) { for (auto & [key, mixer] : *mixer_map_ptr) { @@ -332,7 +311,6 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } } - enabled.mixed_filters.store(mixed_filters); } diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 49e6a2e5df3..7260de22164 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -12,7 +12,6 @@ namespace DB class AccessControl; struct RolesOrUsersSet; struct RowPolicy; - using RowPolicyPtr = std::shared_ptr; /// Stores read and parsed row policies. @@ -30,24 +29,21 @@ private: explicit PolicyInfo(const RowPolicyPtr & policy_) { setPolicy(policy_); } void setPolicy(const RowPolicyPtr & policy_); + bool isDatabase() const { return policy->isDatabase(); } RowPolicyPtr policy; const RolesOrUsersSet * roles = nullptr; std::shared_ptr> database_and_table_name; ASTPtr parsed_filters[static_cast(RowPolicyFilterType::MAX)]; }; - using PolicyMap = std::unordered_map; - void ensureAllRowPoliciesRead(); void rowPolicyAddedOrChanged(const UUID & policy_id, const RowPolicyPtr & new_policy); void rowPolicyRemoved(const UUID & policy_id); void mixFilters(); void mixFiltersFor(EnabledRowPolicies & enabled); - const AccessControl & access_control; - PolicyMap database_policies; - PolicyMap table_policies; + std::unordered_map all_policies; bool all_policies_read = false; scope_guard subscription; std::map> enabled_row_policies; diff --git a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp index 03d0754ca1a..2c25fc14e7d 100644 --- a/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp +++ b/src/Parsers/Access/ParserCreateRowPolicyQuery.cpp @@ -203,7 +203,6 @@ namespace bool ParserCreateRowPolicyQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserCreateRowPolicyQuery::parseImpl" == nullptr); bool alter = false; if (attach_mode) { diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index e087c45fad9..e5b4e01d5ac 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -7,8 +7,6 @@ #include #include -#include - namespace DB { @@ -28,19 +26,18 @@ namespace return IParserBase::wrapParseImpl(pos, [&] { String res_database, res_table_name; - // if (!parseDatabaseAndTableName(pos, expected, res_database, res_table_name)) - bool any_database = false; - bool any_table = true; + bool is_any_database = false; + bool is_any_table = false; - if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, any_database, res_table_name, any_table)) + if (!parseDatabaseAndTableNameOrAsterisks(pos, expected, res_database, is_any_database, res_table_name, is_any_table) + || is_any_database) { - // poco_assert("parseDatabaseAndTableName failed" == nullptr); - LOG_TRACE((&Poco::Logger::get("ParserRowPolicyName")), "parseDatabaseAndTableName failed"); return false; } - if (any_table) - res_table_name = "*"; - + else if (is_any_table) + { + res_table_name = "*"; // RowPolicy::ANY_TABLE_MARK + } /// If table is specified without DB it cannot be followed by "ON" /// (but can be followed by "ON CLUSTER"). @@ -64,10 +61,8 @@ namespace } - bool parseOnDBAndTableName(IParser::Pos & pos, Expected & expected, String & database, String & table_name) { - // poco_assert("parseOnDBAndTableNames" == nullptr); return IParserBase::wrapParseImpl(pos, [&] { return ParserKeyword{"ON"}.ignore(pos, expected) && parseDBAndTableName(pos, expected, database, table_name); @@ -77,9 +72,6 @@ namespace bool parseOnDBAndTableNames(IParser::Pos & pos, Expected & expected, std::vector> & database_and_table_names) { - // poco_assert("parseOnDBAndTableNames" == nullptr); - - return IParserBase::wrapParseImpl(pos, [&] { if (!ParserKeyword{"ON"}.ignore(pos, expected)) @@ -164,7 +156,6 @@ namespace bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; String cluster; if (!parseRowPolicyNamesAroundON(pos, expected, false, false, allow_on_cluster, full_names, cluster)) @@ -181,7 +172,6 @@ bool ParserRowPolicyName::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte bool ParserRowPolicyNames::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { - // poco_assert("ParserRowPolicyName::parseImpl" == nullptr); std::vector full_names; size_t num_added_names_last_time = 0; String cluster; diff --git a/src/Parsers/parseQuery.cpp b/src/Parsers/parseQuery.cpp index 1b1ff439e13..8d794409f78 100644 --- a/src/Parsers/parseQuery.cpp +++ b/src/Parsers/parseQuery.cpp @@ -139,8 +139,6 @@ void writeCommonErrorMessage( if (!query_description.empty()) out << " (" << query_description << ")"; - // poco_assert("writeCommonErrorMessage" == nullptr); - out << ": failed at position " << (last_token.begin - begin + 1); if (last_token.type == TokenType::EndOfStream || last_token.type == TokenType::Semicolon) diff --git a/tests/queries/0_stateless/02131_row_policies_combination.sql b/tests/queries/0_stateless/02131_row_policies_combination.sql index b5be672bb1b..1cbbca754b6 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.sql +++ b/tests/queries/0_stateless/02131_row_policies_combination.sql @@ -8,6 +8,8 @@ DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; +-- the test assumes users_without_row_policies_can_read_rows is true + SELECT 'None'; SELECT * FROM 02131_rptable; @@ -52,3 +54,9 @@ SELECT 'None'; SELECT * FROM 02131_rptable; DROP TABLE 02131_rptable; + +DROP ROW POLICY IF EXISTS 02131_filter_1 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_2 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; +DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.reference b/tests/queries/0_stateless/25341_row_policies_database_combination.reference new file mode 100644 index 00000000000..68ed02d1dc0 --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policies_database_combination.reference @@ -0,0 +1,42 @@ +None +1 10 +2 20 +3 30 +4 40 +R1: x == 1 +1 10 +R1, R2: (x == 1) OR (x == 2) +1 10 +2 20 +R1, R2: (x == 2) FROM ANOTHER +2 20 +R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3) +1 10 +2 20 +3 30 +R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) +1 10 +2 20 +R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20) +2 20 +2 20 +R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP +2 20 +R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER +2 20 +R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20) +2 20 +R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20) +R4, R5: (x <= 2) AND (y >= 20) +2 20 +R5: (x >= 2) +2 20 +3 30 +4 40 +Policy not applicable +None +1 10 +2 20 +3 30 +4 40 +No problematic policy, select works diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.sql b/tests/queries/0_stateless/25341_row_policies_database_combination.sql new file mode 100644 index 00000000000..aa9454b8c9b --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policies_database_combination.sql @@ -0,0 +1,88 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS 25341_db; +CREATE DATABASE 25341_db; +DROP TABLE IF EXISTS 25341_db.25341_rptable; +DROP TABLE IF EXISTS 25341_db.25341_rptable_another; +CREATE TABLE 25341_db.25341_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; + +INSERT INTO 25341_db.25341_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); + +CREATE TABLE 25341_db.25341_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; + + +DROP ROW POLICY IF EXISTS 25341_filter_1 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_2 ON 25341_db.*; +DROP ROW POLICY IF EXISTS 25341_filter_3 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_4 ON 25341_db.25341_rptable; +DROP ROW POLICY IF EXISTS 25341_filter_5 ON 25341_db.*; + +-- the test assumes users_without_row_policies_can_read_rows is true + +SELECT 'None'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable USING x=1 AS permissive TO ALL; +SELECT 'R1: x == 1'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_2 ON 25341_db.* USING x=2 AS permissive TO ALL; +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 25341_db.25341_rptable; + +SELECT 'R1, R2: (x == 2) FROM ANOTHER'; +SELECT * FROM 25341_db.25341_rptable_another; + +CREATE ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable USING x=3 AS permissive TO ALL; +SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable USING x<=2 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE ROW POLICY 25341_filter_5 ON 25341_db.* USING y>=20 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE TABLE 25341_db.25341_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; +SELECT * FROM 25341_db.25341_after_rp; + +-- does not matter if policies or table are created first +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; +SELECT * FROM 25341_db.25341_after_rp; + +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; +SELECT * FROM 25341_db.25341_rptable_another; + +DROP ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable; +SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_2 ON 25341_db.*; +SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable; +SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; +SELECT * FROM 25341_db.25341_rptable; + +DROP ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable; +SELECT 'R5: (x >= 2)'; +SELECT * FROM 25341_db.25341_rptable; + +CREATE TABLE 25341_db.25341_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; +SELECT 'Policy not applicable'; +SELECT * FROM 25341_db.25341_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query + +DROP ROW POLICY 25341_filter_5 ON 25341_db.*; +SELECT 'None'; +SELECT * FROM 25341_db.25341_rptable; + +SELECT 'No problematic policy, select works'; +SELECT 'Ok' FROM 25341_db.25341_unexpected_columns; + +DROP TABLE 25341_db.25341_rptable; +DROP TABLE 25341_db.25341_rptable_another; +DROP TABLE 25341_db.25341_unexpected_columns; +DROP DATABASE 25341_db; diff --git a/tests/queries/0_stateless/25341_row_policy_database.reference b/tests/queries/0_stateless/25341_row_policy_database.reference new file mode 100644 index 00000000000..57125b64056 --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policy_database.reference @@ -0,0 +1,22 @@ +-- database level policies + -- SHOW CREATE POLICY db1_25341 ON db1_25341.* +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_25341.* +CREATE ROW POLICY `25341_filter_11` ON db1_25341.`25341_rqtable` FOR SELECT USING x = 2 TO ALL +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY tbl1_25341 ON db1_25341.table FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_25341.`*` +CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL +R1, R2: (x == 1) OR (x == 2) +1 +2 +Check system.query_log +SELECT \'-- database level policies\'; [] +SELECT \' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_25341.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_25341.`*`\'; [] +SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] +SELECT * FROM 25341_rqtable_default; ['`25341_filter_11_db` ON default.`*`','`25341_filter_11` ON default.`25341_rqtable_default`'] +SELECT \'Check system.query_log\'; [] + -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE +CREATE ROW POLICY db2_25341 ON db1_25341.`*` TO u1_25341 diff --git a/tests/queries/0_stateless/25341_row_policy_database.sql b/tests/queries/0_stateless/25341_row_policy_database.sql new file mode 100644 index 00000000000..9d865487f0b --- /dev/null +++ b/tests/queries/0_stateless/25341_row_policy_database.sql @@ -0,0 +1,53 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS db1_25341; +DROP USER IF EXISTS u1_25341; +CREATE USER u1_25341; + +CREATE DATABASE db1_25341; + +CREATE TABLE db1_25341.25341_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO db1_25341.25341_rqtable VALUES (1), (2), (3), (4); + + +SELECT '-- database level policies'; +CREATE ROW POLICY db1_25341 ON db1_25341.* USING 1 AS PERMISSIVE TO ALL; +CREATE ROW POLICY tbl1_25341 ON db1_25341.table USING 1 AS PERMISSIVE TO ALL; +SELECT ' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*'; +SHOW CREATE POLICY db1_25341 ON db1_25341.*; +SELECT ' -- SHOW CREATE POLICY ON db1_25341.*'; +SHOW CREATE POLICY ON db1_25341.*; +SELECT ' -- SHOW CREATE POLICY ON db1_25341.`*`'; +SHOW CREATE POLICY ON db1_25341.`*`; +DROP POLICY db1_25341 ON db1_25341.*; +DROP POLICY tbl1_25341 ON db1_25341.table; + +CREATE ROW POLICY any_25341 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } + +CREATE TABLE 25341_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; + +CREATE ROW POLICY 25341_filter_11_db ON * USING x=1 AS permissive TO ALL; +CREATE ROW POLICY 25341_filter_11 ON 25341_rqtable_default USING x=2 AS permissive TO ALL; + +INSERT INTO 25341_rqtable_default VALUES (1), (2), (3), (4); + +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 25341_rqtable_default; + +DROP TABLE 25341_rqtable_default; + +SELECT 'Check system.query_log'; +SYSTEM FLUSH LOGS; +SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; + +DROP ROW POLICY 25341_filter_11_db ON *; +DROP ROW POLICY 25341_filter_11 ON 25341_rqtable_default; + +USE db1_25341; +SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +CREATE ROW POLICY db2_25341 ON * TO u1_25341; +SHOW CREATE POLICY db2_25341 ON *; + +DROP ROW POLICY db2_25341 ON *; + +DROP USER u1_25341; From 28c6d4fb630c47f821d2225677ec305a8fdfd883 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 30 Mar 2023 01:29:02 +0200 Subject: [PATCH 037/127] row policy template - rollback old test to make stylecheck happy --- .../0_stateless/02131_row_policies_combination.sql | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/queries/0_stateless/02131_row_policies_combination.sql b/tests/queries/0_stateless/02131_row_policies_combination.sql index 1cbbca754b6..b5be672bb1b 100644 --- a/tests/queries/0_stateless/02131_row_policies_combination.sql +++ b/tests/queries/0_stateless/02131_row_policies_combination.sql @@ -8,8 +8,6 @@ DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; --- the test assumes users_without_row_policies_can_read_rows is true - SELECT 'None'; SELECT * FROM 02131_rptable; @@ -54,9 +52,3 @@ SELECT 'None'; SELECT * FROM 02131_rptable; DROP TABLE 02131_rptable; - -DROP ROW POLICY IF EXISTS 02131_filter_1 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_2 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_3 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_4 ON 02131_rptable; -DROP ROW POLICY IF EXISTS 02131_filter_5 ON 02131_rptable; From 2ed8e318cfd9d857869de601ed08ff4d9904a7b1 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 30 Mar 2023 10:13:50 +0200 Subject: [PATCH 038/127] row policy template - tests renumbered to keep monotony --- ...w_policies_database_combination.reference} | 0 ...2703_row_policies_database_combination.sql | 88 +++++++++++++++++++ .../02703_row_policy_database.reference | 21 +++++ .../0_stateless/02703_row_policy_database.sql | 53 +++++++++++ ...5341_row_policies_database_combination.sql | 88 ------------------- .../25341_row_policy_database.reference | 22 ----- .../0_stateless/25341_row_policy_database.sql | 53 ----------- 7 files changed, 162 insertions(+), 163 deletions(-) rename tests/queries/0_stateless/{25341_row_policies_database_combination.reference => 02703_row_policies_database_combination.reference} (100%) create mode 100644 tests/queries/0_stateless/02703_row_policies_database_combination.sql create mode 100644 tests/queries/0_stateless/02703_row_policy_database.reference create mode 100644 tests/queries/0_stateless/02703_row_policy_database.sql delete mode 100644 tests/queries/0_stateless/25341_row_policies_database_combination.sql delete mode 100644 tests/queries/0_stateless/25341_row_policy_database.reference delete mode 100644 tests/queries/0_stateless/25341_row_policy_database.sql diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_database_combination.reference similarity index 100% rename from tests/queries/0_stateless/25341_row_policies_database_combination.reference rename to tests/queries/0_stateless/02703_row_policies_database_combination.reference diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_database_combination.sql new file mode 100644 index 00000000000..f9b466f1ade --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_database_combination.sql @@ -0,0 +1,88 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS 02703_db; +CREATE DATABASE 02703_db; +DROP TABLE IF EXISTS 02703_db.02703_rptable; +DROP TABLE IF EXISTS 02703_db.02703_rptable_another; +CREATE TABLE 02703_db.02703_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; + +INSERT INTO 02703_db.02703_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); + +CREATE TABLE 02703_db.02703_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 02703_db.02703_rptable; + + +DROP ROW POLICY IF EXISTS 02703_filter_1 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_2 ON 02703_db.*; +DROP ROW POLICY IF EXISTS 02703_filter_3 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_4 ON 02703_db.02703_rptable; +DROP ROW POLICY IF EXISTS 02703_filter_5 ON 02703_db.*; + +-- the test assumes users_without_row_policies_can_read_rows is true + +SELECT 'None'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_1 ON 02703_db.02703_rptable USING x=1 AS permissive TO ALL; +SELECT 'R1: x == 1'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_2 ON 02703_db.* USING x=2 AS permissive TO ALL; +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 02703_db.02703_rptable; + +SELECT 'R1, R2: (x == 2) FROM ANOTHER'; +SELECT * FROM 02703_db.02703_rptable_another; + +CREATE ROW POLICY 02703_filter_3 ON 02703_db.02703_rptable USING x=3 AS permissive TO ALL; +SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_4 ON 02703_db.02703_rptable USING x<=2 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE ROW POLICY 02703_filter_5 ON 02703_db.* USING y>=20 AS restrictive TO ALL; +SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE TABLE 02703_db.02703_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 02703_db.02703_rptable; +SELECT * FROM 02703_db.02703_after_rp; + +-- does not matter if policies or table are created first +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; +SELECT * FROM 02703_db.02703_after_rp; + +SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; +SELECT * FROM 02703_db.02703_rptable_another; + +DROP ROW POLICY 02703_filter_1 ON 02703_db.02703_rptable; +SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_2 ON 02703_db.*; +SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_3 ON 02703_db.02703_rptable; +SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; +SELECT * FROM 02703_db.02703_rptable; + +DROP ROW POLICY 02703_filter_4 ON 02703_db.02703_rptable; +SELECT 'R5: (x >= 2)'; +SELECT * FROM 02703_db.02703_rptable; + +CREATE TABLE 02703_db.02703_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; +SELECT 'Policy not applicable'; +SELECT * FROM 02703_db.02703_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query + +DROP ROW POLICY 02703_filter_5 ON 02703_db.*; +SELECT 'None'; +SELECT * FROM 02703_db.02703_rptable; + +SELECT 'No problematic policy, select works'; +SELECT 'Ok' FROM 02703_db.02703_unexpected_columns; + +DROP TABLE 02703_db.02703_rptable; +DROP TABLE 02703_db.02703_rptable_another; +DROP TABLE 02703_db.02703_unexpected_columns; +DROP DATABASE 02703_db; diff --git a/tests/queries/0_stateless/02703_row_policy_database.reference b/tests/queries/0_stateless/02703_row_policy_database.reference new file mode 100644 index 00000000000..e318dfac429 --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policy_database.reference @@ -0,0 +1,21 @@ +-- database level policies + -- SHOW CREATE POLICY db1_02703 ON db1_02703.* +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_02703.* +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY tbl1_02703 ON db1_02703.table FOR SELECT USING 1 TO ALL + -- SHOW CREATE POLICY ON db1_02703.`*` +CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +R1, R2: (x == 1) OR (x == 2) +1 +2 +Check system.query_log +SELECT \'-- database level policies\'; [] +SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] +SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] +SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] +SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] +SELECT \'Check system.query_log\'; [] + -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE +CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 diff --git a/tests/queries/0_stateless/02703_row_policy_database.sql b/tests/queries/0_stateless/02703_row_policy_database.sql new file mode 100644 index 00000000000..85f5a44dfbf --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policy_database.sql @@ -0,0 +1,53 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS db1_02703; +DROP USER IF EXISTS u1_02703; +CREATE USER u1_02703; + +CREATE DATABASE db1_02703; + +CREATE TABLE db1_02703.02703_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; +INSERT INTO db1_02703.02703_rqtable VALUES (1), (2), (3), (4); + + +SELECT '-- database level policies'; +CREATE ROW POLICY db1_02703 ON db1_02703.* USING 1 AS PERMISSIVE TO ALL; +CREATE ROW POLICY tbl1_02703 ON db1_02703.table USING 1 AS PERMISSIVE TO ALL; +SELECT ' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*'; +SHOW CREATE POLICY db1_02703 ON db1_02703.*; +SELECT ' -- SHOW CREATE POLICY ON db1_02703.*'; +SHOW CREATE POLICY ON db1_02703.*; +SELECT ' -- SHOW CREATE POLICY ON db1_02703.`*`'; +SHOW CREATE POLICY ON db1_02703.`*`; +DROP POLICY db1_02703 ON db1_02703.*; +DROP POLICY tbl1_02703 ON db1_02703.table; + +CREATE ROW POLICY any_02703 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } + +CREATE TABLE 02703_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; + +CREATE ROW POLICY 02703_filter_11_db ON * USING x=1 AS permissive TO ALL; +CREATE ROW POLICY 02703_filter_11 ON 02703_rqtable_default USING x=2 AS permissive TO ALL; + +INSERT INTO 02703_rqtable_default VALUES (1), (2), (3), (4); + +SELECT 'R1, R2: (x == 1) OR (x == 2)'; +SELECT * FROM 02703_rqtable_default; + +DROP TABLE 02703_rqtable_default; + +SELECT 'Check system.query_log'; +SYSTEM FLUSH LOGS; +SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; + +DROP ROW POLICY 02703_filter_11_db ON *; +DROP ROW POLICY 02703_filter_11 ON 02703_rqtable_default; + +USE db1_02703; +SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +CREATE ROW POLICY db2_02703 ON * TO u1_02703; +SHOW CREATE POLICY db2_02703 ON *; + +DROP ROW POLICY db2_02703 ON *; + +DROP USER u1_02703; diff --git a/tests/queries/0_stateless/25341_row_policies_database_combination.sql b/tests/queries/0_stateless/25341_row_policies_database_combination.sql deleted file mode 100644 index aa9454b8c9b..00000000000 --- a/tests/queries/0_stateless/25341_row_policies_database_combination.sql +++ /dev/null @@ -1,88 +0,0 @@ --- Tags: no-parallel - -DROP DATABASE IF EXISTS 25341_db; -CREATE DATABASE 25341_db; -DROP TABLE IF EXISTS 25341_db.25341_rptable; -DROP TABLE IF EXISTS 25341_db.25341_rptable_another; -CREATE TABLE 25341_db.25341_rptable (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x; - -INSERT INTO 25341_db.25341_rptable VALUES (1, 10), (2, 20), (3, 30), (4, 40); - -CREATE TABLE 25341_db.25341_rptable_another ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; - - -DROP ROW POLICY IF EXISTS 25341_filter_1 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_2 ON 25341_db.*; -DROP ROW POLICY IF EXISTS 25341_filter_3 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_4 ON 25341_db.25341_rptable; -DROP ROW POLICY IF EXISTS 25341_filter_5 ON 25341_db.*; - --- the test assumes users_without_row_policies_can_read_rows is true - -SELECT 'None'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable USING x=1 AS permissive TO ALL; -SELECT 'R1: x == 1'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_2 ON 25341_db.* USING x=2 AS permissive TO ALL; -SELECT 'R1, R2: (x == 1) OR (x == 2)'; -SELECT * FROM 25341_db.25341_rptable; - -SELECT 'R1, R2: (x == 2) FROM ANOTHER'; -SELECT * FROM 25341_db.25341_rptable_another; - -CREATE ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable USING x=3 AS permissive TO ALL; -SELECT 'R1, R2, R3: (x == 1) OR (x == 2) OR (x == 3)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable USING x<=2 AS restrictive TO ALL; -SELECT 'R1, R2, R3, R4: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE ROW POLICY 25341_filter_5 ON 25341_db.* USING y>=20 AS restrictive TO ALL; -SELECT 'R1, R2, R3, R4, R5: ((x == 1) OR (x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE TABLE 25341_db.25341_after_rp ENGINE = MergeTree ORDER BY x AS SELECT * FROM 25341_db.25341_rptable; -SELECT * FROM 25341_db.25341_after_rp; - --- does not matter if policies or table are created first -SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM AFTER_RP'; -SELECT * FROM 25341_db.25341_after_rp; - -SELECT 'R1, R2, R3, R4, R5: (x == 2) AND (y >= 20) FROM ANOTHER'; -SELECT * FROM 25341_db.25341_rptable_another; - -DROP ROW POLICY 25341_filter_1 ON 25341_db.25341_rptable; -SELECT 'R2, R3, R4, R5: ((x == 2) OR (x == 3)) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_2 ON 25341_db.*; -SELECT 'R3, R4, R5: (x == 3) AND (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_3 ON 25341_db.25341_rptable; -SELECT 'R4, R5: (x <= 2) AND (y >= 20)'; -SELECT * FROM 25341_db.25341_rptable; - -DROP ROW POLICY 25341_filter_4 ON 25341_db.25341_rptable; -SELECT 'R5: (x >= 2)'; -SELECT * FROM 25341_db.25341_rptable; - -CREATE TABLE 25341_db.25341_unexpected_columns (xx UInt8, yy UInt8) ENGINE = MergeTree ORDER BY xx; -SELECT 'Policy not applicable'; -SELECT * FROM 25341_db.25341_unexpected_columns; -- { serverError 47 } -- Missing columns: 'x' while processing query - -DROP ROW POLICY 25341_filter_5 ON 25341_db.*; -SELECT 'None'; -SELECT * FROM 25341_db.25341_rptable; - -SELECT 'No problematic policy, select works'; -SELECT 'Ok' FROM 25341_db.25341_unexpected_columns; - -DROP TABLE 25341_db.25341_rptable; -DROP TABLE 25341_db.25341_rptable_another; -DROP TABLE 25341_db.25341_unexpected_columns; -DROP DATABASE 25341_db; diff --git a/tests/queries/0_stateless/25341_row_policy_database.reference b/tests/queries/0_stateless/25341_row_policy_database.reference deleted file mode 100644 index 57125b64056..00000000000 --- a/tests/queries/0_stateless/25341_row_policy_database.reference +++ /dev/null @@ -1,22 +0,0 @@ --- database level policies - -- SHOW CREATE POLICY db1_25341 ON db1_25341.* -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL - -- SHOW CREATE POLICY ON db1_25341.* -CREATE ROW POLICY `25341_filter_11` ON db1_25341.`25341_rqtable` FOR SELECT USING x = 2 TO ALL -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL -CREATE ROW POLICY tbl1_25341 ON db1_25341.table FOR SELECT USING 1 TO ALL - -- SHOW CREATE POLICY ON db1_25341.`*` -CREATE ROW POLICY db1_25341 ON db1_25341.`*` FOR SELECT USING 1 TO ALL -R1, R2: (x == 1) OR (x == 2) -1 -2 -Check system.query_log -SELECT \'-- database level policies\'; [] -SELECT \' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*\'; [] -SELECT \' -- SHOW CREATE POLICY ON db1_25341.*\'; [] -SELECT \' -- SHOW CREATE POLICY ON db1_25341.`*`\'; [] -SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] -SELECT * FROM 25341_rqtable_default; ['`25341_filter_11_db` ON default.`*`','`25341_filter_11` ON default.`25341_rqtable_default`'] -SELECT \'Check system.query_log\'; [] - -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE -CREATE ROW POLICY db2_25341 ON db1_25341.`*` TO u1_25341 diff --git a/tests/queries/0_stateless/25341_row_policy_database.sql b/tests/queries/0_stateless/25341_row_policy_database.sql deleted file mode 100644 index 9d865487f0b..00000000000 --- a/tests/queries/0_stateless/25341_row_policy_database.sql +++ /dev/null @@ -1,53 +0,0 @@ --- Tags: no-parallel - -DROP DATABASE IF EXISTS db1_25341; -DROP USER IF EXISTS u1_25341; -CREATE USER u1_25341; - -CREATE DATABASE db1_25341; - -CREATE TABLE db1_25341.25341_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; -INSERT INTO db1_25341.25341_rqtable VALUES (1), (2), (3), (4); - - -SELECT '-- database level policies'; -CREATE ROW POLICY db1_25341 ON db1_25341.* USING 1 AS PERMISSIVE TO ALL; -CREATE ROW POLICY tbl1_25341 ON db1_25341.table USING 1 AS PERMISSIVE TO ALL; -SELECT ' -- SHOW CREATE POLICY db1_25341 ON db1_25341.*'; -SHOW CREATE POLICY db1_25341 ON db1_25341.*; -SELECT ' -- SHOW CREATE POLICY ON db1_25341.*'; -SHOW CREATE POLICY ON db1_25341.*; -SELECT ' -- SHOW CREATE POLICY ON db1_25341.`*`'; -SHOW CREATE POLICY ON db1_25341.`*`; -DROP POLICY db1_25341 ON db1_25341.*; -DROP POLICY tbl1_25341 ON db1_25341.table; - -CREATE ROW POLICY any_25341 ON *.some_table USING 1 AS PERMISSIVE TO ALL; -- { clientError 62 } - -CREATE TABLE 25341_rqtable_default (x UInt8) ENGINE = MergeTree ORDER BY x; - -CREATE ROW POLICY 25341_filter_11_db ON * USING x=1 AS permissive TO ALL; -CREATE ROW POLICY 25341_filter_11 ON 25341_rqtable_default USING x=2 AS permissive TO ALL; - -INSERT INTO 25341_rqtable_default VALUES (1), (2), (3), (4); - -SELECT 'R1, R2: (x == 1) OR (x == 2)'; -SELECT * FROM 25341_rqtable_default; - -DROP TABLE 25341_rqtable_default; - -SELECT 'Check system.query_log'; -SYSTEM FLUSH LOGS; -SELECT query, used_row_policies FROM system.query_log WHERE current_database == currentDatabase() AND type == 'QueryStart' AND query_kind == 'Select' ORDER BY event_time_microseconds; - -DROP ROW POLICY 25341_filter_11_db ON *; -DROP ROW POLICY 25341_filter_11 ON 25341_rqtable_default; - -USE db1_25341; -SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; -CREATE ROW POLICY db2_25341 ON * TO u1_25341; -SHOW CREATE POLICY db2_25341 ON *; - -DROP ROW POLICY db2_25341 ON *; - -DROP USER u1_25341; From 6852ae0d938131863138ed532ed291acbf750444 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Tue, 2 May 2023 21:11:18 +0000 Subject: [PATCH 039/127] row_policy_template - external code review changes --- src/Access/EnabledRowPolicies.cpp | 2 +- src/Access/RowPolicy.h | 4 +++- src/Access/RowPolicyCache.cpp | 16 ++++++++++++---- src/Access/RowPolicyCache.h | 2 +- ...ase.sql => 02703_row_policy_for_database.sql} | 0 5 files changed, 17 insertions(+), 7 deletions(-) rename tests/queries/0_stateless/{02703_row_policy_database.sql => 02703_row_policy_for_database.sql} (100%) diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 9efac6e992e..be78dd62146 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -35,7 +35,7 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const auto loaded = mixed_filters.load(); auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) - { + { /// Look for a policy for database if a table policy not found it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index b9ba528e9bb..348ebfa1637 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -36,7 +36,9 @@ struct RowPolicy : public IAccessEntity /// in addition to all the restrictive policies. void setPermissive(bool permissive_ = true) { setRestrictive(!permissive_); } bool isPermissive() const { return !isRestrictive(); } - bool isDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } + + /// Applied for entire database + bool isForDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 07bec185131..07670a8fe84 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -218,14 +218,13 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) std::vector policies; }; - std::unordered_map table_mixers; std::unordered_map database_mixers; /// populate database_mixers using database-level policies /// to aggregate (mix) rules per database for (const auto & [policy_id, info] : all_policies) { - if (info.isDatabase()) + if (info.isForDatabase()) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); @@ -250,10 +249,12 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) } } + std::unordered_map table_mixers; + /// populate table_mixers using database_mixers and table-level policies for (const auto & [policy_id, info] : all_policies) { - if (!info.isDatabase()) + if (!info.isForDatabase()) { const auto & policy = *info.policy; bool match = info.roles->match(enabled.params.user_id, enabled.params.enabled_roles); @@ -298,7 +299,14 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) auto mixed_filters = boost::make_shared(); - /// retrieve aggregated policies from mixers + /// Retrieve aggregated policies from mixers + /// if a table has a policy for this particular table, we have all needed information in table_mixers + /// (policies for the database are already applied) + /// otherwise we would look for a policy for database using RowPolicy::ANY_TABLE_MARK + /// Consider restrictive policies a=1 for db.t, b=2 for db.* and c=3 for db.* + /// We are going to have two items in mixed_filters: + /// 1. a=1 AND b=2 AND c=3 for db.t (comes from table_mixers, where it had been created with the help of database_mixers) + /// 2. b=2 AND c=3 for db.* (comes directly from database_mixers) for (auto * mixer_map_ptr : {&table_mixers, &database_mixers}) { for (auto & [key, mixer] : *mixer_map_ptr) diff --git a/src/Access/RowPolicyCache.h b/src/Access/RowPolicyCache.h index 7260de22164..df263416509 100644 --- a/src/Access/RowPolicyCache.h +++ b/src/Access/RowPolicyCache.h @@ -29,7 +29,7 @@ private: explicit PolicyInfo(const RowPolicyPtr & policy_) { setPolicy(policy_); } void setPolicy(const RowPolicyPtr & policy_); - bool isDatabase() const { return policy->isDatabase(); } + bool isForDatabase() const { return policy->isForDatabase(); } RowPolicyPtr policy; const RolesOrUsersSet * roles = nullptr; std::shared_ptr> database_and_table_name; diff --git a/tests/queries/0_stateless/02703_row_policy_database.sql b/tests/queries/0_stateless/02703_row_policy_for_database.sql similarity index 100% rename from tests/queries/0_stateless/02703_row_policy_database.sql rename to tests/queries/0_stateless/02703_row_policy_for_database.sql From d54a62e8e9ac1db9b0e37b75902e6762e520e3aa Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Wed, 3 May 2023 08:56:45 +0000 Subject: [PATCH 040/127] row_policy_template - forgotten reference test file --- ...database.reference => 02703_row_policy_for_database.reference} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02703_row_policy_database.reference => 02703_row_policy_for_database.reference} (100%) diff --git a/tests/queries/0_stateless/02703_row_policy_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference similarity index 100% rename from tests/queries/0_stateless/02703_row_policy_database.reference rename to tests/queries/0_stateless/02703_row_policy_for_database.reference From 9685aa0c9174d11905871aad2bd598e03d017bcd Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Thu, 4 May 2023 14:35:31 +0000 Subject: [PATCH 041/127] row_policy_template - minor changes, test renamed --- ... => 02703_row_policies_for_database_combination.reference} | 0 ...on.sql => 02703_row_policies_for_database_combination.sql} | 0 tests/queries/0_stateless/02703_row_policy_for_database.sql | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename tests/queries/0_stateless/{02703_row_policies_database_combination.reference => 02703_row_policies_for_database_combination.reference} (100%) rename tests/queries/0_stateless/{02703_row_policies_database_combination.sql => 02703_row_policies_for_database_combination.sql} (100%) diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference similarity index 100% rename from tests/queries/0_stateless/02703_row_policies_database_combination.reference rename to tests/queries/0_stateless/02703_row_policies_for_database_combination.reference diff --git a/tests/queries/0_stateless/02703_row_policies_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql similarity index 100% rename from tests/queries/0_stateless/02703_row_policies_database_combination.sql rename to tests/queries/0_stateless/02703_row_policies_for_database_combination.sql diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.sql b/tests/queries/0_stateless/02703_row_policy_for_database.sql index 85f5a44dfbf..03183a96b98 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.sql +++ b/tests/queries/0_stateless/02703_row_policy_for_database.sql @@ -10,7 +10,7 @@ CREATE TABLE db1_02703.02703_rqtable (x UInt8) ENGINE = MergeTree ORDER BY x; INSERT INTO db1_02703.02703_rqtable VALUES (1), (2), (3), (4); -SELECT '-- database level policies'; +SELECT '-- row policies for database'; CREATE ROW POLICY db1_02703 ON db1_02703.* USING 1 AS PERMISSIVE TO ALL; CREATE ROW POLICY tbl1_02703 ON db1_02703.table USING 1 AS PERMISSIVE TO ALL; SELECT ' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*'; @@ -44,7 +44,7 @@ DROP ROW POLICY 02703_filter_11_db ON *; DROP ROW POLICY 02703_filter_11 ON 02703_rqtable_default; USE db1_02703; -SELECT ' -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE'; +SELECT ' -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE'; CREATE ROW POLICY db2_02703 ON * TO u1_02703; SHOW CREATE POLICY db2_02703 ON *; From 1027db6acaadbb20fcfedaac3416d98bd75b8e7d Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 5 May 2023 11:06:15 +0000 Subject: [PATCH 042/127] row_policy_template - minor change, reference test file fix --- .../0_stateless/02703_row_policy_for_database.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference index e318dfac429..5f0b486c1bd 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.reference +++ b/tests/queries/0_stateless/02703_row_policy_for_database.reference @@ -1,4 +1,4 @@ --- database level policies +-- row policies for database -- SHOW CREATE POLICY db1_02703 ON db1_02703.* CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.* @@ -10,12 +10,12 @@ R1, R2: (x == 1) OR (x == 2) 1 2 Check system.query_log -SELECT \'-- database level policies\'; [] +SELECT \'-- row policies for database\'; [] SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] SELECT \'Check system.query_log\'; [] - -- CREATE DATABSE-LEVEL POLICY ON IN CURRENT DATABASE + -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 From 9ef610040f691bc39ca711566de9feb2fbde44f1 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 12 May 2023 22:30:00 +0000 Subject: [PATCH 043/127] row_policy_template - table `*` is respected --- docs/en/operations/system-tables/row_policies.md | 2 +- src/Access/Common/RowPolicyDefs.cpp | 2 +- src/Access/Common/RowPolicyDefs.h | 2 ++ src/Access/EnabledRowPolicies.cpp | 10 +++++++++- src/Access/RowPolicy.h | 4 +--- src/Access/RowPolicyCache.cpp | 2 +- src/Parsers/Access/ASTRowPolicyName.cpp | 11 ++++++++--- src/Parsers/Access/ASTRowPolicyName.h | 3 +++ src/Parsers/Access/ParserRowPolicyName.cpp | 2 +- ...03_row_policies_for_database_combination.reference | 2 ++ .../02703_row_policies_for_database_combination.sql | 10 ++++++++++ .../02703_row_policy_for_database.reference | 9 ++++----- 12 files changed, 43 insertions(+), 16 deletions(-) diff --git a/docs/en/operations/system-tables/row_policies.md b/docs/en/operations/system-tables/row_policies.md index 2c4d060ce66..e92ba1ece74 100644 --- a/docs/en/operations/system-tables/row_policies.md +++ b/docs/en/operations/system-tables/row_policies.md @@ -12,7 +12,7 @@ Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. -- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `table` ([String](../../sql-reference/data-types/string.md)) — Table name. Empty if policy for database. - `id` ([UUID](../../sql-reference/data-types/uuid.md)) — Row policy ID. diff --git a/src/Access/Common/RowPolicyDefs.cpp b/src/Access/Common/RowPolicyDefs.cpp index ba7856116f6..b1f882fe971 100644 --- a/src/Access/Common/RowPolicyDefs.cpp +++ b/src/Access/Common/RowPolicyDefs.cpp @@ -22,7 +22,7 @@ String RowPolicyName::toString() const name += backQuoteIfNeed(database); name += '.'; } - name += backQuoteIfNeed(table_name); + name += (table_name == RowPolicyName::ANY_TABLE_MARK ? "*" : backQuoteIfNeed(table_name)); return name; } diff --git a/src/Access/Common/RowPolicyDefs.h b/src/Access/Common/RowPolicyDefs.h index 792884c56df..7ffc99e1272 100644 --- a/src/Access/Common/RowPolicyDefs.h +++ b/src/Access/Common/RowPolicyDefs.h @@ -9,6 +9,8 @@ namespace DB /// Represents the full name of a row policy, e.g. "myfilter ON mydb.mytable". struct RowPolicyName { + static constexpr char ANY_TABLE_MARK[] = ""; + String short_name; String database; String table_name; diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index be78dd62146..601f004e3ea 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -36,11 +36,19 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const auto it = loaded->find({database, table_name, filter_type}); if (it == loaded->end()) { /// Look for a policy for database if a table policy not found - it = loaded->find({database, RowPolicy::ANY_TABLE_MARK, filter_type}); + it = loaded->find({database, RowPolicyName::ANY_TABLE_MARK, filter_type}); if (it == loaded->end()) { return {}; } + else + { + // deep copy found policy for database and change its table name to the actual one + auto policy_for_database = std::make_shared(*it->second); + auto database_and_table_name = std::make_shared>(database, table_name); + policy_for_database->database_and_table_name = database_and_table_name; + return policy_for_database; + } } return it->second; diff --git a/src/Access/RowPolicy.h b/src/Access/RowPolicy.h index 348ebfa1637..9c190458620 100644 --- a/src/Access/RowPolicy.h +++ b/src/Access/RowPolicy.h @@ -14,8 +14,6 @@ namespace DB */ struct RowPolicy : public IAccessEntity { - static constexpr char ANY_TABLE_MARK[] = "*"; - void setShortName(const String & short_name); void setDatabase(const String & database); void setTableName(const String & table_name); @@ -38,7 +36,7 @@ struct RowPolicy : public IAccessEntity bool isPermissive() const { return !isRestrictive(); } /// Applied for entire database - bool isForDatabase() const { return full_name.table_name == ANY_TABLE_MARK; } + bool isForDatabase() const { return full_name.table_name == RowPolicyName::ANY_TABLE_MARK; } /// Sets that the policy is restrictive. /// A row is only accessible if at least one of the permissive policies passes, diff --git a/src/Access/RowPolicyCache.cpp b/src/Access/RowPolicyCache.cpp index 07670a8fe84..bb9da674477 100644 --- a/src/Access/RowPolicyCache.cpp +++ b/src/Access/RowPolicyCache.cpp @@ -270,7 +270,7 @@ void RowPolicyCache::mixFiltersFor(EnabledRowPolicies & enabled) if (table_it == table_mixers.end()) { /// no exact match - create new mixer MixedFiltersKey database_key = key; - database_key.table_name = RowPolicy::ANY_TABLE_MARK; + database_key.table_name = RowPolicyName::ANY_TABLE_MARK; auto database_it = database_mixers.find(database_key); diff --git a/src/Parsers/Access/ASTRowPolicyName.cpp b/src/Parsers/Access/ASTRowPolicyName.cpp index 4edfa61f10e..81a90de9d53 100644 --- a/src/Parsers/Access/ASTRowPolicyName.cpp +++ b/src/Parsers/Access/ASTRowPolicyName.cpp @@ -30,6 +30,11 @@ void ASTRowPolicyName::replaceEmptyDatabase(const String & current_database) full_name.database = current_database; } +String ASTRowPolicyNames::tableOrAsterisk(const String & table_name) const +{ + return table_name == RowPolicyName::ANY_TABLE_MARK ? "*" : backQuoteIfNeed(table_name); +} + void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const { @@ -73,7 +78,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState const String & table_name = full_name.table_name; if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } } else if (same_db_and_table_name) @@ -92,7 +97,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState settings.ostr << (settings.hilite ? hilite_keyword : "") << " ON " << (settings.hilite ? hilite_none : ""); if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } else { @@ -108,7 +113,7 @@ void ASTRowPolicyNames::formatImpl(const FormatSettings & settings, FormatState << (settings.hilite ? hilite_none : ""); if (!database.empty()) settings.ostr << backQuoteIfNeed(database) + "."; - settings.ostr << backQuoteIfNeed(table_name); + settings.ostr << tableOrAsterisk(table_name); } } diff --git a/src/Parsers/Access/ASTRowPolicyName.h b/src/Parsers/Access/ASTRowPolicyName.h index 9f4848bd612..86171475a0a 100644 --- a/src/Parsers/Access/ASTRowPolicyName.h +++ b/src/Parsers/Access/ASTRowPolicyName.h @@ -45,5 +45,8 @@ public: ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceEmptyDatabase(const String & current_database); + +private: + String tableOrAsterisk(const String & table_name) const; }; } diff --git a/src/Parsers/Access/ParserRowPolicyName.cpp b/src/Parsers/Access/ParserRowPolicyName.cpp index e5b4e01d5ac..efdff3c24bf 100644 --- a/src/Parsers/Access/ParserRowPolicyName.cpp +++ b/src/Parsers/Access/ParserRowPolicyName.cpp @@ -36,7 +36,7 @@ namespace } else if (is_any_table) { - res_table_name = "*"; // RowPolicy::ANY_TABLE_MARK + res_table_name = RowPolicyName::ANY_TABLE_MARK; } /// If table is specified without DB it cannot be followed by "ON" diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference index 68ed02d1dc0..fa01904b846 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference @@ -40,3 +40,5 @@ None 3 30 4 40 No problematic policy, select works +Policy for table `*` does not affect other tables in the database +other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql index f9b466f1ade..9941d69979d 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql @@ -86,3 +86,13 @@ DROP TABLE 02703_db.02703_rptable; DROP TABLE 02703_db.02703_rptable_another; DROP TABLE 02703_db.02703_unexpected_columns; DROP DATABASE 02703_db; + +SELECT 'Policy for table `*` does not affect other tables in the database'; +CREATE DATABASE 02703_db_asterisk; +CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; +CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +SELECT 'star', * FROM 02703_db_asterisk.`*`; +SELECT 'other', * FROM 02703_db_asterisk.other; +DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; +DROP DATABASE 02703_db_asterisk; diff --git a/tests/queries/0_stateless/02703_row_policy_for_database.reference b/tests/queries/0_stateless/02703_row_policy_for_database.reference index 5f0b486c1bd..ec03e538d95 100644 --- a/tests/queries/0_stateless/02703_row_policy_for_database.reference +++ b/tests/queries/0_stateless/02703_row_policy_for_database.reference @@ -1,11 +1,10 @@ -- row policies for database -- SHOW CREATE POLICY db1_02703 ON db1_02703.* -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY db1_02703 ON db1_02703.* FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.* -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL +CREATE ROW POLICY db1_02703 ON db1_02703.* FOR SELECT USING 1 TO ALL CREATE ROW POLICY tbl1_02703 ON db1_02703.table FOR SELECT USING 1 TO ALL -- SHOW CREATE POLICY ON db1_02703.`*` -CREATE ROW POLICY db1_02703 ON db1_02703.`*` FOR SELECT USING 1 TO ALL R1, R2: (x == 1) OR (x == 2) 1 2 @@ -15,7 +14,7 @@ SELECT \' -- SHOW CREATE POLICY db1_02703 ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.*\'; [] SELECT \' -- SHOW CREATE POLICY ON db1_02703.`*`\'; [] SELECT \'R1, R2: (x == 1) OR (x == 2)\'; [] -SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.`*`','`02703_filter_11` ON default.`02703_rqtable_default`'] +SELECT * FROM 02703_rqtable_default; ['`02703_filter_11_db` ON default.*','`02703_filter_11` ON default.`02703_rqtable_default`'] SELECT \'Check system.query_log\'; [] -- CREATE DATABASE-LEVEL POLICY IN CURRENT DATABASE -CREATE ROW POLICY db2_02703 ON db1_02703.`*` TO u1_02703 +CREATE ROW POLICY db2_02703 ON db1_02703.* TO u1_02703 From ccd82842a0e197f3aa1dbfda2f2977382ad16345 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Fri, 12 May 2023 22:32:53 +0000 Subject: [PATCH 044/127] row_policy_template - rollback tweaking found policy for database --- src/Access/EnabledRowPolicies.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/Access/EnabledRowPolicies.cpp b/src/Access/EnabledRowPolicies.cpp index 601f004e3ea..8ab1bf5928b 100644 --- a/src/Access/EnabledRowPolicies.cpp +++ b/src/Access/EnabledRowPolicies.cpp @@ -41,14 +41,6 @@ RowPolicyFilterPtr EnabledRowPolicies::getFilter(const String & database, const { return {}; } - else - { - // deep copy found policy for database and change its table name to the actual one - auto policy_for_database = std::make_shared(*it->second); - auto database_and_table_name = std::make_shared>(database, table_name); - policy_for_database->database_and_table_name = database_and_table_name; - return policy_for_database; - } } return it->second; From ad8e114bcf411104e08ffe4c71752e08fa6d308a Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Sat, 13 May 2023 07:20:38 +0000 Subject: [PATCH 045/127] row_policy_template - row policies not in query log if analyzer used --- tests/broken_tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 3278f720daf..0b4efacba0b 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -138,3 +138,4 @@ 01600_parts_states_metrics_long 01600_parts_types_metrics_long 01287_max_execution_speed +02703_row_policy_for_database From 47e7e7e60a5ac177e51d32dd9cf2080dbab85f75 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Sat, 13 May 2023 22:26:31 +0000 Subject: [PATCH 046/127] row_policy_template - test for asterisk table in a dedicated file --- .../02703_row_policies_for_asterisk.reference | 2 ++ .../0_stateless/02703_row_policies_for_asterisk.sql | 11 +++++++++++ ...03_row_policies_for_database_combination.reference | 2 -- .../02703_row_policies_for_database_combination.sql | 10 ---------- 4 files changed, 13 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/02703_row_policies_for_asterisk.reference create mode 100644 tests/queries/0_stateless/02703_row_policies_for_asterisk.sql diff --git a/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference b/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference new file mode 100644 index 00000000000..528bd7ef91e --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_for_asterisk.reference @@ -0,0 +1,2 @@ +Policy for table `*` does not affect other tables in the database +other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql b/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql new file mode 100644 index 00000000000..96b1c01a6d6 --- /dev/null +++ b/tests/queries/0_stateless/02703_row_policies_for_asterisk.sql @@ -0,0 +1,11 @@ +-- Tags: no-parallel + +SELECT 'Policy for table `*` does not affect other tables in the database'; +CREATE DATABASE 02703_db_asterisk; +CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; +CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; +SELECT 'star', * FROM 02703_db_asterisk.`*`; +SELECT 'other', * FROM 02703_db_asterisk.other; +DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; +DROP DATABASE 02703_db_asterisk; diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference index fa01904b846..68ed02d1dc0 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.reference @@ -40,5 +40,3 @@ None 3 30 4 40 No problematic policy, select works -Policy for table `*` does not affect other tables in the database -other 100 20 diff --git a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql index 9941d69979d..f9b466f1ade 100644 --- a/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql +++ b/tests/queries/0_stateless/02703_row_policies_for_database_combination.sql @@ -86,13 +86,3 @@ DROP TABLE 02703_db.02703_rptable; DROP TABLE 02703_db.02703_rptable_another; DROP TABLE 02703_db.02703_unexpected_columns; DROP DATABASE 02703_db; - -SELECT 'Policy for table `*` does not affect other tables in the database'; -CREATE DATABASE 02703_db_asterisk; -CREATE ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*` USING x=1 AS permissive TO ALL; -CREATE TABLE 02703_db_asterisk.`*` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; -CREATE TABLE 02703_db_asterisk.`other` (x UInt8, y UInt8) ENGINE = MergeTree ORDER BY x AS SELECT 100, 20; -SELECT 'star', * FROM 02703_db_asterisk.`*`; -SELECT 'other', * FROM 02703_db_asterisk.other; -DROP ROW POLICY 02703_asterisk ON 02703_db_asterisk.`*`; -DROP DATABASE 02703_db_asterisk; From 1db35384d969f9239de84563927494148100c6d5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 May 2023 03:30:03 +0200 Subject: [PATCH 047/127] Support `bitCount` for big integers --- src/Functions/bitCount.cpp | 9 ++++++++- .../02736_bit_count_big_int.reference | 13 +++++++++++++ .../0_stateless/02736_bit_count_big_int.sql | 19 +++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02736_bit_count_big_int.reference create mode 100644 tests/queries/0_stateless/02736_bit_count_big_int.sql diff --git a/src/Functions/bitCount.cpp b/src/Functions/bitCount.cpp index 984f33b7001..566a11481be 100644 --- a/src/Functions/bitCount.cpp +++ b/src/Functions/bitCount.cpp @@ -9,7 +9,7 @@ namespace DB template struct BitCountImpl { - using ResultType = UInt8; + using ResultType = std::conditional_t<(sizeof(A) * 8 >= 256), UInt16, UInt8>; static constexpr bool allow_string_or_fixed_string = true; static inline ResultType apply(A a) @@ -17,6 +17,13 @@ struct BitCountImpl /// We count bits in the value representation in memory. For example, we support floats. /// We need to avoid sign-extension when converting signed numbers to larger type. So, uint8_t(-1) has 8 bits. + if constexpr (is_big_int_v) + { + ResultType res = 0; + for (auto item : a.items) + res += __builtin_popcountll(item); + return res; + } if constexpr (std::is_same_v || std::is_same_v) return __builtin_popcountll(a); if constexpr (std::is_same_v || std::is_same_v || std::is_unsigned_v) diff --git a/tests/queries/0_stateless/02736_bit_count_big_int.reference b/tests/queries/0_stateless/02736_bit_count_big_int.reference new file mode 100644 index 00000000000..a3a725ace69 --- /dev/null +++ b/tests/queries/0_stateless/02736_bit_count_big_int.reference @@ -0,0 +1,13 @@ +128 +256 +128 +256 +127 +255 +126 +255 +64 +UInt8 +UInt16 +UInt8 +UInt16 diff --git a/tests/queries/0_stateless/02736_bit_count_big_int.sql b/tests/queries/0_stateless/02736_bit_count_big_int.sql new file mode 100644 index 00000000000..35a4a641606 --- /dev/null +++ b/tests/queries/0_stateless/02736_bit_count_big_int.sql @@ -0,0 +1,19 @@ +SELECT bitCount(CAST(-1 AS UInt128)); +SELECT bitCount(CAST(-1 AS UInt256)); + +SELECT bitCount(CAST(-1 AS Int128)); +SELECT bitCount(CAST(-1 AS Int256)); + +SELECT bitCount(CAST(-1 AS UInt128) - 1); +SELECT bitCount(CAST(-1 AS UInt256) - 2); + +SELECT bitCount(CAST(-1 AS Int128) - 3); +SELECT bitCount(CAST(-1 AS Int256) - 4); + +SELECT bitCount(CAST(0xFFFFFFFFFFFFFFFF AS Int256)); + +SELECT toTypeName(bitCount(1::UInt128)); +SELECT toTypeName(bitCount(1::UInt256)); + +SELECT toTypeName(bitCount(1::Int128)); +SELECT toTypeName(bitCount(1::Int256)); From 4764259f609a024f04ec8c082b63a495efc66d70 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 15 May 2023 14:17:16 +0800 Subject: [PATCH 048/127] Fix a bug with projections and the aggregate_functions_null_for_empty setting (for query_plan_optimize_projection) Fix a bug with projections and the aggregate_functions_null_for_empty setting. This was already fixed in PR #42198 but got forgotten after using query_plan_optimize_projection. --- .../QueryPlan/Optimizations/projectionsCommon.cpp | 4 ++++ ...rojection_aggregate_functions_null_for_empty.reference | 1 + ...1710_projection_aggregate_functions_null_for_empty.sql | 8 ++++++++ 3 files changed, 13 insertions(+) create mode 100644 tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference create mode 100644 tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index a334450fb41..9252137f649 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -42,6 +42,10 @@ bool canUseProjectionForReadingStep(ReadFromMergeTree * reading) if (reading->getContext()->getSettingsRef().allow_experimental_query_deduplication) return false; + // Currently projection don't support settings which implicitly modify aggregate functions. + if (reading->getContext()->getSettingsRef().aggregate_functions_null_for_empty) + return false; + return true; } diff --git a/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference new file mode 100644 index 00000000000..f2a527c4d8d --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.reference @@ -0,0 +1 @@ +1554690688 diff --git a/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql new file mode 100644 index 00000000000..a77720b6580 --- /dev/null +++ b/tests/queries/0_stateless/01710_projection_aggregate_functions_null_for_empty.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS t1; + +CREATE TABLE t1 (c0 Int32, PRIMARY KEY (c0)) ENGINE=MergeTree; +INSERT INTO t1 VALUES (1554690688); + +SELECT MIN(t1.c0) FROM t1 SETTINGS aggregate_functions_null_for_empty = 1; + +DROP TABLE IF EXISTS t1; From 8d1bcb5c2f23ea0e29c61c8cb3f584586f5f8834 Mon Sep 17 00:00:00 2001 From: JackyWoo Date: Mon, 15 May 2023 16:28:28 +0800 Subject: [PATCH 049/127] fix typo --- src/Storages/StorageMongoDB.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h index 682a027440d..2b77f076e7e 100644 --- a/src/Storages/StorageMongoDB.h +++ b/src/Storages/StorageMongoDB.h @@ -7,7 +7,7 @@ namespace DB { /* Implements storage in the MongoDB database. - * Use ENGINE = mysql(host_port, database_name, table_name, user_name, password) + * Use ENGINE = MongoDB(host:port, database, collection, user, password [, options]); * Read only. */ From 665545ec45d4fc0e15e602dcbcc990621d904623 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 May 2023 09:35:04 +0200 Subject: [PATCH 050/127] Fix "reference to local binding" after fixes for clang-17 Follow-up for: #49851 (cc @alexey-milovidov) Signed-off-by: Azat Khuzhin --- src/Coordination/KeeperStorage.cpp | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 58fccc83e6a..7a1a5e42632 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -339,37 +339,37 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) nodes.emplace(delta.path, UncommittedNode{.node = nullptr}); } - auto & [node, acls, last_applied_zxid] = nodes.at(delta.path); - std::visit( - [&, &my_node = node, &my_acls = acls, &my_last_applied_zxid = last_applied_zxid](const DeltaType & operation) + [&](const DeltaType & operation) { + auto & [node, acls, last_applied_zxid] = nodes.at(delta.path); + if constexpr (std::same_as) { - assert(!my_node); - my_node = std::make_shared(); - my_node->stat = operation.stat; - my_node->setData(operation.data); - my_acls = operation.acls; - my_last_applied_zxid = delta.zxid; + assert(!node); + node = std::make_shared(); + node->stat = operation.stat; + node->setData(operation.data); + acls = operation.acls; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(my_node); - my_node = nullptr; - my_last_applied_zxid = delta.zxid; + assert(node); + node = nullptr; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - assert(my_node); - my_node->invalidateDigestCache(); + assert(node); + node->invalidateDigestCache(); operation.update_fn(*node); - my_last_applied_zxid = delta.zxid; + last_applied_zxid = delta.zxid; } else if constexpr (std::same_as) { - my_acls = operation.acls; - my_last_applied_zxid = delta.zxid; + acls = operation.acls; + last_applied_zxid = delta.zxid; } }, delta.operation); From 424a20ee1cb53a0c64d13c6ed08d1ce54872ddb5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 May 2023 14:17:32 +0200 Subject: [PATCH 051/127] Log with warning if the server was terminated forcefully In case of it is terminated forcefully it will not be terminated gracefully (i.e. run dtors and stuff), and by using warning log level those messages will go to clickhouse-server.err.log, in which messages are kept for a longer period then in clickhouse-server.log (at least because it contains only warnings, errors and fatals only). This will help with investigating some obscure issues. Signed-off-by: Azat Khuzhin --- programs/server/Server.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b5e44d90129..632f3f3a02d 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1872,7 +1872,7 @@ try } if (current_connections) - LOG_INFO(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); + LOG_WARNING(log, "Closed all listening sockets. Waiting for {} outstanding connections.", current_connections); else LOG_INFO(log, "Closed all listening sockets."); @@ -1884,7 +1884,7 @@ try current_connections = waitServersToFinish(servers, config().getInt("shutdown_wait_unfinished", 5)); if (current_connections) - LOG_INFO(log, "Closed connections. But {} remain." + LOG_WARNING(log, "Closed connections. But {} remain." " Tip: To increase wait time add to config: 60", current_connections); else LOG_INFO(log, "Closed connections."); @@ -1900,7 +1900,7 @@ try /// Dump coverage here, because std::atexit callback would not be called. dumpCoverageReportIfPossible(); - LOG_INFO(log, "Will shutdown forcefully."); + LOG_WARNING(log, "Will shutdown forcefully."); safeExit(0); } }); From e8f971aa2bdfcdfb12e0091a8aa6a7eb4e96ff8c Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 15 May 2023 09:25:10 -0300 Subject: [PATCH 052/127] use LIMIT_EXCEEDED instead of TOO_LARGE_MAP_SIZE --- src/Common/ErrorCodes.cpp | 1 - src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h | 4 ++-- .../02499_extract_key_value_pairs_multiple_input.sql | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 382b8ed8019..e9dc5649245 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -576,7 +576,6 @@ M(691, UNKNOWN_ELEMENT_OF_ENUM) \ M(692, TOO_MANY_MUTATIONS) \ M(693, AWS_ERROR) \ - M(694, TOO_LARGE_MAP_SIZE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index 24c8d3cd89a..3895cf3e77d 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -13,7 +13,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int TOO_LARGE_MAP_SIZE; + extern const int LIMIT_EXCEEDED; } /* @@ -118,7 +118,7 @@ private: if (row_offset > max_number_of_pairs) { - throw Exception(ErrorCodes::TOO_LARGE_MAP_SIZE, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); + throw Exception(ErrorCodes::LIMIT_EXCEEDED, "Number of pairs produced exceeded the limit of {}", max_number_of_pairs); } key.commit(); diff --git a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql index 8c5f0365cc7..38d09338774 100644 --- a/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql +++ b/tests/queries/0_stateless/02499_extract_key_value_pairs_multiple_input.sql @@ -425,7 +425,7 @@ WITH 'Map(String,String)' ) AS x SELECT - x; -- {serverError TOO_LARGE_MAP_SIZE} + x; -- {serverError LIMIT_EXCEEDED} -- { echoOn } From dccdb3e6786d10803968504d1179804789edaf0a Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Mon, 15 May 2023 14:41:51 +0200 Subject: [PATCH 053/127] work with comments on PR --- ...oryStream.cpp => StdIStreamFromMemory.cpp} | 20 ++++------- ...3MemoryStream.h => StdIStreamFromMemory.h} | 17 +++++----- src/IO/WriteBufferFromS3.cpp | 24 ++++++------- src/IO/WriteBufferFromS3.h | 14 +++++++- ...riteBufferFromS3BufferAllocationPolicy.cpp | 34 +++++++++++-------- .../WriteBufferFromS3BufferAllocationPolicy.h | 26 -------------- src/IO/WriteBufferFromS3TaskTracker.cpp | 10 +++--- src/IO/WriteBufferFromS3TaskTracker.h | 10 ++++-- 8 files changed, 72 insertions(+), 83 deletions(-) rename src/IO/{WriteBufferFromS3MemoryStream.cpp => StdIStreamFromMemory.cpp} (67%) rename src/IO/{WriteBufferFromS3MemoryStream.h => StdIStreamFromMemory.h} (60%) delete mode 100644 src/IO/WriteBufferFromS3BufferAllocationPolicy.h diff --git a/src/IO/WriteBufferFromS3MemoryStream.cpp b/src/IO/StdIStreamFromMemory.cpp similarity index 67% rename from src/IO/WriteBufferFromS3MemoryStream.cpp rename to src/IO/StdIStreamFromMemory.cpp index 6271f15f055..3242a7e6383 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.cpp +++ b/src/IO/StdIStreamFromMemory.cpp @@ -1,27 +1,24 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include +#include namespace DB { -MemoryStream::MemoryBuf::MemoryBuf(char * begin_, size_t size_) +StdIStreamFromMemory::MemoryBuf::MemoryBuf(char * begin_, size_t size_) : begin(begin_) , size(size_) { this->setg(begin, begin, begin + size); } -MemoryStream::MemoryBuf::int_type MemoryStream::MemoryBuf::underflow() +StdIStreamFromMemory::MemoryBuf::int_type StdIStreamFromMemory::MemoryBuf::underflow() { if (gptr() < egptr()) return traits_type::to_int_type(*gptr()); return traits_type::eof(); } -MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, +StdIStreamFromMemory::MemoryBuf::pos_type +StdIStreamFromMemory::MemoryBuf::seekoff(off_type off, std::ios_base::seekdir way, std::ios_base::openmode mode) { bool out_mode = (std::ios_base::out & mode) != 0; @@ -49,13 +46,13 @@ MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekoff(off_type off, return pos_type(ret); } -MemoryStream::MemoryBuf::pos_type MemoryStream::MemoryBuf::seekpos(pos_type sp, +StdIStreamFromMemory::MemoryBuf::pos_type StdIStreamFromMemory::MemoryBuf::seekpos(pos_type sp, std::ios_base::openmode mode) { return seekoff(off_type(sp), std::ios_base::beg, mode); } -MemoryStream::MemoryStream(char * begin_, size_t size_) +StdIStreamFromMemory::StdIStreamFromMemory(char * begin_, size_t size_) : std::iostream(nullptr) , mem_buf(begin_, size_) { @@ -63,6 +60,3 @@ MemoryStream::MemoryStream(char * begin_, size_t size_) } } - -#endif - diff --git a/src/IO/WriteBufferFromS3MemoryStream.h b/src/IO/StdIStreamFromMemory.h similarity index 60% rename from src/IO/WriteBufferFromS3MemoryStream.h rename to src/IO/StdIStreamFromMemory.h index e9606798910..64b147fd296 100644 --- a/src/IO/WriteBufferFromS3MemoryStream.h +++ b/src/IO/StdIStreamFromMemory.h @@ -1,15 +1,15 @@ #pragma once -#include "config.h" - -#if USE_AWS_S3 - #include namespace DB { -struct MemoryStream: std::iostream +/// StdIStreamFromMemory is used in WriteBufferFromS3 as a stream which is passed to the S3::Client +/// It provides istream interface (only reading) over the memory. +/// However S3::Client requires iostream interface it only reads from the stream + +class StdIStreamFromMemory : public std::iostream { struct MemoryBuf: std::streambuf { @@ -27,11 +27,10 @@ struct MemoryStream: std::iostream size_t size = 0; }; - MemoryStream(char * begin_, size_t size_); - MemoryBuf mem_buf; + +public: + StdIStreamFromMemory(char * begin_, size_t size_); }; } - -#endif diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 73d78cb13be..01ab8ff7cbb 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -2,8 +2,8 @@ #if USE_AWS_S3 +#include "StdIStreamFromMemory.h" #include "WriteBufferFromS3.h" -#include "WriteBufferFromS3MemoryStream.h" #include "WriteBufferFromS3TaskTracker.h" #include @@ -63,7 +63,7 @@ struct WriteBufferFromS3::PartData std::shared_ptr createAwsBuffer() { - auto buffer = std::make_shared(memory.data(), data_size); + auto buffer = std::make_shared(memory.data(), data_size); buffer->exceptions(std::ios::badbit); return buffer; } @@ -108,7 +108,7 @@ void WriteBufferFromS3::nextImpl() "Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest"); /// Make sense to call to before adding new async task to check if there is an exception - task_tracker->getReady(); + task_tracker->waitReady(); hidePartialData(); @@ -132,7 +132,7 @@ void WriteBufferFromS3::preFinalize() LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails()); - task_tracker->getReady(); + task_tracker->waitReady(); hidePartialData(); @@ -178,7 +178,7 @@ void WriteBufferFromS3::finalizeImpl() chassert(offset() == 0); chassert(hidden_size == 0); - task_tracker->getAll(); + task_tracker->waitAll(); if (!multipart_upload_id.empty()) { @@ -266,10 +266,10 @@ void WriteBufferFromS3::reallocateFirstBuffer() { chassert(offset() == 0); - if (buffer_allocation_policy->getNumber() > 1 || available() > 0) + if (buffer_allocation_policy->getBufferNumber() > 1 || available() > 0) return; - const size_t max_first_buffer = buffer_allocation_policy->getSize(); + const size_t max_first_buffer = buffer_allocation_policy->getBufferSize(); if (memory.size() == max_first_buffer) return; @@ -299,7 +299,7 @@ void WriteBufferFromS3::detachBuffer() void WriteBufferFromS3::allocateFirstBuffer() { - const auto max_first_buffer = buffer_allocation_policy->getSize(); + const auto max_first_buffer = buffer_allocation_policy->getBufferSize(); const auto size = std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), max_first_buffer); memory = Memory(size); WriteBuffer::set(memory.data(), memory.size()); @@ -309,16 +309,16 @@ void WriteBufferFromS3::allocateFirstBuffer() void WriteBufferFromS3::allocateBuffer() { - buffer_allocation_policy->next(); + buffer_allocation_policy->nextBuffer(); chassert(0 == hidden_size); - if (buffer_allocation_policy->getNumber() == 1) + if (buffer_allocation_policy->getBufferNumber() == 1) return allocateFirstBuffer(); - memory = Memory(buffer_allocation_policy->getSize()); + memory = Memory(buffer_allocation_policy->getBufferSize()); WriteBuffer::set(memory.data(), memory.size()); - LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getSize(), getLogDetails()); + LOG_TRACE(log, "Allocated buffer with size {}. {}", buffer_allocation_policy->getBufferSize(), getLogDetails()); } void WriteBufferFromS3::setFakeBufferWhenPreFinalized() diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index e65127872fa..ac6c430606f 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -41,6 +40,19 @@ public: void nextImpl() override; void preFinalize() override; +public: + class IBufferAllocationPolicy + { + public: + virtual size_t getBufferNumber() const = 0; + virtual size_t getBufferSize() const = 0; + virtual void nextBuffer() = 0; + virtual ~IBufferAllocationPolicy() = 0; + }; + using IBufferAllocationPolicyPtr = std::unique_ptr; + + static IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); + private: /// Receives response from the server after sending all data. void finalizeImpl() override; diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp index 0eec6b0d034..6347c1acfd7 100644 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp +++ b/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp @@ -2,38 +2,41 @@ #if USE_AWS_S3 -#include +#include + +#include namespace { -struct FixedSizeBufferAllocationPolicy : DB::IBufferAllocationPolicy +class FixedSizeBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy { - const size_t size = 0; + const size_t buffer_size = 0; size_t buffer_number = 0; +public: explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) - : size(settings_.strict_upload_part_size) + : buffer_size(settings_.strict_upload_part_size) { - chassert(size > 0); + chassert(buffer_size > 0); } - size_t getNumber() const override { return buffer_number; } + size_t getBufferNumber() const override { return buffer_number; } - size_t getSize() const override + size_t getBufferSize() const override { chassert(buffer_number > 0); - return size; + return buffer_size; } - void next() override + void nextBuffer() override { ++buffer_number; } }; -struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy +class ExpBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy { const size_t first_size = 0; const size_t second_size = 0; @@ -45,6 +48,7 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy size_t current_size = 0; size_t buffer_number = 0; +public: explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) , second_size(settings_.min_upload_part_size) @@ -59,15 +63,15 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy chassert(max_size > 0); } - size_t getNumber() const override { return buffer_number; } + size_t getBufferNumber() const override { return buffer_number; } - size_t getSize() const override + size_t getBufferSize() const override { chassert(buffer_number > 0); return current_size; } - void next() override + void nextBuffer() override { ++buffer_number; @@ -93,9 +97,9 @@ struct ExpBufferAllocationPolicy : DB::IBufferAllocationPolicy namespace DB { -IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; +WriteBufferFromS3::IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; -IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) +WriteBufferFromS3::IBufferAllocationPolicyPtr WriteBufferFromS3::ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) { if (settings_.strict_upload_part_size > 0) return std::make_unique(settings_); diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h b/src/IO/WriteBufferFromS3BufferAllocationPolicy.h deleted file mode 100644 index 1ee7c982ed2..00000000000 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include - -namespace DB -{ - -struct IBufferAllocationPolicy -{ - virtual size_t getNumber() const = 0; - virtual size_t getSize() const = 0; - virtual void next() = 0; - virtual ~IBufferAllocationPolicy() = 0; -}; - -using IBufferAllocationPolicyPtr = std::unique_ptr; - -IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); - -} - -#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp index 0769f7731c2..7826747c0a4 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -28,9 +28,9 @@ ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() }; } -void WriteBufferFromS3::TaskTracker::getReady() +void WriteBufferFromS3::TaskTracker::waitReady() { - LOG_TEST(log, "getReady, in queue {}", futures.size()); + LOG_TEST(log, "waitReady, in queue {}", futures.size()); /// Exceptions are propagated auto it = futures.begin(); @@ -55,12 +55,12 @@ void WriteBufferFromS3::TaskTracker::getReady() it = futures.erase(it); } - LOG_TEST(log, "getReady ended, in queue {}", futures.size()); + LOG_TEST(log, "waitReady ended, in queue {}", futures.size()); } -void WriteBufferFromS3::TaskTracker::getAll() +void WriteBufferFromS3::TaskTracker::waitAll() { - LOG_TEST(log, "getAll, in queue {}", futures.size()); + LOG_TEST(log, "waitAll, in queue {}", futures.size()); /// Exceptions are propagated for (auto & future : futures) diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h index fa214a4f8c5..c978b9a78f0 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.h +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -9,6 +9,12 @@ namespace DB { +/// That class is used only in WriteBufferFromS3 for now. +/// Therefore it declared as a part of WriteBufferFromS3. +/// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool. +/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll +/// to help with coordination of the running tasks. + class WriteBufferFromS3::TaskTracker { public: @@ -20,8 +26,8 @@ public: static ThreadPoolCallbackRunner syncRunner(); bool isAsync() const; - void getReady(); - void getAll(); + void waitReady(); + void waitAll(); void safeWaitAll(); void add(Callback && func); From 418a61a68c2e8089b5a7372b08e398757eb9522e Mon Sep 17 00:00:00 2001 From: AVMusorin Date: Fri, 12 May 2023 12:47:14 +0200 Subject: [PATCH 054/127] Allow using Alias column type for KafkaEngine ``` create table kafka ( a UInt32, a_str String Alias toString(a) ) engine = Kafka; create table data ( a UInt32; a_str String ) engine = MergeTree order by tuple(); create materialized view data_mv to data ( a UInt32, a_str String ) as select a, a_str from kafka; ``` Alias type works as expected in comparison with MATERIALIZED/EPHEMERAL or column with default expression. Ref: https://github.com/ClickHouse/ClickHouse/pull/47138 Co-authored-by: Azat Khuzhin --- .../table-engines/integrations/kafka.md | 4 +- src/Storages/ColumnsDescription.cpp | 9 ----- src/Storages/ColumnsDescription.h | 1 - src/Storages/Kafka/StorageKafka.cpp | 14 ++++++- tests/integration/test_storage_kafka/test.py | 38 +++++++++++++++---- 5 files changed, 44 insertions(+), 22 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index ab69e4e90ce..ccfca4c1f1f 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -19,8 +19,8 @@ Kafka lets you: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1], - name2 [type2], + name1 [type1] [ALIAS expr1], + name2 [type2] [ALIAS expr2], ... ) ENGINE = Kafka() SETTINGS diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 21b140bd73a..8eabae7929c 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -383,15 +383,6 @@ NamesAndTypesList ColumnsDescription::getEphemeral() const return ret; } -NamesAndTypesList ColumnsDescription::getWithDefaultExpression() const -{ - NamesAndTypesList ret; - for (const auto & col : columns) - if (col.default_desc.expression) - ret.emplace_back(col.name, col.type); - return ret; -} - NamesAndTypesList ColumnsDescription::getAll() const { NamesAndTypesList ret; diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index e5ec867cd64..365a999673e 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -132,7 +132,6 @@ public: NamesAndTypesList getInsertable() const; /// ordinary + ephemeral NamesAndTypesList getAliases() const; NamesAndTypesList getEphemeral() const; - NamesAndTypesList getWithDefaultExpression() const; // columns with default expression, for example set by `CREATE TABLE` statement NamesAndTypesList getAllPhysical() const; /// ordinary + materialized. NamesAndTypesList getAll() const; /// ordinary + materialized + aliases + ephemeral /// Returns .size0/.null/... diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 3381561eb1b..7d504833a0a 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -41,6 +41,7 @@ #include #include +#include "Storages/ColumnDefault.h" #include "config_version.h" #include @@ -966,9 +967,18 @@ void registerStorageKafka(StorageFactory & factory) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "kafka_poll_max_batch_size can not be lower than 1"); } - if (args.columns.getOrdinary() != args.columns.getAll() || !args.columns.getWithDefaultExpression().empty()) + NamesAndTypesList supported_columns; + for (const auto & column : args.columns) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL/ALIAS expressions for columns. " + if (column.default_desc.kind == ColumnDefaultKind::Alias) + supported_columns.emplace_back(column.name, column.type); + if (column.default_desc.kind == ColumnDefaultKind::Default && !column.default_desc.expression) + supported_columns.emplace_back(column.name, column.type); + } + // Kafka engine allows only ordinary columns without default expression or alias columns. + if (args.columns.getAll() != supported_columns) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL expressions for columns. " "See https://clickhouse.com/docs/en/engines/table-engines/integrations/kafka/#configuration"); } diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 3a4fa6c6bfe..9a6d3e0513c 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -285,11 +285,11 @@ def avro_confluent_message(schema_registry_client, value): # Tests -def test_kafka_prohibited_column_types(kafka_cluster): +def test_kafka_column_types(kafka_cluster): def assert_returned_exception(e): assert e.value.returncode == 36 assert ( - "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL/ALIAS expressions for columns." + "KafkaEngine doesn't support DEFAULT/MATERIALIZED/EPHEMERAL expressions for columns." in str(e.value) ) @@ -314,17 +314,39 @@ def test_kafka_prohibited_column_types(kafka_cluster): assert_returned_exception(exception) # check ALIAS - with pytest.raises(QueryRuntimeException) as exception: - instance.query( - """ + instance.query( + """ CREATE TABLE test.kafka (a Int, b String Alias toString(a)) ENGINE = Kafka('{kafka_broker}:19092', '{kafka_topic_new}', '{kafka_group_name_new}', '{kafka_format_json_each_row}', '\\n') + SETTINGS kafka_commit_on_select = 1; """ - ) - assert_returned_exception(exception) + ) + messages = [] + for i in range(5): + messages.append(json.dumps({"a": i})) + kafka_produce(kafka_cluster, "new", messages) + result = "" + expected = TSV( + """ +0\t0 +1\t1 +2\t2 +3\t3 +4\t4 + """ + ) + retries = 50 + while retries > 0: + result += instance.query("SELECT a, b FROM test.kafka", ignore_error=True) + if TSV(result) == expected: + break + retries -= 1 + + assert TSV(result) == expected + + instance.query("DROP TABLE test.kafka SYNC") # check MATERIALIZED - # check ALIAS with pytest.raises(QueryRuntimeException) as exception: instance.query( """ From bf201a09b7e68fde4d84f87388adeb7047416fb0 Mon Sep 17 00:00:00 2001 From: Diego Nieto Date: Mon, 15 May 2023 15:43:39 +0200 Subject: [PATCH 055/127] Update docs/en/operations/storing-data.md @kssenii comments Co-authored-by: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index e8b043e7a27..5804ad8545b 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -194,7 +194,7 @@ These settings should be defined in the disk configuration section. File Cache **query/profile settings**: -Some of these settings will disable cache features per query/profile that are enabled by default. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that general file and per query cache are enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. +Some of these settings will disable cache features per query/profile that are enabled by default or in disk configuration settings. For example, you can enable cache in disk configuration and disable it per query/profile setting `enable_filesystem_cache` to `false`. Also setting `cache_on_write_operations` to `true` in disk configuration means that "write-though" cache is enabled. But if you need to disable this general setting per specific queries then setting `enable_filesystem_cache_on_write_operations` to `false` means that write operations cache will be disabled for a specific query/profile. - `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`. From f2a023140e6134aca52fa1bd45e3a616f790be70 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 May 2023 15:38:50 +0200 Subject: [PATCH 056/127] Fix processing pending batch for Distributed async INSERT after restart After abnormal server restart current_batch.txt (that contains list of files to send to the remote shard), may not have all files, if it was terminated between unlink .bin files and truncation of current_batch.txt But it should be fixed in a more reliable way, though to backport the patch I kept it simple. Signed-off-by: Azat Khuzhin --- src/Storages/Distributed/DistributedAsyncInsertBatch.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp b/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp index ba5938e057d..b82cf1d7ffc 100644 --- a/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp +++ b/src/Storages/Distributed/DistributedAsyncInsertBatch.cpp @@ -201,6 +201,14 @@ void DistributedAsyncInsertBatch::sendBatch() { for (const auto & file : files) { + /// In case of recovery it is possible that some of files will be + /// missing, if server had been restarted abnormally + if (recovered && !fs::exists(file)) + { + LOG_WARNING(parent.log, "File {} does not exists, likely due abnormal shutdown", file); + continue; + } + ReadBufferFromFile in(file); const auto & distributed_header = DistributedAsyncInsertHeader::read(in, parent.log); From b23afdc53390149236d02140dfc96775317a2e6c Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 15:48:00 +0000 Subject: [PATCH 057/127] Fix build for aarch64-darwin --- contrib/boost-cmake/CMakeLists.txt | 18 +- src/Common/AsyncTaskExecutor.h | 44 +- src/Common/OpenTelemetryTraceContext.cpp | 45 +- .../FunctionGenerateRandomStructure.cpp | 457 ++++++++++++++++++ .../FunctionGenerateRandomStructure.h | 45 ++ 5 files changed, 561 insertions(+), 48 deletions(-) create mode 100644 src/Functions/FunctionGenerateRandomStructure.cpp create mode 100644 src/Functions/FunctionGenerateRandomStructure.h diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index c9a759eab9c..6f9dce0b042 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -103,11 +103,19 @@ set (SRCS_CONTEXT ) if (ARCH_AARCH64) - set (SRCS_CONTEXT ${SRCS_CONTEXT} - "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S" - "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S" - "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S" - ) + if (OS_DARWIN) + set (SRCS_CONTEXT ${SRCS_CONTEXT} + "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_macho_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_macho_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_macho_gas.S" + ) + else() + set (SRCS_CONTEXT ${SRCS_CONTEXT} + "${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S" + "${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S" + ) + endif() elseif (ARCH_PPC64LE) set (SRCS_CONTEXT ${SRCS_CONTEXT} "${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S" diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index 1c2f758504a..f87abd7eb9b 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -79,8 +79,8 @@ public: ERROR = 4, }; #endif - static FiberInfo getCurrentFiberInfo(); + static FiberInfo getCurrentFiberInfo(); protected: /// Method that is called in resume() before actual fiber resuming. /// If it returns false, resume() will return immediately without actual fiber resuming. @@ -124,6 +124,48 @@ private: std::unique_ptr task; }; +/// Simple implementation for fiber local variable. +template +struct FiberLocal +{ +public: + FiberLocal() + { + /// Initialize main instance for this thread. + /// Contexts for fibers will inherit this instance + /// (it could be changed before creating fibers). + data[nullptr] = T(); + } + + T & operator*() + { + return get(); + } + + T * operator->() + { + return &get(); + } + +private: + T & get() + { + /// Get instance for current fiber. + return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); + } + + T & getInstanceForFiber(FiberInfo info) + { + auto it = data.find(info.fiber); + /// If it's the first request, we need to initialize instance for the fiber using instance from parent fiber. + if (it == data.end()) + it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first; + return it->second; + } + + std::unordered_map data; +}; + String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description); } diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp index 8cf4879c1e2..40d06e71456 100644 --- a/src/Common/OpenTelemetryTraceContext.cpp +++ b/src/Common/OpenTelemetryTraceContext.cpp @@ -14,48 +14,9 @@ namespace DB namespace OpenTelemetry { -/// This code can be executed inside several fibers in one thread, -/// we should use fiber local tracing context. -struct FiberLocalTracingContextOnThread -{ -public: - FiberLocalTracingContextOnThread() - { - /// Initialize main context for this thread. - /// Contexts for fibers will inherit this main context. - data[nullptr] = TracingContextOnThread(); - } - - TracingContextOnThread & operator*() - { - return get(); - } - - TracingContextOnThread * operator->() - { - return &get(); - } - -private: - TracingContextOnThread & get() - { - /// Get context for current fiber. - return getContextForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); - } - - TracingContextOnThread & getContextForFiber(FiberInfo info) - { - auto it = data.find(info.fiber); - /// If it's the first request, we need to initialize context for the fiber using context from parent fiber. - if (it == data.end()) - it = data.insert({info.fiber, getContextForFiber(*info.parent_fiber_info)}).first; - return it->second; - } - - std::unordered_map data; -}; - -thread_local FiberLocalTracingContextOnThread current_fiber_trace_context; +///// This code can be executed inside several fibers in one thread, +///// we should use fiber local tracing context. +thread_local FiberLocal current_fiber_trace_context; bool Span::addAttribute(std::string_view name, UInt64 value) noexcept { diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp new file mode 100644 index 00000000000..023a73fe147 --- /dev/null +++ b/src/Functions/FunctionGenerateRandomStructure.cpp @@ -0,0 +1,457 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +class FunctionGenerateRandomStructure : public IFunction +{ +private: + static constexpr std::array simple_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + static constexpr std::array complex_types + { + TypeIndex::Nullable, + TypeIndex::LowCardinality, + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + }; + + static constexpr std::array map_key_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::Enum8, + TypeIndex::Enum16, + TypeIndex::UUID, + TypeIndex::LowCardinality, + }; + + static constexpr std::array suspicious_lc_types + { + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Float32, + TypeIndex::Float64, + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::String, + TypeIndex::FixedString, + TypeIndex::IPv4, + TypeIndex::IPv6, + TypeIndex::UUID, + }; + + static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; + static constexpr size_t MAX_TUPLE_ELEMENTS = 16; + static constexpr size_t MAX_DATETIME64_PRECISION = 9; + static constexpr size_t MAX_DECIMAL32_PRECISION = 9; + static constexpr size_t MAX_DECIMAL64_PRECISION = 18; + static constexpr size_t MAX_DECIMAL128_PRECISION = 38; + static constexpr size_t MAX_DECIMAL256_PRECISION = 76; + static constexpr size_t MAX_DEPTH = 32; + +public: + static constexpr auto name = "generateRandomStructure"; + + explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) + { + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 2) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", + getName(), arguments.size()); + + + for (size_t i = 0; i != arguments.size(); ++i) + { + if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) + { + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", + arguments[i]->getName(), + i + 1, + getName()); + } + } + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + size_t seed = randomSeed(); + size_t number_of_columns = 0; + + if (!arguments.empty() && !arguments[0].column->onlyNull()) + { + number_of_columns = arguments[0].column->getUInt(0); + if (number_of_columns > MAX_NUMBER_OF_COLUMNS) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Maximum allowed number of columns is {}, got {}", + MAX_NUMBER_OF_COLUMNS, + number_of_columns); + } + + if (arguments.size() > 1 && !arguments[1].column->onlyNull()) + seed = arguments[1].column->getUInt(0); + + pcg64 rng(seed); + if (number_of_columns == 0) + number_of_columns = generateNumberOfColumns(rng); + + auto col_res = ColumnString::create(); + auto & string_column = assert_cast(*col_res); + auto & chars = string_column.getChars(); + WriteBufferFromVector buf(chars); + writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); + buf.finalize(); + chars.push_back(0); + string_column.getOffsets().push_back(chars.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); + } + + static String getRandomStructure(size_t seed, const ContextPtr & context) + { + pcg64 rng(seed); + size_t number_of_columns = generateNumberOfColumns(rng); + WriteBufferFromOwnString buf; + writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); + return buf.str(); + } + +private: + + static size_t generateNumberOfColumns(pcg64 & rng) + { + return rng() % MAX_NUMBER_OF_COLUMNS + 1; + } + + static void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + for (size_t i = 0; i != number_of_columns; ++i) + { + if (i != 0) + writeCString(", ", buf); + String column_name = "c" + std::to_string(i + 1); + writeString(column_name, buf); + writeChar(' ', buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); + } + } + + template + static void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) + { + if (allow_complex_types && depth > MAX_DEPTH) + writeRandomType(column_name, rng, buf, depth); + + constexpr auto all_types = getAllTypes(); + auto type = all_types[rng() % all_types.size()]; + + switch (type) + { + case TypeIndex::UInt8: + if (rng() % 2) + writeCString("UInt8", buf); + else + writeCString("Bool", buf); + return; + case TypeIndex::FixedString: + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + return; + case TypeIndex::DateTime64: + writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal32: + writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal64: + writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal128: + writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Decimal256: + writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); + return; + case TypeIndex::Enum8: + writeCString("Enum8(", buf); + writeEnumValues(column_name, rng, buf, INT8_MAX); + writeChar(')', buf); + return; + case TypeIndex::Enum16: + writeCString("Enum16(", buf); + writeEnumValues(column_name, rng, buf, INT16_MAX); + writeChar(')', buf); + return; + case TypeIndex::LowCardinality: + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); + writeChar(')', buf); + return; + case TypeIndex::Nullable: + { + writeCString("Nullable(", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Array: + { + writeCString("Array(", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Map: + { + writeCString("Map(", buf); + writeMapKeyType(rng, buf, allow_suspicious_lc_types); + writeCString(", ", buf); + writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); + writeChar(')', buf); + return; + } + case TypeIndex::Tuple: + { + size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; + bool generate_nested = rng() % 2; + bool generate_named_tuple = rng() % 2; + if (generate_nested) + writeCString("Nested(", buf); + else + writeCString("Tuple(", buf); + + for (size_t i = 0; i != elements; ++i) + { + if (i != 0) + writeCString(", ", buf); + + String element_name = "e" + std::to_string(i + 1); + if (generate_named_tuple || generate_nested) + { + writeString(element_name, buf); + writeChar(' ', buf); + } + writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); + } + writeChar(')', buf); + return; + } + default: + writeString(magic_enum::enum_name(type), buf); + return; + } + } + + static void writeMapKeyType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + TypeIndex type = map_key_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + { + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + else if (type == TypeIndex::LowCardinality) + { + writeCString("LowCardinality(", buf); + writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); + writeChar(')', buf); + } + else + { + writeString(magic_enum::enum_name(type), buf); + } + } + + static void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) + { + bool make_nullable = rng() % 2; + if (make_nullable) + writeCString("Nullable(", buf); + + if (allow_suspicious_lc_types) + { + TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; + if (type == TypeIndex::FixedString) + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + else + writeString(magic_enum::enum_name(type), buf); + } + else + { + /// Support only String and FixedString. + if (rng() % 2) + writeCString("String", buf); + else + writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); + } + + if (make_nullable) + writeChar(')', buf); + } + + static void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) + { + /// Don't generate big enums, because it will lead to really big result + /// and slowness of this function, and it can lead to `Max query size exceeded` + /// while using this function with generateRandom. + size_t num_values = rng() % 16 + 1; + std::vector values(num_values); + + /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] + for (Int16 & x : values) + x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; + /// Make all numbers unique. + std::sort(values.begin(), values.end()); + for (size_t i = 0; i < num_values; ++i) + values[i] += i; + std::shuffle(values.begin(), values.end(), rng); + for (size_t i = 0; i != num_values; ++i) + { + if (i != 0) + writeCString(", ", buf); + writeString("'" + column_name + "V" + std::to_string(values[i]) + "' = " + std::to_string(i), buf); + } + } + + template + static constexpr auto getAllTypes() + { + constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; + constexpr size_t result_size = simple_types.size() + complex_types_size; + std::array result; + size_t index = 0; + + for (size_t i = 0; i != simple_types.size(); ++i, ++index) + result[index] = simple_types[i]; + + for (size_t i = 0; i != complex_types_size; ++i, ++index) + result[index] = complex_types[i]; + + return result; + } + + bool allow_suspicious_lc_types; +}; + + +REGISTER_FUNCTION(GenerateRandomStructure) +{ + factory.registerFunction( + { + R"( +Generates a random table structure. +This function takes 2 optional constant arguments: +the number of columns in the result structure (random by default) and random seed (random by default) +The maximum number of columns is 128. +The function returns a value of type String. +)", + Documentation::Examples{ + {"random", "SELECT generateRandomStructure()"}, + {"with specified number of columns", "SELECT generateRandomStructure(10)"}, + {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, + }, + Documentation::Categories{"Random"} + }, + FunctionFactory::CaseSensitive); +} + +} diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h new file mode 100644 index 00000000000..1d1bcb1a0a8 --- /dev/null +++ b/src/Functions/FunctionGenerateRandomStructure.h @@ -0,0 +1,45 @@ +#include +#include + +#include + +namespace DB +{ + +class FunctionGenerateRandomStructure : public IFunction +{ +public: + static constexpr auto name = "generateRandomStructure"; + + explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) + { + } + + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } + bool useDefaultImplementationForConstants() const override { return false; } + bool useDefaultImplementationForNulls() const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; + + static String generateRandomStructure(size_t seed, const ContextPtr & context); + +private: + bool allow_suspicious_lc_types; +}; + +} From 78064d062266559b9032d328b84a22e5f60b75d7 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 15 May 2023 15:52:14 +0000 Subject: [PATCH 058/127] Better comments --- src/Common/AsyncTaskExecutor.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Common/AsyncTaskExecutor.h b/src/Common/AsyncTaskExecutor.h index f87abd7eb9b..55dc2913c13 100644 --- a/src/Common/AsyncTaskExecutor.h +++ b/src/Common/AsyncTaskExecutor.h @@ -131,9 +131,9 @@ struct FiberLocal public: FiberLocal() { - /// Initialize main instance for this thread. - /// Contexts for fibers will inherit this instance - /// (it could be changed before creating fibers). + /// Initialize main instance for this thread. Instances for fibers will inherit it, + /// (it's needed because main instance could be changed before creating fibers + /// and changes should be visible in fibers). data[nullptr] = T(); } @@ -150,14 +150,14 @@ public: private: T & get() { - /// Get instance for current fiber. return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo()); } T & getInstanceForFiber(FiberInfo info) { auto it = data.find(info.fiber); - /// If it's the first request, we need to initialize instance for the fiber using instance from parent fiber. + /// If it's the first request, we need to initialize instance for the fiber + /// using instance from parent fiber or main thread that created fiber. if (it == data.end()) it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first; return it->second; From 05ae7b2c2d7337c2ade92634f859718061a31064 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 May 2023 18:28:12 +0200 Subject: [PATCH 059/127] fix some tests --- src/Interpreters/MutationsInterpreter.cpp | 1 + src/Interpreters/MutationsInterpreter.h | 4 +-- .../MergeTree/MutateFromLogEntryTask.cpp | 5 ++- src/Storages/MergeTree/MutateTask.cpp | 30 +++++++++++++++-- .../ReplicatedMergeTreeMutationEntry.cpp | 9 +++++ .../ReplicatedMergeTreeMutationEntry.h | 2 ++ .../MergeTree/ReplicatedMergeTreeQueue.cpp | 11 ++++--- .../MergeTree/ReplicatedMergeTreeQueue.h | 3 +- src/Storages/StorageReplicatedMergeTree.cpp | 9 +++-- tests/clickhouse-test | 8 ++--- .../test_default_compression_codec/test.py | 4 +++ .../02440_mutations_finalization.reference | 5 +++ .../02440_mutations_finalization.sql | 33 +++++++++++++++++++ ...441_alter_delete_and_drop_column.reference | 2 ++ .../02441_alter_delete_and_drop_column.sql | 20 +++++++++++ 15 files changed, 130 insertions(+), 16 deletions(-) create mode 100644 tests/queries/0_stateless/02440_mutations_finalization.reference create mode 100644 tests/queries/0_stateless/02440_mutations_finalization.sql create mode 100644 tests/queries/0_stateless/02441_alter_delete_and_drop_column.reference create mode 100644 tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 2f05ba5d7f8..2f9e647b99e 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -542,6 +542,7 @@ void MutationsInterpreter::prepare(bool dry_run) if (commands.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty mutation commands list"); + /// TODO Should we get columns, indices and projections from the part itself? Table metadata may be different const ColumnsDescription & columns_desc = metadata_snapshot->getColumns(); const IndicesDescription & indices_desc = metadata_snapshot->getSecondaryIndices(); const ProjectionsDescription & projections_desc = metadata_snapshot->getProjections(); diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 4b965546408..8bb90365050 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -50,7 +50,7 @@ public: bool return_all_columns_ = false, bool return_mutated_rows_ = false); - /// Special case for MergeTree + /// Special case for *MergeTree MutationsInterpreter( MergeTreeData & storage_, MergeTreeData::DataPartPtr source_part_, @@ -123,7 +123,7 @@ public: private: StoragePtr storage; - /// Special case for MergeTree. + /// Special case for *MergeTree. MergeTreeData * data = nullptr; MergeTreeData::DataPartPtr part; }; diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 35a503c658d..d0b85ee65b8 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -90,7 +90,10 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() } new_part_info = MergeTreePartInfo::fromPartName(entry.new_part_name, storage.format_version); - commands = std::make_shared(storage.queue.getMutationCommands(source_part, new_part_info.mutation)); + Strings mutation_ids; + commands = std::make_shared(storage.queue.getMutationCommands(source_part, new_part_info.mutation, mutation_ids)); + LOG_TRACE(log, "Mutating part {} with mutation commands from {} mutations ({}): {}", + entry.new_part_name, commands->size(), fmt::join(mutation_ids, ", "), commands->toString()); /// Once we mutate part, we must reserve space on the same disk, because mutations can possibly create hardlinks. /// Can throw an exception. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index a3fa210ac42..27e5319ed4f 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -58,7 +58,9 @@ static void splitAndModifyMutationCommands( MergeTreeData::DataPartPtr part, const MutationCommands & commands, MutationCommands & for_interpreter, - MutationCommands & for_file_renames) + MutationCommands & for_file_renames, + const StorageMetadataPtr & table_metadata_snapshot, + Poco::Logger * log) { auto part_columns = part->getColumnsDescription(); @@ -142,6 +144,29 @@ static void splitAndModifyMutationCommands( { if (!mutated_columns.contains(column.name)) { + if (!table_metadata_snapshot->getColumns().has(column.name)) + { + /// We cannot add the column because there's no such column in table. + /// It's okay if the column was dropped. It may also absent in dropped_columns + /// if the corresponding MUTATE_PART entry was not created yet or was created separately from current MUTATE_PART. + /// But we don't know for sure what happened. + auto part_metadata_version = part->getMetadataVersion(); + auto table_metadata_version = table_metadata_snapshot->getMetadataVersion(); + if (table_metadata_version <= part_metadata_version) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} with metadata version {} contains column {} that is absent " + "in table {} with metadata version {}", + part->name, part_metadata_version, column.name, + part->storage.getStorageID().getNameForLogs(), table_metadata_version); + + if (part_metadata_version < table_metadata_version) + { + LOG_WARNING(log, "Ignoring column {} from part {} with metadata version {} because there is no such column " + "in table {} with metadata version {}. Assuming the column was dropped", column.name, part->name, + part_metadata_version, part->storage.getStorageID().getNameForLogs(), table_metadata_version); + continue; + } + } + for_interpreter.emplace_back( MutationCommand{.type = MutationCommand::Type::READ_COLUMN, .column_name = column.name, .data_type = column.type}); } @@ -1776,7 +1801,8 @@ bool MutateTask::prepare() context_for_reading->setSetting("allow_asynchronous_read_from_io_pool_for_merge_tree", false); context_for_reading->setSetting("max_streams_for_merge_tree_reading", Field(0)); - MutationHelpers::splitAndModifyMutationCommands(ctx->source_part, ctx->commands_for_part, ctx->for_interpreter, ctx->for_file_renames); + MutationHelpers::splitAndModifyMutationCommands(ctx->source_part, ctx->commands_for_part, ctx->for_interpreter, + ctx->for_file_renames, ctx->metadata_snapshot, ctx->log); ctx->stage_progress = std::make_unique(1.0); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.cpp index 1efb3f6826b..5e01cd96f6b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.cpp @@ -99,4 +99,13 @@ std::shared_ptr ReplicatedMergeTreeMutationEntry::backup() c return std::make_shared(out.str()); } + +String ReplicatedMergeTreeMutationEntry::getBlockNumbersForLogs() const +{ + WriteBufferFromOwnString out; + for (const auto & kv : block_numbers) + out << kv.first << " = " << kv.second << "; "; + return out.str(); +} + } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h index 8c359a57279..09787bd1731 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h @@ -51,6 +51,8 @@ struct ReplicatedMergeTreeMutationEntry bool isAlterMutation() const { return alter_version != -1; } std::shared_ptr backup() const; + + String getBlockNumbersForLogs() const; }; using ReplicatedMergeTreeMutationEntryPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 1762c7aabe9..9f7ae3222a4 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -955,13 +955,14 @@ void ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, C const String & partition_id = pair.first; Int64 block_num = pair.second; mutations_by_partition[partition_id].emplace(block_num, &mutation); - LOG_TRACE(log, "Adding mutation {} for partition {} for all block numbers less than {}", entry->znode_name, partition_id, block_num); } + LOG_TRACE(log, "Adding mutation {} for {} partitions (data versions: {})", + entry->znode_name, entry->block_numbers.size(), entry->getBlockNumbersForLogs()); /// Initialize `mutation.parts_to_do`. We cannot use only current_parts + virtual_parts here so we /// traverse all the queue and build correct state of parts_to_do. auto queue_representation = getQueueRepresentation(queue, format_version); - mutation.parts_to_do = getPartNamesToMutate(*entry, virtual_parts, queue_representation, format_version); + mutation.parts_to_do = getPartNamesToMutate(*entry, current_parts, queue_representation, format_version); if (mutation.parts_to_do.size() == 0) some_mutations_are_probably_done = true; @@ -1801,7 +1802,7 @@ std::map ReplicatedMergeTreeQueue::getAlterMutationCo } MutationCommands ReplicatedMergeTreeQueue::getMutationCommands( - const MergeTreeData::DataPartPtr & part, Int64 desired_mutation_version) const + const MergeTreeData::DataPartPtr & part, Int64 desired_mutation_version, Strings & mutation_ids) const { /// NOTE: If the corresponding mutation is not found, the error is logged (and not thrown as an exception) /// to allow recovering from a mutation that cannot be executed. This way you can delete the mutation entry @@ -1840,6 +1841,8 @@ MutationCommands ReplicatedMergeTreeQueue::getMutationCommands( MutationCommands commands; for (auto it = begin; it != end; ++it) { + chassert(mutation_pointer < it->second->entry->znode_name); + mutation_ids.push_back(it->second->entry->znode_name); const auto & commands_from_entry = it->second->entry->commands; commands.insert(commands.end(), commands_from_entry.begin(), commands_from_entry.end()); } @@ -2600,7 +2603,7 @@ void ReplicatedMergeTreeQueue::removeCurrentPartsFromMutations() { std::lock_guard state_lock(state_mutex); for (const auto & part_name : current_parts.getParts()) - removeCoveredPartsFromMutations(part_name, /*remove_part = */ true, /*remove_covered_parts = */ true); + removeCoveredPartsFromMutations(part_name, /*remove_part = */ false, /*remove_covered_parts = */ true); } } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index 368f2d4bc1f..3fefc341bbc 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -392,7 +392,8 @@ public: /// Returns functor which used by MergeTreeMergerMutator to select parts for merge ReplicatedMergeTreeMergePredicate getMergePredicate(zkutil::ZooKeeperPtr & zookeeper, PartitionIdsHint && partition_ids_hint); - MutationCommands getMutationCommands(const MergeTreeData::DataPartPtr & part, Int64 desired_mutation_version) const; + MutationCommands getMutationCommands(const MergeTreeData::DataPartPtr & part, Int64 desired_mutation_version, + Strings & mutation_ids) const; /// Return mutation commands for part which could be not applied to /// it according to part mutation version. Used when we apply alter commands on fly, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b5e53950a02..5ca97f06cde 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5265,12 +5265,12 @@ void StorageReplicatedMergeTree::alter( fs::path(zookeeper_path) / "log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential)); PartitionBlockNumbersHolder partition_block_numbers_holder; + ReplicatedMergeTreeMutationEntry mutation_entry; if (have_mutation) { delayMutationOrThrowIfNeeded(&partial_shutdown_event, query_context); const String mutations_path(fs::path(zookeeper_path) / "mutations"); - ReplicatedMergeTreeMutationEntry mutation_entry; mutation_entry.alter_version = new_metadata_version; mutation_entry.source_replica = replica_name; mutation_entry.commands = std::move(maybe_mutation_commands); @@ -5322,12 +5322,16 @@ void StorageReplicatedMergeTree::alter( /// ReplicatedMergeTreeMutationEntry record in /mutations String mutation_path = dynamic_cast(*results[mutation_path_idx]).path_created; mutation_znode = mutation_path.substr(mutation_path.find_last_of('/') + 1); + LOG_DEBUG(log, "Created log entry {} to update table metadata to version {}, created a mutation {} (data versions: {})", + alter_entry->znode_name, alter_entry->alter_version, *mutation_znode, mutation_entry.getBlockNumbersForLogs()); } else { /// ALTER_METADATA record in replication /log String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; alter_entry->znode_name = alter_path.substr(alter_path.find_last_of('/') + 1); + LOG_DEBUG(log, "Created log entry {} to update table metadata to version {}", + alter_entry->znode_name, alter_entry->alter_version); } break; } @@ -6493,7 +6497,8 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, Conte const String & path_created = dynamic_cast(responses[1].get())->path_created; mutation_entry.znode_name = path_created.substr(path_created.find_last_of('/') + 1); - LOG_TRACE(log, "Created mutation with ID {}", mutation_entry.znode_name); + LOG_TRACE(log, "Created mutation with ID {} (data versions: {})", + mutation_entry.znode_name, mutation_entry.getBlockNumbersForLogs()); break; } else if (rc == Coordination::Error::ZBADVERSION) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e279b899a93..cd9dde28fea 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -286,11 +286,11 @@ def get_processlist_with_stacktraces(args): -- NOTE: view() here to do JOIN on shards, instead of initiator FROM clusterAllReplicas('test_cluster_database_replicated', view( SELECT + p.*, arrayStringConcat(groupArray('Thread ID ' || toString(s.thread_id) || '\n' || arrayStringConcat(arrayMap( x -> concat(addressToLine(x), '::', demangle(addressToSymbol(x))), s.trace), '\n') AS stacktrace - )) AS stacktraces, - p.* + )) AS stacktraces FROM system.processes p JOIN system.stack_trace s USING (query_id) WHERE query NOT LIKE '%system.processes%' @@ -307,11 +307,11 @@ def get_processlist_with_stacktraces(args): args, """ SELECT + p.*, arrayStringConcat(groupArray('Thread ID ' || toString(s.thread_id) || '\n' || arrayStringConcat(arrayMap( x -> concat(addressToLine(x), '::', demangle(addressToSymbol(x))), s.trace), '\n') AS stacktrace - )) AS stacktraces, - p.* + )) AS stacktraces FROM system.processes p JOIN system.stack_trace s USING (query_id) WHERE query NOT LIKE '%system.processes%' diff --git a/tests/integration/test_default_compression_codec/test.py b/tests/integration/test_default_compression_codec/test.py index 5d033ac8f7e..c7c30f5eea4 100644 --- a/tests/integration/test_default_compression_codec/test.py +++ b/tests/integration/test_default_compression_codec/test.py @@ -262,6 +262,8 @@ def test_default_codec_multiple(start_cluster): ) ) + node2.query("SYSTEM SYNC REPLICA compression_table_multiple", timeout=15) + # Same codec for all assert ( get_compression_codec_byte(node1, "compression_table_multiple", "1_0_0_0") @@ -330,6 +332,8 @@ def test_default_codec_multiple(start_cluster): node1.query("OPTIMIZE TABLE compression_table_multiple FINAL") + node2.query("SYSTEM SYNC REPLICA compression_table_multiple", timeout=15) + assert ( get_compression_codec_byte(node1, "compression_table_multiple", "1_0_0_1") == CODECS_MAPPING["Multiple"] diff --git a/tests/queries/0_stateless/02440_mutations_finalization.reference b/tests/queries/0_stateless/02440_mutations_finalization.reference new file mode 100644 index 00000000000..a8b9c2acdce --- /dev/null +++ b/tests/queries/0_stateless/02440_mutations_finalization.reference @@ -0,0 +1,5 @@ +0000000000 UPDATE n = 2 WHERE n = 1 ['all_0_0_0'] 0 +1 +0000000000 UPDATE n = 2 WHERE n = 1 ['all_0_0_0'] 0 +2 +0000000000 UPDATE n = 2 WHERE n = 1 [] 1 diff --git a/tests/queries/0_stateless/02440_mutations_finalization.sql b/tests/queries/0_stateless/02440_mutations_finalization.sql new file mode 100644 index 00000000000..796dcde8e4e --- /dev/null +++ b/tests/queries/0_stateless/02440_mutations_finalization.sql @@ -0,0 +1,33 @@ + +create table mut (n int) engine=ReplicatedMergeTree('/test/02440/{database}/mut', '1') order by tuple(); +set insert_keeper_fault_injection_probability=0; +insert into mut values (1); +system stop merges mut; +alter table mut update n = 2 where n = 1; +-- it will create MUTATE_PART entry, but will not execute it + +select mutation_id, command, parts_to_do_names, is_done from system.mutations where database=currentDatabase() and table='mut'; + +-- merges (and mutations) will start again after detach/attach, we need to avoid this somehow... +create table tmp (n int) engine=MergeTree order by tuple() settings index_granularity=1; +insert into tmp select * from numbers(1000); +alter table tmp update n = sleepEachRow(1) where 1; +select sleepEachRow(2) as higher_probablility_of_reproducing_the_issue format Null; + +-- it will not execute MUTATE_PART, because another mutation is currently executing (in tmp) +alter table mut modify setting max_number_of_mutations_for_replica=1; +detach table mut; +attach table mut; + +-- mutation should not be finished yet +select * from mut; +select mutation_id, command, parts_to_do_names, is_done from system.mutations where database=currentDatabase() and table='mut'; + +alter table mut modify setting max_number_of_mutations_for_replica=100; +system sync replica mut; + +-- and now it should +select * from mut; +select mutation_id, command, parts_to_do_names, is_done from system.mutations where database=currentDatabase() and table='mut'; + +drop table tmp; -- btw, it will check that mutation can be cancelled between blocks on shutdown diff --git a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.reference b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.reference new file mode 100644 index 00000000000..e9858167301 --- /dev/null +++ b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.reference @@ -0,0 +1,2 @@ +MUTATE_PART all_0_0_0_1 ['all_0_0_0'] +1 2 diff --git a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql new file mode 100644 index 00000000000..d274fae1a4f --- /dev/null +++ b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql @@ -0,0 +1,20 @@ + +create table mut (n int, m int, k int) engine=ReplicatedMergeTree('/test/02441/{database}/mut', '1') order by n; +set insert_keeper_fault_injection_probability=0; +insert into mut values (1, 2, 3), (10, 20, 30); + +system stop merges mut; +alter table mut delete where n = 10; +alter table mut drop column k settings alter_sync=0; +system sync replica mut pull; + +-- a funny way to wait for ALTER_METADATA to disappear from the replication queue +select sleepEachRow(1) from url('http://localhost:8123/?param_tries={1..30}&query=' || encodeURLComponent( + 'select * from system.replication_queue where database=''' || currentDatabase() || ''' and table=''mut'' and type=''ALTER_METADATA''' + ), 'LineAsString', 's String') settings max_threads=1 format Null; + +select type, new_part_name, parts_to_merge from system.replication_queue where database=currentDatabase() and table='mut'; +system start merges mut; +set receive_timeout=30; +system sync replica mut; +select * from mut; From bfcaf95aed8b6f88a02c19c384db9a11a875961d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 18:32:54 +0200 Subject: [PATCH 060/127] Delete unneded files --- .../FunctionGenerateRandomStructure.h | 45 ------------------- 1 file changed, 45 deletions(-) delete mode 100644 src/Functions/FunctionGenerateRandomStructure.h diff --git a/src/Functions/FunctionGenerateRandomStructure.h b/src/Functions/FunctionGenerateRandomStructure.h deleted file mode 100644 index 1d1bcb1a0a8..00000000000 --- a/src/Functions/FunctionGenerateRandomStructure.h +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include - -#include - -namespace DB -{ - -class FunctionGenerateRandomStructure : public IFunction -{ -public: - static constexpr auto name = "generateRandomStructure"; - - explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) - { - } - - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; - - static String generateRandomStructure(size_t seed, const ContextPtr & context); - -private: - bool allow_suspicious_lc_types; -}; - -} From 900aca5f0a9c6c498b18cb1778e545b5f4d951f2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 May 2023 18:33:09 +0200 Subject: [PATCH 061/127] Delete unneded files --- .../FunctionGenerateRandomStructure.cpp | 457 ------------------ 1 file changed, 457 deletions(-) delete mode 100644 src/Functions/FunctionGenerateRandomStructure.cpp diff --git a/src/Functions/FunctionGenerateRandomStructure.cpp b/src/Functions/FunctionGenerateRandomStructure.cpp deleted file mode 100644 index 023a73fe147..00000000000 --- a/src/Functions/FunctionGenerateRandomStructure.cpp +++ /dev/null @@ -1,457 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int BAD_ARGUMENTS; -} - -class FunctionGenerateRandomStructure : public IFunction -{ -private: - static constexpr std::array simple_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Float32, - TypeIndex::Float64, - TypeIndex::Decimal32, - TypeIndex::Decimal64, - TypeIndex::Decimal128, - TypeIndex::Decimal256, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::DateTime64, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::Enum8, - TypeIndex::Enum16, - TypeIndex::IPv4, - TypeIndex::IPv6, - TypeIndex::UUID, - }; - - static constexpr std::array complex_types - { - TypeIndex::Nullable, - TypeIndex::LowCardinality, - TypeIndex::Array, - TypeIndex::Tuple, - TypeIndex::Map, - }; - - static constexpr std::array map_key_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::IPv4, - TypeIndex::Enum8, - TypeIndex::Enum16, - TypeIndex::UUID, - TypeIndex::LowCardinality, - }; - - static constexpr std::array suspicious_lc_types - { - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Float32, - TypeIndex::Float64, - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::String, - TypeIndex::FixedString, - TypeIndex::IPv4, - TypeIndex::IPv6, - TypeIndex::UUID, - }; - - static constexpr size_t MAX_NUMBER_OF_COLUMNS = 128; - static constexpr size_t MAX_TUPLE_ELEMENTS = 16; - static constexpr size_t MAX_DATETIME64_PRECISION = 9; - static constexpr size_t MAX_DECIMAL32_PRECISION = 9; - static constexpr size_t MAX_DECIMAL64_PRECISION = 18; - static constexpr size_t MAX_DECIMAL128_PRECISION = 38; - static constexpr size_t MAX_DECIMAL256_PRECISION = 76; - static constexpr size_t MAX_DEPTH = 32; - -public: - static constexpr auto name = "generateRandomStructure"; - - explicit FunctionGenerateRandomStructure(bool allow_suspicious_lc_types_) : allow_suspicious_lc_types(allow_suspicious_lc_types_) - { - } - - static FunctionPtr create(ContextPtr context) - { - return std::make_shared(context->getSettingsRef().allow_suspicious_low_cardinality_types.value); - } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 0; } - - bool isVariadic() const override { return true; } - bool isDeterministic() const override { return false; } - bool isDeterministicInScopeOfQuery() const override { return false; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 1}; } - bool useDefaultImplementationForConstants() const override { return false; } - bool useDefaultImplementationForNulls() const override { return false; } - - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (arguments.size() > 2) - throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, expected from 0 to 2", - getName(), arguments.size()); - - - for (size_t i = 0; i != arguments.size(); ++i) - { - if (!isUnsignedInteger(arguments[i]) && !arguments[i]->onlyNull()) - { - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of the {} argument of function {}, expected unsigned integer or Null", - arguments[i]->getName(), - i + 1, - getName()); - } - } - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - size_t seed = randomSeed(); - size_t number_of_columns = 0; - - if (!arguments.empty() && !arguments[0].column->onlyNull()) - { - number_of_columns = arguments[0].column->getUInt(0); - if (number_of_columns > MAX_NUMBER_OF_COLUMNS) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Maximum allowed number of columns is {}, got {}", - MAX_NUMBER_OF_COLUMNS, - number_of_columns); - } - - if (arguments.size() > 1 && !arguments[1].column->onlyNull()) - seed = arguments[1].column->getUInt(0); - - pcg64 rng(seed); - if (number_of_columns == 0) - number_of_columns = generateNumberOfColumns(rng); - - auto col_res = ColumnString::create(); - auto & string_column = assert_cast(*col_res); - auto & chars = string_column.getChars(); - WriteBufferFromVector buf(chars); - writeRandomStructure(rng, number_of_columns, buf, allow_suspicious_lc_types); - buf.finalize(); - chars.push_back(0); - string_column.getOffsets().push_back(chars.size()); - return ColumnConst::create(std::move(col_res), input_rows_count); - } - - static String getRandomStructure(size_t seed, const ContextPtr & context) - { - pcg64 rng(seed); - size_t number_of_columns = generateNumberOfColumns(rng); - WriteBufferFromOwnString buf; - writeRandomStructure(rng, number_of_columns, buf, context->getSettingsRef().allow_suspicious_low_cardinality_types); - return buf.str(); - } - -private: - - static size_t generateNumberOfColumns(pcg64 & rng) - { - return rng() % MAX_NUMBER_OF_COLUMNS + 1; - } - - static void writeRandomStructure(pcg64 & rng, size_t number_of_columns, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - for (size_t i = 0; i != number_of_columns; ++i) - { - if (i != 0) - writeCString(", ", buf); - String column_name = "c" + std::to_string(i + 1); - writeString(column_name, buf); - writeChar(' ', buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types); - } - } - - template - static void writeRandomType(const String & column_name, pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types, size_t depth = 0) - { - if (allow_complex_types && depth > MAX_DEPTH) - writeRandomType(column_name, rng, buf, depth); - - constexpr auto all_types = getAllTypes(); - auto type = all_types[rng() % all_types.size()]; - - switch (type) - { - case TypeIndex::UInt8: - if (rng() % 2) - writeCString("UInt8", buf); - else - writeCString("Bool", buf); - return; - case TypeIndex::FixedString: - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - return; - case TypeIndex::DateTime64: - writeString("DateTime64(" + std::to_string(rng() % MAX_DATETIME64_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal32: - writeString("Decimal32(" + std::to_string(rng() % MAX_DECIMAL32_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal64: - writeString("Decimal64(" + std::to_string(rng() % MAX_DECIMAL64_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal128: - writeString("Decimal128(" + std::to_string(rng() % MAX_DECIMAL128_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Decimal256: - writeString("Decimal256(" + std::to_string(rng() % MAX_DECIMAL256_PRECISION + 1) + ")", buf); - return; - case TypeIndex::Enum8: - writeCString("Enum8(", buf); - writeEnumValues(column_name, rng, buf, INT8_MAX); - writeChar(')', buf); - return; - case TypeIndex::Enum16: - writeCString("Enum16(", buf); - writeEnumValues(column_name, rng, buf, INT16_MAX); - writeChar(')', buf); - return; - case TypeIndex::LowCardinality: - writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); - writeChar(')', buf); - return; - case TypeIndex::Nullable: - { - writeCString("Nullable(", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Array: - { - writeCString("Array(", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Map: - { - writeCString("Map(", buf); - writeMapKeyType(rng, buf, allow_suspicious_lc_types); - writeCString(", ", buf); - writeRandomType(column_name, rng, buf, allow_suspicious_lc_types, depth + 1); - writeChar(')', buf); - return; - } - case TypeIndex::Tuple: - { - size_t elements = rng() % MAX_TUPLE_ELEMENTS + 1; - bool generate_nested = rng() % 2; - bool generate_named_tuple = rng() % 2; - if (generate_nested) - writeCString("Nested(", buf); - else - writeCString("Tuple(", buf); - - for (size_t i = 0; i != elements; ++i) - { - if (i != 0) - writeCString(", ", buf); - - String element_name = "e" + std::to_string(i + 1); - if (generate_named_tuple || generate_nested) - { - writeString(element_name, buf); - writeChar(' ', buf); - } - writeRandomType(element_name, rng, buf, allow_suspicious_lc_types, depth + 1); - } - writeChar(')', buf); - return; - } - default: - writeString(magic_enum::enum_name(type), buf); - return; - } - } - - static void writeMapKeyType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - TypeIndex type = map_key_types[rng() % map_key_types.size()]; - if (type == TypeIndex::FixedString) - { - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - } - else if (type == TypeIndex::LowCardinality) - { - writeCString("LowCardinality(", buf); - writeLowCardinalityNestedType(rng, buf, allow_suspicious_lc_types); - writeChar(')', buf); - } - else - { - writeString(magic_enum::enum_name(type), buf); - } - } - - static void writeLowCardinalityNestedType(pcg64 & rng, WriteBuffer & buf, bool allow_suspicious_lc_types) - { - bool make_nullable = rng() % 2; - if (make_nullable) - writeCString("Nullable(", buf); - - if (allow_suspicious_lc_types) - { - TypeIndex type = suspicious_lc_types[rng() % map_key_types.size()]; - if (type == TypeIndex::FixedString) - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - else - writeString(magic_enum::enum_name(type), buf); - } - else - { - /// Support only String and FixedString. - if (rng() % 2) - writeCString("String", buf); - else - writeString("FixedString(" + std::to_string(rng() % MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS + 1) + ")", buf); - } - - if (make_nullable) - writeChar(')', buf); - } - - static void writeEnumValues(const String & column_name, pcg64 & rng, WriteBuffer & buf, ssize_t max_value) - { - /// Don't generate big enums, because it will lead to really big result - /// and slowness of this function, and it can lead to `Max query size exceeded` - /// while using this function with generateRandom. - size_t num_values = rng() % 16 + 1; - std::vector values(num_values); - - /// Generate random numbers from range [-(max_value + 1), max_value - num_values + 1] - for (Int16 & x : values) - x = rng() % (2 * (max_value + 1) - num_values) - max_value - 1; - /// Make all numbers unique. - std::sort(values.begin(), values.end()); - for (size_t i = 0; i < num_values; ++i) - values[i] += i; - std::shuffle(values.begin(), values.end(), rng); - for (size_t i = 0; i != num_values; ++i) - { - if (i != 0) - writeCString(", ", buf); - writeString("'" + column_name + "V" + std::to_string(values[i]) + "' = " + std::to_string(i), buf); - } - } - - template - static constexpr auto getAllTypes() - { - constexpr size_t complex_types_size = complex_types.size() * allow_complex_types; - constexpr size_t result_size = simple_types.size() + complex_types_size; - std::array result; - size_t index = 0; - - for (size_t i = 0; i != simple_types.size(); ++i, ++index) - result[index] = simple_types[i]; - - for (size_t i = 0; i != complex_types_size; ++i, ++index) - result[index] = complex_types[i]; - - return result; - } - - bool allow_suspicious_lc_types; -}; - - -REGISTER_FUNCTION(GenerateRandomStructure) -{ - factory.registerFunction( - { - R"( -Generates a random table structure. -This function takes 2 optional constant arguments: -the number of columns in the result structure (random by default) and random seed (random by default) -The maximum number of columns is 128. -The function returns a value of type String. -)", - Documentation::Examples{ - {"random", "SELECT generateRandomStructure()"}, - {"with specified number of columns", "SELECT generateRandomStructure(10)"}, - {"with specified seed", "SELECT generateRandomStructure(10, 42)"}, - }, - Documentation::Categories{"Random"} - }, - FunctionFactory::CaseSensitive); -} - -} From 4137a5e0582041b0f7fcb388156153c4eb5e360a Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 15 May 2023 18:51:16 +0200 Subject: [PATCH 062/127] use chassert in MergeTreeDeduplicationLog to have better log info --- src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp index b843ce6a078..ac03b0be779 100644 --- a/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp +++ b/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp @@ -227,7 +227,7 @@ std::pair MergeTreeDeduplicationLog::addPart(const std: return std::make_pair(info, false); } - assert(current_writer != nullptr); + chassert(current_writer != nullptr); /// Create new record MergeTreeDeduplicationLogRecord record; @@ -257,7 +257,7 @@ void MergeTreeDeduplicationLog::dropPart(const MergeTreePartInfo & drop_part_inf if (deduplication_window == 0) return; - assert(current_writer != nullptr); + chassert(current_writer != nullptr); for (auto itr = deduplication_map.begin(); itr != deduplication_map.end(); /* no increment here, we erasing from map */) { From 91db14851397cccb166bc96f4d335dca5928640e Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 4 May 2023 21:28:33 +0000 Subject: [PATCH 063/127] Fix AsynchronousReadIndirectBufferFromRemoteFS breaking on short seeks --- ...chronousReadIndirectBufferFromRemoteFS.cpp | 28 +++++++++++-------- ...ynchronousReadIndirectBufferFromRemoteFS.h | 4 ++- .../0_stateless/02731_parquet_s3.reference | 1 + .../queries/0_stateless/02731_parquet_s3.sql | 7 +++++ 4 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 tests/queries/0_stateless/02731_parquet_s3.reference create mode 100644 tests/queries/0_stateless/02731_parquet_s3.sql diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index bf9a476b785..24b7042e459 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -118,12 +118,7 @@ std::future AsynchronousReadIndirectBufferFromRemot request.size = size; request.offset = file_offset_of_buffer_end; request.priority = base_priority + priority; - - if (bytes_to_ignore) - { - request.ignore = bytes_to_ignore; - bytes_to_ignore = 0; - } + request.ignore = bytes_to_ignore; return reader.submit(request); } @@ -165,8 +160,7 @@ void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t pos void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilEnd() { - read_until_position = impl->getFileSize(); - impl->setReadUntilPosition(*read_until_position); + setReadUntilPosition(impl->getFileSize()); } @@ -228,12 +222,13 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl() chassert(memory.size() == read_settings.prefetch_buffer_size || memory.size() == read_settings.remote_fs_buffer_size); std::tie(size, offset) = impl->readInto(memory.data(), memory.size(), file_offset_of_buffer_end, bytes_to_ignore); - bytes_to_ignore = 0; ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads); ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedBytes, size); } + bytes_to_ignore = 0; + chassert(size >= offset); size_t bytes_read = size - offset; @@ -269,7 +264,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } else if (whence == SEEK_CUR) { - new_pos = file_offset_of_buffer_end - (working_buffer.end() - pos) + offset; + new_pos = static_cast(getPosition()) + offset; } else { @@ -277,13 +272,15 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } /// Position is unchanged. - if (new_pos + (working_buffer.end() - pos) == file_offset_of_buffer_end) + if (new_pos == static_cast(getPosition())) return new_pos; bool read_from_prefetch = false; while (true) { - if (file_offset_of_buffer_end - working_buffer.size() <= new_pos && new_pos <= file_offset_of_buffer_end) + /// The first condition implies bytes_to_ignore = 0. + if (!working_buffer.empty() && file_offset_of_buffer_end - working_buffer.size() <= new_pos && + new_pos <= file_offset_of_buffer_end) { /// Position is still inside the buffer. /// Probably it is at the end of the buffer - then we will load data on the following 'next' call. @@ -320,6 +317,7 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) /// First reset the buffer so the next read will fetch new data to the buffer. resetWorkingBuffer(); + bytes_to_ignore = 0; if (read_until_position && new_pos > *read_until_position) { @@ -356,6 +354,12 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence) } +off_t AsynchronousReadIndirectBufferFromRemoteFS::getPosition() +{ + return file_offset_of_buffer_end - available() + bytes_to_ignore; +} + + void AsynchronousReadIndirectBufferFromRemoteFS::finalize() { resetPrefetch(FilesystemPrefetchState::UNNEEDED); diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 49b44916a46..e8fb3fe248b 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -42,7 +42,7 @@ public: off_t seek(off_t offset_, int whence) override; - off_t getPosition() override { return file_offset_of_buffer_end - available(); } + off_t getPosition() override; String getFileName() const override; @@ -89,6 +89,8 @@ private: std::string current_reader_id; + /// If nonzero then working_buffer is empty. + /// If a prefetch is in flight, the prefetch task has been instructed to ignore this many bytes. size_t bytes_to_ignore = 0; std::optional read_until_position; diff --git a/tests/queries/0_stateless/02731_parquet_s3.reference b/tests/queries/0_stateless/02731_parquet_s3.reference new file mode 100644 index 00000000000..5a5aaeb0068 --- /dev/null +++ b/tests/queries/0_stateless/02731_parquet_s3.reference @@ -0,0 +1 @@ +12639441726720293784 diff --git a/tests/queries/0_stateless/02731_parquet_s3.sql b/tests/queries/0_stateless/02731_parquet_s3.sql new file mode 100644 index 00000000000..3c3f11f535b --- /dev/null +++ b/tests/queries/0_stateless/02731_parquet_s3.sql @@ -0,0 +1,7 @@ +-- Tags: no-fasttest +-- Tag no-fasttest: Depends on AWS + +-- Reading from s3 a parquet file of size between ~1 MB and ~2 MB was broken at some point. +insert into function s3(s3_conn, filename='test_02731_parquet_s3.parquet') select cityHash64(number) from numbers(170000) settings s3_truncate_on_insert=1; + +select sum(*) from s3(s3_conn, filename='test_02731_parquet_s3.parquet') settings remote_filesystem_read_method='threadpool', remote_filesystem_read_prefetch=1; From 65bc702b0bfa2dc01c76fc1ba10007eff980fdd7 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 May 2023 20:02:30 +0200 Subject: [PATCH 064/127] fix --- src/Storages/MergeTree/MutateTask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 27e5319ed4f..f7de6ed3d22 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -144,7 +144,7 @@ static void splitAndModifyMutationCommands( { if (!mutated_columns.contains(column.name)) { - if (!table_metadata_snapshot->getColumns().has(column.name)) + if (!table_metadata_snapshot->getColumns().has(column.name) && !part->storage.getVirtuals().contains(column.name)) { /// We cannot add the column because there's no such column in table. /// It's okay if the column was dropped. It may also absent in dropped_columns From cbc15bf35a61d86e2c38e467d8b0f2210aec8230 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Mon, 15 May 2023 23:13:17 +0200 Subject: [PATCH 065/127] Add `DynamicResourceManager` and `FairPolicy` into scheduling subsystem (#49671) * Add `DynamicResourceManager` and `FairPolicy` into scheduling subsystem * fix test * fix tidy build --- src/IO/Resource/DynamicResourceManager.cpp | 234 ++++++++++++++++++ src/IO/Resource/DynamicResourceManager.h | 93 +++++++ src/IO/Resource/FairPolicy.cpp | 13 + src/IO/Resource/FairPolicy.h | 232 +++++++++++++++++ src/IO/Resource/registerResourceManagers.cpp | 2 + src/IO/Resource/registerSchedulerNodes.cpp | 2 + .../tests/gtest_resource_class_fair.cpp | 187 ++++++++++++++ .../gtest_resource_manager_hierarchical.cpp | 116 +++++++++ src/Interpreters/Context.cpp | 2 +- 9 files changed, 880 insertions(+), 1 deletion(-) create mode 100644 src/IO/Resource/DynamicResourceManager.cpp create mode 100644 src/IO/Resource/DynamicResourceManager.h create mode 100644 src/IO/Resource/FairPolicy.cpp create mode 100644 src/IO/Resource/FairPolicy.h create mode 100644 src/IO/Resource/tests/gtest_resource_class_fair.cpp create mode 100644 src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp diff --git a/src/IO/Resource/DynamicResourceManager.cpp b/src/IO/Resource/DynamicResourceManager.cpp new file mode 100644 index 00000000000..49e12984e33 --- /dev/null +++ b/src/IO/Resource/DynamicResourceManager.cpp @@ -0,0 +1,234 @@ +#include + +#include +#include +#include + +#include +#include + +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int RESOURCE_ACCESS_DENIED; + extern const int RESOURCE_NOT_FOUND; + extern const int INVALID_SCHEDULER_NODE; +} + +DynamicResourceManager::State::State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config) + : classifiers(config) +{ + Poco::Util::AbstractConfiguration::Keys keys; + const String config_prefix = "resources"; + config.keys(config_prefix, keys); + + // Create resource for every element under tag + for (const auto & key : keys) + { + resources.emplace(key, std::make_shared(key, event_queue, config, config_prefix + "." + key)); + } +} + +DynamicResourceManager::State::Resource::Resource( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix) +{ + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_prefix, keys); + + // Sort nodes by path to create parents before children + std::map path2key; + for (const auto & key : keys) + { + if (!startsWith(key, "node")) + continue; + String path = config.getString(config_prefix + "." + key + "[@path]", ""); + if (path.empty()) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Attribute 'path' must be specified in all nodes for resource '{}'", name); + if (path[0] != '/') + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Path must start with '/' for resource '{}'", name); + if (auto [_, inserted] = path2key.emplace(path, key); !inserted) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Duplicate path '{}' for resource '{}'", path, name); + } + + // Create nodes + bool has_root = false; + for (auto [path, key] : path2key) + { + // Validate path + size_t slash = path.rfind('/'); + if (slash == String::npos) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Invalid scheduler node path '{}' for resource '{}'", path, name); + + // Create node + String basename = path.substr(slash + 1); // root name is empty string + auto [iter, _] = nodes.emplace(path, Node(basename, event_queue, config, config_prefix + "." + key)); + if (path == "/") + { + has_root = true; + continue; + } + + // Attach created node to parent (if not root) + // NOTE: resource root is attached to the scheduler using event queue for thread-safety + String parent_path = path.substr(0, slash); + if (parent_path.empty()) + parent_path = "/"; + if (auto parent = nodes.find(parent_path); parent != nodes.end()) + parent->second.ptr->attachChild(iter->second.ptr); + else + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "Parent node doesn't exist for path '{}' for resource '{}'", path, name); + } + + if (!has_root) + throw Exception(ErrorCodes::INVALID_SCHEDULER_NODE, "undefined root node path '/' for resource '{}'", name); +} + +DynamicResourceManager::State::Resource::~Resource() +{ + // NOTE: we should rely on `attached_to` and cannot use `parent`, + // NOTE: because `parent` can be `nullptr` in case attachment is still in event queue + if (attached_to != nullptr) + { + ISchedulerNode * root = nodes.find("/")->second.ptr.get(); + attached_to->event_queue->enqueue([scheduler = attached_to, root] + { + scheduler->removeChild(root); + }); + } +} + +DynamicResourceManager::State::Node::Node(const String & name, EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix) + : type(config.getString(config_prefix + ".type", "fifo")) + , ptr(SchedulerNodeFactory::instance().get(type, event_queue, config, config_prefix)) +{ + ptr->basename = name; +} + +bool DynamicResourceManager::State::Resource::equals(const DynamicResourceManager::State::Resource & o) const +{ + if (nodes.size() != o.nodes.size()) + return false; + + for (const auto & [path, o_node] : o.nodes) + { + auto iter = nodes.find(path); + if (iter == nodes.end()) + return false; + if (!iter->second.equals(o_node)) + return false; + } + + return true; +} + +bool DynamicResourceManager::State::Node::equals(const DynamicResourceManager::State::Node & o) const +{ + if (type != o.type) + return false; + return ptr->equals(o.ptr.get()); +} + +DynamicResourceManager::Classifier::Classifier(const DynamicResourceManager::StatePtr & state_, const String & classifier_name) + : state(state_) +{ + // State is immutable, but nodes are mutable and thread-safe + // So it's safe to obtain node pointers w/o lock + for (auto [resource_name, path] : state->classifiers.get(classifier_name)) + { + if (auto resource_iter = state->resources.find(resource_name); resource_iter != state->resources.end()) + { + const auto & resource = resource_iter->second; + if (auto node_iter = resource->nodes.find(path); node_iter != resource->nodes.end()) + { + if (auto * queue = dynamic_cast(node_iter->second.ptr.get())) + resources.emplace(resource_name, ResourceLink{.queue = queue}); + else + throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Unable to access non-queue node at path '{}' for resource '{}'", path, resource_name); + } + else + throw Exception(ErrorCodes::RESOURCE_NOT_FOUND, "Path '{}' for resource '{}' does not exist", path, resource_name); + } + else + resources.emplace(resource_name, ResourceLink{}); // resource not configured yet - use unlimited resource + } +} + +ResourceLink DynamicResourceManager::Classifier::get(const String & resource_name) +{ + if (auto iter = resources.find(resource_name); iter != resources.end()) + return iter->second; + else + throw Exception(ErrorCodes::RESOURCE_ACCESS_DENIED, "Access denied to resource '{}'", resource_name); +} + +DynamicResourceManager::DynamicResourceManager() + : state(new State()) +{ + scheduler.start(); +} + +void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfiguration & config) +{ + StatePtr new_state = std::make_shared(scheduler.event_queue, config); + + std::lock_guard lock{mutex}; + + // Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits) + // Try to minimise this by reusing "equal" resources (initialized with the same configuration). + for (auto & [name, new_resource] : new_state->resources) + { + if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update + { + State::ResourcePtr old_resource = iter->second; + if (old_resource->equals(*new_resource)) + new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data + } + } + + // Commit new state + // NOTE: dtor will detach from scheduler old resources that are not in use currently + state = new_state; + + // Attach new and updated resources to the scheduler + for (auto & [name, resource] : new_state->resources) + { + const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr; + if (root->parent == nullptr) + { + resource->attached_to = &scheduler; + scheduler.event_queue->enqueue([this, root] + { + scheduler.attachChild(root); + }); + } + } + + // NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable +} + +ClassifierPtr DynamicResourceManager::acquire(const String & classifier_name) +{ + // Acquire a reference to the current state + StatePtr state_; + { + std::lock_guard lock{mutex}; + state_ = state; + } + + return std::make_shared(state_, classifier_name); +} + +void registerDynamicResourceManager(ResourceManagerFactory & factory) +{ + factory.registerMethod("dynamic"); +} + +} diff --git a/src/IO/Resource/DynamicResourceManager.h b/src/IO/Resource/DynamicResourceManager.h new file mode 100644 index 00000000000..aa1147f1fb2 --- /dev/null +++ b/src/IO/Resource/DynamicResourceManager.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include + +#include + +namespace DB +{ + +/* + * Implementation of `IResourceManager` supporting arbitrary dynamic hierarchy of scheduler nodes. + * All resources are controlled by single root `SchedulerRoot`. + * + * State of manager is set of resources attached to the scheduler. States are referenced by classifiers. + * Classifiers are used (1) to access resources and (2) to keep shared ownership of resources with pending + * resource requests. This allows `ResourceRequest` and `ResourceLink` to hold raw pointers as long as + * `ClassifierPtr` is acquired and held. + * + * Manager can update configuration after initialization. During update, new version of resources are also + * attached to scheduler, so multiple version can coexist for a short perid. This will violate constraints + * (e.g. in-fly-limit), because different version have independent nodes to impose constraints, the same + * violation will apply to fairness. Old version exists as long as there is at least one classifier + * instance referencing it. Classifiers are typically attached to queries and will be destructed with them. + */ +class DynamicResourceManager : public IResourceManager +{ +public: + DynamicResourceManager(); + void updateConfiguration(const Poco::Util::AbstractConfiguration & config) override; + ClassifierPtr acquire(const String & classifier_name) override; + +private: + /// Holds everything required to work with one specific configuration + struct State + { + struct Node + { + String type; + SchedulerNodePtr ptr; + + Node( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix); + bool equals(const Node & o) const; + }; + + struct Resource + { + std::unordered_map nodes; // by path + SchedulerRoot * attached_to = nullptr; + + Resource( + const String & name, + EventQueue * event_queue, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix); + ~Resource(); // unregisters resource from scheduler + bool equals(const Resource & o) const; + }; + + using ResourcePtr = std::shared_ptr; + + std::unordered_map resources; // by name + ClassifiersConfig classifiers; + + State() = default; + explicit State(EventQueue * event_queue, const Poco::Util::AbstractConfiguration & config); + }; + + using StatePtr = std::shared_ptr; + + /// Created per query, holds State used by that query + class Classifier : public IClassifier + { + public: + Classifier(const StatePtr & state_, const String & classifier_name); + ResourceLink get(const String & resource_name) override; + private: + std::unordered_map resources; // accessible resources by names + StatePtr state; // hold state to avoid ResourceLink invalidation due to resource deregistration from SchedulerRoot + }; + +private: + SchedulerRoot scheduler; + std::mutex mutex; + StatePtr state; +}; + +} diff --git a/src/IO/Resource/FairPolicy.cpp b/src/IO/Resource/FairPolicy.cpp new file mode 100644 index 00000000000..248ff04cbd7 --- /dev/null +++ b/src/IO/Resource/FairPolicy.cpp @@ -0,0 +1,13 @@ +#include + +#include + +namespace DB +{ + +void registerFairPolicy(SchedulerNodeFactory & factory) +{ + factory.registerMethod("fair"); +} + +} diff --git a/src/IO/Resource/FairPolicy.h b/src/IO/Resource/FairPolicy.h new file mode 100644 index 00000000000..9c0c78f057c --- /dev/null +++ b/src/IO/Resource/FairPolicy.h @@ -0,0 +1,232 @@ +#pragma once + +#include +#include + +#include + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_SCHEDULER_NODE; +} + +/* + * Scheduler node that implements weight-based fair scheduling policy. + * Based on Start-time Fair Queueing (SFQ) algorithm. + * + * Algorithm description. + * Virtual runtime (total consumed cost divided by child weight) is tracked for every child. + * Active child with minimum vruntime is selected to be dequeued next. On activation, initial vruntime + * of a child is set to vruntime of "start" of the last request. This guarantees immediate processing + * of at least single request of newly activated children and thus best isolation and scheduling latency. + */ +class FairPolicy : public ISchedulerNode +{ + /// Scheduling state of a child + struct Item + { + ISchedulerNode * child = nullptr; + double vruntime = 0; /// total consumed cost divided by child weight + + /// For min-heap by vruntime + bool operator<(const Item & rhs) const noexcept + { + return vruntime > rhs.vruntime; + } + }; + +public: + explicit FairPolicy(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {}) + : ISchedulerNode(event_queue_, config, config_prefix) + {} + + bool equals(ISchedulerNode * other) override + { + if (auto * o = dynamic_cast(other)) + return true; + return false; + } + + void attachChild(const SchedulerNodePtr & child) override + { + // Take ownership + if (auto [it, inserted] = children.emplace(child->basename, child); !inserted) + throw Exception( + ErrorCodes::INVALID_SCHEDULER_NODE, + "Can't add another child with the same path: {}", + it->second->getPath()); + + // Attach + child->setParent(this); + + // At first attach as inactive child. + // Inactive attached child must have `info.parent.idx` equal it's index inside `items` array. + // This is needed to avoid later scanning through inactive `items` in O(N). Important optimization. + // NOTE: vruntime must be equal to `system_vruntime` for fairness. + child->info.parent.idx = items.size(); + items.emplace_back(Item{child.get(), system_vruntime}); + + // Activate child if it is not empty + if (child->isActive()) + activateChildImpl(items.size() - 1); + } + + void removeChild(ISchedulerNode * child) override + { + if (auto iter = children.find(child->basename); iter != children.end()) + { + SchedulerNodePtr removed = iter->second; + + // Deactivate: detach is not very common operation, so we can afford O(N) here + size_t child_idx = 0; + [[ maybe_unused ]] bool found = false; + for (; child_idx != items.size(); child_idx++) + { + if (items[child_idx].child == removed.get()) + { + found = true; + break; + } + } + assert(found); + if (child_idx < heap_size) // Detach of active child requires deactivation at first + { + heap_size--; + std::swap(items[child_idx], items[heap_size]); + // Element was removed from inside of heap -- heap must be rebuilt + std::make_heap(items.begin(), items.begin() + heap_size); + child_idx = heap_size; + } + + // Now detach inactive child + if (child_idx != items.size() - 1) + { + std::swap(items[child_idx], items.back()); + items[child_idx].child->info.parent.idx = child_idx; + } + items.pop_back(); + + // Detach + removed->setParent(nullptr); + + // Get rid of ownership + children.erase(iter); + } + } + + ISchedulerNode * getChild(const String & child_name) override + { + if (auto iter = children.find(child_name); iter != children.end()) + return iter->second.get(); + else + return nullptr; + } + + std::pair dequeueRequest() override + { + if (heap_size == 0) + return {nullptr, false}; + + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + assert(request != nullptr); + std::pop_heap(items.begin(), items.begin() + heap_size); + Item & current = items[heap_size - 1]; + + // SFQ fairness invariant: system vruntime equals last served request start-time + assert(current.vruntime >= system_vruntime); + system_vruntime = current.vruntime; + + // By definition vruntime is amount of consumed resource (cost) divided by weight + current.vruntime += double(request->cost) / current.child->info.weight; + max_vruntime = std::max(max_vruntime, current.vruntime); + + if (child_active) // Put active child back in heap after vruntime update + { + std::push_heap(items.begin(), items.begin() + heap_size); + } + else // Deactivate child if it is empty, but remember it's vruntime for latter activations + { + heap_size--; + + // Store index of this inactive child in `parent.idx` + // This enables O(1) search of inactive children instead of O(n) + current.child->info.parent.idx = heap_size; + } + + // Reset any difference between children on busy period end + if (heap_size == 0) + { + // Reset vtime to zero to avoid floating-point error accumulation, + // but do not reset too often, because it's O(N) + UInt64 ns = clock_gettime_ns(); + if (last_reset_ns + 1000000000 < ns) + { + last_reset_ns = ns; + for (Item & item : items) + item.vruntime = 0; + max_vruntime = 0; + } + system_vruntime = max_vruntime; + } + + return {request, heap_size > 0}; + } + + bool isActive() override + { + return heap_size > 0; + } + + void activateChild(ISchedulerNode * child) override + { + // Find this child; this is O(1), thanks to inactive index we hold in `parent.idx` + activateChildImpl(child->info.parent.idx); + } + +private: + void activateChildImpl(size_t inactive_idx) + { + bool activate_parent = heap_size == 0; + + if (heap_size != inactive_idx) + { + std::swap(items[heap_size], items[inactive_idx]); + items[inactive_idx].child->info.parent.idx = inactive_idx; + } + + // Newly activated child should have at least `system_vruntime` to keep fairness + items[heap_size].vruntime = std::max(system_vruntime, items[heap_size].vruntime); + heap_size++; + std::push_heap(items.begin(), items.begin() + heap_size); + + // Recursive activation + if (activate_parent && parent) + parent->activateChild(this); + } + +private: + /// Beginning of `items` vector is heap of active children: [0; `heap_size`). + /// Next go inactive children in unsorted order. + /// NOTE: we have to track vruntime of inactive children for max-min fairness. + std::vector items; + size_t heap_size = 0; + + /// Last request vruntime + double system_vruntime = 0; + double max_vruntime = 0; + UInt64 last_reset_ns = 0; + + /// All children with ownership + std::unordered_map children; // basename -> child +}; + +} diff --git a/src/IO/Resource/registerResourceManagers.cpp b/src/IO/Resource/registerResourceManagers.cpp index 0a394e3f0cd..5217bcdfbec 100644 --- a/src/IO/Resource/registerResourceManagers.cpp +++ b/src/IO/Resource/registerResourceManagers.cpp @@ -4,11 +4,13 @@ namespace DB { +void registerDynamicResourceManager(ResourceManagerFactory &); void registerStaticResourceManager(ResourceManagerFactory &); void registerResourceManagers() { auto & factory = ResourceManagerFactory::instance(); + registerDynamicResourceManager(factory); registerStaticResourceManager(factory); } diff --git a/src/IO/Resource/registerSchedulerNodes.cpp b/src/IO/Resource/registerSchedulerNodes.cpp index 1b58b3981c2..896f96d7f50 100644 --- a/src/IO/Resource/registerSchedulerNodes.cpp +++ b/src/IO/Resource/registerSchedulerNodes.cpp @@ -8,6 +8,7 @@ namespace DB { void registerPriorityPolicy(SchedulerNodeFactory &); +void registerFairPolicy(SchedulerNodeFactory &); void registerSemaphoreConstraint(SchedulerNodeFactory &); void registerFifoQueue(SchedulerNodeFactory &); @@ -17,6 +18,7 @@ void registerSchedulerNodes() // ISchedulerNode registerPriorityPolicy(factory); + registerFairPolicy(factory); // ISchedulerConstraint registerSemaphoreConstraint(factory); diff --git a/src/IO/Resource/tests/gtest_resource_class_fair.cpp b/src/IO/Resource/tests/gtest_resource_class_fair.cpp new file mode 100644 index 00000000000..89ec2ac7c32 --- /dev/null +++ b/src/IO/Resource/tests/gtest_resource_class_fair.cpp @@ -0,0 +1,187 @@ +#include + +#include + +#include + +using namespace DB; + +using ResourceTest = ResourceTestClass; + +TEST(IOResourceFairPolicy, Factory) +{ + ResourceTest t; + + Poco::AutoPtr cfg = new Poco::Util::XMLConfiguration(); + SchedulerNodePtr fair = SchedulerNodeFactory::instance().get("fair", /* event_queue = */ nullptr, *cfg, ""); + EXPECT_TRUE(dynamic_cast(fair.get()) != nullptr); +} + +TEST(IOResourceFairPolicy, FairnessWeights) +{ + ResourceTest t; + + t.add("/"); + t.add("/A", "1.0"); + t.add("/B", "3.0"); + + t.enqueue("/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/B", {10, 10, 10, 10, 10, 10, 10, 10}); + + t.dequeue(4); + t.consumed("A", 10); + t.consumed("B", 30); + + t.dequeue(4); + t.consumed("A", 10); + t.consumed("B", 30); + + t.dequeue(); + t.consumed("A", 60); + t.consumed("B", 20); +} + +TEST(IOResourceFairPolicy, Activation) +{ + ResourceTest t; + + t.add("/"); + t.add("/A"); + t.add("/B"); + t.add("/C"); + + t.enqueue("/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/B", {10}); + t.enqueue("/C", {10, 10}); + + t.dequeue(3); + t.consumed("A", 10); + t.consumed("B", 10); + t.consumed("C", 10); + + t.dequeue(4); + t.consumed("A", 30); + t.consumed("B", 0); + t.consumed("C", 10); + + t.enqueue("/B", {10, 10}); + t.dequeue(1); + t.consumed("B", 10); + + t.enqueue("/C", {10, 10}); + t.dequeue(1); + t.consumed("C", 10); + + t.dequeue(2); // A B or B A + t.consumed("A", 10); + t.consumed("B", 10); +} + +TEST(IOResourceFairPolicy, FairnessMaxMin) +{ + ResourceTest t; + + t.add("/"); + t.add("/A"); + t.add("/B"); + + t.enqueue("/A", {10, 10}); // make sure A is never empty + + for (int i = 0; i < 10; i++) + { + t.enqueue("/A", {10, 10, 10, 10}); + t.enqueue("/B", {10, 10}); + + t.dequeue(6); + t.consumed("A", 40); + t.consumed("B", 20); + } + + t.dequeue(2); + t.consumed("A", 20); +} + +TEST(IOResourceFairPolicy, HierarchicalFairness) +{ + ResourceTest t; + + t.add("/"); + t.add("/X"); + t.add("/Y"); + t.add("/X/A"); + t.add("/X/B"); + t.add("/Y/C"); + t.add("/Y/D"); + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 40); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("B", 40); + t.consumed("C", 20); + t.consumed("D", 20); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/C", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("C", 40); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/B", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 20); + t.consumed("B", 20); + t.consumed("D", 40); + } + + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/X/A", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + t.enqueue("/Y/D", {10, 10, 10, 10, 10, 10, 10, 10}); + for (int i = 0; i < 4; i++) + { + t.dequeue(8); + t.consumed("A", 40); + t.consumed("D", 40); + } +} diff --git a/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp new file mode 100644 index 00000000000..b113da31d59 --- /dev/null +++ b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp @@ -0,0 +1,116 @@ +#include + +#include + +#include +#include + +using namespace DB; + +using ResourceTest = ResourceTestManager; +using TestGuard = ResourceTest::Guard; + +TEST(IOResourceDynamicResourceManager, Smoke) +{ + ResourceTest t; + + t.update(R"CONFIG( + + + + inflight_limit10 + fair + fifo + fifo3 + + + + /fair/A + /fair/B + + + )CONFIG"); + + ClassifierPtr cA = t.manager->acquire("A"); + ClassifierPtr cB = t.manager->acquire("B"); + + for (int i = 0; i < 10; i++) + { + ResourceGuard gA(cA->get("res1"), ResourceGuard::PostponeLocking); + gA.lock(); + gA.setFailure(); + gA.unlock(); + + ResourceGuard gB(cB->get("res1")); + } +} + +TEST(IOResourceDynamicResourceManager, Fairness) +{ + constexpr size_t T = 3; // threads per queue + int N = 100; // requests per thread + ResourceTest t(2 * T + 1); + + t.update(R"CONFIG( + + + + inflight_limit1 + fair + fifo + fifo + fifo + + + + /fair/A + /fair/B + /fair/leader + + + )CONFIG"); + + + // Total cost for A and B cannot differ for more than 1 (every request has cost equal to 1). + // Requests from A use `value = 1` and from B `value = -1` is used. + std::atomic unfairness = 0; + auto fairness_diff = [&] (Int64 value) + { + Int64 cur_unfairness = unfairness.fetch_add(value, std::memory_order_relaxed) + value; + EXPECT_NEAR(cur_unfairness, 0, 1); + }; + + for (int thr = 0; thr < T; thr++) + { + t.threads.emplace_back([&] + { + ClassifierPtr c = t.manager->acquire("A"); + ResourceLink link = c->get("res1"); + t.startBusyPeriod(link, 1, N); + for (int req = 0; req < N; req++) + { + TestGuard g(t, link, 1); + fairness_diff(1); + } + }); + } + + for (int thr = 0; thr < T; thr++) + { + t.threads.emplace_back([&] + { + ClassifierPtr c = t.manager->acquire("B"); + ResourceLink link = c->get("res1"); + t.startBusyPeriod(link, 1, N); + for (int req = 0; req < N; req++) + { + TestGuard g(t, link, 1); + fairness_diff(-1); + } + }); + } + + ClassifierPtr c = t.manager->acquire("leader"); + ResourceLink link = c->get("res1"); + t.blockResource(link); +} diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e222b8655aa..b4bdb7cf233 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1220,7 +1220,7 @@ ResourceManagerPtr Context::getResourceManager() const { auto lock = getLock(); if (!shared->resource_manager) - shared->resource_manager = ResourceManagerFactory::instance().get(getConfigRef().getString("resource_manager", "static")); + shared->resource_manager = ResourceManagerFactory::instance().get(getConfigRef().getString("resource_manager", "dynamic")); return shared->resource_manager; } From 11b94a626aca1cf346db8c797bbce672a72f0703 Mon Sep 17 00:00:00 2001 From: FFFFFFFHHHHHHH <75292180+FFFFFFFHHHHHHH@users.noreply.github.com> Date: Tue, 16 May 2023 05:20:29 +0800 Subject: [PATCH 066/127] Fix aggregate function kolmogorovSmirnovTest (#49768) --- .../AggregateFunctionKolmogorovSmirnovTest.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h index 33a9966ee2c..5629de31c88 100644 --- a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h +++ b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h @@ -43,6 +43,7 @@ struct KolmogorovSmirnov : public StatisticalSample Float64 now_s = 0; UInt64 pos_x = 0; UInt64 pos_y = 0; + UInt64 pos_tmp; UInt64 n1 = x.size(); UInt64 n2 = y.size(); @@ -65,14 +66,22 @@ struct KolmogorovSmirnov : public StatisticalSample now_s -= n2_d; ++pos_y; } - max_s = std::max(max_s, now_s); - min_s = std::min(min_s, now_s); } else { - now_s += n1_d; - ++pos_x; + pos_tmp = pos_x + 1; + while (pos_tmp < x.size() && unlikely(fabs(x[pos_tmp] - x[pos_x]) <= tol)) + pos_tmp++; + now_s += n1_d * (pos_tmp - pos_x); + pos_x = pos_tmp; + pos_tmp = pos_y + 1; + while (pos_tmp < y.size() && unlikely(fabs(y[pos_tmp] - y[pos_y]) <= tol)) + pos_tmp++; + now_s -= n2_d * (pos_tmp - pos_y); + pos_y = pos_tmp; } + max_s = std::max(max_s, now_s); + min_s = std::min(min_s, now_s); } now_s += n1_d * (x.size() - pos_x) - n2_d * (y.size() - pos_y); min_s = std::min(min_s, now_s); From a4f84cecfde581b37a37589239040612e7022935 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 May 2023 13:05:36 +0200 Subject: [PATCH 067/127] Retry connection expired in test_rename_column/test.py --- tests/integration/test_rename_column/test.py | 32 ++++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_rename_column/test.py b/tests/integration/test_rename_column/test.py index 6bab0a28259..a664f1050e1 100644 --- a/tests/integration/test_rename_column/test.py +++ b/tests/integration/test_rename_column/test.py @@ -225,7 +225,9 @@ def select( def rename_column( node, table_name, name, new_name, iterations=1, ignore_exception=False ): - for i in range(iterations): + i = 0 + while True: + i += 1 try: node.query( "ALTER TABLE {table_name} RENAME COLUMN {name} to {new_name}".format( @@ -233,14 +235,22 @@ def rename_column( ) ) except QueryRuntimeException as ex: + if 'Coordination::Exception' in str(ex): + continue + if not ignore_exception: raise + if i >= iterations: + break + def rename_column_on_cluster( node, table_name, name, new_name, iterations=1, ignore_exception=False ): - for i in range(iterations): + i = 0 + while True: + i += 1 try: node.query( "ALTER TABLE {table_name} ON CLUSTER test_cluster RENAME COLUMN {name} to {new_name}".format( @@ -248,12 +258,21 @@ def rename_column_on_cluster( ) ) except QueryRuntimeException as ex: + if 'Coordination::Exception' in str(ex): + continue + if not ignore_exception: raise + if i >= iterations: + break + + def alter_move(node, table_name, iterations=1, ignore_exception=False): - for i in range(iterations): + i = 0 + while True: + i += 1 move_part = random.randint(0, 99) move_volume = "external" try: @@ -263,9 +282,16 @@ def alter_move(node, table_name, iterations=1, ignore_exception=False): ) ) except QueryRuntimeException as ex: + if 'Coordination::Exception' in str(ex): + continue + if not ignore_exception: raise + if i >= iterations: + break + + def test_rename_parallel_same_node(started_cluster): table_name = "test_rename_parallel_same_node" From b3fc7d1a86264bb7c62b6b10026d5701e91753e1 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 16 May 2023 11:15:15 +0000 Subject: [PATCH 068/127] Automatic style fix --- tests/integration/test_rename_column/test.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_rename_column/test.py b/tests/integration/test_rename_column/test.py index a664f1050e1..8dc57cf08ff 100644 --- a/tests/integration/test_rename_column/test.py +++ b/tests/integration/test_rename_column/test.py @@ -235,7 +235,7 @@ def rename_column( ) ) except QueryRuntimeException as ex: - if 'Coordination::Exception' in str(ex): + if "Coordination::Exception" in str(ex): continue if not ignore_exception: @@ -258,7 +258,7 @@ def rename_column_on_cluster( ) ) except QueryRuntimeException as ex: - if 'Coordination::Exception' in str(ex): + if "Coordination::Exception" in str(ex): continue if not ignore_exception: @@ -268,7 +268,6 @@ def rename_column_on_cluster( break - def alter_move(node, table_name, iterations=1, ignore_exception=False): i = 0 while True: @@ -282,7 +281,7 @@ def alter_move(node, table_name, iterations=1, ignore_exception=False): ) ) except QueryRuntimeException as ex: - if 'Coordination::Exception' in str(ex): + if "Coordination::Exception" in str(ex): continue if not ignore_exception: @@ -292,7 +291,6 @@ def alter_move(node, table_name, iterations=1, ignore_exception=False): break - def test_rename_parallel_same_node(started_cluster): table_name = "test_rename_parallel_same_node" drop_table(nodes, table_name) From d4ea3ea045a250c86859ebfcff953b13822f0a23 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 16 May 2023 13:49:32 +0200 Subject: [PATCH 069/127] Fix --- src/Interpreters/Cache/Metadata.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index a97b10ffbfa..01cdc7f1d1b 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -1,6 +1,7 @@ #include #include #include +#include "Common/Exception.h" #include #include @@ -257,8 +258,6 @@ void CacheMetadata::doCleanup() } locked_metadata->markAsRemoved(); - erase(it); - LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); try { @@ -272,9 +271,17 @@ void CacheMetadata::doCleanup() } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(false)); chassert(false); } + + /// Remove key from metadata AFTER deleting key directory, because otherwise key lock is + /// released before we delete directory from fs and there might be a race: + /// a key, which we just removed, can be added back to cache before we start removing key directory, + /// which makes key directory either non-empty (and we get exception in try catch above) + /// or we removed directory while another thread thinks it exists. + erase(it); + LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); } } From 9a2645a72904e1146e98ca58c0274bbe034a79ef Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 16 May 2023 14:09:38 +0200 Subject: [PATCH 070/127] Fixed clang build --- src/Backups/BackupCoordinationLocal.h | 2 +- src/Backups/BackupCoordinationRemote.h | 2 +- src/Backups/BackupsWorker.cpp | 6 +++--- src/Backups/IBackupCoordination.h | 2 +- src/Backups/IRestoreCoordination.h | 2 +- src/Backups/RestoreCoordinationLocal.h | 2 +- src/Backups/RestoreCoordinationRemote.h | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index a7b05fbb83c..60fcc014720 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -22,7 +22,7 @@ public: BackupCoordinationLocal(bool plain_backup_); ~BackupCoordinationLocal() override; - void setStage(const String & new_stage, const String & message = "") override; + void setStage(const String & new_stage, const String & message) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index 5671079fa27..949dd9c9bf0 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -33,7 +33,7 @@ public: ~BackupCoordinationRemote() override; - void setStage(const String & new_stage, const String & message = "") override; + void setStage(const String & new_stage, const String & message) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 720ca994a40..0a6482fb7de 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -368,7 +368,7 @@ void BackupsWorker::doBackup( /// Wait until all the hosts have written their backup entries. backup_coordination->waitForStage(Stage::COMPLETED); - backup_coordination->setStage(Stage::COMPLETED); + backup_coordination->setStage(Stage::COMPLETED,""); } else { @@ -386,7 +386,7 @@ void BackupsWorker::doBackup( writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal); /// We have written our backup entries, we need to tell other hosts (they could be waiting for it). - backup_coordination->setStage(Stage::COMPLETED); + backup_coordination->setStage(Stage::COMPLETED,""); } size_t num_files = 0; @@ -709,7 +709,7 @@ void BackupsWorker::doRestore( /// Wait until all the hosts have written their backup entries. restore_coordination->waitForStage(Stage::COMPLETED); - restore_coordination->setStage(Stage::COMPLETED); + restore_coordination->setStage(Stage::COMPLETED,""); } else { diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 68a13ab7846..75d9202374b 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -21,7 +21,7 @@ public: virtual ~IBackupCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message = "") = 0; + virtual void setStage(const String & new_stage, const String & message) = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index b4df9491c4c..2f9e8d171f6 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -18,7 +18,7 @@ public: virtual ~IRestoreCoordination() = default; /// Sets the current stage and waits for other hosts to come to this stage too. - virtual void setStage(const String & new_stage, const String & message = "") = 0; + virtual void setStage(const String & new_stage, const String & message) = 0; virtual void setError(const Exception & exception) = 0; virtual Strings waitForStage(const String & stage_to_wait) = 0; virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index 2240a25ef3d..e27f0d1ef88 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -19,7 +19,7 @@ public: ~RestoreCoordinationLocal() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message = "") override; + void setStage(const String & new_stage, const String & message) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index 989b1c1b727..eb0fcff9c2d 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -26,7 +26,7 @@ public: ~RestoreCoordinationRemote() override; /// Sets the current stage and waits for other hosts to come to this stage too. - void setStage(const String & new_stage, const String & message = "") override; + void setStage(const String & new_stage, const String & message) override; void setError(const Exception & exception) override; Strings waitForStage(const String & stage_to_wait) override; Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override; From fd2731845c12d0ee82ebf2fb8b8aada523303581 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 15 Apr 2023 20:43:06 +0200 Subject: [PATCH 071/127] Simplify interface of IBackupWriter: Remove supportNativeCopy() function. --- src/Backups/BackupIO.cpp | 22 +++--- src/Backups/BackupIO.h | 29 +++++--- src/Backups/BackupIO_Disk.cpp | 54 +++++++------- src/Backups/BackupIO_Disk.h | 6 +- src/Backups/BackupIO_File.cpp | 47 ++++++------ src/Backups/BackupIO_File.h | 7 +- src/Backups/BackupIO_S3.cpp | 52 +++++++------- src/Backups/BackupIO_S3.h | 24 +------ src/Backups/BackupImpl.cpp | 72 ++++++++----------- .../test_backup_restore_new/test.py | 12 ---- .../test_backup_restore_s3/test.py | 6 +- 11 files changed, 146 insertions(+), 185 deletions(-) diff --git a/src/Backups/BackupIO.cpp b/src/Backups/BackupIO.cpp index f78e6df23a8..7b269bd965f 100644 --- a/src/Backups/BackupIO.cpp +++ b/src/Backups/BackupIO.cpp @@ -4,29 +4,30 @@ #include #include #include +#include namespace DB { -namespace ErrorCodes +IBackupReader::IBackupReader(Poco::Logger * log_) : log(log_) { - extern const int NOT_IMPLEMENTED; } void IBackupReader::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { + LOG_TRACE(log, "Copying file {} through buffers", file_name); auto read_buffer = readFile(file_name); auto write_buffer = destination_disk->writeFile(destination_path, std::min(size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings); copyData(*read_buffer, *write_buffer, size); write_buffer->finalize(); } -IBackupWriter::IBackupWriter(const ContextPtr & context_) - : read_settings(context_->getBackupReadSettings()) - , has_throttling(static_cast(context_->getBackupsThrottler())) -{} +IBackupWriter::IBackupWriter(const ContextPtr & context_, Poco::Logger * log_) + : log(log_), read_settings(context_->getBackupReadSettings()) +{ +} void IBackupWriter::copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) { @@ -38,9 +39,12 @@ void IBackupWriter::copyDataToFile(const CreateReadBufferFunction & create_read_ write_buffer->finalize(); } -void IBackupWriter::copyFileNative( - DiskPtr /* src_disk */, const String & /* src_file_name */, UInt64 /* src_offset */, UInt64 /* src_size */, const String & /* dest_file_name */) +void IBackupWriter::copyFileFromDisk( + DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Native copy not implemented for backup writer"); + LOG_TRACE(log, "Copying file {} through buffers", src_file_name); + auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; + copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); } + } diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index aef9c14e83e..dae13422bf2 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -15,6 +15,8 @@ class WriteBuffer; class IBackupReader /// BackupReaderFile, BackupReaderDisk { public: + explicit IBackupReader(Poco::Logger * log_); + virtual ~IBackupReader() = default; virtual bool fileExists(const String & file_name) = 0; virtual UInt64 getFileSize(const String & file_name) = 0; @@ -22,6 +24,9 @@ public: virtual void copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings); virtual DataSourceDescription getDataSourceDescription() const = 0; + +protected: + Poco::Logger * const log; }; /// Represents operations of storing to disk or uploading for writing a backup. @@ -30,29 +35,31 @@ class IBackupWriter /// BackupWriterFile, BackupWriterDisk public: using CreateReadBufferFunction = std::function()>; - explicit IBackupWriter(const ContextPtr & context_); + IBackupWriter(const ContextPtr & context_, Poco::Logger * log_); virtual ~IBackupWriter() = default; virtual bool fileExists(const String & file_name) = 0; virtual UInt64 getFileSize(const String & file_name) = 0; virtual bool fileContentsEqual(const String & file_name, const String & expected_file_contents) = 0; + virtual std::unique_ptr writeFile(const String & file_name) = 0; + + virtual void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name); + + /// copyFileFromDisk() can be much faster than copyDataToFile() + /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). + virtual void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name); + virtual void removeFile(const String & file_name) = 0; virtual void removeFiles(const Strings & file_names) = 0; - virtual DataSourceDescription getDataSourceDescription() const = 0; - virtual void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name); - virtual bool supportNativeCopy(DataSourceDescription /* data_source_description */) const { return false; } - /// Copy file using native copy (optimized for S3 to use CopyObject) - /// - /// NOTE: It still may fall back to copyDataToFile() if native copy is not possible: - /// - different buckets - /// - throttling had been requested - virtual void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name); + virtual DataSourceDescription getDataSourceDescription() const = 0; protected: + Poco::Logger * const log; + + /// These read settings are used to read from the source disk in copyFileFromDisk(). const ReadSettings read_settings; - const bool has_throttling; }; } diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index 10d7a572f6b..643922cf3d0 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -14,7 +15,7 @@ namespace ErrorCodes } BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & path_) - : disk(disk_), path(path_), log(&Poco::Logger::get("BackupReaderDisk")) + : IBackupReader(&Poco::Logger::get("BackupReaderDisk")), disk(disk_), path(path_) { } @@ -38,22 +39,29 @@ std::unique_ptr BackupReaderDisk::readFile(const String & fi void BackupReaderDisk::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - if (write_mode == WriteMode::Rewrite) + if ((write_mode == WriteMode::Rewrite) && (destination_disk->getDataSourceDescription() == getDataSourceDescription())) { - LOG_TRACE(log, "Copying {}/{} from disk {} to {} by the disk", path, file_name, disk->getName(), destination_disk->getName()); + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} using {} disk", file_name, toString(destination_disk->getDataSourceDescription().type)); disk->copyFile(path / file_name, *destination_disk, destination_path, write_settings); return; } - LOG_TRACE(log, "Copying {}/{} from disk {} to {} through buffers", path, file_name, disk->getName(), destination_disk->getName()); + /// Fallback to copy through buffers. IBackupReader::copyFileToDisk(file_name, size, destination_disk, destination_path, write_mode, write_settings); } +DataSourceDescription BackupReaderDisk::getDataSourceDescription() const +{ + return disk->getDataSourceDescription(); +} + BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_) - : IBackupWriter(context_) + : IBackupWriter(context_, &Poco::Logger::get("BackupWriterDisk")) , disk(disk_) , path(path_) + , has_throttling(static_cast(context_->getBackupsThrottler())) { } @@ -115,31 +123,25 @@ DataSourceDescription BackupWriterDisk::getDataSourceDescription() const return disk->getDataSourceDescription(); } -DataSourceDescription BackupReaderDisk::getDataSourceDescription() const +void BackupWriterDisk::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) { - return disk->getDataSourceDescription(); -} - -bool BackupWriterDisk::supportNativeCopy(DataSourceDescription data_source_description) const -{ - return data_source_description == disk->getDataSourceDescription(); -} - -void BackupWriterDisk::copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) -{ - if (!src_disk) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot natively copy data to disk without source disk"); - - if (has_throttling || (src_offset != 0) || (src_size != src_disk->getFileSize(src_file_name))) + /// IDisk::copyFile() can copy to the same disk only, and it cannot do the throttling. + if (!has_throttling && (getDataSourceDescription() == src_disk->getDataSourceDescription())) { - auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; - copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); - return; + /// IDisk::copyFile() can copy a file as a whole only. + if ((src_offset == 0) && (src_size == src_disk->getFileSize(src_file_name))) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} using {} disk", src_file_name, toString(src_disk->getDataSourceDescription().type)); + auto dest_file_path = path / dest_file_name; + disk->createDirectories(dest_file_path.parent_path()); + src_disk->copyFile(src_file_name, *disk, dest_file_path); + return; + } } - auto file_path = path / dest_file_name; - disk->createDirectories(file_path.parent_path()); - src_disk->copyFile(src_file_name, *disk, file_path); + /// Fallback to copy through buffers. + IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); } } diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index be34847000d..884282ea4e0 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -25,7 +25,6 @@ public: private: DiskPtr disk; std::filesystem::path path; - Poco::Logger * log; }; class BackupWriterDisk : public IBackupWriter @@ -38,16 +37,15 @@ public: UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; + void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; DataSourceDescription getDataSourceDescription() const override; - bool supportNativeCopy(DataSourceDescription data_source_description) const override; - void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; - private: DiskPtr disk; std::filesystem::path path; + const bool has_throttling; }; } diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index d4c9d0cb210..b569e65284b 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -12,7 +13,9 @@ namespace fs = std::filesystem; namespace DB { -BackupReaderFile::BackupReaderFile(const String & path_) : path(path_), log(&Poco::Logger::get("BackupReaderFile")) + +BackupReaderFile::BackupReaderFile(const String & path_) + : IBackupReader(&Poco::Logger::get("BackupReaderFile")), path(path_) { } @@ -39,19 +42,20 @@ void BackupReaderFile::copyFileToDisk(const String & file_name, size_t size, Dis if (destination_disk->getDataSourceDescription() == getDataSourceDescription()) { /// Use more optimal way. - LOG_TRACE(log, "Copying {}/{} to disk {} locally", path, file_name, destination_disk->getName()); + LOG_TRACE(log, "Copying file {} locally", file_name); fs::copy(path / file_name, fullPath(destination_disk, destination_path), fs::copy_options::overwrite_existing); return; } - LOG_TRACE(log, "Copying {}/{} to disk {} through buffers", path, file_name, destination_disk->getName()); + /// Fallback to copy through buffers. IBackupReader::copyFileToDisk(path / file_name, size, destination_disk, destination_path, write_mode, write_settings); } BackupWriterFile::BackupWriterFile(const String & path_, const ContextPtr & context_) - : IBackupWriter(context_) + : IBackupWriter(context_, &Poco::Logger::get("BackupWriterFile")) , path(path_) + , has_throttling(static_cast(context_->getBackupsThrottler())) { } @@ -141,29 +145,26 @@ DataSourceDescription BackupReaderFile::getDataSourceDescription() const } -bool BackupWriterFile::supportNativeCopy(DataSourceDescription data_source_description) const +void BackupWriterFile::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) { - return data_source_description == getDataSourceDescription(); -} - -void BackupWriterFile::copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) -{ - std::string abs_source_path; - if (src_disk) - abs_source_path = fullPath(src_disk, src_file_name); - else - abs_source_path = fs::absolute(src_file_name); - - if (has_throttling || (src_offset != 0) || (src_size != fs::file_size(abs_source_path))) + /// std::filesystem::copy() can copy from the filesystem only, and it cannot do the throttling. + if (!has_throttling && (getDataSourceDescription() == src_disk->getDataSourceDescription())) { - auto create_read_buffer = [this, abs_source_path] { return createReadBufferFromFileBase(abs_source_path, read_settings); }; - copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); - return; + std::string abs_source_path = fullPath(src_disk, src_file_name); + /// std::filesystem::copy() can copy a file as a whole only. + if ((src_offset == 0) && (src_size == fs::file_size(abs_source_path))) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} locally", src_file_name); + auto abs_dest_path = path / dest_file_name; + fs::create_directories(abs_dest_path.parent_path()); + fs::copy(abs_source_path, abs_dest_path, fs::copy_options::overwrite_existing); + return; + } } - auto file_path = path / dest_file_name; - fs::create_directories(file_path.parent_path()); - fs::copy(abs_source_path, file_path, fs::copy_options::overwrite_existing); + /// Fallback to copy through buffers. + IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); } } diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index d4b6e13d546..1f0a247c321 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -22,27 +22,26 @@ public: private: std::filesystem::path path; - Poco::Logger * log; }; class BackupWriterFile : public IBackupWriter { public: - explicit BackupWriterFile(const String & path_, const ContextPtr & context_); + BackupWriterFile(const String & path_, const ContextPtr & context_); ~BackupWriterFile() override; bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; + void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; DataSourceDescription getDataSourceDescription() const override; - bool supportNativeCopy(DataSourceDescription data_source_description) const override; - void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; private: std::filesystem::path path; + const bool has_throttling; }; } diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 84dba63ae4e..043075d55ec 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -102,11 +102,11 @@ namespace BackupReaderS3::BackupReaderS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) - : s3_uri(s3_uri_) + : IBackupReader(&Poco::Logger::get("BackupReaderS3")) + , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , read_settings(context_->getReadSettings()) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) - , log(&Poco::Logger::get("BackupReaderS3")) { request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } @@ -141,8 +141,6 @@ std::unique_ptr BackupReaderS3::readFile(const String & file void BackupReaderS3::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - LOG_TRACE(log, "Copying {} to disk {}", file_name, destination_disk->getName()); - copyS3FileToDisk( client, s3_uri.bucket, @@ -162,11 +160,10 @@ void BackupReaderS3::copyFileToDisk(const String & file_name, size_t size, DiskP BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) - : IBackupWriter(context_) + : IBackupWriter(context_, &Poco::Logger::get("BackupWriterS3")) , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) - , log(&Poco::Logger::get("BackupWriterS3")) { request_settings.updateFromSettings(context_->getSettingsRef()); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint @@ -177,30 +174,31 @@ DataSourceDescription BackupWriterS3::getDataSourceDescription() const return DataSourceDescription{DataSourceType::S3, s3_uri.endpoint, false, false}; } -bool BackupWriterS3::supportNativeCopy(DataSourceDescription data_source_description) const +void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) { - return getDataSourceDescription() == data_source_description; -} - -void BackupWriterS3::copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) -{ - if (!src_disk) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot natively copy data to disk without source disk"); - - auto objects = src_disk->getStorageObjects(src_file_name); - if (objects.size() > 1) + /// copyS3File() can copy to another S3 bucket, but it requires the same S3 URI endpoint. + /// We don't check `has_throttling` here (compare with BackupWriterDisk::copyFileFromDisk()) because + /// copyS3File() almost doesn't use network so the throttling is not needed. + if (getDataSourceDescription() == src_disk->getDataSourceDescription()) { - auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; - copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); - } - else - { - auto object_storage = src_disk->getObjectStorage(); - std::string src_bucket = object_storage->getObjectsNamespace(); - auto file_path = fs::path(s3_uri.key) / dest_file_name; - copyS3File(client, src_bucket, objects[0].remote_path, src_offset, src_size, s3_uri.bucket, file_path, request_settings, {}, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + /// A single file can be represented as multiple objects in S3 bucket. + /// However copyS3File() can copy only a single file into a single file. + auto objects = src_disk->getStorageObjects(src_file_name); + if (objects.size() == 1) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} using native copy", src_file_name); + auto object_storage = src_disk->getObjectStorage(); + std::string src_bucket = object_storage->getObjectsNamespace(); + auto file_path = fs::path(s3_uri.key) / dest_file_name; + copyS3File(client, src_bucket, objects[0].remote_path, src_offset, src_size, s3_uri.bucket, file_path, request_settings, {}, + threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + return; + } } + + /// Fallback to BackupWriterS3::copyDataToFile(). + IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); } void BackupWriterS3::copyDataToFile( diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index c32a6b48660..7d53d30e8d6 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -32,7 +32,6 @@ private: std::shared_ptr client; ReadSettings read_settings; S3Settings::RequestSettings request_settings; - Poco::Logger * log; }; @@ -48,37 +47,18 @@ public: std::unique_ptr writeFile(const String & file_name) override; void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) override; + void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; + DataSourceDescription getDataSourceDescription() const override; void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; - DataSourceDescription getDataSourceDescription() const override; - bool supportNativeCopy(DataSourceDescription data_source_description) const override; - void copyFileNative(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; - private: - void copyObjectImpl( - const String & src_bucket, - const String & src_key, - const String & dst_bucket, - const String & dst_key, - size_t size, - const std::optional & metadata = std::nullopt) const; - - void copyObjectMultipartImpl( - const String & src_bucket, - const String & src_key, - const String & dst_bucket, - const String & dst_key, - size_t size, - const std::optional & metadata = std::nullopt) const; - void removeFilesBatch(const Strings & file_names); S3::URI s3_uri; std::shared_ptr client; S3Settings::RequestSettings request_settings; - Poco::Logger * log; std::optional supports_batch_delete; }; diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index a8c6e16b268..55fb6dbfe03 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -805,72 +805,56 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) if (writing_finalized) throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized"); - std::string from_file_name = "memory buffer"; - if (auto fname = entry->getFilePath(); !fname.empty()) - from_file_name = "file " + fname; - + bool should_check_lock_file = false; { std::lock_guard lock{mutex}; ++num_files; total_size += info.size; + if (!num_entries) + should_check_lock_file = true; } + auto src_disk = entry->tryGetDiskIfExists(); + auto src_file_path = entry->getFilePath(); + String src_file_desc = src_file_path.empty() ? "memory buffer" : ("file " + src_file_path); + if (info.data_file_name.empty()) { - LOG_TRACE(log, "Writing backup for file {} from {}: skipped, {}", info.data_file_name, from_file_name, !info.size ? "empty" : "base backup has it"); + LOG_TRACE(log, "Writing backup for file {} from {}: skipped, {}", info.data_file_name, src_file_desc, !info.size ? "empty" : "base backup has it"); return; } if (!coordination->startWritingFile(info.data_file_index)) { - LOG_TRACE(log, "Writing backup for file {} from {}: skipped, data file #{} is already being written", info.data_file_name, from_file_name, info.data_file_index); + LOG_TRACE(log, "Writing backup for file {} from {}: skipped, data file #{} is already being written", info.data_file_name, src_file_desc, info.data_file_index); return; } - LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, from_file_name, info.data_file_index); + if (!should_check_lock_file) + checkLockFile(true); - auto writer_description = writer->getDataSourceDescription(); - auto reader_description = entry->getDataSourceDescription(); + /// NOTE: `mutex` must be unlocked during copying otherwise writing will be in one thread maximum and hence slow. - /// We need to copy whole file without archive, we can do it faster - /// if source and destination are compatible - if (!use_archive && writer->supportNativeCopy(reader_description)) + if (use_archive) { - /// Should be much faster than writing data through server. - LOG_TRACE(log, "Will copy file {} using native copy", info.data_file_name); - - /// NOTE: `mutex` must be unlocked here otherwise writing will be in one thread maximum and hence slow. - - writer->copyFileNative(entry->tryGetDiskIfExists(), entry->getFilePath(), info.base_size, info.size - info.base_size, info.data_file_name); + LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}, adding to archive", info.data_file_name, src_file_desc, info.data_file_index); + auto out = archive_writer->writeFile(info.data_file_name); + auto read_buffer = entry->getReadBuffer(); + if (info.base_size != 0) + read_buffer->seek(info.base_size, SEEK_SET); + copyData(*read_buffer, *out); + out->finalize(); + } + else if (src_disk) + { + LOG_TRACE(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); + writer->copyFileFromDisk(src_disk, src_file_path, info.base_size, info.size - info.base_size, info.data_file_name); } else { - bool has_entries = false; - { - std::lock_guard lock{mutex}; - has_entries = num_entries > 0; - } - if (!has_entries) - checkLockFile(true); - - if (use_archive) - { - LOG_TRACE(log, "Adding file {} to archive", info.data_file_name); - auto out = archive_writer->writeFile(info.data_file_name); - auto read_buffer = entry->getReadBuffer(); - if (info.base_size != 0) - read_buffer->seek(info.base_size, SEEK_SET); - copyData(*read_buffer, *out); - out->finalize(); - } - else - { - LOG_TRACE(log, "Will copy file {}", info.data_file_name); - auto create_read_buffer = [entry] { return entry->getReadBuffer(); }; - - /// NOTE: `mutex` must be unlocked here otherwise writing will be in one thread maximum and hence slow. - writer->copyDataToFile(create_read_buffer, info.base_size, info.size - info.base_size, info.data_file_name); - } + LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); + auto create_read_buffer = [entry] { return entry->getReadBuffer(); }; + writer->copyDataToFile(create_read_buffer, info.base_size, info.size - info.base_size, info.data_file_name); } { diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 18f6ada6821..c19cca4126a 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -158,8 +158,6 @@ def test_restore_table(engine): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - assert instance.contains_in_log("using native copy") - instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" @@ -200,8 +198,6 @@ def test_restore_table_under_another_name(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - assert instance.contains_in_log("using native copy") - assert instance.query("EXISTS test.table2") == "0\n" instance.query(f"RESTORE TABLE test.table AS test.table2 FROM {backup_name}") @@ -215,8 +211,6 @@ def test_backup_table_under_another_name(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table AS test.table2 TO {backup_name}") - assert instance.contains_in_log("using native copy") - assert instance.query("EXISTS test.table2") == "0\n" instance.query(f"RESTORE TABLE test.table2 FROM {backup_name}") @@ -245,8 +239,6 @@ def test_incremental_backup(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - assert instance.contains_in_log("using native copy") - instance.query("INSERT INTO test.table VALUES (65, 'a'), (66, 'b')") assert instance.query("SELECT count(), sum(x) FROM test.table") == "102\t5081\n" @@ -524,8 +516,6 @@ def test_file_engine(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - assert instance.contains_in_log("using native copy") - instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" @@ -540,8 +530,6 @@ def test_database(): instance.query(f"BACKUP DATABASE test TO {backup_name}") - assert instance.contains_in_log("using native copy") - instance.query("DROP DATABASE test") instance.query(f"RESTORE DATABASE test FROM {backup_name}") diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index e42754dd2a3..6e9b6b8569e 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -141,7 +141,7 @@ def test_backup_to_s3_native_copy(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("BackupImpl.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" @@ -155,7 +155,7 @@ def test_backup_to_s3_native_copy_other_bucket(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("BackupImpl.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" @@ -167,7 +167,7 @@ def test_backup_to_s3_native_copy_multipart(): backup_name = new_backup_name() backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart/{backup_name}', 'minio', 'minio123')" check_backup_and_restore(storage_policy, backup_destination, size=1000000) - assert node.contains_in_log("BackupImpl.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/" From 69114cb550527443e555875a91f9fe1c2731e532 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 22 Apr 2023 13:48:03 +0200 Subject: [PATCH 072/127] Add function getBlobPath() to IDisk interface to allow copying to/from disks which are not built on top of IObjectStorage. --- src/Backups/BackupIO_S3.cpp | 14 ++++++-------- src/Disks/DiskEncrypted.h | 12 ++++++++++++ src/Disks/DiskLocal.cpp | 11 +++++++++++ src/Disks/DiskLocal.h | 3 +++ src/Disks/FakeDiskTransaction.h | 8 ++------ src/Disks/IDisk.cpp | 10 ---------- src/Disks/IDisk.h | 16 ++++++++++------ src/Disks/IDiskTransaction.h | 8 +++----- src/Disks/ObjectStorages/DiskObjectStorage.cpp | 16 ++++++++++------ src/Disks/ObjectStorages/DiskObjectStorage.h | 7 ++----- .../DiskObjectStorageTransaction.cpp | 11 +++++------ .../DiskObjectStorageTransaction.h | 6 +----- .../ObjectStorages/S3/copyS3FileToDisk.cpp | 18 ++++++++++++++---- 13 files changed, 79 insertions(+), 61 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 043075d55ec..901e9de7576 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -181,17 +181,15 @@ void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_ /// copyS3File() almost doesn't use network so the throttling is not needed. if (getDataSourceDescription() == src_disk->getDataSourceDescription()) { - /// A single file can be represented as multiple objects in S3 bucket. - /// However copyS3File() can copy only a single file into a single file. - auto objects = src_disk->getStorageObjects(src_file_name); - if (objects.size() == 1) + /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. + /// In this case we can't use the native copy. + if (auto blob_path = src_disk->getBlobPath(src_file_name)) { /// Use more optimal way. LOG_TRACE(log, "Copying file {} using native copy", src_file_name); - auto object_storage = src_disk->getObjectStorage(); - std::string src_bucket = object_storage->getObjectsNamespace(); - auto file_path = fs::path(s3_uri.key) / dest_file_name; - copyS3File(client, src_bucket, objects[0].remote_path, src_offset, src_size, s3_uri.bucket, file_path, request_settings, {}, + const auto & [src_bucket, src_key] = *blob_path; + auto dest_key = fs::path(s3_uri.key) / dest_file_name; + copyS3File(client, src_bucket, src_key, src_offset, src_size, s3_uri.bucket, dest_key, request_settings, {}, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); return; } diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 8e824a1f7e5..d9da320e505 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -131,6 +131,18 @@ public: WriteMode mode, const WriteSettings & settings) override; + std::optional> getBlobPath(const String & path) const override + { + auto wrapped_path = wrappedPath(path); + return delegate->getBlobPath(wrapped_path); + } + + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override + { + auto wrapped_path = wrappedPath(path); + delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function)); + } + void removeFile(const String & path) override { auto wrapped_path = wrappedPath(path); diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 1abecb7af4e..a40368fae88 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -328,6 +328,17 @@ DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, const fs::path(disk_path) / path, buf_size, flags, settings.local_throttler); } +std::optional> DiskLocal::getBlobPath(const String & path) const +{ + return std::make_pair(fs::path(disk_path) / path, ""); +} + +void DiskLocal::writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) +{ + auto blob_path = std::make_pair(fs::path(disk_path) / path, ""); + std::move(write_blob_function)(blob_path, mode, {}); +} + void DiskLocal::removeFile(const String & path) { auto fs_path = fs::path(disk_path) / path; diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 3112198aab3..b838654925d 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -81,6 +81,9 @@ public: WriteMode mode, const WriteSettings & settings) override; + std::optional> getBlobPath(const String & path) const override; + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override; + void removeFile(const String & path) override; void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; diff --git a/src/Disks/FakeDiskTransaction.h b/src/Disks/FakeDiskTransaction.h index 3a7ea4473b6..2cf540444be 100644 --- a/src/Disks/FakeDiskTransaction.h +++ b/src/Disks/FakeDiskTransaction.h @@ -69,13 +69,9 @@ public: return disk.writeFile(path, buf_size, mode, settings); } - void writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) override + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override { - disk.writeFileUsingCustomWriteObject(path, mode, std::move(custom_write_object_function)); + disk.writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function)); } void removeFile(const std::string & path) override diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 22197760d88..9f39c8242f5 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -35,16 +35,6 @@ void IDisk::copyFile(const String & from_file_path, IDisk & to_disk, const Strin out->finalize(); } -void IDisk::writeFileUsingCustomWriteObject( - const String &, WriteMode, std::function &)>) -{ - throw Exception( - ErrorCodes::NOT_IMPLEMENTED, - "Method `writeFileUsingCustomWriteObject()` is not implemented for disk: {}", - getDataSourceDescription().type); -} - - DiskTransactionPtr IDisk::createTransaction() { return std::make_shared(*this); diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index b0a57afe169..238f2258c3d 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -209,14 +209,18 @@ public: WriteMode mode = WriteMode::Rewrite, const WriteSettings & settings = {}) = 0; - /// Write a file using a custom function to write an object to the disk's object storage. + /// Returns the path to a blob representing a specified file. + /// The meaning of the returned path depends on disk's type. + /// For DiskLocal it the absolute path to the file and for DiskObjectStorage it's the name of a namespace + /// combined with StoredObject::absolute_path. + virtual std::optional> getBlobPath(const String & path) const = 0; + + using WriteBlobFunction = std::function & blob_path, WriteMode mode, const std::optional & object_attributes)>; + + /// Write a file using a custom function to write a blob representing the file. /// This method is alternative to writeFile(), the difference is that writeFile() calls IObjectStorage::writeObject() /// to write an object to the object storage while this method allows to specify a callback for that. - virtual void writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function); + virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; /// Remove file. Throws exception if file doesn't exists or it's a directory. /// Return whether file was finally removed. (For remote disks it is not always removed). diff --git a/src/Disks/IDiskTransaction.h b/src/Disks/IDiskTransaction.h index 2edbe858c06..376d7bd78e6 100644 --- a/src/Disks/IDiskTransaction.h +++ b/src/Disks/IDiskTransaction.h @@ -68,12 +68,10 @@ public: const WriteSettings & settings = {}, bool autocommit = true) = 0; + using WriteBlobFunction = std::function & blob_path, WriteMode mode, const std::optional & object_attributes)>; + /// Write a file using a custom function to write an object to the disk's object storage. - virtual void writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) = 0; + virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; /// Remove file. Throws exception if file doesn't exists or it's a directory. virtual void removeFile(const std::string & path) = 0; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index f832ba5b7b6..b01f1b327e7 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -579,15 +579,19 @@ std::unique_ptr DiskObjectStorage::writeFile( return result; } -void DiskObjectStorage::writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) +std::optional> DiskObjectStorage::getBlobPath(const String & path) const +{ + auto objects = getStorageObjects(path); + if (objects.size() != 1) + return {}; + return std::make_pair(object_storage->getObjectsNamespace(), objects[0].absolute_path); +} + +void DiskObjectStorage::writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) { LOG_TEST(log, "Write file: {}", path); auto transaction = createObjectStorageTransaction(); - return transaction->writeFileUsingCustomWriteObject(path, mode, std::move(custom_write_object_function)); + return transaction->writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function)); } void DiskObjectStorage::applyNewSettings( diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index 4372bc75950..97751edc3f5 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -149,11 +149,8 @@ public: WriteMode mode, const WriteSettings & settings) override; - void writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) override; + std::optional> getBlobPath(const String & path) const override; + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override; void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index 6cafc35f8fa..f578bcb9772 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -669,11 +669,8 @@ std::unique_ptr DiskObjectStorageTransaction::writeFile } -void DiskObjectStorageTransaction::writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) +void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( + const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) { /// This function is a simplified and adapted version of DiskObjectStorageTransaction::writeFile(). auto blob_name = object_storage.generateBlobNameForPath(path); @@ -694,8 +691,10 @@ void DiskObjectStorageTransaction::writeFileUsingCustomWriteObject( operations_to_execute.emplace_back(std::move(write_operation)); + auto blob_path = std::make_pair(object_storage.getObjectsNamespace(), object.absolute_path); + /// We always use mode Rewrite because we simulate append using metadata and different files - size_t object_size = std::move(custom_write_object_function)(object, WriteMode::Rewrite, object_attributes); + size_t object_size = std::move(write_blob_function)(blob_path, WriteMode::Rewrite, object_attributes); /// Create metadata (see create_metadata_callback in DiskObjectStorageTransaction::writeFile()). if (mode == WriteMode::Rewrite) diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h index 080a3e42057..a4cb0ed3739 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.h @@ -100,11 +100,7 @@ public: bool autocommit = true) override; /// Write a file using a custom function to write an object to the disk's object storage. - void writeFileUsingCustomWriteObject( - const String & path, - WriteMode mode, - std::function & object_attributes)> - custom_write_object_function) override; + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override; void removeFile(const std::string & path) override; void removeFileIfExists(const std::string & path) override; diff --git a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp index f89415f789a..dea3266ad7a 100644 --- a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp +++ b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp @@ -50,18 +50,28 @@ void copyS3FileToDisk( String dest_bucket = destination_disk->getObjectStorage()->getObjectsNamespace(); - auto custom_write_object = [&](const StoredObject & object_, WriteMode write_mode_, const std::optional & object_attributes_) -> size_t + auto write_blob_function = [&](const std::pair & blob_path_, WriteMode write_mode_, const std::optional & object_attributes_) -> size_t { /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. chassert(write_mode_ == WriteMode::Rewrite); - copyS3File(s3_client, src_bucket, src_key, *src_offset, *src_size, dest_bucket, /* dest_key= */ object_.remote_path, - request_settings, object_attributes_, scheduler, /* for_disk_s3= */ true); + copyS3File( + s3_client, + src_bucket, + src_key, + *src_offset, + *src_size, + /* dest_bucket= */ blob_path_.first, + /* dest_key= */ blob_path_.second, + request_settings, + object_attributes_, + scheduler, + /* for_disk_s3= */ true); return *src_size; }; - destination_disk->writeFileUsingCustomWriteObject(destination_path, write_mode, custom_write_object); + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); } } From 101aa6eff09a06f3751f3f06c7fa22afb4d93134 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 22 Apr 2023 17:51:14 +0200 Subject: [PATCH 073/127] Add function copyS3FileFromDisk(). --- src/Backups/BackupIO_S3.cpp | 24 +---- .../ObjectStorages/S3/copyS3FileToDisk.cpp | 102 ++++++++++++------ .../ObjectStorages/S3/copyS3FileToDisk.h | 20 +++- .../test_backup_restore_s3/test.py | 6 +- 4 files changed, 94 insertions(+), 58 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 901e9de7576..d2861500159 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -176,27 +176,9 @@ DataSourceDescription BackupWriterS3::getDataSourceDescription() const void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) { - /// copyS3File() can copy to another S3 bucket, but it requires the same S3 URI endpoint. - /// We don't check `has_throttling` here (compare with BackupWriterDisk::copyFileFromDisk()) because - /// copyS3File() almost doesn't use network so the throttling is not needed. - if (getDataSourceDescription() == src_disk->getDataSourceDescription()) - { - /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. - /// In this case we can't use the native copy. - if (auto blob_path = src_disk->getBlobPath(src_file_name)) - { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} using native copy", src_file_name); - const auto & [src_bucket, src_key] = *blob_path; - auto dest_key = fs::path(s3_uri.key) / dest_file_name; - copyS3File(client, src_bucket, src_key, src_offset, src_size, s3_uri.bucket, dest_key, request_settings, {}, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); - return; - } - } - - /// Fallback to BackupWriterS3::copyDataToFile(). - IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); + copyS3FileFromDisk(src_disk, src_file_name, src_offset, src_size, + client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, read_settings, request_settings, + threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); } void BackupWriterS3::copyDataToFile( diff --git a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp index dea3266ad7a..e43d88b2519 100644 --- a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp +++ b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp @@ -13,7 +13,7 @@ namespace DB { void copyS3FileToDisk( - const std::shared_ptr & s3_client, + const std::shared_ptr & src_s3_client, const String & src_bucket, const String & src_key, const std::optional & version_id, @@ -31,47 +31,85 @@ void copyS3FileToDisk( src_offset = 0; if (!src_size) - src_size = S3::getObjectSize(*s3_client, src_bucket, src_key, version_id.value_or(""), request_settings) - *src_offset; + src_size = S3::getObjectSize(*src_s3_client, src_bucket, src_key, version_id.value_or(""), request_settings) - *src_offset; auto destination_data_source_description = destination_disk->getDataSourceDescription(); - if (destination_data_source_description != DataSourceDescription{DataSourceType::S3, s3_client->getInitialEndpoint(), false, false}) + if (destination_data_source_description == DataSourceDescription{DataSourceType::S3, src_s3_client->getInitialEndpoint(), false, false}) { - LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} through buffers", src_key, destination_disk->getName()); - ReadBufferFromS3 read_buffer{s3_client, src_bucket, src_key, {}, request_settings, read_settings}; - if (*src_offset) - read_buffer.seek(*src_offset, SEEK_SET); - auto write_buffer = destination_disk->writeFile(destination_path, std::min(*src_size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings); - copyData(read_buffer, *write_buffer, *src_size); - write_buffer->finalize(); + /// Use native copy, the more optimal way. + LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} using native copy", src_key, destination_disk->getName()); + auto write_blob_function = [&](const std::pair & blob_path_, WriteMode write_mode_, const std::optional & object_attributes_) -> size_t + { + /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. + chassert(write_mode_ == WriteMode::Rewrite); + + copyS3File( + src_s3_client, + src_bucket, + src_key, + *src_offset, + *src_size, + /* dest_bucket= */ blob_path_.first, + /* dest_key= */ blob_path_.second, + request_settings, + object_attributes_, + scheduler, + /* for_disk_s3= */ true); + + return *src_size; + }; + + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); return; } - LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} using native copy", src_key, destination_disk->getName()); + /// Fallback to copy through buffers. + LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} through buffers", src_key, destination_disk->getName()); + ReadBufferFromS3 read_buffer{src_s3_client, src_bucket, src_key, {}, request_settings, read_settings}; + if (*src_offset) + read_buffer.seek(*src_offset, SEEK_SET); + auto write_buffer = destination_disk->writeFile(destination_path, std::min(*src_size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings); + copyData(read_buffer, *write_buffer, *src_size); + write_buffer->finalize(); +} - String dest_bucket = destination_disk->getObjectStorage()->getObjectsNamespace(); +void copyS3FileFromDisk( + DiskPtr src_disk, + const String & src_path, + std::optional src_offset, + std::optional src_size, + const std::shared_ptr & dest_s3_client, + const String & dest_bucket, + const String & dest_key, + const ReadSettings & read_settings, + const S3Settings::RequestSettings & request_settings, + ThreadPoolCallbackRunner scheduler) +{ + if (!src_offset) + src_offset = 0; - auto write_blob_function = [&](const std::pair & blob_path_, WriteMode write_mode_, const std::optional & object_attributes_) -> size_t + if (!src_size) + src_size = src_disk->getFileSize(src_path) - *src_offset; + + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description == DataSourceDescription{DataSourceType::S3, dest_s3_client->getInitialEndpoint(), false, false}) { - /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. - chassert(write_mode_ == WriteMode::Rewrite); + /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. + /// In this case we can't use native copy. + if (auto blob_path = src_disk->getBlobPath(src_path)) + { + /// Use native copy, the more optimal way. + LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying file {} to S3 using native copy", src_path); + const auto & [src_bucket, src_key] = *blob_path; + copyS3File(dest_s3_client, src_bucket, src_key, *src_offset, *src_size, dest_bucket, dest_key, request_settings, {}, scheduler); + return; + } + } - copyS3File( - s3_client, - src_bucket, - src_key, - *src_offset, - *src_size, - /* dest_bucket= */ blob_path_.first, - /* dest_key= */ blob_path_.second, - request_settings, - object_attributes_, - scheduler, - /* for_disk_s3= */ true); - - return *src_size; - }; - - destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); + /// Fallback to copy through buffers. + LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying {} to S3 through buffers", src_path); + auto create_read_buffer = [src_disk, &src_path, &read_settings] { return src_disk->readFile(src_path, read_settings); }; + copyDataToS3File(create_read_buffer, *src_offset, *src_size, dest_s3_client, dest_bucket, dest_key, request_settings, {}, scheduler); } } diff --git a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h index 21c92ec9623..78caf2f50c8 100644 --- a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h +++ b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h @@ -13,11 +13,11 @@ namespace DB { /// Copies an object from S3 bucket to a disk of any type. -/// Depending on the disk the function can either do copying though buffers +/// Depending on the disk the function can either do copying through buffers /// (i.e. download the object by portions and then write those portions to the specified disk), /// or perform a server-side copy. void copyS3FileToDisk( - const std::shared_ptr & s3_client, + const std::shared_ptr & src_s3_client, const String & src_bucket, const String & src_key, const std::optional & version_id, @@ -31,6 +31,22 @@ void copyS3FileToDisk( const S3Settings::RequestSettings & request_settings = {}, ThreadPoolCallbackRunner scheduler = {}); +/// Copies an object from a disk of any type to S3 bucket. +/// Depending on the disk the function can either do copying through buffers +/// (i.e. read the object by portions and then upload those portions to the specified disk), +/// or perform a server-side copy. +void copyS3FileFromDisk( + DiskPtr src_disk, + const String & src_path, + std::optional src_offset, + std::optional src_size, + const std::shared_ptr & dest_s3_client, + const String & dest_bucket, + const String & dest_key, + const ReadSettings & read_settings = {}, + const S3Settings::RequestSettings & request_settings = {}, + ThreadPoolCallbackRunner scheduler = {}); + } #endif diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 6e9b6b8569e..b5ac34f0b46 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -141,7 +141,7 @@ def test_backup_to_s3_native_copy(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("copyS3FileFromDisk.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" @@ -155,7 +155,7 @@ def test_backup_to_s3_native_copy_other_bucket(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("copyS3FileFromDisk.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" @@ -167,7 +167,7 @@ def test_backup_to_s3_native_copy_multipart(): backup_name = new_backup_name() backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart/{backup_name}', 'minio', 'minio123')" check_backup_and_restore(storage_policy, backup_destination, size=1000000) - assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("copyS3FileFromDisk.*using native copy") assert node.contains_in_log("copyS3FileToDisk.*using native copy") assert node.contains_in_log( f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/" From bc880db5d925a4ff57565d85b541e4d6c17c07d7 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 23 Apr 2023 10:50:45 +0200 Subject: [PATCH 074/127] Add functions to read/write encrypted files from IDisk. --- src/Disks/DiskEncrypted.h | 23 +++++++++++++++++++++++ src/Disks/IDisk.cpp | 15 +++++++++++++++ src/Disks/IDisk.h | 14 ++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index d9da320e505..9c9e61275ce 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -198,6 +198,29 @@ public: delegate->removeSharedFileIfExists(wrapped_path, flag); } + std::unique_ptr readEncryptedFile( + const String & path, const ReadSettings & settings) const override + { + auto wrapped_path = wrappedPath(path); + return delegate->readFile(wrapped_path, settings); + } + + std::unique_ptr writeEncryptedFile( + const String & path, + size_t buf_size, + WriteMode mode, + const WriteSettings & settings) const override + { + auto wrapped_path = wrappedPath(path); + return delegate->writeFile(wrapped_path, buf_size, mode, settings); + } + + size_t getEncryptedFileSize(const String & path) const override + { + auto wrapped_path = wrappedPath(path); + return delegate->getFileSize(wrapped_path); + } + void setLastModified(const String & path, const Poco::Timestamp & timestamp) override { auto wrapped_path = wrappedPath(path); diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 9f39c8242f5..54e09b09d2f 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -52,6 +52,21 @@ void IDisk::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_ba } } +std::unique_ptr IDisk::readEncryptedFile(const String &, const ReadSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); +} + +std::unique_ptr IDisk::writeEncryptedFile(const String &, size_t, WriteMode, const WriteSettings &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); +} + +size_t IDisk::getEncryptedFileSize(const String &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); +} + using ResultsCollector = std::vector>; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 238f2258c3d..b98d8a74308 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -251,6 +251,20 @@ public: /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 virtual void removeSharedFileIfExists(const String & path, bool /* keep_shared_data */) { removeFileIfExists(path); } + /// Reads a file from an encrypted disk without decrypting it. + virtual std::unique_ptr readEncryptedFile( + const String & path, const ReadSettings & settings = ReadSettings{}) const; + + /// Writes an already encrypted file to an encrypted disk. + virtual std::unique_ptr writeEncryptedFile( + const String & path, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + WriteMode mode = WriteMode::Rewrite, + const WriteSettings & settings = {}) const; + + /// Returns the size of encrypted file on an encrypted disk. + virtual size_t getEncryptedFileSize(const String & path) const; + virtual const String & getCacheName() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "There is no cache"); } virtual bool supportsCache() const { return false; } From cc50fcc60a4102b194f30b56896d045390833d8d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 23 Apr 2023 12:25:46 +0200 Subject: [PATCH 075/127] Remove the 'temporary_file_' argument from BackupEntryFromImmutableFile's constructor. --- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 5 ++--- src/Backups/BackupEntryFromAppendOnlyFile.h | 3 +-- src/Backups/BackupEntryFromImmutableFile.cpp | 4 +--- src/Backups/BackupEntryFromImmutableFile.h | 5 +---- src/Backups/BackupEntryWrappedWith.h | 8 +++++++- .../MergeTree/DataPartStorageOnDiskBase.cpp | 10 +++++++--- src/Storages/StorageLog.cpp | 17 +++++++++-------- src/Storages/StorageStripeLog.cpp | 17 +++++++++-------- 8 files changed, 37 insertions(+), 32 deletions(-) diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index 5384a69d890..83117f686bf 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -10,9 +10,8 @@ BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_, - const std::optional & checksum_, - const std::shared_ptr & temporary_file_) - : BackupEntryFromImmutableFile(disk_, file_path_, settings_, file_size_, checksum_, temporary_file_) + const std::optional & checksum_) + : BackupEntryFromImmutableFile(disk_, file_path_, settings_, file_size_, checksum_) , limit(BackupEntryFromImmutableFile::getSize()) { } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index b0cee38c6be..b7a39c935a9 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -18,8 +18,7 @@ public: const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_ = {}, - const std::optional & checksum_ = {}, - const std::shared_ptr & temporary_file_ = {}); + const std::optional & checksum_ = {}); UInt64 getSize() const override { return limit; } std::unique_ptr getReadBuffer() const override; diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 48783a3bb63..790ea567496 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -13,14 +13,12 @@ BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_, - const std::optional & checksum_, - const std::shared_ptr & temporary_file_) + const std::optional & checksum_) : disk(disk_) , file_path(file_path_) , settings(settings_) , file_size(file_size_) , checksum(checksum_) - , temporary_file_on_disk(temporary_file_) { } diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 66f1fade294..4f2f902d31e 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -7,7 +7,6 @@ namespace DB { -class TemporaryFileOnDisk; class IDisk; using DiskPtr = std::shared_ptr; @@ -22,8 +21,7 @@ public: const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_ = {}, - const std::optional & checksum_ = {}, - const std::shared_ptr & temporary_file_ = {}); + const std::optional & checksum_ = {}); ~BackupEntryFromImmutableFile() override; @@ -43,7 +41,6 @@ private: mutable std::optional file_size TSA_GUARDED_BY(get_file_size_mutex); mutable std::mutex get_file_size_mutex; const std::optional checksum; - const std::shared_ptr temporary_file_on_disk; }; } diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h index 97244650b6b..da3b70e9ba9 100644 --- a/src/Backups/BackupEntryWrappedWith.h +++ b/src/Backups/BackupEntryWrappedWith.h @@ -27,11 +27,17 @@ private: T custom_value; }; +template +BackupEntryPtr wrapBackupEntryWith(BackupEntryPtr && backup_entry, const T & custom_value) +{ + return std::make_shared>(std::move(backup_entry), custom_value); +} + template void wrapBackupEntriesWith(std::vector> & backup_entries, const T & custom_value) { for (auto & [_, backup_entry] : backup_entries) - backup_entry = std::make_shared>(std::move(backup_entry), custom_value); + backup_entry = wrapBackupEntryWith(std::move(backup_entry), custom_value); } } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index ec00cc3d2b9..4df490f41fe 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -392,9 +393,12 @@ void DataPartStorageOnDiskBase::backup( file_hash = {it->second.file_hash.first, it->second.file_hash.second}; } - backup_entries.emplace_back( - filepath_in_backup, - std::make_unique(disk, filepath_on_disk, read_settings, file_size, file_hash, temp_dir_owner)); + BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, read_settings, file_size, file_hash); + + if (temp_dir_owner) + backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); + + backup_entries.emplace_back(filepath_in_backup, std::move(backup_entry)); } } diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 8264d67aaba..19887d6695e 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -951,10 +952,10 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String data_file_name = fileName(data_file.path); String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file.path, hardlink_file_path); - backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / data_file_name, - std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file.path), std::nullopt, temp_dir_owner)); + BackupEntryPtr backup_entry = std::make_unique( + disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file.path)); + backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); + backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } /// __marks.mrk @@ -964,10 +965,10 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String marks_file_name = fileName(marks_file_path); String hardlink_file_path = temp_dir / marks_file_name; disk->createHardLink(marks_file_path, hardlink_file_path); - backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / marks_file_name, - std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(marks_file_path), std::nullopt, temp_dir_owner)); + BackupEntryPtr backup_entry = std::make_unique( + disk, hardlink_file_path, read_settings, file_checker.getFileSize(marks_file_path)); + backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); + backup_entries_collector.addBackupEntry(data_path_in_backup_fs / marks_file_name, std::move(backup_entry)); } /// sizes.json diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index d54725b8b39..ddb55c119c4 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -551,10 +552,10 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String data_file_name = fileName(data_file_path); String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file_path, hardlink_file_path); - backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / data_file_name, - std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file_path), std::nullopt, temp_dir_owner)); + BackupEntryPtr backup_entry = std::make_unique( + disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file_path)); + backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); + backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } /// index.mrk @@ -563,10 +564,10 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String index_file_name = fileName(index_file_path); String hardlink_file_path = temp_dir / index_file_name; disk->createHardLink(index_file_path, hardlink_file_path); - backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / index_file_name, - std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(index_file_path), std::nullopt, temp_dir_owner)); + BackupEntryPtr backup_entry = std::make_unique( + disk, hardlink_file_path, read_settings, file_checker.getFileSize(index_file_path)); + backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); + backup_entries_collector.addBackupEntry(data_path_in_backup_fs / index_file_name, std::move(backup_entry)); } /// sizes.json From c92219f01b29c54c6a83b7b2ea63565ff67b7681 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 25 Apr 2023 19:44:03 +0200 Subject: [PATCH 076/127] BACKUP now writes encrypted data for tables on encrypted disks. --- src/Backups/BackupCoordinationRemote.cpp | 2 + src/Backups/BackupEntryFromAppendOnlyFile.h | 3 +- src/Backups/BackupEntryFromImmutableFile.cpp | 28 ++--- src/Backups/BackupEntryFromImmutableFile.h | 12 +- src/Backups/BackupEntryFromMemory.h | 7 -- src/Backups/BackupEntryFromSmallFile.cpp | 25 +++- src/Backups/BackupEntryFromSmallFile.h | 18 ++- src/Backups/BackupEntryWrappedWith.h | 5 +- src/Backups/BackupFileInfo.cpp | 2 + src/Backups/BackupFileInfo.h | 3 + src/Backups/BackupIO.cpp | 43 ++++--- src/Backups/BackupIO.h | 30 +++-- src/Backups/BackupIO_Disk.cpp | 46 +++---- src/Backups/BackupIO_Disk.h | 19 +-- src/Backups/BackupIO_File.cpp | 107 ++++++++-------- src/Backups/BackupIO_File.h | 18 +-- src/Backups/BackupIO_S3.cpp | 106 ++++++++++------ src/Backups/BackupIO_S3.h | 24 ++-- src/Backups/BackupImpl.cpp | 47 +++++-- src/Backups/BackupImpl.h | 2 + src/Backups/IBackupEntriesLazyBatch.cpp | 20 +-- src/Backups/IBackupEntry.h | 9 +- src/Common/ErrorCodes.cpp | 1 + src/Disks/DiskEncrypted.h | 24 ++-- src/Disks/DiskLocal.cpp | 32 +++-- src/Disks/DiskLocal.h | 3 +- src/Disks/DiskType.cpp | 5 + src/Disks/DiskType.h | 1 + src/Disks/IDisk.h | 27 ++-- src/Disks/IDiskTransaction.h | 2 +- .../ObjectStorages/DiskObjectStorage.cpp | 11 +- src/Disks/ObjectStorages/DiskObjectStorage.h | 2 +- .../DiskObjectStorageTransaction.cpp | 6 +- .../ObjectStorages/S3/copyS3FileToDisk.cpp | 117 ------------------ .../ObjectStorages/S3/copyS3FileToDisk.h | 52 -------- src/Storages/StorageLog.cpp | 1 + src/Storages/StorageMemory.cpp | 1 + src/Storages/StorageStripeLog.cpp | 1 + 38 files changed, 413 insertions(+), 449 deletions(-) delete mode 100644 src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp delete mode 100644 src/Disks/ObjectStorages/S3/copyS3FileToDisk.h diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index c1ef353c4eb..9a2fdf5dd6b 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -115,6 +115,7 @@ namespace writeBinary(info.checksum, out); writeBinary(info.base_size, out); writeBinary(info.base_checksum, out); + writeBinary(info.encrypted_by_disk, out); /// We don't store `info.data_file_name` and `info.data_file_index` because they're determined automalically /// after reading file infos for all the hosts (see the class BackupCoordinationFileInfos). } @@ -136,6 +137,7 @@ namespace readBinary(info.checksum, in); readBinary(info.base_size, in); readBinary(info.base_checksum, in); + readBinary(info.encrypted_by_disk, in); } return res; } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index b7a39c935a9..7c57e55923e 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -11,7 +11,6 @@ namespace DB class BackupEntryFromAppendOnlyFile : public BackupEntryFromImmutableFile { public: - /// The constructor is allowed to not set `file_size_` or `checksum_`, in that case it will be calculated from the data. BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, @@ -23,6 +22,8 @@ public: UInt64 getSize() const override { return limit; } std::unique_ptr getReadBuffer() const override; + bool isFromImmutableFile() const override { return false; } + private: const UInt64 limit; }; diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 790ea567496..7545134f638 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -16,9 +16,10 @@ BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const std::optional & checksum_) : disk(disk_) , file_path(file_path_) + , data_source_description(disk->getDataSourceDescription()) , settings(settings_) - , file_size(file_size_) - , checksum(checksum_) + , file_size(data_source_description.is_encrypted ? std::optional{} : file_size_) + , checksum(data_source_description.is_encrypted ? std::optional{} : checksum_) { } @@ -28,24 +29,21 @@ UInt64 BackupEntryFromImmutableFile::getSize() const { std::lock_guard lock{get_file_size_mutex}; if (!file_size) - file_size = disk->getFileSize(file_path); + { + if (data_source_description.is_encrypted) + file_size = disk->getEncryptedFileSize(file_path); + else + file_size = disk->getFileSize(file_path); + } return *file_size; } std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const { - return disk->readFile(file_path, settings); -} - - -DataSourceDescription BackupEntryFromImmutableFile::getDataSourceDescription() const -{ - return disk->getDataSourceDescription(); -} - -String BackupEntryFromImmutableFile::getFilePath() const -{ - return file_path; + if (data_source_description.is_encrypted) + return disk->readEncryptedFile(file_path, settings); + else + return disk->readFile(file_path, settings); } } diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 4f2f902d31e..f2801b67df6 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -14,7 +14,6 @@ using DiskPtr = std::shared_ptr; class BackupEntryFromImmutableFile : public IBackupEntry { public: - /// The constructor is allowed to not set `file_size_` or `checksum_`, in that case it will be calculated from the data. BackupEntryFromImmutableFile( const DiskPtr & disk_, @@ -28,15 +27,20 @@ public: UInt64 getSize() const override; std::optional getChecksum() const override { return checksum; } std::unique_ptr getReadBuffer() const override; + + bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } - String getFilePath() const override; - DataSourceDescription getDataSourceDescription() const override; + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } - DiskPtr tryGetDiskIfExists() const override { return disk; } + bool isFromFile() const override { return true; } + bool isFromImmutableFile() const override { return true; } + DiskPtr getDisk() const override { return disk; } + String getFilePath() const override { return file_path; } private: const DiskPtr disk; const String file_path; + const DataSourceDescription data_source_description; ReadSettings settings; mutable std::optional file_size TSA_GUARDED_BY(get_file_size_mutex); mutable std::mutex get_file_size_mutex; diff --git a/src/Backups/BackupEntryFromMemory.h b/src/Backups/BackupEntryFromMemory.h index df3b9de40e3..64f46d68580 100644 --- a/src/Backups/BackupEntryFromMemory.h +++ b/src/Backups/BackupEntryFromMemory.h @@ -19,18 +19,11 @@ public: std::optional getChecksum() const override { return checksum; } std::unique_ptr getReadBuffer() const override; - String getFilePath() const override - { - return ""; - } - DataSourceDescription getDataSourceDescription() const override { return DataSourceDescription{DataSourceType::RAM, "", false, false}; } - DiskPtr tryGetDiskIfExists() const override { return nullptr; } - private: const String data; const std::optional checksum; diff --git a/src/Backups/BackupEntryFromSmallFile.cpp b/src/Backups/BackupEntryFromSmallFile.cpp index d24b3a6498d..6f7d2364031 100644 --- a/src/Backups/BackupEntryFromSmallFile.cpp +++ b/src/Backups/BackupEntryFromSmallFile.cpp @@ -1,6 +1,9 @@ #include +#include +#include #include #include +#include #include @@ -16,9 +19,9 @@ namespace return s; } - String readFile(const DiskPtr & disk, const String & file_path) + String readFile(const DiskPtr & disk, const String & file_path, bool read_encrypted) { - auto buf = disk->readFile(file_path); + auto buf = read_encrypted ? disk->readEncryptedFile(file_path) : disk->readFile(file_path); String s; readStringUntilEOF(s, *buf); return s; @@ -27,14 +30,26 @@ namespace BackupEntryFromSmallFile::BackupEntryFromSmallFile(const String & file_path_, const std::optional & checksum_) - : BackupEntryFromMemory(readFile(file_path_), checksum_), file_path(file_path_) + : file_path(file_path_) + , data_source_description(DiskLocal::getLocalDataSourceDescription(file_path_)) + , data(readFile(file_path_)) + , checksum(checksum_) { } - + BackupEntryFromSmallFile::BackupEntryFromSmallFile( const DiskPtr & disk_, const String & file_path_, const std::optional & checksum_) - : BackupEntryFromMemory(readFile(disk_, file_path_), checksum_), disk(disk_), file_path(file_path_) + : disk(disk_) + , file_path(file_path_) + , data_source_description(disk_->getDataSourceDescription()) + , data(readFile(disk_, file_path, data_source_description.is_encrypted)) + , checksum(data_source_description.is_encrypted ? std::optional{} : checksum_) { } +std::unique_ptr BackupEntryFromSmallFile::getReadBuffer() const +{ + return std::make_unique(data); +} + } diff --git a/src/Backups/BackupEntryFromSmallFile.h b/src/Backups/BackupEntryFromSmallFile.h index 99e319f07a0..2f7f3764571 100644 --- a/src/Backups/BackupEntryFromSmallFile.h +++ b/src/Backups/BackupEntryFromSmallFile.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -10,7 +10,7 @@ using DiskPtr = std::shared_ptr; /// Represents a file prepared to be included in a backup, /// assuming that the file is small and can be easily loaded into memory. -class BackupEntryFromSmallFile : public BackupEntryFromMemory +class BackupEntryFromSmallFile : public IBackupEntry { public: /// The constructor is allowed to not set `checksum_`, in that case it will be calculated from the data. @@ -23,12 +23,24 @@ public: const String & file_path_, const std::optional & checksum_ = {}); + UInt64 getSize() const override { return data.size(); } + std::optional getChecksum() const override { return checksum; } + std::unique_ptr getReadBuffer() const override; + + bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + + bool isFromFile() const override { return true; } + DiskPtr getDisk() const override { return disk; } String getFilePath() const override { return file_path; } - DiskPtr tryGetDiskIfExists() const override { return disk; } + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } + private: const DiskPtr disk; const String file_path; + const DataSourceDescription data_source_description; + const String data; + const std::optional checksum; }; } diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h index da3b70e9ba9..933fec39634 100644 --- a/src/Backups/BackupEntryWrappedWith.h +++ b/src/Backups/BackupEntryWrappedWith.h @@ -18,8 +18,11 @@ public: UInt64 getSize() const override { return entry->getSize(); } std::optional getChecksum() const override { return entry->getChecksum(); } std::unique_ptr getReadBuffer() const override { return entry->getReadBuffer(); } + bool isEncryptedByDisk() const override { return entry->isEncryptedByDisk(); } + bool isFromFile() const override { return entry->isFromFile(); } + bool isFromImmutableFile() const override { return entry->isFromImmutableFile(); } String getFilePath() const override { return entry->getFilePath(); } - DiskPtr tryGetDiskIfExists() const override { return entry->tryGetDiskIfExists(); } + DiskPtr getDisk() const override { return entry->getDisk(); } DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); } private: diff --git a/src/Backups/BackupFileInfo.cpp b/src/Backups/BackupFileInfo.cpp index 5a3076d1647..91ddc52ae44 100644 --- a/src/Backups/BackupFileInfo.cpp +++ b/src/Backups/BackupFileInfo.cpp @@ -111,6 +111,7 @@ String BackupFileInfo::describe() const result += fmt::format("base_checksum: {};\n", getHexUIntLowercase(checksum)); result += fmt::format("data_file_name: {};\n", data_file_name); result += fmt::format("data_file_index: {};\n", data_file_index); + result += fmt::format("encrypted_by_disk: {};\n", encrypted_by_disk); return result; } @@ -122,6 +123,7 @@ BackupFileInfo buildFileInfoForBackupEntry(const String & file_name, const Backu BackupFileInfo info; info.file_name = adjusted_path; info.size = backup_entry->getSize(); + info.encrypted_by_disk = backup_entry->isEncryptedByDisk(); /// We don't set `info.data_file_name` and `info.data_file_index` in this function because they're set during backup coordination /// (see the class BackupCoordinationFileInfos). diff --git a/src/Backups/BackupFileInfo.h b/src/Backups/BackupFileInfo.h index ae6ec83a37b..a925a1e81ac 100644 --- a/src/Backups/BackupFileInfo.h +++ b/src/Backups/BackupFileInfo.h @@ -35,6 +35,9 @@ struct BackupFileInfo /// This field is set during backup coordination (see the class BackupCoordinationFileInfos). size_t data_file_index = static_cast(-1); + /// Whether this file is encrypted by an encrypted disk. + bool encrypted_by_disk = false; + struct LessByFileName { bool operator()(const BackupFileInfo & lhs, const BackupFileInfo & rhs) const { return (lhs.file_name < rhs.file_name); } diff --git a/src/Backups/BackupIO.cpp b/src/Backups/BackupIO.cpp index 7b269bd965f..4d890ed7419 100644 --- a/src/Backups/BackupIO.cpp +++ b/src/Backups/BackupIO.cpp @@ -14,13 +14,18 @@ IBackupReader::IBackupReader(Poco::Logger * log_) : log(log_) { } -void IBackupReader::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) +void IBackupReader::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - LOG_TRACE(log, "Copying file {} through buffers", file_name); - auto read_buffer = readFile(file_name); - auto write_buffer = destination_disk->writeFile(destination_path, std::min(size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings); - copyData(*read_buffer, *write_buffer, size); + LOG_TRACE(log, "Copying file {} to disk {} through buffers", path_in_backup, destination_disk->getName()); + auto read_buffer = readFile(path_in_backup); + auto buf_size = std::min(file_size, DBMS_DEFAULT_BUFFER_SIZE); + std::unique_ptr write_buffer; + if (encrypted_in_backup) + write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, write_settings); + else + write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, write_settings); + copyData(*read_buffer, *write_buffer, file_size); write_buffer->finalize(); } @@ -29,22 +34,28 @@ IBackupWriter::IBackupWriter(const ContextPtr & context_, Poco::Logger * log_) { } -void IBackupWriter::copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) +void IBackupWriter::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { auto read_buffer = create_read_buffer(); - if (offset) - read_buffer->seek(offset, SEEK_SET); - auto write_buffer = writeFile(dest_file_name); - copyData(*read_buffer, *write_buffer, size); + if (start_pos) + read_buffer->seek(start_pos, SEEK_SET); + auto write_buffer = writeFile(path_in_backup); + copyData(*read_buffer, *write_buffer, length); write_buffer->finalize(); } -void IBackupWriter::copyFileFromDisk( - DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) +void IBackupWriter::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) { - LOG_TRACE(log, "Copying file {} through buffers", src_file_name); - auto create_read_buffer = [this, src_disk, src_file_name] { return src_disk->readFile(src_file_name, read_settings); }; - copyDataToFile(create_read_buffer, src_offset, src_size, dest_file_name); + LOG_TRACE(log, "Copying file {} from disk {} through buffers", src_path, src_disk->getName()); + auto create_read_buffer = [this, src_disk, src_path, copy_encrypted] + { + if (copy_encrypted) + return src_disk->readEncryptedFile(src_path, read_settings); + else + return src_disk->readFile(src_path, read_settings); + }; + copyDataToFile(path_in_backup, create_read_buffer, start_pos, length); } } diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index dae13422bf2..f65f0871698 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -16,14 +16,19 @@ class IBackupReader /// BackupReaderFile, BackupReaderDisk { public: explicit IBackupReader(Poco::Logger * log_); - virtual ~IBackupReader() = default; + virtual bool fileExists(const String & file_name) = 0; virtual UInt64 getFileSize(const String & file_name) = 0; + virtual std::unique_ptr readFile(const String & file_name) = 0; - virtual void copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings); - virtual DataSourceDescription getDataSourceDescription() const = 0; + + /// The function copyFileToDisk() can be much faster than reading the file with readFile() and then writing it to some disk. + /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). + /// Parameters: + /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. + virtual void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings); protected: Poco::Logger * const log; @@ -33,28 +38,29 @@ protected: class IBackupWriter /// BackupWriterFile, BackupWriterDisk { public: - using CreateReadBufferFunction = std::function()>; - IBackupWriter(const ContextPtr & context_, Poco::Logger * log_); - virtual ~IBackupWriter() = default; + virtual bool fileExists(const String & file_name) = 0; virtual UInt64 getFileSize(const String & file_name) = 0; virtual bool fileContentsEqual(const String & file_name, const String & expected_file_contents) = 0; virtual std::unique_ptr writeFile(const String & file_name) = 0; - virtual void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name); + using CreateReadBufferFunction = std::function()>; + virtual void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length); - /// copyFileFromDisk() can be much faster than copyDataToFile() + /// The function copyFileFromDisk() can be much faster than copyDataToFile() /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). - virtual void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name); + /// Parameters: + /// `start_pos` and `length` specify a part of the file on `src_disk` to copy to the backup. + /// `copy_encrypted` specify whether this function should copy encrypted data of the file `src_path` to the backup. + virtual void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length); virtual void removeFile(const String & file_name) = 0; virtual void removeFiles(const Strings & file_names) = 0; - virtual DataSourceDescription getDataSourceDescription() const = 0; - protected: Poco::Logger * const log; diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index 643922cf3d0..d24d90ae7ae 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -36,24 +36,19 @@ std::unique_ptr BackupReaderDisk::readFile(const String & fi return disk->readFile(path / file_name); } -void BackupReaderDisk::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) +void BackupReaderDisk::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - if ((write_mode == WriteMode::Rewrite) && (destination_disk->getDataSourceDescription() == getDataSourceDescription())) + if ((write_mode == WriteMode::Rewrite) && !encrypted_in_backup) { /// Use more optimal way. - LOG_TRACE(log, "Copying file {} using {} disk", file_name, toString(destination_disk->getDataSourceDescription().type)); - disk->copyFile(path / file_name, *destination_disk, destination_path, write_settings); + LOG_TRACE(log, "Copying file {} from disk {} to disk {}", path_in_backup, disk->getName(), destination_disk->getName()); + disk->copyFile(path / path_in_backup, *destination_disk, destination_path, write_settings); return; } /// Fallback to copy through buffers. - IBackupReader::copyFileToDisk(file_name, size, destination_disk, destination_path, write_mode, write_settings); -} - -DataSourceDescription BackupReaderDisk::getDataSourceDescription() const -{ - return disk->getDataSourceDescription(); + IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); } @@ -118,30 +113,21 @@ void BackupWriterDisk::removeFiles(const Strings & file_names) disk->removeDirectory(path); } -DataSourceDescription BackupWriterDisk::getDataSourceDescription() const +void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) { - return disk->getDataSourceDescription(); -} - -void BackupWriterDisk::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) -{ - /// IDisk::copyFile() can copy to the same disk only, and it cannot do the throttling. - if (!has_throttling && (getDataSourceDescription() == src_disk->getDataSourceDescription())) + if (!copy_encrypted && !start_pos && (length == src_disk->getFileSize(src_path))) { - /// IDisk::copyFile() can copy a file as a whole only. - if ((src_offset == 0) && (src_size == src_disk->getFileSize(src_file_name))) - { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} using {} disk", src_file_name, toString(src_disk->getDataSourceDescription().type)); - auto dest_file_path = path / dest_file_name; - disk->createDirectories(dest_file_path.parent_path()); - src_disk->copyFile(src_file_name, *disk, dest_file_path); - return; - } + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} from disk {} to disk {}", src_path, src_disk->getName(), disk->getName()); + auto dest_file_path = path / path_in_backup; + disk->createDirectories(dest_file_path.parent_path()); + src_disk->copyFile(src_path, *disk, dest_file_path); + return; } /// Fallback to copy through buffers. - IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); + IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } } diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index 884282ea4e0..d2af06668eb 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -18,13 +18,12 @@ public: bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) override; - DataSourceDescription getDataSourceDescription() const override; + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; private: - DiskPtr disk; - std::filesystem::path path; + const DiskPtr disk; + const std::filesystem::path path; }; class BackupWriterDisk : public IBackupWriter @@ -37,14 +36,16 @@ public: UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; - void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; + + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; - DataSourceDescription getDataSourceDescription() const override; private: - DiskPtr disk; - std::filesystem::path path; + const DiskPtr disk; + const std::filesystem::path path; const bool has_throttling; }; diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index b569e65284b..69d899528cd 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -15,7 +16,9 @@ namespace DB { BackupReaderFile::BackupReaderFile(const String & path_) - : IBackupReader(&Poco::Logger::get("BackupReaderFile")), path(path_) + : IBackupReader(&Poco::Logger::get("BackupReaderFile")) + , path(path_) + , data_source_description(DiskLocal::getLocalDataSourceDescription(path_)) { } @@ -36,25 +39,44 @@ std::unique_ptr BackupReaderFile::readFile(const String & fi return createReadBufferFromFileBase(path / file_name, {}); } -void BackupReaderFile::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) +void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - if (destination_disk->getDataSourceDescription() == getDataSourceDescription()) + if (write_mode == WriteMode::Rewrite) { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} locally", file_name); - fs::copy(path / file_name, fullPath(destination_disk, destination_path), fs::copy_options::overwrite_existing); - return; + auto destination_data_source_description = destination_disk->getDataSourceDescription(); + if (destination_data_source_description.sameKind(data_source_description) + && (destination_data_source_description.is_encrypted == encrypted_in_backup)) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} to disk {} locally", path_in_backup, destination_disk->getName()); + + auto write_blob_function + = [abs_source_path = path / path_in_backup, file_size]( + const Strings & blob_path, WriteMode mode, const std::optional &) -> size_t + { + if (blob_path.size() != 1 || mode != WriteMode::Rewrite) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Blob writing function called with unexpected blob_path.size={} or mode={}", + blob_path.size(), mode); + fs::copy(abs_source_path, blob_path.at(0), fs::copy_options::overwrite_existing); + return file_size; + }; + + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); + return; + } } /// Fallback to copy through buffers. - IBackupReader::copyFileToDisk(path / file_name, size, destination_disk, destination_path, write_mode, write_settings); + IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); } BackupWriterFile::BackupWriterFile(const String & path_, const ContextPtr & context_) : IBackupWriter(context_, &Poco::Logger::get("BackupWriterFile")) , path(path_) + , data_source_description(DiskLocal::getLocalDataSourceDescription(path_)) , has_throttling(static_cast(context_->getBackupsThrottler())) { } @@ -112,59 +134,36 @@ void BackupWriterFile::removeFiles(const Strings & file_names) fs::remove(path); } -DataSourceDescription BackupWriterFile::getDataSourceDescription() const -{ - DataSourceDescription data_source_description; - - data_source_description.type = DataSourceType::Local; - - if (auto block_device_id = tryGetBlockDeviceId(path); block_device_id.has_value()) - data_source_description.description = *block_device_id; - else - data_source_description.description = path; - data_source_description.is_encrypted = false; - data_source_description.is_cached = false; - - return data_source_description; -} - -DataSourceDescription BackupReaderFile::getDataSourceDescription() const -{ - DataSourceDescription data_source_description; - - data_source_description.type = DataSourceType::Local; - - if (auto block_device_id = tryGetBlockDeviceId(path); block_device_id.has_value()) - data_source_description.description = *block_device_id; - else - data_source_description.description = path; - data_source_description.is_encrypted = false; - data_source_description.is_cached = false; - - return data_source_description; -} - - -void BackupWriterFile::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) +void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) { /// std::filesystem::copy() can copy from the filesystem only, and it cannot do the throttling. - if (!has_throttling && (getDataSourceDescription() == src_disk->getDataSourceDescription())) + if (!has_throttling) { - std::string abs_source_path = fullPath(src_disk, src_file_name); - /// std::filesystem::copy() can copy a file as a whole only. - if ((src_offset == 0) && (src_size == fs::file_size(abs_source_path))) + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description.sameKind(data_source_description) + && (source_data_source_description.is_encrypted == copy_encrypted)) { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} locally", src_file_name); - auto abs_dest_path = path / dest_file_name; - fs::create_directories(abs_dest_path.parent_path()); - fs::copy(abs_source_path, abs_dest_path, fs::copy_options::overwrite_existing); - return; + if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 1) + { + auto abs_source_path = blob_path[0]; + + /// std::filesystem::copy() can copy a file as a whole only. + if ((start_pos == 0) && (length == fs::file_size(abs_source_path))) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} from disk {} locally", src_path, src_disk->getName()); + auto abs_dest_path = path / path_in_backup; + fs::create_directories(abs_dest_path.parent_path()); + fs::copy(abs_source_path, abs_dest_path, fs::copy_options::overwrite_existing); + return; + } + } } } /// Fallback to copy through buffers. - IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name); + IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } } diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index 1f0a247c321..a8ada969ca4 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -16,12 +16,13 @@ public: bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) override; - DataSourceDescription getDataSourceDescription() const override; + + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; private: - std::filesystem::path path; + const std::filesystem::path path; + const DataSourceDescription data_source_description; }; class BackupWriterFile : public IBackupWriter @@ -34,13 +35,16 @@ public: UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; - void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; + + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; - DataSourceDescription getDataSourceDescription() const override; private: - std::filesystem::path path; + const std::filesystem::path path; + const DataSourceDescription data_source_description; const bool has_throttling; }; diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index d2861500159..00694a5e01d 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -2,7 +2,6 @@ #if USE_AWS_S3 #include -#include #include #include #include @@ -107,16 +106,11 @@ BackupReaderS3::BackupReaderS3( , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , read_settings(context_->getReadSettings()) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) + , data_source_description{DataSourceType::S3, s3_uri.endpoint, false, false} { request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } -DataSourceDescription BackupReaderS3::getDataSourceDescription() const -{ - return DataSourceDescription{DataSourceType::S3, s3_uri.endpoint, false, false}; -} - - BackupReaderS3::~BackupReaderS3() = default; bool BackupReaderS3::fileExists(const String & file_name) @@ -138,23 +132,45 @@ std::unique_ptr BackupReaderS3::readFile(const String & file client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); } -void BackupReaderS3::copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) +void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) { - copyS3FileToDisk( - client, - s3_uri.bucket, - fs::path(s3_uri.key) / file_name, - s3_uri.version_id, - 0, - size, - destination_disk, - destination_path, - write_mode, - read_settings, - write_settings, - request_settings, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupReaderS3")); + auto destination_data_source_description = destination_disk->getDataSourceDescription(); + if (destination_data_source_description.sameKind(data_source_description) + && (destination_data_source_description.is_encrypted == encrypted_in_backup)) + { + /// Use native copy, the more optimal way. + LOG_TRACE(log, "Copying {} from S3 to disk {} using native copy", path_in_backup, destination_disk->getName()); + auto write_blob_function = [&](const Strings & blob_path, WriteMode mode, const std::optional & object_attributes) -> size_t + { + /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. + if (blob_path.size() != 2 || mode != WriteMode::Rewrite) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Blob writing function called with unexpected blob_path.size={} or mode={}", + blob_path.size(), mode); + + copyS3File( + client, + s3_uri.bucket, + fs::path(s3_uri.key) / path_in_backup, + 0, + file_size, + /* dest_bucket= */ blob_path[0], + /* dest_key= */ blob_path[1], + request_settings, + object_attributes, + threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupReaderS3"), + /* for_disk_s3= */ true); + + return file_size; + }; + + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); + return; + } + + /// Fallback to copy through buffers. + IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); } @@ -164,27 +180,47 @@ BackupWriterS3::BackupWriterS3( , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) + , data_source_description{DataSourceType::S3, s3_uri.endpoint, false, false} { request_settings.updateFromSettings(context_->getSettingsRef()); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint } -DataSourceDescription BackupWriterS3::getDataSourceDescription() const +void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) { - return DataSourceDescription{DataSourceType::S3, s3_uri.endpoint, false, false}; + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description.sameKind(data_source_description) + && (source_data_source_description.is_encrypted == copy_encrypted)) + { + /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. + /// In this case we can't use native copy. + if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) + { + /// Use native copy, the more optimal way. + LOG_TRACE(log, "Copying file {} from disk {} to S3 using native copy", src_path, src_disk->getName()); + copyS3File( + client, + /* src_bucket */ blob_path[0], + /* src_key= */ blob_path[1], + start_pos, + length, + s3_uri.endpoint, + fs::path(s3_uri.key) / path_in_backup, + request_settings, + {}, + threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + return; + } + } + + /// Fallback to copy through buffers. + IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } -void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) +void BackupWriterS3::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { - copyS3FileFromDisk(src_disk, src_file_name, src_offset, src_size, - client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, read_settings, request_settings, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); -} - -void BackupWriterS3::copyDataToFile( - const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) -{ - copyDataToS3File(create_read_buffer, offset, size, client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, request_settings, {}, + copyDataToS3File(create_read_buffer, start_pos, length, client, s3_uri.bucket, fs::path(s3_uri.key) / path_in_backup, request_settings, {}, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); } diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 7d53d30e8d6..1db9d5f8c4c 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -23,15 +23,16 @@ public: bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & file_name, size_t size, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) override; - DataSourceDescription getDataSourceDescription() const override; + + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; private: - S3::URI s3_uri; - std::shared_ptr client; - ReadSettings read_settings; + const S3::URI s3_uri; + const std::shared_ptr client; + const ReadSettings read_settings; S3Settings::RequestSettings request_settings; + const DataSourceDescription data_source_description; }; @@ -46,9 +47,9 @@ public: bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; - void copyDataToFile(const CreateReadBufferFunction & create_read_buffer, UInt64 offset, UInt64 size, const String & dest_file_name) override; - void copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name) override; - DataSourceDescription getDataSourceDescription() const override; + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; @@ -56,10 +57,11 @@ public: private: void removeFilesBatch(const Strings & file_names); - S3::URI s3_uri; - std::shared_ptr client; + const S3::URI s3_uri; + const std::shared_ptr client; S3Settings::RequestSettings request_settings; std::optional supports_batch_delete; + const DataSourceDescription data_source_description; }; } diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 55fb6dbfe03..9bfa2d77353 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -36,6 +36,7 @@ namespace ErrorCodes extern const int WRONG_BASE_BACKUP; extern const int BACKUP_ENTRY_NOT_FOUND; extern const int BACKUP_IS_EMPTY; + extern const int CANNOT_RESTORE_TO_NONENCRYPTED_DISK; extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; extern const int LOGICAL_ERROR; } @@ -339,6 +340,8 @@ void BackupImpl::writeBackupMetadata() } if (!info.data_file_name.empty() && (info.data_file_name != info.file_name)) *out << "" << xml << info.data_file_name << ""; + if (info.encrypted_by_disk) + *out << "true"; } total_size += info.size; @@ -444,6 +447,7 @@ void BackupImpl::readBackupMetadata() { info.data_file_name = getString(file_config, "data_file", info.file_name); } + info.encrypted_by_disk = getBool(file_config, "encrypted_by_disk", false); } file_names.emplace(info.file_name, std::pair{info.size, info.checksum}); @@ -633,6 +637,11 @@ std::unique_ptr BackupImpl::readFile(const String & file_nam } std::unique_ptr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) const +{ + return readFileImpl(size_and_checksum, /* read_encrypted= */ false); +} + +std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecksum & size_and_checksum, bool read_encrypted) const { if (open_mode != OpenMode::READ) throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for reading"); @@ -660,6 +669,14 @@ std::unique_ptr BackupImpl::readFile(const SizeAndChecksum & info = it->second; } + if (info.encrypted_by_disk != read_encrypted) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_TO_NONENCRYPTED_DISK, + "File {} is encrypted in the backup, it can be restored only to an encrypted disk", + info.data_file_name); + } + std::unique_ptr read_buffer; std::unique_ptr base_read_buffer; @@ -760,14 +777,21 @@ size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, Dis info = it->second; } + if (info.encrypted_by_disk && !destination_disk->getDataSourceDescription().is_encrypted) + { + throw Exception( + ErrorCodes::CANNOT_RESTORE_TO_NONENCRYPTED_DISK, + "File {} is encrypted in the backup, it can be restored only to an encrypted disk", + info.data_file_name); + } + bool file_copied = false; if (info.size && !info.base_size && !use_archive) { /// Data comes completely from this backup. - reader->copyFileToDisk(info.data_file_name, info.size, destination_disk, destination_path, write_mode, write_settings); + reader->copyFileToDisk(info.data_file_name, info.size, info.encrypted_by_disk, destination_disk, destination_path, write_mode, write_settings); file_copied = true; - } else if (info.size && (info.size == info.base_size)) { @@ -786,9 +810,13 @@ size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, Dis else { /// Use the generic way to copy data. `readFile()` will update `num_read_files`. - auto read_buffer = readFile(size_and_checksum); - auto write_buffer = destination_disk->writeFile(destination_path, std::min(info.size, DBMS_DEFAULT_BUFFER_SIZE), - write_mode, write_settings); + auto read_buffer = readFileImpl(size_and_checksum, /* read_encrypted= */ info.encrypted_by_disk); + std::unique_ptr write_buffer; + size_t buf_size = std::min(info.size, DBMS_DEFAULT_BUFFER_SIZE); + if (info.encrypted_by_disk) + write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, write_settings); + else + write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, write_settings); copyData(*read_buffer, *write_buffer, info.size); write_buffer->finalize(); } @@ -814,8 +842,9 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) should_check_lock_file = true; } - auto src_disk = entry->tryGetDiskIfExists(); + auto src_disk = entry->getDisk(); auto src_file_path = entry->getFilePath(); + bool from_immutable_file = entry->isFromImmutableFile(); String src_file_desc = src_file_path.empty() ? "memory buffer" : ("file " + src_file_path); if (info.data_file_name.empty()) @@ -845,16 +874,16 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) copyData(*read_buffer, *out); out->finalize(); } - else if (src_disk) + else if (src_disk && from_immutable_file) { LOG_TRACE(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); - writer->copyFileFromDisk(src_disk, src_file_path, info.base_size, info.size - info.base_size, info.data_file_name); + writer->copyFileFromDisk(info.data_file_name, src_disk, src_file_path, info.encrypted_by_disk, info.base_size, info.size - info.base_size); } else { LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); auto create_read_buffer = [entry] { return entry->getReadBuffer(); }; - writer->copyDataToFile(create_read_buffer, info.base_size, info.size - info.base_size, info.data_file_name); + writer->copyDataToFile(info.data_file_name, create_read_buffer, info.base_size, info.size - info.base_size); } { diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index bf94926c46c..511b100c557 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -109,6 +109,8 @@ private: /// Calculates and sets `compressed_size`. void setCompressedSize(); + std::unique_ptr readFileImpl(const SizeAndChecksum & size_and_checksum, bool read_encrypted) const; + const String backup_name_for_logging; const bool use_archive; const ArchiveParams archive_params; diff --git a/src/Backups/IBackupEntriesLazyBatch.cpp b/src/Backups/IBackupEntriesLazyBatch.cpp index 78086015e7b..cd4d470967b 100644 --- a/src/Backups/IBackupEntriesLazyBatch.cpp +++ b/src/Backups/IBackupEntriesLazyBatch.cpp @@ -20,20 +20,12 @@ public: UInt64 getSize() const override { return getInternalBackupEntry()->getSize(); } std::optional getChecksum() const override { return getInternalBackupEntry()->getChecksum(); } std::unique_ptr getReadBuffer() const override { return getInternalBackupEntry()->getReadBuffer(); } - String getFilePath() const override - { - return getInternalBackupEntry()->getFilePath(); - } - - DiskPtr tryGetDiskIfExists() const override - { - return getInternalBackupEntry()->tryGetDiskIfExists(); - } - - DataSourceDescription getDataSourceDescription() const override - { - return getInternalBackupEntry()->getDataSourceDescription(); - } + bool isEncryptedByDisk() const override { return getInternalBackupEntry()->isEncryptedByDisk(); } + DataSourceDescription getDataSourceDescription() const override { return getInternalBackupEntry()->getDataSourceDescription(); } + bool isFromFile() const override { return getInternalBackupEntry()->isFromFile(); } + bool isFromImmutableFile() const override { return getInternalBackupEntry()->isFromImmutableFile(); } + String getFilePath() const override { return getInternalBackupEntry()->getFilePath(); } + DiskPtr getDisk() const override { return getInternalBackupEntry()->getDisk(); } private: BackupEntryPtr getInternalBackupEntry() const diff --git a/src/Backups/IBackupEntry.h b/src/Backups/IBackupEntry.h index 2a71a1e9756..7a93d4035df 100644 --- a/src/Backups/IBackupEntry.h +++ b/src/Backups/IBackupEntry.h @@ -27,9 +27,14 @@ public: /// Returns a read buffer for reading the data. virtual std::unique_ptr getReadBuffer() const = 0; - virtual String getFilePath() const = 0; + /// Returns true if the data returned by getReadBuffer() is encrypted by an encrypted disk. + virtual bool isEncryptedByDisk() const { return false; } - virtual DiskPtr tryGetDiskIfExists() const = 0; + /// Returns information about disk and file if this backup entry is generated from a file. + virtual bool isFromFile() const { return false; } + virtual bool isFromImmutableFile() const { return false; } + virtual String getFilePath() const { return ""; } + virtual DiskPtr getDisk() const { return nullptr; } virtual DataSourceDescription getDataSourceDescription() const = 0; }; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 24e6114b26c..83a7314ac7a 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -579,6 +579,7 @@ M(694, ASYNC_LOAD_CYCLE) \ M(695, ASYNC_LOAD_FAILED) \ M(696, ASYNC_LOAD_CANCELED) \ + M(697, CANNOT_RESTORE_TO_NONENCRYPTED_DISK) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 9c9e61275ce..be726ef46b4 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -131,18 +131,6 @@ public: WriteMode mode, const WriteSettings & settings) override; - std::optional> getBlobPath(const String & path) const override - { - auto wrapped_path = wrappedPath(path); - return delegate->getBlobPath(wrapped_path); - } - - void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override - { - auto wrapped_path = wrappedPath(path); - delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function)); - } - void removeFile(const String & path) override { auto wrapped_path = wrappedPath(path); @@ -198,6 +186,18 @@ public: delegate->removeSharedFileIfExists(wrapped_path, flag); } + Strings getBlobPath(const String & path) const override + { + auto wrapped_path = wrappedPath(path); + return delegate->getBlobPath(wrapped_path); + } + + void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override + { + auto wrapped_path = wrappedPath(path); + delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function)); + } + std::unique_ptr readEncryptedFile( const String & path, const ReadSettings & settings) const override { diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index a40368fae88..c76ea289101 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -328,15 +328,16 @@ DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, const fs::path(disk_path) / path, buf_size, flags, settings.local_throttler); } -std::optional> DiskLocal::getBlobPath(const String & path) const +std::vector DiskLocal::getBlobPath(const String & path) const { - return std::make_pair(fs::path(disk_path) / path, ""); + auto fs_path = fs::path(disk_path) / path; + return {fs_path}; } void DiskLocal::writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) { - auto blob_path = std::make_pair(fs::path(disk_path) / path, ""); - std::move(write_blob_function)(blob_path, mode, {}); + auto fs_path = fs::path(disk_path) / path; + std::move(write_blob_function)({fs_path}, mode, {}); } void DiskLocal::removeFile(const String & path) @@ -466,15 +467,8 @@ DiskLocal::DiskLocal(const String & name_, const String & path_, UInt64 keep_fre , disk_path(path_) , keep_free_space_bytes(keep_free_space_bytes_) , logger(&Poco::Logger::get("DiskLocal")) + , data_source_description(getLocalDataSourceDescription(disk_path)) { - data_source_description.type = DataSourceType::Local; - - if (auto block_device_id = tryGetBlockDeviceId(disk_path); block_device_id.has_value()) - data_source_description.description = *block_device_id; - else - data_source_description.description = disk_path; - data_source_description.is_encrypted = false; - data_source_description.is_cached = false; } DiskLocal::DiskLocal( @@ -490,6 +484,20 @@ DataSourceDescription DiskLocal::getDataSourceDescription() const return data_source_description; } +DataSourceDescription DiskLocal::getLocalDataSourceDescription(const String & path) +{ + DataSourceDescription res; + res.type = DataSourceType::Local; + + if (auto block_device_id = tryGetBlockDeviceId(path); block_device_id.has_value()) + res.description = *block_device_id; + else + res.description = path; + res.is_encrypted = false; + res.is_cached = false; + return res; +} + void DiskLocal::shutdown() { if (disk_checker) diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index b838654925d..3d340ae40b7 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -81,7 +81,7 @@ public: WriteMode mode, const WriteSettings & settings) override; - std::optional> getBlobPath(const String & path) const override; + Strings getBlobPath(const String & path) const override; void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override; void removeFile(const String & path) override; @@ -102,6 +102,7 @@ public: void truncateFile(const String & path, size_t size) override; DataSourceDescription getDataSourceDescription() const override; + static DataSourceDescription getLocalDataSourceDescription(const String & path); bool isRemote() const override { return false; } diff --git a/src/Disks/DiskType.cpp b/src/Disks/DiskType.cpp index 92979ab505c..aa18cc6e0cb 100644 --- a/src/Disks/DiskType.cpp +++ b/src/Disks/DiskType.cpp @@ -8,4 +8,9 @@ bool DataSourceDescription::operator==(const DataSourceDescription & other) cons return std::tie(type, description, is_encrypted) == std::tie(other.type, other.description, other.is_encrypted); } +bool DataSourceDescription::sameKind(const DataSourceDescription & other) const +{ + return std::tie(type, description) == std::tie(other.type, other.description); +} + } diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h index 840ed5549e6..7d47fa8da78 100644 --- a/src/Disks/DiskType.h +++ b/src/Disks/DiskType.h @@ -51,6 +51,7 @@ struct DataSourceDescription bool is_cached = false; bool operator==(const DataSourceDescription & other) const; + bool sameKind(const DataSourceDescription & other) const; }; } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index b98d8a74308..006f2d882a0 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -209,19 +209,6 @@ public: WriteMode mode = WriteMode::Rewrite, const WriteSettings & settings = {}) = 0; - /// Returns the path to a blob representing a specified file. - /// The meaning of the returned path depends on disk's type. - /// For DiskLocal it the absolute path to the file and for DiskObjectStorage it's the name of a namespace - /// combined with StoredObject::absolute_path. - virtual std::optional> getBlobPath(const String & path) const = 0; - - using WriteBlobFunction = std::function & blob_path, WriteMode mode, const std::optional & object_attributes)>; - - /// Write a file using a custom function to write a blob representing the file. - /// This method is alternative to writeFile(), the difference is that writeFile() calls IObjectStorage::writeObject() - /// to write an object to the object storage while this method allows to specify a callback for that. - virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; - /// Remove file. Throws exception if file doesn't exists or it's a directory. /// Return whether file was finally removed. (For remote disks it is not always removed). virtual void removeFile(const String & path) = 0; @@ -251,6 +238,20 @@ public: /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 virtual void removeSharedFileIfExists(const String & path, bool /* keep_shared_data */) { removeFileIfExists(path); } + /// Returns the path to a blob representing a specified file. + /// The meaning of the returned path depends on disk's type. + /// E.g. for DiskLocal it the absolute path to the file and for DiskObjectStorage it's the name of the objects' namespace + /// combined with StoredObject::absolute_path for each stored object representing a specified file. + virtual Strings getBlobPath(const String & path) const = 0; + + using WriteBlobFunction = std::function & object_attributes)>; + + /// Write a file using a custom function to write a blob representing the file. + /// This method is alternative to writeFile(), the difference is that for example for DiskObjectStorage + /// writeFile() calls IObjectStorage::writeObject() to write an object to the object storage while + /// this method allows to specify a callback for that. + virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; + /// Reads a file from an encrypted disk without decrypting it. virtual std::unique_ptr readEncryptedFile( const String & path, const ReadSettings & settings = ReadSettings{}) const; diff --git a/src/Disks/IDiskTransaction.h b/src/Disks/IDiskTransaction.h index 376d7bd78e6..f0c32e04f48 100644 --- a/src/Disks/IDiskTransaction.h +++ b/src/Disks/IDiskTransaction.h @@ -68,7 +68,7 @@ public: const WriteSettings & settings = {}, bool autocommit = true) = 0; - using WriteBlobFunction = std::function & blob_path, WriteMode mode, const std::optional & object_attributes)>; + using WriteBlobFunction = std::function & object_attributes)>; /// Write a file using a custom function to write an object to the disk's object storage. virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index b01f1b327e7..c080c2cd92d 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -579,12 +579,15 @@ std::unique_ptr DiskObjectStorage::writeFile( return result; } -std::optional> DiskObjectStorage::getBlobPath(const String & path) const +Strings DiskObjectStorage::getBlobPath(const String & path) const { auto objects = getStorageObjects(path); - if (objects.size() != 1) - return {}; - return std::make_pair(object_storage->getObjectsNamespace(), objects[0].absolute_path); + Strings res; + res.reserve(objects.size() + 1); + res.emplace_back(object_storage->getObjectsNamespace()); + for (const auto & object : objects) + res.emplace_back(object.absolute_path); + return res; } void DiskObjectStorage::writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index 97751edc3f5..b7dfaf67cf2 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -149,7 +149,7 @@ public: WriteMode mode, const WriteSettings & settings) override; - std::optional> getBlobPath(const String & path) const override; + Strings getBlobPath(const String & path) const override; void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override; void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index f578bcb9772..f98ac55889b 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -691,7 +691,11 @@ void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( operations_to_execute.emplace_back(std::move(write_operation)); - auto blob_path = std::make_pair(object_storage.getObjectsNamespace(), object.absolute_path); + /// See DiskObjectStorage::getBlobPath(). + Strings blob_path; + blob_path.reserve(2); + blob_path.emplace_back(object_storage.getObjectsNamespace()); + blob_path.emplace_back(object.absolute_path); /// We always use mode Rewrite because we simulate append using metadata and different files size_t object_size = std::move(write_blob_function)(blob_path, WriteMode::Rewrite, object_attributes); diff --git a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp deleted file mode 100644 index e43d88b2519..00000000000 --- a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#include - -#if USE_AWS_S3 - -#include -#include -#include -#include -#include - - -namespace DB -{ - -void copyS3FileToDisk( - const std::shared_ptr & src_s3_client, - const String & src_bucket, - const String & src_key, - const std::optional & version_id, - std::optional src_offset, - std::optional src_size, - DiskPtr destination_disk, - const String & destination_path, - WriteMode write_mode, - const ReadSettings & read_settings, - const WriteSettings & write_settings, - const S3Settings::RequestSettings & request_settings, - ThreadPoolCallbackRunner scheduler) -{ - if (!src_offset) - src_offset = 0; - - if (!src_size) - src_size = S3::getObjectSize(*src_s3_client, src_bucket, src_key, version_id.value_or(""), request_settings) - *src_offset; - - auto destination_data_source_description = destination_disk->getDataSourceDescription(); - if (destination_data_source_description == DataSourceDescription{DataSourceType::S3, src_s3_client->getInitialEndpoint(), false, false}) - { - /// Use native copy, the more optimal way. - LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} using native copy", src_key, destination_disk->getName()); - auto write_blob_function = [&](const std::pair & blob_path_, WriteMode write_mode_, const std::optional & object_attributes_) -> size_t - { - /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. - chassert(write_mode_ == WriteMode::Rewrite); - - copyS3File( - src_s3_client, - src_bucket, - src_key, - *src_offset, - *src_size, - /* dest_bucket= */ blob_path_.first, - /* dest_key= */ blob_path_.second, - request_settings, - object_attributes_, - scheduler, - /* for_disk_s3= */ true); - - return *src_size; - }; - - destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); - return; - } - - /// Fallback to copy through buffers. - LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} through buffers", src_key, destination_disk->getName()); - ReadBufferFromS3 read_buffer{src_s3_client, src_bucket, src_key, {}, request_settings, read_settings}; - if (*src_offset) - read_buffer.seek(*src_offset, SEEK_SET); - auto write_buffer = destination_disk->writeFile(destination_path, std::min(*src_size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings); - copyData(read_buffer, *write_buffer, *src_size); - write_buffer->finalize(); -} - -void copyS3FileFromDisk( - DiskPtr src_disk, - const String & src_path, - std::optional src_offset, - std::optional src_size, - const std::shared_ptr & dest_s3_client, - const String & dest_bucket, - const String & dest_key, - const ReadSettings & read_settings, - const S3Settings::RequestSettings & request_settings, - ThreadPoolCallbackRunner scheduler) -{ - if (!src_offset) - src_offset = 0; - - if (!src_size) - src_size = src_disk->getFileSize(src_path) - *src_offset; - - auto source_data_source_description = src_disk->getDataSourceDescription(); - if (source_data_source_description == DataSourceDescription{DataSourceType::S3, dest_s3_client->getInitialEndpoint(), false, false}) - { - /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. - /// In this case we can't use native copy. - if (auto blob_path = src_disk->getBlobPath(src_path)) - { - /// Use native copy, the more optimal way. - LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying file {} to S3 using native copy", src_path); - const auto & [src_bucket, src_key] = *blob_path; - copyS3File(dest_s3_client, src_bucket, src_key, *src_offset, *src_size, dest_bucket, dest_key, request_settings, {}, scheduler); - return; - } - } - - /// Fallback to copy through buffers. - LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying {} to S3 through buffers", src_path); - auto create_read_buffer = [src_disk, &src_path, &read_settings] { return src_disk->readFile(src_path, read_settings); }; - copyDataToS3File(create_read_buffer, *src_offset, *src_size, dest_s3_client, dest_bucket, dest_key, request_settings, {}, scheduler); -} - -} - -#endif diff --git a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h b/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h deleted file mode 100644 index 78caf2f50c8..00000000000 --- a/src/Disks/ObjectStorages/S3/copyS3FileToDisk.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include "config.h" - -#if USE_AWS_S3 - -#include -#include -#include - - -namespace DB -{ - -/// Copies an object from S3 bucket to a disk of any type. -/// Depending on the disk the function can either do copying through buffers -/// (i.e. download the object by portions and then write those portions to the specified disk), -/// or perform a server-side copy. -void copyS3FileToDisk( - const std::shared_ptr & src_s3_client, - const String & src_bucket, - const String & src_key, - const std::optional & version_id, - std::optional src_offset, - std::optional src_size, - DiskPtr destination_disk, - const String & destination_path, - WriteMode write_mode = WriteMode::Rewrite, - const ReadSettings & read_settings = {}, - const WriteSettings & write_settings = {}, - const S3Settings::RequestSettings & request_settings = {}, - ThreadPoolCallbackRunner scheduler = {}); - -/// Copies an object from a disk of any type to S3 bucket. -/// Depending on the disk the function can either do copying through buffers -/// (i.e. read the object by portions and then upload those portions to the specified disk), -/// or perform a server-side copy. -void copyS3FileFromDisk( - DiskPtr src_disk, - const String & src_path, - std::optional src_offset, - std::optional src_size, - const std::shared_ptr & dest_s3_client, - const String & dest_bucket, - const String & dest_key, - const ReadSettings & read_settings = {}, - const S3Settings::RequestSettings & request_settings = {}, - ThreadPoolCallbackRunner scheduler = {}); - -} - -#endif diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 19887d6695e..31f499a7d96 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 37d9e3bc32c..68c888a2d23 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index ddb55c119c4..5b22db91631 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -31,6 +31,7 @@ #include #include +#include #include #include #include From 002fd19cb7833f1babc6345329d293ba6445155a Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 28 Apr 2023 13:15:29 +0200 Subject: [PATCH 077/127] Move the common part of BackupIO_* to BackupIO_Default. --- src/Backups/BackupIO.cpp | 61 ------------ src/Backups/BackupIO.h | 39 ++++---- src/Backups/BackupIO_Default.cpp | 95 +++++++++++++++++++ src/Backups/BackupIO_Default.h | 73 ++++++++++++++ src/Backups/BackupIO_Disk.cpp | 70 ++++++-------- src/Backups/BackupIO_Disk.h | 25 ++--- src/Backups/BackupIO_File.cpp | 84 +++++++--------- src/Backups/BackupIO_File.h | 25 ++--- src/Backups/BackupIO_S3.cpp | 40 +++----- src/Backups/BackupIO_S3.h | 13 ++- src/Backups/BackupImpl.cpp | 20 ++-- src/Backups/BackupImpl.h | 6 +- src/Backups/IBackup.h | 4 +- .../registerBackupEnginesFileAndDisk.cpp | 4 +- src/Disks/DiskEncrypted.h | 6 +- src/Disks/IDisk.cpp | 2 +- src/Disks/IDisk.h | 5 +- .../ObjectStorages/DiskObjectStorage.cpp | 4 +- 18 files changed, 324 insertions(+), 252 deletions(-) delete mode 100644 src/Backups/BackupIO.cpp create mode 100644 src/Backups/BackupIO_Default.cpp create mode 100644 src/Backups/BackupIO_Default.h diff --git a/src/Backups/BackupIO.cpp b/src/Backups/BackupIO.cpp deleted file mode 100644 index 4d890ed7419..00000000000 --- a/src/Backups/BackupIO.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include - -#include -#include -#include -#include -#include - - -namespace DB -{ - -IBackupReader::IBackupReader(Poco::Logger * log_) : log(log_) -{ -} - -void IBackupReader::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) -{ - LOG_TRACE(log, "Copying file {} to disk {} through buffers", path_in_backup, destination_disk->getName()); - auto read_buffer = readFile(path_in_backup); - auto buf_size = std::min(file_size, DBMS_DEFAULT_BUFFER_SIZE); - std::unique_ptr write_buffer; - if (encrypted_in_backup) - write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, write_settings); - else - write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, write_settings); - copyData(*read_buffer, *write_buffer, file_size); - write_buffer->finalize(); -} - -IBackupWriter::IBackupWriter(const ContextPtr & context_, Poco::Logger * log_) - : log(log_), read_settings(context_->getBackupReadSettings()) -{ -} - -void IBackupWriter::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) -{ - auto read_buffer = create_read_buffer(); - if (start_pos) - read_buffer->seek(start_pos, SEEK_SET); - auto write_buffer = writeFile(path_in_backup); - copyData(*read_buffer, *write_buffer, length); - write_buffer->finalize(); -} - -void IBackupWriter::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length) -{ - LOG_TRACE(log, "Copying file {} from disk {} through buffers", src_path, src_disk->getName()); - auto create_read_buffer = [this, src_disk, src_path, copy_encrypted] - { - if (copy_encrypted) - return src_disk->readEncryptedFile(src_path, read_settings); - else - return src_disk->readFile(src_path, read_settings); - }; - copyDataToFile(path_in_backup, create_read_buffer, start_pos, length); -} - -} diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index f65f0871698..d522387deba 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -1,21 +1,23 @@ #pragma once #include -#include -#include -#include -#include + namespace DB { +class IDisk; +using DiskPtr = std::shared_ptr; class SeekableReadBuffer; class WriteBuffer; +enum class WriteMode; +struct WriteSettings; +struct ReadSettings; /// Represents operations of loading from disk or downloading for reading a backup. -class IBackupReader /// BackupReaderFile, BackupReaderDisk +/// See also implementations: BackupReaderFile, BackupReaderDisk. +class IBackupReader { public: - explicit IBackupReader(Poco::Logger * log_); virtual ~IBackupReader() = default; virtual bool fileExists(const String & file_name) = 0; @@ -28,17 +30,18 @@ public: /// Parameters: /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. virtual void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings); - -protected: - Poco::Logger * const log; + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) = 0; + + virtual const ReadSettings & getReadSettings() const = 0; + virtual const WriteSettings & getWriteSettings() const = 0; + virtual size_t getWriteBufferSize() const = 0; }; /// Represents operations of storing to disk or uploading for writing a backup. -class IBackupWriter /// BackupWriterFile, BackupWriterDisk +/// See also implementations: BackupWriterFile, BackupWriterDisk +class IBackupWriter { public: - IBackupWriter(const ContextPtr & context_, Poco::Logger * log_); virtual ~IBackupWriter() = default; virtual bool fileExists(const String & file_name) = 0; @@ -48,7 +51,7 @@ public: virtual std::unique_ptr writeFile(const String & file_name) = 0; using CreateReadBufferFunction = std::function()>; - virtual void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length); + virtual void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) = 0; /// The function copyFileFromDisk() can be much faster than copyDataToFile() /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). @@ -56,16 +59,14 @@ public: /// `start_pos` and `length` specify a part of the file on `src_disk` to copy to the backup. /// `copy_encrypted` specify whether this function should copy encrypted data of the file `src_path` to the backup. virtual void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length); + bool copy_encrypted, UInt64 start_pos, UInt64 length) = 0; virtual void removeFile(const String & file_name) = 0; virtual void removeFiles(const Strings & file_names) = 0; -protected: - Poco::Logger * const log; - - /// These read settings are used to read from the source disk in copyFileFromDisk(). - const ReadSettings read_settings; + virtual const ReadSettings & getReadSettings() const = 0; + virtual const WriteSettings & getWriteSettings() const = 0; + virtual size_t getWriteBufferSize() const = 0; }; } diff --git a/src/Backups/BackupIO_Default.cpp b/src/Backups/BackupIO_Default.cpp new file mode 100644 index 00000000000..3b4851e9441 --- /dev/null +++ b/src/Backups/BackupIO_Default.cpp @@ -0,0 +1,95 @@ +#include + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +BackupReaderDefault::BackupReaderDefault(Poco::Logger * log_, const ContextPtr & context_) + : log(log_) + , read_settings(context_->getBackupReadSettings()) + , write_settings(context_->getWriteSettings()) + , write_buffer_size(DBMS_DEFAULT_BUFFER_SIZE) +{ +} + +void BackupReaderDefault::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) +{ + LOG_TRACE(log, "Copying file {} to disk {} through buffers", path_in_backup, destination_disk->getName()); + + auto read_buffer = readFile(path_in_backup); + + std::unique_ptr write_buffer; + auto buf_size = std::min(file_size, write_buffer_size); + if (encrypted_in_backup) + write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, write_settings); + else + write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, write_settings); + + copyData(*read_buffer, *write_buffer, file_size); + write_buffer->finalize(); +} + +BackupWriterDefault::BackupWriterDefault(Poco::Logger * log_, const ContextPtr & context_) + : log(log_) + , read_settings(context_->getBackupReadSettings()) + , write_settings(context_->getWriteSettings()) + , write_buffer_size(DBMS_DEFAULT_BUFFER_SIZE) +{ +} + +bool BackupWriterDefault::fileContentsEqual(const String & file_name, const String & expected_file_contents) +{ + if (!fileExists(file_name)) + return false; + + try + { + auto in = readFile(file_name, expected_file_contents.size()); + String actual_file_contents(expected_file_contents.size(), ' '); + return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) + && (actual_file_contents == expected_file_contents) && in->eof(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + return false; + } +} + +void BackupWriterDefault::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) +{ + auto read_buffer = create_read_buffer(); + + if (start_pos) + read_buffer->seek(start_pos, SEEK_SET); + + auto write_buffer = writeFile(path_in_backup); + + copyData(*read_buffer, *write_buffer, length); + write_buffer->finalize(); +} + +void BackupWriterDefault::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) +{ + LOG_TRACE(log, "Copying file {} from disk {} through buffers", src_path, src_disk->getName()); + + auto create_read_buffer = [this, src_disk, src_path, file_size = start_pos + length, copy_encrypted] + { + if (copy_encrypted) + return src_disk->readEncryptedFile(src_path, read_settings, {}, file_size); + else + return src_disk->readFile(src_path, read_settings, {}, file_size); + }; + + copyDataToFile(path_in_backup, create_read_buffer, start_pos, length); +} +} diff --git a/src/Backups/BackupIO_Default.h b/src/Backups/BackupIO_Default.h new file mode 100644 index 00000000000..0fc510f9361 --- /dev/null +++ b/src/Backups/BackupIO_Default.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ +class IDisk; +using DiskPtr = std::shared_ptr; +class ReadBuffer; +class SeekableReadBuffer; +class WriteBuffer; +enum class WriteMode; + +/// Represents operations of loading from disk or downloading for reading a backup. +class BackupReaderDefault : public IBackupReader +{ +public: + BackupReaderDefault(Poco::Logger * log_, const ContextPtr & context_); + ~BackupReaderDefault() override = default; + + /// The function copyFileToDisk() can be much faster than reading the file with readFile() and then writing it to some disk. + /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). + /// Parameters: + /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + + const ReadSettings & getReadSettings() const override { return read_settings; } + const WriteSettings & getWriteSettings() const override { return write_settings; } + size_t getWriteBufferSize() const override { return write_buffer_size; } + +protected: + Poco::Logger * const log; + const ReadSettings read_settings; + + /// The write settings are used to write to the source disk in copyFileToDisk(). + const WriteSettings write_settings; + const size_t write_buffer_size; +}; + +/// Represents operations of storing to disk or uploading for writing a backup. +class BackupWriterDefault : public IBackupWriter +{ +public: + BackupWriterDefault(Poco::Logger * log_, const ContextPtr & context_); + ~BackupWriterDefault() override = default; + + bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + + const ReadSettings & getReadSettings() const override { return read_settings; } + const WriteSettings & getWriteSettings() const override { return write_settings; } + size_t getWriteBufferSize() const override { return write_buffer_size; } + +protected: + /// Here readFile() is used only to implement fileContentsEqual(). + virtual std::unique_ptr readFile(const String & file_name, size_t expected_file_size) = 0; + + Poco::Logger * const log; + + /// The read settings are used to read from the source disk in copyFileFromDisk(). + const ReadSettings read_settings; + + const WriteSettings write_settings; + const size_t write_buffer_size; +}; + +} diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index d24d90ae7ae..f64e929131c 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -3,7 +3,6 @@ #include #include #include -#include namespace DB @@ -14,8 +13,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & path_) - : IBackupReader(&Poco::Logger::get("BackupReaderDisk")), disk(disk_), path(path_) +BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & root_path_, const ContextPtr & context_) + : BackupReaderDefault(&Poco::Logger::get("BackupReaderDisk"), context_) + , disk(disk_) + , root_path(root_path_) { } @@ -23,40 +24,39 @@ BackupReaderDisk::~BackupReaderDisk() = default; bool BackupReaderDisk::fileExists(const String & file_name) { - return disk->exists(path / file_name); + return disk->exists(root_path / file_name); } UInt64 BackupReaderDisk::getFileSize(const String & file_name) { - return disk->getFileSize(path / file_name); + return disk->getFileSize(root_path / file_name); } std::unique_ptr BackupReaderDisk::readFile(const String & file_name) { - return disk->readFile(path / file_name); + return disk->readFile(root_path / file_name, read_settings); } void BackupReaderDisk::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { if ((write_mode == WriteMode::Rewrite) && !encrypted_in_backup) { /// Use more optimal way. LOG_TRACE(log, "Copying file {} from disk {} to disk {}", path_in_backup, disk->getName(), destination_disk->getName()); - disk->copyFile(path / path_in_backup, *destination_disk, destination_path, write_settings); + disk->copyFile(root_path / path_in_backup, *destination_disk, destination_path, write_settings); return; } /// Fallback to copy through buffers. - IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); + BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); } -BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_) - : IBackupWriter(context_, &Poco::Logger::get("BackupWriterDisk")) +BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & root_path_, const ContextPtr & context_) + : BackupWriterDefault(&Poco::Logger::get("BackupWriterDisk"), context_) , disk(disk_) - , path(path_) - , has_throttling(static_cast(context_->getBackupsThrottler())) + , root_path(root_path_) { } @@ -64,53 +64,39 @@ BackupWriterDisk::~BackupWriterDisk() = default; bool BackupWriterDisk::fileExists(const String & file_name) { - return disk->exists(path / file_name); + return disk->exists(root_path / file_name); } UInt64 BackupWriterDisk::getFileSize(const String & file_name) { - return disk->getFileSize(path / file_name); + return disk->getFileSize(root_path / file_name); } -bool BackupWriterDisk::fileContentsEqual(const String & file_name, const String & expected_file_contents) +std::unique_ptr BackupWriterDisk::readFile(const String & file_name, size_t expected_file_size) { - if (!disk->exists(path / file_name)) - return false; - - try - { - auto in = disk->readFile(path / file_name); - String actual_file_contents(expected_file_contents.size(), ' '); - return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) - && (actual_file_contents == expected_file_contents) && in->eof(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - return false; - } + return disk->readFile(file_name, read_settings, {}, expected_file_size); } std::unique_ptr BackupWriterDisk::writeFile(const String & file_name) { - auto file_path = path / file_name; + auto file_path = root_path / file_name; disk->createDirectories(file_path.parent_path()); - return disk->writeFile(file_path); + return disk->writeFile(file_path, write_buffer_size, WriteMode::Rewrite, write_settings); } void BackupWriterDisk::removeFile(const String & file_name) { - disk->removeFileIfExists(path / file_name); - if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) - disk->removeDirectory(path); + disk->removeFileIfExists(root_path / file_name); + if (disk->isDirectory(root_path) && disk->isDirectoryEmpty(root_path)) + disk->removeDirectory(root_path); } void BackupWriterDisk::removeFiles(const Strings & file_names) { for (const auto & file_name : file_names) - disk->removeFileIfExists(path / file_name); - if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) - disk->removeDirectory(path); + disk->removeFileIfExists(root_path / file_name); + if (disk->isDirectory(root_path) && disk->isDirectoryEmpty(root_path)) + disk->removeDirectory(root_path); } void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -120,14 +106,14 @@ void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr s { /// Use more optimal way. LOG_TRACE(log, "Copying file {} from disk {} to disk {}", src_path, src_disk->getName(), disk->getName()); - auto dest_file_path = path / path_in_backup; + auto dest_file_path = root_path / path_in_backup; disk->createDirectories(dest_file_path.parent_path()); - src_disk->copyFile(src_path, *disk, dest_file_path); + src_disk->copyFile(src_path, *disk, dest_file_path, write_settings); return; } /// Fallback to copy through buffers. - IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); + BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } } diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index d2af06668eb..399ebeaa227 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -1,40 +1,42 @@ #pragma once +#include #include -#include -#include + namespace DB { class IDisk; using DiskPtr = std::shared_ptr; -class BackupReaderDisk : public IBackupReader +class BackupReaderDisk : public BackupReaderDefault { public: - BackupReaderDisk(const DiskPtr & disk_, const String & path_); + BackupReaderDisk(const DiskPtr & disk_, const String & root_path_, const ContextPtr & context_); ~BackupReaderDisk() override; bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; private: const DiskPtr disk; - const std::filesystem::path path; + const std::filesystem::path root_path; }; -class BackupWriterDisk : public IBackupWriter +class BackupWriterDisk : public BackupWriterDefault { public: - BackupWriterDisk(const DiskPtr & disk_, const String & path_, const ContextPtr & context_); + BackupWriterDisk(const DiskPtr & disk_, const String & root_path_, const ContextPtr & context_); ~BackupWriterDisk() override; bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; - bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; + std::unique_ptr writeFile(const String & file_name) override; void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -44,9 +46,10 @@ public: void removeFiles(const Strings & file_names) override; private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + const DiskPtr disk; - const std::filesystem::path path; - const bool has_throttling; + const std::filesystem::path root_path; }; } diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 69d899528cd..69265799793 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -1,12 +1,11 @@ #include #include -#include +//#include #include #include -#include -#include +//#include +//#include #include -#include namespace fs = std::filesystem; @@ -15,32 +14,30 @@ namespace fs = std::filesystem; namespace DB { -BackupReaderFile::BackupReaderFile(const String & path_) - : IBackupReader(&Poco::Logger::get("BackupReaderFile")) - , path(path_) - , data_source_description(DiskLocal::getLocalDataSourceDescription(path_)) +BackupReaderFile::BackupReaderFile(const String & root_path_, const ContextPtr & context_) + : BackupReaderDefault(&Poco::Logger::get("BackupReaderFile"), context_) + , root_path(root_path_) + , data_source_description(DiskLocal::getLocalDataSourceDescription(root_path)) { } -BackupReaderFile::~BackupReaderFile() = default; - bool BackupReaderFile::fileExists(const String & file_name) { - return fs::exists(path / file_name); + return fs::exists(root_path / file_name); } UInt64 BackupReaderFile::getFileSize(const String & file_name) { - return fs::file_size(path / file_name); + return fs::file_size(root_path / file_name); } std::unique_ptr BackupReaderFile::readFile(const String & file_name) { - return createReadBufferFromFileBase(path / file_name, {}); + return createReadBufferFromFileBase(root_path / file_name, read_settings); } void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { if (write_mode == WriteMode::Rewrite) { @@ -52,7 +49,7 @@ void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file LOG_TRACE(log, "Copying file {} to disk {} locally", path_in_backup, destination_disk->getName()); auto write_blob_function - = [abs_source_path = path / path_in_backup, file_size]( + = [abs_source_path = root_path / path_in_backup, file_size]( const Strings & blob_path, WriteMode mode, const std::optional &) -> size_t { if (blob_path.size() != 1 || mode != WriteMode::Rewrite) @@ -69,69 +66,53 @@ void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file } /// Fallback to copy through buffers. - IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); + BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); } -BackupWriterFile::BackupWriterFile(const String & path_, const ContextPtr & context_) - : IBackupWriter(context_, &Poco::Logger::get("BackupWriterFile")) - , path(path_) - , data_source_description(DiskLocal::getLocalDataSourceDescription(path_)) - , has_throttling(static_cast(context_->getBackupsThrottler())) +BackupWriterFile::BackupWriterFile(const String & root_path_, const ContextPtr & context_) + : BackupWriterDefault(&Poco::Logger::get("BackupWriterFile"), context_) + , root_path(root_path_) + , data_source_description(DiskLocal::getLocalDataSourceDescription(root_path)) + , has_throttling(static_cast(read_settings.local_throttler)) { } -BackupWriterFile::~BackupWriterFile() = default; - bool BackupWriterFile::fileExists(const String & file_name) { - return fs::exists(path / file_name); + return fs::exists(root_path / file_name); } UInt64 BackupWriterFile::getFileSize(const String & file_name) { - return fs::file_size(path / file_name); + return fs::file_size(root_path / file_name); } -bool BackupWriterFile::fileContentsEqual(const String & file_name, const String & expected_file_contents) +std::unique_ptr BackupWriterFile::readFile(const String & file_name, size_t expected_file_size) { - if (!fs::exists(path / file_name)) - return false; - - try - { - auto in = createReadBufferFromFileBase(path / file_name, {}); - String actual_file_contents(expected_file_contents.size(), ' '); - return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) - && (actual_file_contents == expected_file_contents) && in->eof(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - return false; - } + return createReadBufferFromFileBase(root_path / file_name, read_settings, {}, expected_file_size); } std::unique_ptr BackupWriterFile::writeFile(const String & file_name) { - auto file_path = path / file_name; + auto file_path = root_path / file_name; fs::create_directories(file_path.parent_path()); - return std::make_unique(file_path); + return std::make_unique(file_path, write_buffer_size, -1, write_settings.local_throttler); } void BackupWriterFile::removeFile(const String & file_name) { - fs::remove(path / file_name); - if (fs::is_directory(path) && fs::is_empty(path)) - fs::remove(path); + fs::remove(root_path / file_name); + if (fs::is_directory(root_path) && fs::is_empty(root_path)) + fs::remove(root_path); } void BackupWriterFile::removeFiles(const Strings & file_names) { for (const auto & file_name : file_names) - fs::remove(path / file_name); - if (fs::is_directory(path) && fs::is_empty(path)) - fs::remove(path); + fs::remove(root_path / file_name); + if (fs::is_directory(root_path) && fs::is_empty(root_path)) + fs::remove(root_path); } void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -144,6 +125,7 @@ void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr s if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { + /// std::filesystem::copy() can copy from a single file only. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 1) { auto abs_source_path = blob_path[0]; @@ -153,7 +135,7 @@ void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr s { /// Use more optimal way. LOG_TRACE(log, "Copying file {} from disk {} locally", src_path, src_disk->getName()); - auto abs_dest_path = path / path_in_backup; + auto abs_dest_path = root_path / path_in_backup; fs::create_directories(abs_dest_path.parent_path()); fs::copy(abs_source_path, abs_dest_path, fs::copy_options::overwrite_existing); return; @@ -163,7 +145,7 @@ void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr s } /// Fallback to copy through buffers. - IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); + BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } } diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index a8ada969ca4..45fc0d47115 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -1,39 +1,38 @@ #pragma once +#include +#include #include -#include -#include + namespace DB { -class BackupReaderFile : public IBackupReader +class BackupReaderFile : public BackupReaderDefault { public: - explicit BackupReaderFile(const String & path_); - ~BackupReaderFile() override; + explicit BackupReaderFile(const String & root_path_, const ContextPtr & context_); bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; private: - const std::filesystem::path path; + const std::filesystem::path root_path; const DataSourceDescription data_source_description; }; -class BackupWriterFile : public IBackupWriter +class BackupWriterFile : public BackupWriterDefault { public: - BackupWriterFile(const String & path_, const ContextPtr & context_); - ~BackupWriterFile() override; + BackupWriterFile(const String & root_path_, const ContextPtr & context_); bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; - bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -43,7 +42,9 @@ public: void removeFiles(const Strings & file_names) override; private: - const std::filesystem::path path; + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + + const std::filesystem::path root_path; const DataSourceDescription data_source_description; const bool has_throttling; }; diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 00694a5e01d..8d9d34bf9b5 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -101,10 +102,9 @@ namespace BackupReaderS3::BackupReaderS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) - : IBackupReader(&Poco::Logger::get("BackupReaderS3")) + : BackupReaderDefault(&Poco::Logger::get("BackupReaderS3"), context_) , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) - , read_settings(context_->getReadSettings()) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) , data_source_description{DataSourceType::S3, s3_uri.endpoint, false, false} { @@ -133,7 +133,7 @@ std::unique_ptr BackupReaderS3::readFile(const String & file } void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { auto destination_data_source_description = destination_disk->getDataSourceDescription(); if (destination_data_source_description.sameKind(data_source_description) @@ -170,13 +170,13 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s } /// Fallback to copy through buffers. - IBackupReader::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode, write_settings); + BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); } BackupWriterS3::BackupWriterS3( const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_) - : IBackupWriter(context_, &Poco::Logger::get("BackupWriterS3")) + : BackupWriterDefault(&Poco::Logger::get("BackupWriterS3"), context_) , s3_uri(s3_uri_) , client(makeS3Client(s3_uri_, access_key_id_, secret_access_key_, context_)) , request_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()).request_settings) @@ -193,8 +193,8 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { - /// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket. - /// In this case we can't use native copy. + /// getBlobPath() can return more than 2 elements if the file is stored as multiple objects in S3 bucket. + /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { /// Use native copy, the more optimal way. @@ -215,7 +215,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src } /// Fallback to copy through buffers. - IBackupWriter::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); + BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } void BackupWriterS3::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) @@ -239,24 +239,11 @@ UInt64 BackupWriterS3::getFileSize(const String & file_name) return objects[0].GetSize(); } -bool BackupWriterS3::fileContentsEqual(const String & file_name, const String & expected_file_contents) +std::unique_ptr BackupWriterS3::readFile(const String & file_name, size_t expected_file_size) { - if (listObjects(*client, s3_uri, file_name).empty()) - return false; - - try - { - auto in = std::make_unique( - client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings); - String actual_file_contents(expected_file_contents.size(), ' '); - return (in->read(actual_file_contents.data(), actual_file_contents.size()) == actual_file_contents.size()) - && (actual_file_contents == expected_file_contents) && in->eof(); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - return false; - } + return std::make_unique( + client, s3_uri.bucket, fs::path(s3_uri.key) / file_name, s3_uri.version_id, request_settings, read_settings, + false, 0, 0, false, expected_file_size); } std::unique_ptr BackupWriterS3::writeFile(const String & file_name) @@ -267,7 +254,8 @@ std::unique_ptr BackupWriterS3::writeFile(const String & file_name) fs::path(s3_uri.key) / file_name, request_settings, std::nullopt, - threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); + threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3"), + write_settings); } void BackupWriterS3::removeFile(const String & file_name) diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index 1db9d5f8c4c..cca56bae6bc 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -3,8 +3,8 @@ #include "config.h" #if USE_AWS_S3 -#include -#include +#include +#include #include #include #include @@ -14,7 +14,7 @@ namespace DB { /// Represents a backup stored to AWS S3. -class BackupReaderS3 : public IBackupReader +class BackupReaderS3 : public BackupReaderDefault { public: BackupReaderS3(const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_); @@ -25,18 +25,17 @@ public: std::unique_ptr readFile(const String & file_name) override; void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode, const WriteSettings & write_settings) override; + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; private: const S3::URI s3_uri; const std::shared_ptr client; - const ReadSettings read_settings; S3Settings::RequestSettings request_settings; const DataSourceDescription data_source_description; }; -class BackupWriterS3 : public IBackupWriter +class BackupWriterS3 : public BackupWriterDefault { public: BackupWriterS3(const S3::URI & s3_uri_, const String & access_key_id_, const String & secret_access_key_, const ContextPtr & context_); @@ -44,7 +43,6 @@ public: bool fileExists(const String & file_name) override; UInt64 getFileSize(const String & file_name) override; - bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; @@ -55,6 +53,7 @@ public: void removeFiles(const Strings & file_names) override; private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; void removeFilesBatch(const Strings & file_names); const S3::URI s3_uri; diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 9bfa2d77353..715fe9e0d13 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -737,14 +737,14 @@ std::unique_ptr BackupImpl::readFileImpl(const SizeAndChecks } } -size_t BackupImpl::copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) const +size_t BackupImpl::copyFileToDisk(const String & file_name, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const { - return copyFileToDisk(getFileSizeAndChecksum(file_name), destination_disk, destination_path, write_mode, write_settings); + return copyFileToDisk(getFileSizeAndChecksum(file_name), destination_disk, destination_path, write_mode); } -size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) const +size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const { if (open_mode != OpenMode::READ) throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for reading"); @@ -790,13 +790,13 @@ size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, Dis if (info.size && !info.base_size && !use_archive) { /// Data comes completely from this backup. - reader->copyFileToDisk(info.data_file_name, info.size, info.encrypted_by_disk, destination_disk, destination_path, write_mode, write_settings); + reader->copyFileToDisk(info.data_file_name, info.size, info.encrypted_by_disk, destination_disk, destination_path, write_mode); file_copied = true; } else if (info.size && (info.size == info.base_size)) { /// Data comes completely from the base backup (nothing comes from this backup). - base_backup->copyFileToDisk(std::pair{info.base_size, info.base_checksum}, destination_disk, destination_path, write_mode, write_settings); + base_backup->copyFileToDisk(std::pair{info.base_size, info.base_checksum}, destination_disk, destination_path, write_mode); file_copied = true; } @@ -812,11 +812,11 @@ size_t BackupImpl::copyFileToDisk(const SizeAndChecksum & size_and_checksum, Dis /// Use the generic way to copy data. `readFile()` will update `num_read_files`. auto read_buffer = readFileImpl(size_and_checksum, /* read_encrypted= */ info.encrypted_by_disk); std::unique_ptr write_buffer; - size_t buf_size = std::min(info.size, DBMS_DEFAULT_BUFFER_SIZE); + size_t buf_size = std::min(info.size, reader->getWriteBufferSize()); if (info.encrypted_by_disk) - write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, write_settings); + write_buffer = destination_disk->writeEncryptedFile(destination_path, buf_size, write_mode, reader->getWriteSettings()); else - write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, write_settings); + write_buffer = destination_disk->writeFile(destination_path, buf_size, write_mode, reader->getWriteSettings()); copyData(*read_buffer, *write_buffer, info.size); write_buffer->finalize(); } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 511b100c557..7e95d156162 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -76,10 +76,8 @@ public: SizeAndChecksum getFileSizeAndChecksum(const String & file_name) const override; std::unique_ptr readFile(const String & file_name) const override; std::unique_ptr readFile(const SizeAndChecksum & size_and_checksum) const override; - size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) const override; - size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode, const WriteSettings & write_settings) const override; + size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const override; + size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) const override; void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override; void finalizeWriting() override; bool supportsWritingInMultipleThreads() const override { return !use_archive; } diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 031130fa3b4..660f7d5da22 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -109,10 +109,10 @@ public: /// Copies a file from the backup to a specified destination disk. Returns the number of bytes written. virtual size_t copyFileToDisk(const String & file_name, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode = WriteMode::Rewrite, const WriteSettings & write_settings = {}) const = 0; + WriteMode write_mode = WriteMode::Rewrite) const = 0; virtual size_t copyFileToDisk(const SizeAndChecksum & size_and_checksum, DiskPtr destination_disk, const String & destination_path, - WriteMode write_mode = WriteMode::Rewrite, const WriteSettings & write_settings = {}) const = 0; + WriteMode write_mode = WriteMode::Rewrite) const = 0; /// Puts a new entry to the backup. virtual void writeFile(const BackupFileInfo & file_info, BackupEntryPtr entry) = 0; diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index 46f44471e6f..daae9627759 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -169,9 +169,9 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) { std::shared_ptr reader; if (engine_name == "File") - reader = std::make_shared(path); + reader = std::make_shared(path, params.context); else - reader = std::make_shared(disk, path); + reader = std::make_shared(disk, path, params.context); return std::make_unique(backup_name_for_logging, archive_params, params.base_backup_info, reader, params.context); } else diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index be726ef46b4..db53accc062 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -199,10 +199,12 @@ public: } std::unique_ptr readEncryptedFile( - const String & path, const ReadSettings & settings) const override + const String & path, const ReadSettings & settings, + std::optional read_hint, + std::optional file_size) const override { auto wrapped_path = wrappedPath(path); - return delegate->readFile(wrapped_path, settings); + return delegate->readFile(wrapped_path, settings, read_hint, file_size); } std::unique_ptr writeEncryptedFile( diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 54e09b09d2f..3c8ca454463 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -52,7 +52,7 @@ void IDisk::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_ba } } -std::unique_ptr IDisk::readEncryptedFile(const String &, const ReadSettings &) const +std::unique_ptr IDisk::readEncryptedFile(const String &, const ReadSettings &, std::optional, std::optional) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 006f2d882a0..770eec46ce6 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -254,7 +254,10 @@ public: /// Reads a file from an encrypted disk without decrypting it. virtual std::unique_ptr readEncryptedFile( - const String & path, const ReadSettings & settings = ReadSettings{}) const; + const String & path, + const ReadSettings & settings = ReadSettings{}, + std::optional read_hint = {}, + std::optional file_size = {}) const; /// Writes an already encrypted file to an encrypted disk. virtual std::unique_ptr writeEncryptedFile( diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index c080c2cd92d..bfa1ed1ab26 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -584,7 +584,9 @@ Strings DiskObjectStorage::getBlobPath(const String & path) const auto objects = getStorageObjects(path); Strings res; res.reserve(objects.size() + 1); - res.emplace_back(object_storage->getObjectsNamespace()); + String objects_namespace = object_storage->getObjectsNamespace(); + if (!objects_namespace.empty()) + res.emplace_back(objects_namespace); for (const auto & object : objects) res.emplace_back(object.absolute_path); return res; From 517e119e0385079b22531032ece33f4a8ed45977 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 1 May 2023 14:23:59 +0200 Subject: [PATCH 078/127] Move checksum calculation to IBackupEntry. --- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 45 +++++++++++--- src/Backups/BackupEntryFromAppendOnlyFile.h | 26 +++++--- src/Backups/BackupEntryFromImmutableFile.cpp | 60 +++++++++++++------ src/Backups/BackupEntryFromImmutableFile.h | 23 +++---- src/Backups/BackupEntryFromMemory.cpp | 7 +-- src/Backups/BackupEntryFromMemory.h | 18 ++---- src/Backups/BackupEntryFromSmallFile.cpp | 9 +-- src/Backups/BackupEntryFromSmallFile.h | 22 ++----- .../BackupEntryWithChecksumCalculation.cpp | 48 +++++++++++++++ .../BackupEntryWithChecksumCalculation.h | 22 +++++++ src/Backups/BackupEntryWrappedWith.h | 7 ++- src/Backups/BackupFileInfo.cpp | 39 +++--------- src/Backups/BackupIO_File.cpp | 3 - src/Backups/IBackupEntriesLazyBatch.cpp | 7 ++- src/Backups/IBackupEntry.h | 9 ++- src/Disks/DiskEncrypted.cpp | 7 +++ src/Disks/DiskEncrypted.h | 2 + src/Storages/StorageReplicatedMergeTree.cpp | 3 +- 18 files changed, 228 insertions(+), 129 deletions(-) create mode 100644 src/Backups/BackupEntryWithChecksumCalculation.cpp create mode 100644 src/Backups/BackupEntryWithChecksumCalculation.h diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index 83117f686bf..e3b79695a0c 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -1,25 +1,52 @@ #include +#include #include namespace DB { +namespace +{ + UInt64 calculateSize(const DiskPtr & disk, const String & file_path, const std::optional & file_size, bool disk_is_encrypted) + { + if (file_size) + { + if (disk_is_encrypted) + return DiskEncrypted::convertFileSizeToEncryptedFileSize(*file_size); + else + return *file_size; + } + else + { + if (disk_is_encrypted) + return disk->getEncryptedFileSize(file_path); + else + return disk->getFileSize(file_path); + } + } +} + BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( - const DiskPtr & disk_, - const String & file_path_, - const ReadSettings & settings_, - const std::optional & file_size_, - const std::optional & checksum_) - : BackupEntryFromImmutableFile(disk_, file_path_, settings_, file_size_, checksum_) - , limit(BackupEntryFromImmutableFile::getSize()) + const DiskPtr & disk_, const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_) + : disk(disk_) + , file_path(file_path_) + , data_source_description(disk->getDataSourceDescription()) + , settings(settings_) + , size(calculateSize(disk_, file_path_, file_size_, data_source_description.is_encrypted)) { } +BackupEntryFromAppendOnlyFile::~BackupEntryFromAppendOnlyFile() = default; + std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer() const { - auto buf = BackupEntryFromImmutableFile::getReadBuffer(); - return std::make_unique(std::move(buf), 0, limit); + std::unique_ptr buf; + if (data_source_description.is_encrypted) + buf = disk->readEncryptedFile(file_path, settings); + else + buf = disk->readFile(file_path, settings); + return std::make_unique(std::move(buf), 0, size); } } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index 7c57e55923e..e6814c4c345 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -8,24 +8,34 @@ namespace DB /// Represents a file prepared to be included in a backup, assuming that until this backup entry is destroyed /// the file can be appended with new data, but the bytes which are already in the file won't be changed. -class BackupEntryFromAppendOnlyFile : public BackupEntryFromImmutableFile +class BackupEntryFromAppendOnlyFile : public BackupEntryWithChecksumCalculation { public: - /// The constructor is allowed to not set `file_size_` or `checksum_`, in that case it will be calculated from the data. + /// The constructor is allowed to not set `file_size_`, in that case it will be calculated from the data. BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, const String & file_path_, const ReadSettings & settings_, - const std::optional & file_size_ = {}, - const std::optional & checksum_ = {}); + const std::optional & file_size_ = {}); + + ~BackupEntryFromAppendOnlyFile() override; - UInt64 getSize() const override { return limit; } std::unique_ptr getReadBuffer() const override; + UInt64 getSize() const override { return size; } - bool isFromImmutableFile() const override { return false; } + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } + bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + + bool isFromFile() const override { return true; } + DiskPtr getDisk() const override { return disk; } + String getFilePath() const override { return file_path; } private: - const UInt64 limit; + const DiskPtr disk; + const String file_path; + const DataSourceDescription data_source_description; + const ReadSettings settings; + const UInt64 size; }; } diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 7545134f638..ad0ce477600 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -1,8 +1,6 @@ #include #include -#include -#include -#include +#include namespace DB @@ -18,26 +16,13 @@ BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) , settings(settings_) - , file_size(data_source_description.is_encrypted ? std::optional{} : file_size_) - , checksum(data_source_description.is_encrypted ? std::optional{} : checksum_) + , file_size(file_size_) + , checksum(checksum_) { } BackupEntryFromImmutableFile::~BackupEntryFromImmutableFile() = default; -UInt64 BackupEntryFromImmutableFile::getSize() const -{ - std::lock_guard lock{get_file_size_mutex}; - if (!file_size) - { - if (data_source_description.is_encrypted) - file_size = disk->getEncryptedFileSize(file_path); - else - file_size = disk->getFileSize(file_path); - } - return *file_size; -} - std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const { if (data_source_description.is_encrypted) @@ -46,4 +31,43 @@ std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer( return disk->readFile(file_path, settings); } +UInt64 BackupEntryFromImmutableFile::getSize() const +{ + std::lock_guard lock{size_and_checksum_mutex}; + if (!file_size_adjusted) + { + if (!file_size) + file_size = disk->getFileSize(file_path); + if (data_source_description.is_encrypted) + *file_size = DiskEncrypted::convertFileSizeToEncryptedFileSize(*file_size); + file_size_adjusted = true; + } + return *file_size; +} + +UInt128 BackupEntryFromImmutableFile::getChecksum() const +{ + std::lock_guard lock{size_and_checksum_mutex}; + if (!checksum_adjusted) + { + /// TODO: We should not just ignore `checksum` if `data_source_description.is_encrypted == true`, we should use it somehow. + if (!checksum || data_source_description.is_encrypted) + checksum = BackupEntryWithChecksumCalculation::getChecksum(); + checksum_adjusted = true; + } + return *checksum; +} + +std::optional BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const +{ + if (prefix_length == 0) + return 0; + + if (prefix_length >= getSize()) + return getChecksum(); + + /// For immutable files we don't use partial checksums. + return std::nullopt; +} + } diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index f2801b67df6..23f48e8335a 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include @@ -11,7 +11,7 @@ class IDisk; using DiskPtr = std::shared_ptr; /// Represents a file prepared to be included in a backup, assuming that until this backup entry is destroyed the file won't be changed. -class BackupEntryFromImmutableFile : public IBackupEntry +class BackupEntryFromImmutableFile : public BackupEntryWithChecksumCalculation { public: /// The constructor is allowed to not set `file_size_` or `checksum_`, in that case it will be calculated from the data. @@ -24,13 +24,14 @@ public: ~BackupEntryFromImmutableFile() override; - UInt64 getSize() const override; - std::optional getChecksum() const override { return checksum; } std::unique_ptr getReadBuffer() const override; - - bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + + UInt64 getSize() const override; + UInt128 getChecksum() const override; + std::optional getPartialChecksum(size_t prefix_length) const override; DataSourceDescription getDataSourceDescription() const override { return data_source_description; } + bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } bool isFromFile() const override { return true; } bool isFromImmutableFile() const override { return true; } @@ -41,10 +42,12 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; - ReadSettings settings; - mutable std::optional file_size TSA_GUARDED_BY(get_file_size_mutex); - mutable std::mutex get_file_size_mutex; - const std::optional checksum; + const ReadSettings settings; + mutable std::optional file_size; + mutable std::optional checksum; + mutable bool file_size_adjusted = false; + mutable bool checksum_adjusted = false; + mutable std::mutex size_and_checksum_mutex; }; } diff --git a/src/Backups/BackupEntryFromMemory.cpp b/src/Backups/BackupEntryFromMemory.cpp index f59eadc2d7f..82e033caca0 100644 --- a/src/Backups/BackupEntryFromMemory.cpp +++ b/src/Backups/BackupEntryFromMemory.cpp @@ -5,13 +5,12 @@ namespace DB { -BackupEntryFromMemory::BackupEntryFromMemory(const void * data_, size_t size_, const std::optional & checksum_) - : BackupEntryFromMemory(String{reinterpret_cast(data_), size_}, checksum_) +BackupEntryFromMemory::BackupEntryFromMemory(const void * data_, size_t size_) + : BackupEntryFromMemory(String{reinterpret_cast(data_), size_}) { } -BackupEntryFromMemory::BackupEntryFromMemory(String data_, const std::optional & checksum_) - : data(std::move(data_)), checksum(checksum_) +BackupEntryFromMemory::BackupEntryFromMemory(String data_) : data(std::move(data_)) { } diff --git a/src/Backups/BackupEntryFromMemory.h b/src/Backups/BackupEntryFromMemory.h index 64f46d68580..19b72c16e53 100644 --- a/src/Backups/BackupEntryFromMemory.h +++ b/src/Backups/BackupEntryFromMemory.h @@ -1,32 +1,26 @@ #pragma once -#include -#include +#include namespace DB { /// Represents small preloaded data to be included in a backup. -class BackupEntryFromMemory : public IBackupEntry +class BackupEntryFromMemory : public BackupEntryWithChecksumCalculation { public: /// The constructor is allowed to not set `checksum_`, in that case it will be calculated from the data. - BackupEntryFromMemory(const void * data_, size_t size_, const std::optional & checksum_ = {}); - explicit BackupEntryFromMemory(String data_, const std::optional & checksum_ = {}); + BackupEntryFromMemory(const void * data_, size_t size_); + explicit BackupEntryFromMemory(String data_); - UInt64 getSize() const override { return data.size(); } - std::optional getChecksum() const override { return checksum; } std::unique_ptr getReadBuffer() const override; + UInt64 getSize() const override { return data.size(); } - DataSourceDescription getDataSourceDescription() const override - { - return DataSourceDescription{DataSourceType::RAM, "", false, false}; - } + DataSourceDescription getDataSourceDescription() const override { return DataSourceDescription{DataSourceType::RAM, "", false, false}; } private: const String data; - const std::optional checksum; }; } diff --git a/src/Backups/BackupEntryFromSmallFile.cpp b/src/Backups/BackupEntryFromSmallFile.cpp index 6f7d2364031..3662522c243 100644 --- a/src/Backups/BackupEntryFromSmallFile.cpp +++ b/src/Backups/BackupEntryFromSmallFile.cpp @@ -29,21 +29,18 @@ namespace } -BackupEntryFromSmallFile::BackupEntryFromSmallFile(const String & file_path_, const std::optional & checksum_) +BackupEntryFromSmallFile::BackupEntryFromSmallFile(const String & file_path_) : file_path(file_path_) , data_source_description(DiskLocal::getLocalDataSourceDescription(file_path_)) , data(readFile(file_path_)) - , checksum(checksum_) { } - -BackupEntryFromSmallFile::BackupEntryFromSmallFile( - const DiskPtr & disk_, const String & file_path_, const std::optional & checksum_) + +BackupEntryFromSmallFile::BackupEntryFromSmallFile(const DiskPtr & disk_, const String & file_path_) : disk(disk_) , file_path(file_path_) , data_source_description(disk_->getDataSourceDescription()) , data(readFile(disk_, file_path, data_source_description.is_encrypted)) - , checksum(data_source_description.is_encrypted ? std::optional{} : checksum_) { } diff --git a/src/Backups/BackupEntryFromSmallFile.h b/src/Backups/BackupEntryFromSmallFile.h index 2f7f3764571..6846ca71a55 100644 --- a/src/Backups/BackupEntryFromSmallFile.h +++ b/src/Backups/BackupEntryFromSmallFile.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -10,37 +10,27 @@ using DiskPtr = std::shared_ptr; /// Represents a file prepared to be included in a backup, /// assuming that the file is small and can be easily loaded into memory. -class BackupEntryFromSmallFile : public IBackupEntry +class BackupEntryFromSmallFile : public BackupEntryWithChecksumCalculation { public: - /// The constructor is allowed to not set `checksum_`, in that case it will be calculated from the data. - explicit BackupEntryFromSmallFile( - const String & file_path_, - const std::optional & checksum_ = {}); + explicit BackupEntryFromSmallFile(const String & file_path_); + BackupEntryFromSmallFile(const DiskPtr & disk_, const String & file_path_); - BackupEntryFromSmallFile( - const DiskPtr & disk_, - const String & file_path_, - const std::optional & checksum_ = {}); - - UInt64 getSize() const override { return data.size(); } - std::optional getChecksum() const override { return checksum; } std::unique_ptr getReadBuffer() const override; + UInt64 getSize() const override { return data.size(); } + DataSourceDescription getDataSourceDescription() const override { return data_source_description; } bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } bool isFromFile() const override { return true; } DiskPtr getDisk() const override { return disk; } String getFilePath() const override { return file_path; } - DataSourceDescription getDataSourceDescription() const override { return data_source_description; } - private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; const String data; - const std::optional checksum; }; } diff --git a/src/Backups/BackupEntryWithChecksumCalculation.cpp b/src/Backups/BackupEntryWithChecksumCalculation.cpp new file mode 100644 index 00000000000..bc97a907cff --- /dev/null +++ b/src/Backups/BackupEntryWithChecksumCalculation.cpp @@ -0,0 +1,48 @@ +#include +#include + + +namespace DB +{ + +template +UInt128 BackupEntryWithChecksumCalculation::getChecksum() const +{ + std::lock_guard lock{checksum_calculation_mutex}; + if (!calculated_checksum) + { + auto read_buffer = this->getReadBuffer(); + HashingReadBuffer hashing_read_buffer(*read_buffer); + hashing_read_buffer.ignoreAll(); + calculated_checksum = hashing_read_buffer.getHash(); + } + return *calculated_checksum; +} + +template +std::optional BackupEntryWithChecksumCalculation::getPartialChecksum(size_t prefix_length) const +{ + if (prefix_length == 0) + return 0; + + if (prefix_length >= this->getSize()) + return this->getChecksum(); + + auto read_buffer = this->getReadBuffer(); + HashingReadBuffer hashing_read_buffer(*read_buffer); + hashing_read_buffer.ignore(prefix_length); + auto partial_checksum = hashing_read_buffer.getHash(); + + std::lock_guard lock{checksum_calculation_mutex}; + if (!calculated_checksum) + { + hashing_read_buffer.ignoreAll(); + calculated_checksum = hashing_read_buffer.getHash(); + } + + return partial_checksum; +} + +template class BackupEntryWithChecksumCalculation; + +} diff --git a/src/Backups/BackupEntryWithChecksumCalculation.h b/src/Backups/BackupEntryWithChecksumCalculation.h new file mode 100644 index 00000000000..32701ab9952 --- /dev/null +++ b/src/Backups/BackupEntryWithChecksumCalculation.h @@ -0,0 +1,22 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Calculates the checksum and the partial checksum for a backup entry based on ReadBuffer returned by getReadBuffer(). +template +class BackupEntryWithChecksumCalculation : public Base +{ +public: + UInt128 getChecksum() const override; + std::optional getPartialChecksum(size_t prefix_length) const override; + +private: + mutable std::optional calculated_checksum; + mutable std::mutex checksum_calculation_mutex; +}; + +} diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h index 933fec39634..01b6163039b 100644 --- a/src/Backups/BackupEntryWrappedWith.h +++ b/src/Backups/BackupEntryWrappedWith.h @@ -15,15 +15,16 @@ public: BackupEntryWrappedWith(BackupEntryPtr entry_, T && custom_value_) : entry(entry_), custom_value(std::move(custom_value_)) { } ~BackupEntryWrappedWith() override = default; - UInt64 getSize() const override { return entry->getSize(); } - std::optional getChecksum() const override { return entry->getChecksum(); } std::unique_ptr getReadBuffer() const override { return entry->getReadBuffer(); } + UInt64 getSize() const override { return entry->getSize(); } + UInt128 getChecksum() const override { return entry->getChecksum(); } + std::optional getPartialChecksum(size_t prefix_length) const override { return entry->getPartialChecksum(prefix_length); } + DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); } bool isEncryptedByDisk() const override { return entry->isEncryptedByDisk(); } bool isFromFile() const override { return entry->isFromFile(); } bool isFromImmutableFile() const override { return entry->isFromImmutableFile(); } String getFilePath() const override { return entry->getFilePath(); } DiskPtr getDisk() const override { return entry->getDisk(); } - DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); } private: BackupEntryPtr entry; diff --git a/src/Backups/BackupFileInfo.cpp b/src/Backups/BackupFileInfo.cpp index 91ddc52ae44..42546d1b1b8 100644 --- a/src/Backups/BackupFileInfo.cpp +++ b/src/Backups/BackupFileInfo.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include namespace DB @@ -49,44 +49,19 @@ namespace struct ChecksumsForNewEntry { UInt128 full_checksum; - UInt128 prefix_checksum; + std::optional prefix_checksum; }; /// Calculate checksum for backup entry if it's empty. /// Also able to calculate additional checksum of some prefix. ChecksumsForNewEntry calculateNewEntryChecksumsIfNeeded(const BackupEntryPtr & entry, size_t prefix_size) { + ChecksumsForNewEntry res; + /// The partial checksum should be calculated before the full checksum to enable optimization in BackupEntryWithChecksumCalculation. if (prefix_size > 0) - { - auto read_buffer = entry->getReadBuffer(); - HashingReadBuffer hashing_read_buffer(*read_buffer); - hashing_read_buffer.ignore(prefix_size); - auto prefix_checksum = hashing_read_buffer.getHash(); - if (entry->getChecksum() == std::nullopt) - { - hashing_read_buffer.ignoreAll(); - auto full_checksum = hashing_read_buffer.getHash(); - return ChecksumsForNewEntry{full_checksum, prefix_checksum}; - } - else - { - return ChecksumsForNewEntry{*(entry->getChecksum()), prefix_checksum}; - } - } - else - { - if (entry->getChecksum() == std::nullopt) - { - auto read_buffer = entry->getReadBuffer(); - HashingReadBuffer hashing_read_buffer(*read_buffer); - hashing_read_buffer.ignoreAll(); - return ChecksumsForNewEntry{hashing_read_buffer.getHash(), 0}; - } - else - { - return ChecksumsForNewEntry{*(entry->getChecksum()), 0}; - } - } + res.prefix_checksum = entry->getPartialChecksum(prefix_size); + res.full_checksum = entry->getChecksum(); + return res; } /// We store entries' file names in the backup without leading slashes. diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 69265799793..4268f653e30 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -1,10 +1,7 @@ #include #include -//#include #include #include -//#include -//#include #include diff --git a/src/Backups/IBackupEntriesLazyBatch.cpp b/src/Backups/IBackupEntriesLazyBatch.cpp index cd4d470967b..8913d60f2eb 100644 --- a/src/Backups/IBackupEntriesLazyBatch.cpp +++ b/src/Backups/IBackupEntriesLazyBatch.cpp @@ -17,11 +17,12 @@ class IBackupEntriesLazyBatch::BackupEntryFromBatch : public IBackupEntry public: BackupEntryFromBatch(const std::shared_ptr & batch_, size_t index_) : batch(batch_), index(index_) { } - UInt64 getSize() const override { return getInternalBackupEntry()->getSize(); } - std::optional getChecksum() const override { return getInternalBackupEntry()->getChecksum(); } std::unique_ptr getReadBuffer() const override { return getInternalBackupEntry()->getReadBuffer(); } - bool isEncryptedByDisk() const override { return getInternalBackupEntry()->isEncryptedByDisk(); } + UInt64 getSize() const override { return getInternalBackupEntry()->getSize(); } + UInt128 getChecksum() const override { return getInternalBackupEntry()->getChecksum(); } + std::optional getPartialChecksum(size_t prefix_length) const override { return getInternalBackupEntry()->getPartialChecksum(prefix_length); } DataSourceDescription getDataSourceDescription() const override { return getInternalBackupEntry()->getDataSourceDescription(); } + bool isEncryptedByDisk() const override { return getInternalBackupEntry()->isEncryptedByDisk(); } bool isFromFile() const override { return getInternalBackupEntry()->isFromFile(); } bool isFromImmutableFile() const override { return getInternalBackupEntry()->isFromImmutableFile(); } String getFilePath() const override { return getInternalBackupEntry()->getFilePath(); } diff --git a/src/Backups/IBackupEntry.h b/src/Backups/IBackupEntry.h index 7a93d4035df..249c290cbe7 100644 --- a/src/Backups/IBackupEntry.h +++ b/src/Backups/IBackupEntry.h @@ -20,9 +20,12 @@ public: /// Returns the size of the data. virtual UInt64 getSize() const = 0; - /// Returns the checksum of the data if it's precalculated. - /// Can return nullopt which means the checksum should be calculated from the read buffer. - virtual std::optional getChecksum() const { return {}; } + /// Returns the checksum of the data. + virtual UInt128 getChecksum() const = 0; + + /// Returns a partial checksum, i.e. the checksum calculated for a prefix part of the data. + /// Can return nullopt if the partial checksum is too difficult to calculate. + virtual std::optional getPartialChecksum(size_t /* prefix_length */) const { return {}; } /// Returns a read buffer for reading the data. virtual std::unique_ptr getReadBuffer() const = 0; diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index db18e9652e7..15f314dcfc9 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -348,6 +348,13 @@ size_t DiskEncrypted::getFileSize(const String & path) const return size > FileEncryption::Header::kSize ? (size - FileEncryption::Header::kSize) : 0; } +size_t DiskEncrypted::convertFileSizeToEncryptedFileSize(size_t file_size) +{ + if (file_size) + return file_size + FileEncryption::Header::kSize; + return 0; +} + void DiskEncrypted::truncateFile(const String & path, size_t size) { auto wrapped_path = wrappedPath(path); diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index db53accc062..295e70a2c66 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -223,6 +223,8 @@ public: return delegate->getFileSize(wrapped_path); } + static size_t convertFileSizeToEncryptedFileSize(size_t file_size); + void setLastModified(const String & path, const Poco::Timestamp & timestamp) override { auto wrapped_path = wrappedPath(path); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 1dde057569e..fba6e6cbafc 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -9278,10 +9278,9 @@ void StorageReplicatedMergeTree::backupData( auto & hash = part_names_with_hashes_calculating[part_name]; if (relative_path.ends_with(".bin")) { - auto checksum = backup_entry->getChecksum(); hash.update(relative_path); hash.update(backup_entry->getSize()); - hash.update(*checksum); + hash.update(backup_entry->getChecksum()); } continue; } From c48c20fac83a0f7014a97e74fa81ce8ab49bdd1d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 1 May 2023 18:43:20 +0200 Subject: [PATCH 079/127] Use combined checksums for encrypted immutable files. --- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 25 +++++++------------ src/Backups/BackupEntryFromAppendOnlyFile.h | 2 +- src/Backups/BackupEntryFromImmutableFile.cpp | 22 ++++++++++++---- src/Backups/BackupIO_Disk.cpp | 2 +- src/Backups/BackupIO_File.cpp | 2 +- src/Backups/BackupIO_S3.cpp | 2 +- src/Disks/DiskEncrypted.cpp | 16 +++++++++--- src/Disks/DiskEncrypted.h | 4 +++ src/Disks/IDisk.cpp | 10 ++++++++ src/Disks/IDisk.h | 10 +++++--- .../test_backup_restore_s3/test.py | 12 ++++----- 11 files changed, 70 insertions(+), 37 deletions(-) diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index e3b79695a0c..2c28abc2e2f 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -8,22 +8,15 @@ namespace DB namespace { - UInt64 calculateSize(const DiskPtr & disk, const String & file_path, const std::optional & file_size, bool disk_is_encrypted) + /// For append-only files we must calculate its size on the construction of a backup entry. + UInt64 calculateSize(const DiskPtr & disk, const String & file_path, bool is_encrypted, std::optional unencrypted_file_size) { - if (file_size) - { - if (disk_is_encrypted) - return DiskEncrypted::convertFileSizeToEncryptedFileSize(*file_size); - else - return *file_size; - } + if (!unencrypted_file_size) + return is_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); + else if (is_encrypted) + return disk->getEncryptedFileSize(*unencrypted_file_size); else - { - if (disk_is_encrypted) - return disk->getEncryptedFileSize(file_path); - else - return disk->getFileSize(file_path); - } + return *unencrypted_file_size; } } @@ -32,8 +25,8 @@ BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( : disk(disk_) , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) - , settings(settings_) - , size(calculateSize(disk_, file_path_, file_size_, data_source_description.is_encrypted)) + , size(calculateSize(disk_, file_path_, data_source_description.is_encrypted, file_size_)) + , settings(settings_.adjustBufferSize(size)) { } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index e6814c4c345..fac8c319d8c 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -34,8 +34,8 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; - const ReadSettings settings; const UInt64 size; + const ReadSettings settings; }; } diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index ad0ce477600..f377e54d9b0 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -6,6 +6,17 @@ namespace DB { +namespace +{ + /// We mix the checksum calculated for non-encrypted data with IV generated to encrypt the file + /// to generate kind of a checksum for encrypted data. Of course it differs from the CityHash properly calculated for encrypted data. + UInt128 combineChecksums(UInt128 checksum1, UInt128 checksum2) + { + chassert(std::size(checksum2.items) == 2); + return CityHash_v1_0_2::CityHash128WithSeed(reinterpret_cast(&checksum1), sizeof(checksum1), {checksum2.items[0], checksum2.items[1]}); + } +} + BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, @@ -37,9 +48,9 @@ UInt64 BackupEntryFromImmutableFile::getSize() const if (!file_size_adjusted) { if (!file_size) - file_size = disk->getFileSize(file_path); - if (data_source_description.is_encrypted) - *file_size = DiskEncrypted::convertFileSizeToEncryptedFileSize(*file_size); + file_size = data_source_description.is_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); + else if (data_source_description.is_encrypted) + file_size = disk->getEncryptedFileSize(*file_size); file_size_adjusted = true; } return *file_size; @@ -50,9 +61,10 @@ UInt128 BackupEntryFromImmutableFile::getChecksum() const std::lock_guard lock{size_and_checksum_mutex}; if (!checksum_adjusted) { - /// TODO: We should not just ignore `checksum` if `data_source_description.is_encrypted == true`, we should use it somehow. - if (!checksum || data_source_description.is_encrypted) + if (!checksum) checksum = BackupEntryWithChecksumCalculation::getChecksum(); + else if (data_source_description.is_encrypted) + checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path)); checksum_adjusted = true; } return *checksum; diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index f64e929131c..1e1af63cdc2 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -74,7 +74,7 @@ UInt64 BackupWriterDisk::getFileSize(const String & file_name) std::unique_ptr BackupWriterDisk::readFile(const String & file_name, size_t expected_file_size) { - return disk->readFile(file_name, read_settings, {}, expected_file_size); + return disk->readFile(root_path / file_name, read_settings.adjustBufferSize(expected_file_size)); } std::unique_ptr BackupWriterDisk::writeFile(const String & file_name) diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 4268f653e30..649637e567d 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -87,7 +87,7 @@ UInt64 BackupWriterFile::getFileSize(const String & file_name) std::unique_ptr BackupWriterFile::readFile(const String & file_name, size_t expected_file_size) { - return createReadBufferFromFileBase(root_path / file_name, read_settings, {}, expected_file_size); + return createReadBufferFromFileBase(root_path / file_name, read_settings.adjustBufferSize(expected_file_size)); } std::unique_ptr BackupWriterFile::writeFile(const String & file_name) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 8d9d34bf9b5..09d2bdbcfaf 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -205,7 +205,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src /* src_key= */ blob_path[1], start_pos, length, - s3_uri.endpoint, + s3_uri.bucket, fs::path(s3_uri.key) / path_in_backup, request_settings, {}, diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index 15f314dcfc9..72f668db00b 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -348,10 +348,20 @@ size_t DiskEncrypted::getFileSize(const String & path) const return size > FileEncryption::Header::kSize ? (size - FileEncryption::Header::kSize) : 0; } -size_t DiskEncrypted::convertFileSizeToEncryptedFileSize(size_t file_size) +UInt128 DiskEncrypted::getEncryptedFileIV(const String & path) const { - if (file_size) - return file_size + FileEncryption::Header::kSize; + auto wrapped_path = wrappedPath(path); + auto read_buffer = delegate->readFile(wrapped_path, ReadSettings().adjustBufferSize(FileEncryption::Header::kSize)); + if (read_buffer->eof()) + return 0; + auto header = readHeader(*read_buffer); + return header.init_vector.get(); +} + +size_t DiskEncrypted::getEncryptedFileSize(size_t unencrypted_size) const +{ + if (unencrypted_size) + return unencrypted_size + FileEncryption::Header::kSize; return 0; } diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 295e70a2c66..c494dd6a216 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -223,6 +223,10 @@ public: return delegate->getFileSize(wrapped_path); } + size_t getEncryptedFileSize(size_t unencrypted_size) const override; + + UInt128 getEncryptedFileIV(const String & path) const override; + static size_t convertFileSizeToEncryptedFileSize(size_t file_size); void setLastModified(const String & path, const Poco::Timestamp & timestamp) override diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 3c8ca454463..9a5ae997b46 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -67,6 +67,16 @@ size_t IDisk::getEncryptedFileSize(const String &) const throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); } +size_t IDisk::getEncryptedFileSize(size_t) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); +} + +UInt128 IDisk::getEncryptedFileIV(const String &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); +} + using ResultsCollector = std::vector>; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 770eec46ce6..47b4ed80ebf 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -252,22 +252,26 @@ public: /// this method allows to specify a callback for that. virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; - /// Reads a file from an encrypted disk without decrypting it. + /// Reads a file from an encrypted disk without decrypting it (only for encrypted disks). virtual std::unique_ptr readEncryptedFile( const String & path, const ReadSettings & settings = ReadSettings{}, std::optional read_hint = {}, std::optional file_size = {}) const; - /// Writes an already encrypted file to an encrypted disk. + /// Writes an already encrypted file to the disk (only for encrypted disks). virtual std::unique_ptr writeEncryptedFile( const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, WriteMode mode = WriteMode::Rewrite, const WriteSettings & settings = {}) const; - /// Returns the size of encrypted file on an encrypted disk. + /// Returns the size of an encrypted file (only for encrypted disks). virtual size_t getEncryptedFileSize(const String & path) const; + virtual size_t getEncryptedFileSize(size_t unencrypted_size) const; + + /// Returns IV of an encrypted file (only for encrypted disks). + virtual UInt128 getEncryptedFileIV(const String & path) const; virtual const String & getCacheName() const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "There is no cache"); } diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index b5ac34f0b46..2f60575b634 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -141,8 +141,8 @@ def test_backup_to_s3_native_copy(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("copyS3FileFromDisk.*using native copy") - assert node.contains_in_log("copyS3FileToDisk.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("BackupReaderS3.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" ) @@ -155,8 +155,8 @@ def test_backup_to_s3_native_copy_other_bucket(): f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')" ) check_backup_and_restore(storage_policy, backup_destination) - assert node.contains_in_log("copyS3FileFromDisk.*using native copy") - assert node.contains_in_log("copyS3FileToDisk.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("BackupReaderS3.*using native copy") assert node.contains_in_log( f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}" ) @@ -167,8 +167,8 @@ def test_backup_to_s3_native_copy_multipart(): backup_name = new_backup_name() backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart/{backup_name}', 'minio', 'minio123')" check_backup_and_restore(storage_policy, backup_destination, size=1000000) - assert node.contains_in_log("copyS3FileFromDisk.*using native copy") - assert node.contains_in_log("copyS3FileToDisk.*using native copy") + assert node.contains_in_log("BackupWriterS3.*using native copy") + assert node.contains_in_log("BackupReaderS3.*using native copy") assert node.contains_in_log( f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/" ) From d0fd81fa27514ff18001a25bd2a3e2b631ca9e0d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 2 May 2023 01:55:56 +0200 Subject: [PATCH 080/127] Add tests. --- .../configs/allow_backup_path.xml | 5 + tests/integration/test_encrypted_disk/test.py | 126 +++++++++++++++++- 2 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_encrypted_disk/configs/allow_backup_path.xml diff --git a/tests/integration/test_encrypted_disk/configs/allow_backup_path.xml b/tests/integration/test_encrypted_disk/configs/allow_backup_path.xml new file mode 100644 index 00000000000..eade3bfb744 --- /dev/null +++ b/tests/integration/test_encrypted_disk/configs/allow_backup_path.xml @@ -0,0 +1,5 @@ + + + /backups/ + + \ No newline at end of file diff --git a/tests/integration/test_encrypted_disk/test.py b/tests/integration/test_encrypted_disk/test.py index 8187f2ff6a8..90da5849c7f 100644 --- a/tests/integration/test_encrypted_disk/test.py +++ b/tests/integration/test_encrypted_disk/test.py @@ -1,6 +1,7 @@ import pytest from helpers.cluster import ClickHouseCluster from helpers.client import QueryRuntimeException +import os.path from helpers.test_tools import assert_eq_with_retry @@ -9,8 +10,9 @@ FIRST_PART_NAME = "all_1_1_0" cluster = ClickHouseCluster(__file__) node = cluster.add_instance( "node", - main_configs=["configs/storage.xml"], + main_configs=["configs/storage.xml", "configs/allow_backup_path.xml"], tmpfs=["/disk:size=100M"], + external_dirs=["/backups/"], with_minio=True, stay_alive=True, ) @@ -33,6 +35,15 @@ def cleanup_after_test(): node.query("DROP TABLE IF EXISTS encrypted_test SYNC") +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"backup{backup_id_counter}" + + @pytest.mark.parametrize( "policy", ["encrypted_policy", "encrypted_policy_key192b", "local_policy", "s3_policy"], @@ -295,3 +306,116 @@ def test_restart(): assert node.query(select_query) == "(0,'data'),(1,'data')" node.query("DROP TABLE encrypted_test SYNC;") + + +@pytest.mark.parametrize( + "storage_policy,backup_type,storage_policy2", + [ + ("encrypted_policy", "S3", "encrypted_policy"), + ("encrypted_policy", "S3", "s3_encrypted_default_path"), + ("s3_encrypted_default_path", "S3", "s3_encrypted_default_path"), + ("s3_encrypted_default_path", "S3", "encrypted_policy"), + ("s3_encrypted_default_path", "File", "encrypted_policy"), + ("local_policy", "File", "encrypted_policy"), + ], +) +def test_backup_restore(storage_policy, backup_type, storage_policy2): + node.query( + f""" + CREATE TABLE encrypted_test ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + SETTINGS storage_policy='{storage_policy}' + """ + ) + + node.query("INSERT INTO encrypted_test VALUES (0,'data'),(1,'data')") + select_query = "SELECT * FROM encrypted_test ORDER BY id FORMAT Values" + assert node.query(select_query) == "(0,'data'),(1,'data')" + + backup_name = new_backup_name() + if backup_type == "S3": + backup_destination = ( + f"S3('http://minio1:9001/root/backups/{backup_name}', 'minio', 'minio123')" + ) + elif backup_type == "File": + backup_destination = f"File('/backups/{backup_name}/')" + + node.query(f"BACKUP TABLE encrypted_test TO {backup_destination}") + + if backup_type == "File" and storage_policy.find("encrypted") != -1: + root_path = os.path.join(node.cluster.instances_dir, "backups", backup_name) + with open( + f"{root_path}/data/default/encrypted_test/all_1_1_0/data.bin", "rb" + ) as file: + assert file.read().startswith(b"ENC") + with open(f"{root_path}/metadata/default/encrypted_test.sql") as file: + assert file.read().startswith("CREATE TABLE default.encrypted_test") + with open(f"{root_path}/.backup") as file: + assert file.read().find("true") != -1 + + node.query(f"DROP TABLE encrypted_test SYNC") + + if storage_policy != storage_policy2: + node.query( + f""" + CREATE TABLE encrypted_test ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + SETTINGS storage_policy='{storage_policy2}' + """ + ) + + node.query( + f"RESTORE TABLE encrypted_test FROM {backup_destination} SETTINGS allow_different_table_def={int(storage_policy != storage_policy2)}" + ) + + assert node.query(select_query) == "(0,'data'),(1,'data')" + + +def test_cannot_restore_encrypted_files_to_unencrypted_disk(): + node.query( + """ + CREATE TABLE encrypted_test ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + SETTINGS storage_policy='encrypted_policy' + """ + ) + + node.query("INSERT INTO encrypted_test VALUES (0,'data'),(1,'data')") + assert ( + node.query("SELECT * FROM encrypted_test ORDER BY id FORMAT Values") + == "(0,'data'),(1,'data')" + ) + + backup_name = new_backup_name() + backup_destination = ( + f"S3('http://minio1:9001/root/backups/{backup_name}', 'minio', 'minio123')" + ) + node.query(f"BACKUP TABLE encrypted_test TO {backup_destination}") + + node.query(f"DROP TABLE encrypted_test SYNC") + + node.query( + f""" + CREATE TABLE encrypted_test ( + id Int64, + data String + ) ENGINE=MergeTree() + ORDER BY id + SETTINGS storage_policy='local_policy' + """ + ) + + expected_error = "can be restored only to an encrypted disk" + assert expected_error in node.query_and_get_error( + f"RESTORE TABLE encrypted_test FROM {backup_destination} SETTINGS allow_different_table_def=1" + ) +>>>>>>> 9c08fb30995 (Add tests.) From 7cea26423010a09ef6777c17ac210cd11cfb14c7 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 3 May 2023 12:33:35 +0200 Subject: [PATCH 081/127] Fix whitespaces. --- src/Backups/BackupIO.h | 4 ++-- src/Backups/BackupIO_Default.h | 4 ++-- src/Backups/BackupIO_Disk.cpp | 5 ----- src/Backups/BackupIO_File.cpp | 5 +++++ src/Backups/IBackupEntry.h | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index d522387deba..e4a82a604e8 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -28,10 +28,10 @@ public: /// The function copyFileToDisk() can be much faster than reading the file with readFile() and then writing it to some disk. /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). /// Parameters: - /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. + /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. virtual void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) = 0; - + virtual const ReadSettings & getReadSettings() const = 0; virtual const WriteSettings & getWriteSettings() const = 0; virtual size_t getWriteBufferSize() const = 0; diff --git a/src/Backups/BackupIO_Default.h b/src/Backups/BackupIO_Default.h index 0fc510f9361..ad7bdf15d9f 100644 --- a/src/Backups/BackupIO_Default.h +++ b/src/Backups/BackupIO_Default.h @@ -25,10 +25,10 @@ public: /// The function copyFileToDisk() can be much faster than reading the file with readFile() and then writing it to some disk. /// (especially for S3 where it can use CopyObject to copy objects inside S3 instead of downloading and uploading them). /// Parameters: - /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. + /// `encrypted_in_backup` specify if this file is encrypted in the backup, so it shouldn't be encrypted again while restoring to an encrypted disk. void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; - + const ReadSettings & getReadSettings() const override { return read_settings; } const WriteSettings & getWriteSettings() const override { return write_settings; } size_t getWriteBufferSize() const override { return write_buffer_size; } diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index 1e1af63cdc2..b58aa1832a5 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -8,11 +8,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & root_path_, const ContextPtr & context_) : BackupReaderDefault(&Poco::Logger::get("BackupReaderDisk"), context_) , disk(disk_) diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 649637e567d..eb079623919 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -11,6 +11,11 @@ namespace fs = std::filesystem; namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + BackupReaderFile::BackupReaderFile(const String & root_path_, const ContextPtr & context_) : BackupReaderDefault(&Poco::Logger::get("BackupReaderFile"), context_) , root_path(root_path_) diff --git a/src/Backups/IBackupEntry.h b/src/Backups/IBackupEntry.h index 249c290cbe7..ca176c740c0 100644 --- a/src/Backups/IBackupEntry.h +++ b/src/Backups/IBackupEntry.h @@ -33,7 +33,7 @@ public: /// Returns true if the data returned by getReadBuffer() is encrypted by an encrypted disk. virtual bool isEncryptedByDisk() const { return false; } - /// Returns information about disk and file if this backup entry is generated from a file. + /// Returns information about disk and file if this backup entry is generated from a file. virtual bool isFromFile() const { return false; } virtual bool isFromImmutableFile() const { return false; } virtual String getFilePath() const { return ""; } From 5198997fd84dc3d2071f4b80b6888ebf8f800b81 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 3 May 2023 13:51:36 +0200 Subject: [PATCH 082/127] Remove ReadSettings from backup entries. --- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 9 ++++----- src/Backups/BackupEntryFromAppendOnlyFile.h | 4 +--- src/Backups/BackupEntryFromImmutableFile.cpp | 8 +++----- src/Backups/BackupEntryFromImmutableFile.h | 6 ++---- src/Backups/BackupEntryFromMemory.cpp | 2 +- src/Backups/BackupEntryFromMemory.h | 2 +- src/Backups/BackupEntryFromSmallFile.cpp | 2 +- src/Backups/BackupEntryFromSmallFile.h | 2 +- src/Backups/BackupEntryWithChecksumCalculation.cpp | 4 ++-- src/Backups/BackupEntryWrappedWith.h | 2 +- src/Backups/BackupImpl.cpp | 4 ++-- src/Backups/IBackupEntriesLazyBatch.cpp | 2 +- src/Backups/IBackupEntry.h | 2 +- src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp | 3 +-- src/Storages/MergeTree/DataPartStorageOnDiskBase.h | 1 - src/Storages/MergeTree/IDataPartStorage.h | 1 - src/Storages/MergeTree/MergeTreeData.cpp | 2 -- src/Storages/StorageLog.cpp | 8 +++----- src/Storages/StorageMemory.cpp | 6 ++---- src/Storages/StorageStripeLog.cpp | 8 +++----- 20 files changed, 30 insertions(+), 48 deletions(-) diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index 2c28abc2e2f..331a4a69d06 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -21,24 +21,23 @@ namespace } BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( - const DiskPtr & disk_, const String & file_path_, const ReadSettings & settings_, const std::optional & file_size_) + const DiskPtr & disk_, const String & file_path_, const std::optional & file_size_) : disk(disk_) , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) , size(calculateSize(disk_, file_path_, data_source_description.is_encrypted, file_size_)) - , settings(settings_.adjustBufferSize(size)) { } BackupEntryFromAppendOnlyFile::~BackupEntryFromAppendOnlyFile() = default; -std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer() const +std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer(const ReadSettings & read_settings) const { std::unique_ptr buf; if (data_source_description.is_encrypted) - buf = disk->readEncryptedFile(file_path, settings); + buf = disk->readEncryptedFile(file_path, read_settings.adjustBufferSize(size)); else - buf = disk->readFile(file_path, settings); + buf = disk->readFile(file_path, read_settings.adjustBufferSize(size)); return std::make_unique(std::move(buf), 0, size); } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index fac8c319d8c..8a78478dcc5 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -15,12 +15,11 @@ public: BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, const String & file_path_, - const ReadSettings & settings_, const std::optional & file_size_ = {}); ~BackupEntryFromAppendOnlyFile() override; - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer(const ReadSettings & read_settings) const override; UInt64 getSize() const override { return size; } DataSourceDescription getDataSourceDescription() const override { return data_source_description; } @@ -35,7 +34,6 @@ private: const String file_path; const DataSourceDescription data_source_description; const UInt64 size; - const ReadSettings settings; }; } diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index f377e54d9b0..24965f09356 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -20,13 +20,11 @@ namespace BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, - const ReadSettings & settings_, const std::optional & file_size_, const std::optional & checksum_) : disk(disk_) , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) - , settings(settings_) , file_size(file_size_) , checksum(checksum_) { @@ -34,12 +32,12 @@ BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( BackupEntryFromImmutableFile::~BackupEntryFromImmutableFile() = default; -std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const +std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer(const ReadSettings & read_settings) const { if (data_source_description.is_encrypted) - return disk->readEncryptedFile(file_path, settings); + return disk->readEncryptedFile(file_path, read_settings); else - return disk->readFile(file_path, settings); + return disk->readFile(file_path, read_settings); } UInt64 BackupEntryFromImmutableFile::getSize() const diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 23f48e8335a..5cdb4adc7c4 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -1,10 +1,10 @@ #pragma once #include -#include #include #include + namespace DB { class IDisk; @@ -18,13 +18,12 @@ public: BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, - const ReadSettings & settings_, const std::optional & file_size_ = {}, const std::optional & checksum_ = {}); ~BackupEntryFromImmutableFile() override; - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer(const ReadSettings & read_settings) const override; UInt64 getSize() const override; UInt128 getChecksum() const override; @@ -42,7 +41,6 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; - const ReadSettings settings; mutable std::optional file_size; mutable std::optional checksum; mutable bool file_size_adjusted = false; diff --git a/src/Backups/BackupEntryFromMemory.cpp b/src/Backups/BackupEntryFromMemory.cpp index 82e033caca0..1dd911afbbb 100644 --- a/src/Backups/BackupEntryFromMemory.cpp +++ b/src/Backups/BackupEntryFromMemory.cpp @@ -14,7 +14,7 @@ BackupEntryFromMemory::BackupEntryFromMemory(String data_) : data(std::move(data { } -std::unique_ptr BackupEntryFromMemory::getReadBuffer() const +std::unique_ptr BackupEntryFromMemory::getReadBuffer(const ReadSettings &) const { return std::make_unique(data); } diff --git a/src/Backups/BackupEntryFromMemory.h b/src/Backups/BackupEntryFromMemory.h index 19b72c16e53..d8bc0eb966d 100644 --- a/src/Backups/BackupEntryFromMemory.h +++ b/src/Backups/BackupEntryFromMemory.h @@ -14,7 +14,7 @@ public: BackupEntryFromMemory(const void * data_, size_t size_); explicit BackupEntryFromMemory(String data_); - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer(const ReadSettings &) const override; UInt64 getSize() const override { return data.size(); } DataSourceDescription getDataSourceDescription() const override { return DataSourceDescription{DataSourceType::RAM, "", false, false}; } diff --git a/src/Backups/BackupEntryFromSmallFile.cpp b/src/Backups/BackupEntryFromSmallFile.cpp index 3662522c243..3dcee7147fc 100644 --- a/src/Backups/BackupEntryFromSmallFile.cpp +++ b/src/Backups/BackupEntryFromSmallFile.cpp @@ -44,7 +44,7 @@ BackupEntryFromSmallFile::BackupEntryFromSmallFile(const DiskPtr & disk_, const { } -std::unique_ptr BackupEntryFromSmallFile::getReadBuffer() const +std::unique_ptr BackupEntryFromSmallFile::getReadBuffer(const ReadSettings &) const { return std::make_unique(data); } diff --git a/src/Backups/BackupEntryFromSmallFile.h b/src/Backups/BackupEntryFromSmallFile.h index 6846ca71a55..4f936718fbb 100644 --- a/src/Backups/BackupEntryFromSmallFile.h +++ b/src/Backups/BackupEntryFromSmallFile.h @@ -16,7 +16,7 @@ public: explicit BackupEntryFromSmallFile(const String & file_path_); BackupEntryFromSmallFile(const DiskPtr & disk_, const String & file_path_); - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer(const ReadSettings &) const override; UInt64 getSize() const override { return data.size(); } DataSourceDescription getDataSourceDescription() const override { return data_source_description; } diff --git a/src/Backups/BackupEntryWithChecksumCalculation.cpp b/src/Backups/BackupEntryWithChecksumCalculation.cpp index bc97a907cff..2c0c2eab8ff 100644 --- a/src/Backups/BackupEntryWithChecksumCalculation.cpp +++ b/src/Backups/BackupEntryWithChecksumCalculation.cpp @@ -11,7 +11,7 @@ UInt128 BackupEntryWithChecksumCalculation::getChecksum() const std::lock_guard lock{checksum_calculation_mutex}; if (!calculated_checksum) { - auto read_buffer = this->getReadBuffer(); + auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(this->getSize())); HashingReadBuffer hashing_read_buffer(*read_buffer); hashing_read_buffer.ignoreAll(); calculated_checksum = hashing_read_buffer.getHash(); @@ -28,7 +28,7 @@ std::optional BackupEntryWithChecksumCalculation::getPartialCheck if (prefix_length >= this->getSize()) return this->getChecksum(); - auto read_buffer = this->getReadBuffer(); + auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(prefix_length)); HashingReadBuffer hashing_read_buffer(*read_buffer); hashing_read_buffer.ignore(prefix_length); auto partial_checksum = hashing_read_buffer.getHash(); diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h index 01b6163039b..f865d529206 100644 --- a/src/Backups/BackupEntryWrappedWith.h +++ b/src/Backups/BackupEntryWrappedWith.h @@ -15,7 +15,7 @@ public: BackupEntryWrappedWith(BackupEntryPtr entry_, T && custom_value_) : entry(entry_), custom_value(std::move(custom_value_)) { } ~BackupEntryWrappedWith() override = default; - std::unique_ptr getReadBuffer() const override { return entry->getReadBuffer(); } + std::unique_ptr getReadBuffer(const ReadSettings & read_settings) const override { return entry->getReadBuffer(read_settings); } UInt64 getSize() const override { return entry->getSize(); } UInt128 getChecksum() const override { return entry->getChecksum(); } std::optional getPartialChecksum(size_t prefix_length) const override { return entry->getPartialChecksum(prefix_length); } diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 715fe9e0d13..7fcb42ec378 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -868,7 +868,7 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) { LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}, adding to archive", info.data_file_name, src_file_desc, info.data_file_index); auto out = archive_writer->writeFile(info.data_file_name); - auto read_buffer = entry->getReadBuffer(); + auto read_buffer = entry->getReadBuffer(writer->getReadSettings()); if (info.base_size != 0) read_buffer->seek(info.base_size, SEEK_SET); copyData(*read_buffer, *out); @@ -882,7 +882,7 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) else { LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); - auto create_read_buffer = [entry] { return entry->getReadBuffer(); }; + auto create_read_buffer = [entry, read_settings = writer->getReadSettings()] { return entry->getReadBuffer(read_settings); }; writer->copyDataToFile(info.data_file_name, create_read_buffer, info.base_size, info.size - info.base_size); } diff --git a/src/Backups/IBackupEntriesLazyBatch.cpp b/src/Backups/IBackupEntriesLazyBatch.cpp index 8913d60f2eb..7c6bb891981 100644 --- a/src/Backups/IBackupEntriesLazyBatch.cpp +++ b/src/Backups/IBackupEntriesLazyBatch.cpp @@ -17,7 +17,7 @@ class IBackupEntriesLazyBatch::BackupEntryFromBatch : public IBackupEntry public: BackupEntryFromBatch(const std::shared_ptr & batch_, size_t index_) : batch(batch_), index(index_) { } - std::unique_ptr getReadBuffer() const override { return getInternalBackupEntry()->getReadBuffer(); } + std::unique_ptr getReadBuffer(const ReadSettings & read_settings) const override { return getInternalBackupEntry()->getReadBuffer(read_settings); } UInt64 getSize() const override { return getInternalBackupEntry()->getSize(); } UInt128 getChecksum() const override { return getInternalBackupEntry()->getChecksum(); } std::optional getPartialChecksum(size_t prefix_length) const override { return getInternalBackupEntry()->getPartialChecksum(prefix_length); } diff --git a/src/Backups/IBackupEntry.h b/src/Backups/IBackupEntry.h index ca176c740c0..7e952e9b568 100644 --- a/src/Backups/IBackupEntry.h +++ b/src/Backups/IBackupEntry.h @@ -28,7 +28,7 @@ public: virtual std::optional getPartialChecksum(size_t /* prefix_length */) const { return {}; } /// Returns a read buffer for reading the data. - virtual std::unique_ptr getReadBuffer() const = 0; + virtual std::unique_ptr getReadBuffer(const ReadSettings & read_settings) const = 0; /// Returns true if the data returned by getReadBuffer() is encrypted by an encrypted disk. virtual bool isEncryptedByDisk() const { return false; } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 4df490f41fe..c1e2c5b8cf8 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -319,7 +319,6 @@ DataPartStorageOnDiskBase::getReplicatedFilesDescriptionForRemoteDisk(const Name } void DataPartStorageOnDiskBase::backup( - const ReadSettings & read_settings, const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, @@ -393,7 +392,7 @@ void DataPartStorageOnDiskBase::backup( file_hash = {it->second.file_hash.first, it->second.file_hash.second}; } - BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, read_settings, file_size, file_hash); + BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, file_size, file_hash); if (temp_dir_owner) backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index fcc771f1250..52544bb2457 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -50,7 +50,6 @@ public: ReplicatedFilesDescription getReplicatedFilesDescriptionForRemoteDisk(const NameSet & file_names) const override; void backup( - const ReadSettings & read_settings, const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index b0b42b331cd..9c267d94e63 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -197,7 +197,6 @@ public: /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time). using TemporaryFilesOnDisks = std::map>; virtual void backup( - const ReadSettings & read_settings, const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a53eb417fcd..e36bc8baeb4 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5086,7 +5086,6 @@ BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, con BackupEntries backup_entries_from_part; part->getDataPartStorage().backup( - read_settings, part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, @@ -5098,7 +5097,6 @@ BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, con for (const auto & [projection_name, projection_part] : projection_parts) { projection_part->getDataPartStorage().backup( - read_settings, projection_part->checksums, projection_part->getFileNamesWithoutChecksums(), fs::path{data_path_in_backup} / part->name, diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 31f499a7d96..6d77d2b97b0 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -928,10 +928,8 @@ std::optional StorageLog::totalBytes(const Settings &) const void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & /* partitions */) { - auto local_context = backup_entries_collector.getContext(); - ReadSettings read_settings = local_context->getBackupReadSettings(); + auto lock_timeout = getLockTimeout(backup_entries_collector.getContext()); - auto lock_timeout = getLockTimeout(local_context); loadMarks(lock_timeout); ReadLock lock{rwlock, lock_timeout}; @@ -954,7 +952,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file.path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file.path)); + disk, hardlink_file_path, file_checker.getFileSize(data_file.path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } @@ -967,7 +965,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String hardlink_file_path = temp_dir / marks_file_name; disk->createHardLink(marks_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(marks_file_path)); + disk, hardlink_file_path, file_checker.getFileSize(marks_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / marks_file_name, std::move(backup_entry)); } diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 68c888a2d23..ebc780f5ab1 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -308,8 +308,6 @@ namespace BackupEntries generate() override { - ReadSettings read_settings = context->getBackupReadSettings(); - BackupEntries backup_entries; backup_entries.resize(file_paths.size()); @@ -326,7 +324,7 @@ namespace NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; for (const auto & block : *blocks) block_out.write(block); - backup_entries[data_bin_pos] = {file_paths[data_bin_pos], std::make_shared(temp_disk, data_file_path, read_settings)}; + backup_entries[data_bin_pos] = {file_paths[data_bin_pos], std::make_shared(temp_disk, data_file_path)}; } /// Writing index.mrk @@ -335,7 +333,7 @@ namespace auto index_mrk_out_compressed = temp_disk->writeFile(index_mrk_path); CompressedWriteBuffer index_mrk_out{*index_mrk_out_compressed}; index.write(index_mrk_out); - backup_entries[index_mrk_pos] = {file_paths[index_mrk_pos], std::make_shared(temp_disk, index_mrk_path, read_settings)}; + backup_entries[index_mrk_pos] = {file_paths[index_mrk_pos], std::make_shared(temp_disk, index_mrk_path)}; } /// Writing columns.txt diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index 5b22db91631..23bf88b9db5 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -529,10 +529,8 @@ std::optional StorageStripeLog::totalBytes(const Settings &) const void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & /* partitions */) { - auto local_context = backup_entries_collector.getContext(); - ReadSettings read_settings = local_context->getBackupReadSettings(); + auto lock_timeout = getLockTimeout(backup_entries_collector.getContext()); - auto lock_timeout = getLockTimeout(local_context); loadIndices(lock_timeout); ReadLock lock{rwlock, lock_timeout}; @@ -554,7 +552,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(data_file_path)); + disk, hardlink_file_path, file_checker.getFileSize(data_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } @@ -566,7 +564,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String hardlink_file_path = temp_dir / index_file_name; disk->createHardLink(index_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, read_settings, file_checker.getFileSize(index_file_path)); + disk, hardlink_file_path, file_checker.getFileSize(index_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / index_file_name, std::move(backup_entry)); } From 019493efa370f4c7aceadc6cb6a33485bf022254 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 3 May 2023 15:56:28 +0200 Subject: [PATCH 083/127] Fix throttling in backups. --- src/Backups/BackupIO_Disk.cpp | 39 ++++++++++++++++++++++++----------- src/Backups/BackupIO_Disk.h | 3 +++ src/Backups/BackupIO_File.cpp | 18 +++++++++------- src/Backups/BackupIO_File.h | 1 - src/Backups/BackupIO_S3.cpp | 11 ++++++---- 5 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index b58aa1832a5..3b1651bb223 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -12,6 +12,7 @@ BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & root_pa : BackupReaderDefault(&Poco::Logger::get("BackupReaderDisk"), context_) , disk(disk_) , root_path(root_path_) + , data_source_description(disk->getDataSourceDescription()) { } @@ -35,12 +36,18 @@ std::unique_ptr BackupReaderDisk::readFile(const String & fi void BackupReaderDisk::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { - if ((write_mode == WriteMode::Rewrite) && !encrypted_in_backup) + /// Use IDisk::copyFile() as a more optimal way to copy a file if it's possible. + bool has_throttling = disk->isRemote() ? static_cast(read_settings.remote_throttler) : static_cast(read_settings.local_throttler); + if (!has_throttling && (write_mode == WriteMode::Rewrite) && !encrypted_in_backup) { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} from disk {} to disk {}", path_in_backup, disk->getName(), destination_disk->getName()); - disk->copyFile(root_path / path_in_backup, *destination_disk, destination_path, write_settings); - return; + auto destination_data_source_description = destination_disk->getDataSourceDescription(); + if (destination_data_source_description.sameKind(data_source_description) && !data_source_description.is_encrypted) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} from disk {} to disk {}", path_in_backup, disk->getName(), destination_disk->getName()); + disk->copyFile(root_path / path_in_backup, *destination_disk, destination_path, write_settings); + return; /// copied! + } } /// Fallback to copy through buffers. @@ -52,6 +59,7 @@ BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & root_pa : BackupWriterDefault(&Poco::Logger::get("BackupWriterDisk"), context_) , disk(disk_) , root_path(root_path_) + , data_source_description(disk->getDataSourceDescription()) { } @@ -97,14 +105,21 @@ void BackupWriterDisk::removeFiles(const Strings & file_names) void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) { - if (!copy_encrypted && !start_pos && (length == src_disk->getFileSize(src_path))) + /// Use IDisk::copyFile() as a more optimal way to copy a file if it's possible. + bool has_throttling = src_disk->isRemote() ? static_cast(read_settings.remote_throttler) : static_cast(read_settings.local_throttler); + if (!has_throttling && !start_pos && !copy_encrypted) { - /// Use more optimal way. - LOG_TRACE(log, "Copying file {} from disk {} to disk {}", src_path, src_disk->getName(), disk->getName()); - auto dest_file_path = root_path / path_in_backup; - disk->createDirectories(dest_file_path.parent_path()); - src_disk->copyFile(src_path, *disk, dest_file_path, write_settings); - return; + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description.sameKind(data_source_description) && !source_data_source_description.is_encrypted + && (length == src_disk->getFileSize(src_path))) + { + /// Use more optimal way. + LOG_TRACE(log, "Copying file {} from disk {} to disk {}", src_path, src_disk->getName(), disk->getName()); + auto dest_file_path = root_path / path_in_backup; + disk->createDirectories(dest_file_path.parent_path()); + src_disk->copyFile(src_path, *disk, dest_file_path, write_settings); + return; /// copied! + } } /// Fallback to copy through buffers. diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index 399ebeaa227..faf4ef03447 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -26,6 +27,7 @@ public: private: const DiskPtr disk; const std::filesystem::path root_path; + const DataSourceDescription data_source_description; }; class BackupWriterDisk : public BackupWriterDefault @@ -50,6 +52,7 @@ private: const DiskPtr disk; const std::filesystem::path root_path; + const DataSourceDescription data_source_description; }; } diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index eb079623919..7ffae26d16f 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -41,7 +41,9 @@ std::unique_ptr BackupReaderFile::readFile(const String & fi void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { - if (write_mode == WriteMode::Rewrite) + /// std::filesystem::copy() can copy from the filesystem only, and it can't do the throttling or appending. + bool has_throttling = static_cast(read_settings.local_throttler); + if (!has_throttling && (write_mode == WriteMode::Rewrite)) { auto destination_data_source_description = destination_disk->getDataSourceDescription(); if (destination_data_source_description.sameKind(data_source_description) @@ -50,10 +52,10 @@ void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file /// Use more optimal way. LOG_TRACE(log, "Copying file {} to disk {} locally", path_in_backup, destination_disk->getName()); - auto write_blob_function - = [abs_source_path = root_path / path_in_backup, file_size]( - const Strings & blob_path, WriteMode mode, const std::optional &) -> size_t + auto write_blob_function = [abs_source_path = root_path / path_in_backup, file_size]( + const Strings & blob_path, WriteMode mode, const std::optional &) -> size_t { + /// For local disks the size of a blob path is expected to be 1. if (blob_path.size() != 1 || mode != WriteMode::Rewrite) throw Exception(ErrorCodes::LOGICAL_ERROR, "Blob writing function called with unexpected blob_path.size={} or mode={}", @@ -63,7 +65,7 @@ void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file }; destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); - return; + return; /// copied! } } @@ -76,7 +78,6 @@ BackupWriterFile::BackupWriterFile(const String & root_path_, const ContextPtr & : BackupWriterDefault(&Poco::Logger::get("BackupWriterFile"), context_) , root_path(root_path_) , data_source_description(DiskLocal::getLocalDataSourceDescription(root_path)) - , has_throttling(static_cast(read_settings.local_throttler)) { } @@ -120,7 +121,8 @@ void BackupWriterFile::removeFiles(const Strings & file_names) void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) { - /// std::filesystem::copy() can copy from the filesystem only, and it cannot do the throttling. + /// std::filesystem::copy() can copy from the filesystem only, and it can't do the throttling or copy a part of the file. + bool has_throttling = static_cast(read_settings.local_throttler); if (!has_throttling) { auto source_data_source_description = src_disk->getDataSourceDescription(); @@ -140,7 +142,7 @@ void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr s auto abs_dest_path = root_path / path_in_backup; fs::create_directories(abs_dest_path.parent_path()); fs::copy(abs_source_path, abs_dest_path, fs::copy_options::overwrite_existing); - return; + return; /// copied! } } } diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index 45fc0d47115..fd2c0b07158 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -46,7 +46,6 @@ private: const std::filesystem::path root_path; const DataSourceDescription data_source_description; - const bool has_throttling; }; } diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 09d2bdbcfaf..69f56078f9d 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -135,6 +135,8 @@ std::unique_ptr BackupReaderS3::readFile(const String & file void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { + /// Use the native copy as a more optimal way to copy a file from S3 to S3 if it's possible. + /// We don't check for `has_throttling` here because the native copy almost doesn't use network. auto destination_data_source_description = destination_disk->getDataSourceDescription(); if (destination_data_source_description.sameKind(data_source_description) && (destination_data_source_description.is_encrypted == encrypted_in_backup)) @@ -166,7 +168,7 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s }; destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); - return; + return; /// copied! } /// Fallback to copy through buffers. @@ -189,9 +191,10 @@ BackupWriterS3::BackupWriterS3( void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) { + /// Use the native copy as a more optimal way to copy a file from S3 to S3 if it's possible. + /// We don't check for `has_throttling` here because the native copy almost doesn't use network. auto source_data_source_description = src_disk->getDataSourceDescription(); - if (source_data_source_description.sameKind(data_source_description) - && (source_data_source_description.is_encrypted == copy_encrypted)) + if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { /// getBlobPath() can return more than 2 elements if the file is stored as multiple objects in S3 bucket. /// In this case we can't use the native copy. @@ -210,7 +213,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src request_settings, {}, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupWriterS3")); - return; + return; /// copied! } } From 943707963feff55dbf07ae47a7955253cc6adde7 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 4 May 2023 01:27:16 +0200 Subject: [PATCH 084/127] Add backup setting "decrypt_files_from_encrypted_disks" --- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 15 +-- src/Backups/BackupEntryFromAppendOnlyFile.h | 4 +- src/Backups/BackupEntryFromImmutableFile.cpp | 11 +- src/Backups/BackupEntryFromImmutableFile.h | 4 +- src/Backups/BackupEntryFromSmallFile.cpp | 9 +- src/Backups/BackupEntryFromSmallFile.h | 5 +- .../BackupEntryWithChecksumCalculation.cpp | 12 +- src/Backups/BackupIO_Default.cpp | 6 +- src/Backups/BackupSettings.cpp | 1 + src/Backups/BackupSettings.h | 3 + src/Disks/IDisk.h | 4 +- .../MergeTree/DataPartStorageOnDiskBase.cpp | 10 +- .../MergeTree/DataPartStorageOnDiskBase.h | 3 +- src/Storages/MergeTree/IDataPartStorage.h | 4 +- src/Storages/MergeTree/MergeTreeData.cpp | 12 +- src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/StorageLog.cpp | 8 +- src/Storages/StorageMemory.cpp | 26 ++-- src/Storages/StorageMergeTree.cpp | 3 +- src/Storages/StorageReplicatedMergeTree.cpp | 4 +- src/Storages/StorageStripeLog.cpp | 8 +- tests/integration/test_encrypted_disk/test.py | 119 ++++++++---------- 22 files changed, 152 insertions(+), 121 deletions(-) diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index 331a4a69d06..1d73ab52820 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include @@ -9,11 +9,11 @@ namespace DB namespace { /// For append-only files we must calculate its size on the construction of a backup entry. - UInt64 calculateSize(const DiskPtr & disk, const String & file_path, bool is_encrypted, std::optional unencrypted_file_size) + UInt64 calculateSize(const DiskPtr & disk, const String & file_path, bool copy_encrypted, std::optional unencrypted_file_size) { if (!unencrypted_file_size) - return is_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); - else if (is_encrypted) + return copy_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); + else if (copy_encrypted) return disk->getEncryptedFileSize(*unencrypted_file_size); else return *unencrypted_file_size; @@ -21,11 +21,12 @@ namespace } BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( - const DiskPtr & disk_, const String & file_path_, const std::optional & file_size_) + const DiskPtr & disk_, const String & file_path_, bool copy_encrypted_, const std::optional & file_size_) : disk(disk_) , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) - , size(calculateSize(disk_, file_path_, data_source_description.is_encrypted, file_size_)) + , copy_encrypted(copy_encrypted_ && data_source_description.is_encrypted) + , size(calculateSize(disk_, file_path_, copy_encrypted, file_size_)) { } @@ -34,7 +35,7 @@ BackupEntryFromAppendOnlyFile::~BackupEntryFromAppendOnlyFile() = default; std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer(const ReadSettings & read_settings) const { std::unique_ptr buf; - if (data_source_description.is_encrypted) + if (copy_encrypted) buf = disk->readEncryptedFile(file_path, read_settings.adjustBufferSize(size)); else buf = disk->readFile(file_path, read_settings.adjustBufferSize(size)); diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index 8a78478dcc5..257c392f24c 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -15,6 +15,7 @@ public: BackupEntryFromAppendOnlyFile( const DiskPtr & disk_, const String & file_path_, + bool copy_encrypted_ = false, const std::optional & file_size_ = {}); ~BackupEntryFromAppendOnlyFile() override; @@ -23,7 +24,7 @@ public: UInt64 getSize() const override { return size; } DataSourceDescription getDataSourceDescription() const override { return data_source_description; } - bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + bool isEncryptedByDisk() const override { return copy_encrypted; } bool isFromFile() const override { return true; } DiskPtr getDisk() const override { return disk; } @@ -33,6 +34,7 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; + const bool copy_encrypted; const UInt64 size; }; diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 24965f09356..cc635dd8541 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -1,6 +1,5 @@ #include #include -#include namespace DB @@ -20,11 +19,13 @@ namespace BackupEntryFromImmutableFile::BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, + bool copy_encrypted_, const std::optional & file_size_, const std::optional & checksum_) : disk(disk_) , file_path(file_path_) , data_source_description(disk->getDataSourceDescription()) + , copy_encrypted(copy_encrypted_ && data_source_description.is_encrypted) , file_size(file_size_) , checksum(checksum_) { @@ -34,7 +35,7 @@ BackupEntryFromImmutableFile::~BackupEntryFromImmutableFile() = default; std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer(const ReadSettings & read_settings) const { - if (data_source_description.is_encrypted) + if (copy_encrypted) return disk->readEncryptedFile(file_path, read_settings); else return disk->readFile(file_path, read_settings); @@ -46,8 +47,8 @@ UInt64 BackupEntryFromImmutableFile::getSize() const if (!file_size_adjusted) { if (!file_size) - file_size = data_source_description.is_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); - else if (data_source_description.is_encrypted) + file_size = copy_encrypted ? disk->getEncryptedFileSize(file_path) : disk->getFileSize(file_path); + else if (copy_encrypted) file_size = disk->getEncryptedFileSize(*file_size); file_size_adjusted = true; } @@ -61,7 +62,7 @@ UInt128 BackupEntryFromImmutableFile::getChecksum() const { if (!checksum) checksum = BackupEntryWithChecksumCalculation::getChecksum(); - else if (data_source_description.is_encrypted) + else if (copy_encrypted) checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path)); checksum_adjusted = true; } diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index 5cdb4adc7c4..850a86a3264 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -18,6 +18,7 @@ public: BackupEntryFromImmutableFile( const DiskPtr & disk_, const String & file_path_, + bool copy_encrypted_ = false, const std::optional & file_size_ = {}, const std::optional & checksum_ = {}); @@ -30,7 +31,7 @@ public: std::optional getPartialChecksum(size_t prefix_length) const override; DataSourceDescription getDataSourceDescription() const override { return data_source_description; } - bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + bool isEncryptedByDisk() const override { return copy_encrypted; } bool isFromFile() const override { return true; } bool isFromImmutableFile() const override { return true; } @@ -41,6 +42,7 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; + const bool copy_encrypted; mutable std::optional file_size; mutable std::optional checksum; mutable bool file_size_adjusted = false; diff --git a/src/Backups/BackupEntryFromSmallFile.cpp b/src/Backups/BackupEntryFromSmallFile.cpp index 3dcee7147fc..22487767689 100644 --- a/src/Backups/BackupEntryFromSmallFile.cpp +++ b/src/Backups/BackupEntryFromSmallFile.cpp @@ -19,9 +19,9 @@ namespace return s; } - String readFile(const DiskPtr & disk, const String & file_path, bool read_encrypted) + String readFile(const DiskPtr & disk, const String & file_path, bool copy_encrypted) { - auto buf = read_encrypted ? disk->readEncryptedFile(file_path) : disk->readFile(file_path); + auto buf = copy_encrypted ? disk->readEncryptedFile(file_path) : disk->readFile(file_path); String s; readStringUntilEOF(s, *buf); return s; @@ -36,11 +36,12 @@ BackupEntryFromSmallFile::BackupEntryFromSmallFile(const String & file_path_) { } -BackupEntryFromSmallFile::BackupEntryFromSmallFile(const DiskPtr & disk_, const String & file_path_) +BackupEntryFromSmallFile::BackupEntryFromSmallFile(const DiskPtr & disk_, const String & file_path_, bool copy_encrypted_) : disk(disk_) , file_path(file_path_) , data_source_description(disk_->getDataSourceDescription()) - , data(readFile(disk_, file_path, data_source_description.is_encrypted)) + , copy_encrypted(copy_encrypted_ && data_source_description.is_encrypted) + , data(readFile(disk_, file_path, copy_encrypted)) { } diff --git a/src/Backups/BackupEntryFromSmallFile.h b/src/Backups/BackupEntryFromSmallFile.h index 4f936718fbb..d6651ab8cb5 100644 --- a/src/Backups/BackupEntryFromSmallFile.h +++ b/src/Backups/BackupEntryFromSmallFile.h @@ -14,13 +14,13 @@ class BackupEntryFromSmallFile : public BackupEntryWithChecksumCalculation getReadBuffer(const ReadSettings &) const override; UInt64 getSize() const override { return data.size(); } DataSourceDescription getDataSourceDescription() const override { return data_source_description; } - bool isEncryptedByDisk() const override { return data_source_description.is_encrypted; } + bool isEncryptedByDisk() const override { return copy_encrypted; } bool isFromFile() const override { return true; } DiskPtr getDisk() const override { return disk; } @@ -30,6 +30,7 @@ private: const DiskPtr disk; const String file_path; const DataSourceDescription data_source_description; + const bool copy_encrypted = false; const String data; }; diff --git a/src/Backups/BackupEntryWithChecksumCalculation.cpp b/src/Backups/BackupEntryWithChecksumCalculation.cpp index 2c0c2eab8ff..1e634e6bb73 100644 --- a/src/Backups/BackupEntryWithChecksumCalculation.cpp +++ b/src/Backups/BackupEntryWithChecksumCalculation.cpp @@ -25,15 +25,21 @@ std::optional BackupEntryWithChecksumCalculation::getPartialCheck if (prefix_length == 0) return 0; - if (prefix_length >= this->getSize()) + size_t size = this->getSize(); + if (prefix_length >= size) return this->getChecksum(); - auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(prefix_length)); + std::lock_guard lock{checksum_calculation_mutex}; + + ReadSettings read_settings; + if (calculated_checksum) + read_settings.adjustBufferSize(calculated_checksum ? prefix_length : size); + + auto read_buffer = this->getReadBuffer(read_settings); HashingReadBuffer hashing_read_buffer(*read_buffer); hashing_read_buffer.ignore(prefix_length); auto partial_checksum = hashing_read_buffer.getHash(); - std::lock_guard lock{checksum_calculation_mutex}; if (!calculated_checksum) { hashing_read_buffer.ignoreAll(); diff --git a/src/Backups/BackupIO_Default.cpp b/src/Backups/BackupIO_Default.cpp index 3b4851e9441..f7ba061cf3a 100644 --- a/src/Backups/BackupIO_Default.cpp +++ b/src/Backups/BackupIO_Default.cpp @@ -82,12 +82,12 @@ void BackupWriterDefault::copyFileFromDisk(const String & path_in_backup, DiskPt { LOG_TRACE(log, "Copying file {} from disk {} through buffers", src_path, src_disk->getName()); - auto create_read_buffer = [this, src_disk, src_path, file_size = start_pos + length, copy_encrypted] + auto create_read_buffer = [src_disk, src_path, copy_encrypted, settings = read_settings.adjustBufferSize(start_pos + length)] { if (copy_encrypted) - return src_disk->readEncryptedFile(src_path, read_settings, {}, file_size); + return src_disk->readEncryptedFile(src_path, settings); else - return src_disk->readFile(src_path, read_settings, {}, file_size); + return src_disk->readFile(src_path, settings); }; copyDataToFile(path_in_backup, create_read_buffer, start_pos, length); diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 57d85305e25..882342467fe 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -23,6 +23,7 @@ namespace ErrorCodes M(String, password) \ M(Bool, structure_only) \ M(Bool, async) \ + M(Bool, decrypt_files_from_encrypted_disks) \ M(Bool, deduplicate_files) \ M(UInt64, shard_num) \ M(UInt64, replica_num) \ diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index 1b97256c75b..2c899687e6e 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -32,6 +32,9 @@ struct BackupSettings /// Whether the BACKUP command must return immediately without waiting until the backup has completed. bool async = false; + /// Whether the BACKUP command should decrypt files stored on encrypted disks. + bool decrypt_files_from_encrypted_disks = false; + /// Whether the BACKUP will omit similar files (within one backup only). bool deduplicate_files = true; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 47b4ed80ebf..4d74fe8bbab 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -253,14 +253,14 @@ public: virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; /// Reads a file from an encrypted disk without decrypting it (only for encrypted disks). - virtual std::unique_ptr readEncryptedFile( + virtual std::unique_ptr readEncryptedFile( /// NOLINT const String & path, const ReadSettings & settings = ReadSettings{}, std::optional read_hint = {}, std::optional file_size = {}) const; /// Writes an already encrypted file to the disk (only for encrypted disks). - virtual std::unique_ptr writeEncryptedFile( + virtual std::unique_ptr writeEncryptedFile( /// NOLINT const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, WriteMode mode = WriteMode::Rewrite, diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index c1e2c5b8cf8..ebe55ea7dc7 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -322,8 +323,9 @@ void DataPartStorageOnDiskBase::backup( const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, - BackupEntries & backup_entries, + const BackupSettings & backup_settings, bool make_temporary_hard_links, + BackupEntries & backup_entries, TemporaryFilesOnDisks * temp_dirs) const { fs::path part_path_on_disk = fs::path{root_path} / part_dir; @@ -364,6 +366,8 @@ void DataPartStorageOnDiskBase::backup( files_to_backup = getActualFileNamesOnDisk(files_to_backup); + bool copy_encrypted = !backup_settings.decrypt_files_from_encrypted_disks; + for (const auto & filepath : files_to_backup) { auto filepath_on_disk = part_path_on_disk / filepath; @@ -371,7 +375,7 @@ void DataPartStorageOnDiskBase::backup( if (files_without_checksums.contains(filepath)) { - backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk)); + backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk, copy_encrypted)); continue; } @@ -392,7 +396,7 @@ void DataPartStorageOnDiskBase::backup( file_hash = {it->second.file_hash.first, it->second.file_hash.second}; } - BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, file_size, file_hash); + BackupEntryPtr backup_entry = std::make_unique(disk, filepath_on_disk, copy_encrypted, file_size, file_hash); if (temp_dir_owner) backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 52544bb2457..09eb7f008bc 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -53,8 +53,9 @@ public: const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, - BackupEntries & backup_entries, + const BackupSettings & backup_settings, bool make_temporary_hard_links, + BackupEntries & backup_entries, TemporaryFilesOnDisks * temp_dirs) const override; MutableDataPartStoragePtr freeze( diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 9c267d94e63..7c85469d890 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -65,6 +65,7 @@ using SyncGuardPtr = std::unique_ptr; class IBackupEntry; using BackupEntryPtr = std::shared_ptr; using BackupEntries = std::vector>; +struct BackupSettings; struct WriteSettings; @@ -200,8 +201,9 @@ public: const MergeTreeDataPartChecksums & checksums, const NameSet & files_without_checksums, const String & path_in_backup, - BackupEntries & backup_entries, + const BackupSettings & backup_settings, bool make_temporary_hard_links, + BackupEntries & backup_entries, TemporaryFilesOnDisks * temp_dirs) const = 0; /// Creates hardlinks into 'to/dir_path' for every file in data part. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e36bc8baeb4..b8208052f19 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5051,7 +5051,11 @@ Pipe MergeTreeData::alterPartition( } -BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context) +BackupEntries MergeTreeData::backupParts( + const DataPartsVector & data_parts, + const String & data_path_in_backup, + const BackupSettings & backup_settings, + const ContextPtr & local_context) { BackupEntries backup_entries; std::map> temp_dirs; @@ -5089,8 +5093,9 @@ BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, con part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, - backup_entries_from_part, + backup_settings, make_temporary_hard_links, + backup_entries_from_part, &temp_dirs); auto projection_parts = part->getProjectionParts(); @@ -5100,8 +5105,9 @@ BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, con projection_part->checksums, projection_part->getFileNamesWithoutChecksums(), fs::path{data_path_in_backup} / part->name, - backup_entries_from_part, + backup_settings, make_temporary_hard_links, + backup_entries_from_part, &temp_dirs); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index ce5c6a730e9..04b008b623c 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -1322,7 +1322,7 @@ protected: MovePartsOutcome movePartsToSpace(const DataPartsVector & parts, SpacePtr space); /// Makes backup entries to backup the parts of this table. - BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context); + BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const BackupSettings & backup_settings, const ContextPtr & local_context); class RestoredPartsHolder; diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index 6d77d2b97b0..f698f1881fa 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -944,6 +944,8 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c fs::path temp_dir = temp_dir_owner->getPath(); disk->createDirectories(temp_dir); + bool copy_encrypted = !backup_entries_collector.getBackupSettings().decrypt_files_from_encrypted_disks; + /// *.bin for (const auto & data_file : data_files) { @@ -952,7 +954,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file.path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, file_checker.getFileSize(data_file.path)); + disk, hardlink_file_path, copy_encrypted, file_checker.getFileSize(data_file.path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } @@ -965,7 +967,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c String hardlink_file_path = temp_dir / marks_file_name; disk->createHardLink(marks_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, file_checker.getFileSize(marks_file_path)); + disk, hardlink_file_path, copy_encrypted, file_checker.getFileSize(marks_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / marks_file_name, std::move(backup_entry)); } @@ -973,7 +975,7 @@ void StorageLog::backupData(BackupEntriesCollector & backup_entries_collector, c /// sizes.json String files_info_path = file_checker.getPath(); backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path)); + data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path, copy_encrypted)); /// columns.txt backup_entries_collector.addBackupEntry( diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index ebc780f5ab1..c9654cfd105 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -319,21 +319,25 @@ namespace IndexForNativeFormat index; { auto data_file_path = temp_dir / fs::path{file_paths[data_bin_pos]}.filename(); - auto data_out_compressed = temp_disk->writeFile(data_file_path); - CompressedWriteBuffer data_out{*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size}; - NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; - for (const auto & block : *blocks) - block_out.write(block); - backup_entries[data_bin_pos] = {file_paths[data_bin_pos], std::make_shared(temp_disk, data_file_path)}; + { + auto data_out_compressed = temp_disk->writeFile(data_file_path); + CompressedWriteBuffer data_out{*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size}; + NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; + for (const auto & block : *blocks) + block_out.write(block); + } + backup_entries[data_bin_pos] = {file_paths[data_bin_pos], std::make_shared(temp_disk, data_file_path)}; } /// Writing index.mrk { auto index_mrk_path = temp_dir / fs::path{file_paths[index_mrk_pos]}.filename(); - auto index_mrk_out_compressed = temp_disk->writeFile(index_mrk_path); - CompressedWriteBuffer index_mrk_out{*index_mrk_out_compressed}; - index.write(index_mrk_out); - backup_entries[index_mrk_pos] = {file_paths[index_mrk_pos], std::make_shared(temp_disk, index_mrk_path)}; + { + auto index_mrk_out_compressed = temp_disk->writeFile(index_mrk_path); + CompressedWriteBuffer index_mrk_out{*index_mrk_out_compressed}; + index.write(index_mrk_out); + } + backup_entries[index_mrk_pos] = {file_paths[index_mrk_pos], std::make_shared(temp_disk, index_mrk_path)}; } /// Writing columns.txt diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index b4dc2830bd6..2c19d3ba122 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2142,6 +2142,7 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ void StorageMergeTree::backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional & partitions) { + const auto & backup_settings = backup_entries_collector.getBackupSettings(); auto local_context = backup_entries_collector.getContext(); DataPartsVector data_parts; @@ -2154,7 +2155,7 @@ void StorageMergeTree::backupData(BackupEntriesCollector & backup_entries_collec for (const auto & data_part : data_parts) min_data_version = std::min(min_data_version, data_part->info.getDataVersion() + 1); - backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, local_context)); + backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, backup_settings, local_context)); backup_entries_collector.addBackupEntries(backupMutations(min_data_version, data_path_in_backup)); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index fba6e6cbafc..fcb7adbd69f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -9252,6 +9252,8 @@ void StorageReplicatedMergeTree::backupData( /// First we generate backup entries in the same way as an ordinary MergeTree does. /// But then we don't add them to the BackupEntriesCollector right away, /// because we need to coordinate them with other replicas (other replicas can have better parts). + + const auto & backup_settings = backup_entries_collector.getBackupSettings(); auto local_context = backup_entries_collector.getContext(); DataPartsVector data_parts; @@ -9260,7 +9262,7 @@ void StorageReplicatedMergeTree::backupData( else data_parts = getVisibleDataPartsVector(local_context); - auto backup_entries = backupParts(data_parts, /* data_path_in_backup */ "", local_context); + auto backup_entries = backupParts(data_parts, /* data_path_in_backup */ "", backup_settings, local_context); auto coordination = backup_entries_collector.getBackupCoordination(); String shared_id = getTableSharedID(); diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index 23bf88b9db5..b2e7c202800 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -545,6 +545,8 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec fs::path temp_dir = temp_dir_owner->getPath(); disk->createDirectories(temp_dir); + bool copy_encrypted = !backup_entries_collector.getBackupSettings().decrypt_files_from_encrypted_disks; + /// data.bin { /// We make a copy of the data file because it can be changed later in write() or in truncate(). @@ -552,7 +554,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String hardlink_file_path = temp_dir / data_file_name; disk->createHardLink(data_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, file_checker.getFileSize(data_file_path)); + disk, hardlink_file_path, copy_encrypted, file_checker.getFileSize(data_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / data_file_name, std::move(backup_entry)); } @@ -564,7 +566,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec String hardlink_file_path = temp_dir / index_file_name; disk->createHardLink(index_file_path, hardlink_file_path); BackupEntryPtr backup_entry = std::make_unique( - disk, hardlink_file_path, file_checker.getFileSize(index_file_path)); + disk, hardlink_file_path, copy_encrypted, file_checker.getFileSize(index_file_path)); backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries_collector.addBackupEntry(data_path_in_backup_fs / index_file_name, std::move(backup_entry)); } @@ -572,7 +574,7 @@ void StorageStripeLog::backupData(BackupEntriesCollector & backup_entries_collec /// sizes.json String files_info_path = file_checker.getPath(); backup_entries_collector.addBackupEntry( - data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path)); + data_path_in_backup_fs / fileName(files_info_path), std::make_unique(disk, files_info_path, copy_encrypted)); /// columns.txt backup_entries_collector.addBackupEntry( diff --git a/tests/integration/test_encrypted_disk/test.py b/tests/integration/test_encrypted_disk/test.py index 90da5849c7f..66ff073f02b 100644 --- a/tests/integration/test_encrypted_disk/test.py +++ b/tests/integration/test_encrypted_disk/test.py @@ -309,17 +309,24 @@ def test_restart(): @pytest.mark.parametrize( - "storage_policy,backup_type,storage_policy2", + "backup_type,old_storage_policy,new_storage_policy,decrypt_files_from_encrypted_disks", [ - ("encrypted_policy", "S3", "encrypted_policy"), - ("encrypted_policy", "S3", "s3_encrypted_default_path"), - ("s3_encrypted_default_path", "S3", "s3_encrypted_default_path"), - ("s3_encrypted_default_path", "S3", "encrypted_policy"), - ("s3_encrypted_default_path", "File", "encrypted_policy"), - ("local_policy", "File", "encrypted_policy"), + ("S3", "encrypted_policy", "encrypted_policy", False), + ("S3", "encrypted_policy", "s3_encrypted_default_path", False), + ("S3", "s3_encrypted_default_path", "s3_encrypted_default_path", False), + ("S3", "s3_encrypted_default_path", "encrypted_policy", False), + ("File", "s3_encrypted_default_path", "encrypted_policy", False), + ("File", "local_policy", "encrypted_policy", False), + ("File", "encrypted_policy", "local_policy", False), + ("File", "encrypted_policy", "local_policy", True), ], ) -def test_backup_restore(storage_policy, backup_type, storage_policy2): +def test_backup_restore( + backup_type, + old_storage_policy, + new_storage_policy, + decrypt_files_from_encrypted_disks, +): node.query( f""" CREATE TABLE encrypted_test ( @@ -327,7 +334,7 @@ def test_backup_restore(storage_policy, backup_type, storage_policy2): data String ) ENGINE=MergeTree() ORDER BY id - SETTINGS storage_policy='{storage_policy}' + SETTINGS storage_policy='{old_storage_policy}' """ ) @@ -343,22 +350,38 @@ def test_backup_restore(storage_policy, backup_type, storage_policy2): elif backup_type == "File": backup_destination = f"File('/backups/{backup_name}/')" - node.query(f"BACKUP TABLE encrypted_test TO {backup_destination}") + node.query( + f"BACKUP TABLE encrypted_test TO {backup_destination} SETTINGS decrypt_files_from_encrypted_disks={int(decrypt_files_from_encrypted_disks)}" + ) - if backup_type == "File" and storage_policy.find("encrypted") != -1: + storage_policy_changed = old_storage_policy != new_storage_policy + old_disk_encrypted = old_storage_policy.find("encrypted") != -1 + new_disk_encrypted = new_storage_policy.find("encrypted") != -1 + + if backup_type == "File": root_path = os.path.join(node.cluster.instances_dir, "backups", backup_name) + expect_encrypted_in_backup = ( + old_disk_encrypted and not decrypt_files_from_encrypted_disks + ) + + with open(f"{root_path}/metadata/default/encrypted_test.sql") as file: + assert file.read().startswith("CREATE TABLE default.encrypted_test") + + with open(f"{root_path}/.backup") as file: + found_encrypted_in_backup = ( + file.read().find("true") != -1 + ) + assert found_encrypted_in_backup == expect_encrypted_in_backup + with open( f"{root_path}/data/default/encrypted_test/all_1_1_0/data.bin", "rb" ) as file: - assert file.read().startswith(b"ENC") - with open(f"{root_path}/metadata/default/encrypted_test.sql") as file: - assert file.read().startswith("CREATE TABLE default.encrypted_test") - with open(f"{root_path}/.backup") as file: - assert file.read().find("true") != -1 + found_encrypted_in_backup = file.read().startswith(b"ENC") + assert found_encrypted_in_backup == expect_encrypted_in_backup node.query(f"DROP TABLE encrypted_test SYNC") - if storage_policy != storage_policy2: + if storage_policy_changed: node.query( f""" CREATE TABLE encrypted_test ( @@ -366,56 +389,22 @@ def test_backup_restore(storage_policy, backup_type, storage_policy2): data String ) ENGINE=MergeTree() ORDER BY id - SETTINGS storage_policy='{storage_policy2}' + SETTINGS storage_policy='{new_storage_policy}' """ ) - node.query( - f"RESTORE TABLE encrypted_test FROM {backup_destination} SETTINGS allow_different_table_def={int(storage_policy != storage_policy2)}" - ) + restore_command = f"RESTORE TABLE encrypted_test FROM {backup_destination} SETTINGS allow_different_table_def={int(storage_policy_changed)}" - assert node.query(select_query) == "(0,'data'),(1,'data')" + expect_error = None + if ( + old_disk_encrypted + and not new_disk_encrypted + and not decrypt_files_from_encrypted_disks + ): + expect_error = "can be restored only to an encrypted disk" - -def test_cannot_restore_encrypted_files_to_unencrypted_disk(): - node.query( - """ - CREATE TABLE encrypted_test ( - id Int64, - data String - ) ENGINE=MergeTree() - ORDER BY id - SETTINGS storage_policy='encrypted_policy' - """ - ) - - node.query("INSERT INTO encrypted_test VALUES (0,'data'),(1,'data')") - assert ( - node.query("SELECT * FROM encrypted_test ORDER BY id FORMAT Values") - == "(0,'data'),(1,'data')" - ) - - backup_name = new_backup_name() - backup_destination = ( - f"S3('http://minio1:9001/root/backups/{backup_name}', 'minio', 'minio123')" - ) - node.query(f"BACKUP TABLE encrypted_test TO {backup_destination}") - - node.query(f"DROP TABLE encrypted_test SYNC") - - node.query( - f""" - CREATE TABLE encrypted_test ( - id Int64, - data String - ) ENGINE=MergeTree() - ORDER BY id - SETTINGS storage_policy='local_policy' - """ - ) - - expected_error = "can be restored only to an encrypted disk" - assert expected_error in node.query_and_get_error( - f"RESTORE TABLE encrypted_test FROM {backup_destination} SETTINGS allow_different_table_def=1" - ) ->>>>>>> 9c08fb30995 (Add tests.) + if expect_error: + assert expect_error in node.query_and_get_error(restore_command) + else: + node.query(restore_command) + assert node.query(select_query) == "(0,'data'),(1,'data')" From 2ec94a42b728a7daa6ab0c2c72fab1119e5f2a1d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 5 May 2023 10:40:12 +0200 Subject: [PATCH 085/127] Remove default parameters from virtual functions. --- src/Disks/DiskEncrypted.h | 7 ++----- src/Disks/IDisk.cpp | 2 +- src/Disks/IDisk.h | 13 +++---------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index c494dd6a216..530d9b2dc02 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -198,13 +198,10 @@ public: delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function)); } - std::unique_ptr readEncryptedFile( - const String & path, const ReadSettings & settings, - std::optional read_hint, - std::optional file_size) const override + std::unique_ptr readEncryptedFile(const String & path, const ReadSettings & settings) const override { auto wrapped_path = wrappedPath(path); - return delegate->readFile(wrapped_path, settings, read_hint, file_size); + return delegate->readFile(wrapped_path, settings); } std::unique_ptr writeEncryptedFile( diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 9a5ae997b46..88dd65bfde7 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -52,7 +52,7 @@ void IDisk::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_ba } } -std::unique_ptr IDisk::readEncryptedFile(const String &, const ReadSettings &, std::optional, std::optional) const +std::unique_ptr IDisk::readEncryptedFile(const String &, const ReadSettings &) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "File encryption is not implemented for disk of type {}", getDataSourceDescription().type); } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 4d74fe8bbab..e3ac790d2b7 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -253,18 +253,11 @@ public: virtual void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) = 0; /// Reads a file from an encrypted disk without decrypting it (only for encrypted disks). - virtual std::unique_ptr readEncryptedFile( /// NOLINT - const String & path, - const ReadSettings & settings = ReadSettings{}, - std::optional read_hint = {}, - std::optional file_size = {}) const; + virtual std::unique_ptr readEncryptedFile(const String & path, const ReadSettings & settings) const; /// Writes an already encrypted file to the disk (only for encrypted disks). - virtual std::unique_ptr writeEncryptedFile( /// NOLINT - const String & path, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - WriteMode mode = WriteMode::Rewrite, - const WriteSettings & settings = {}) const; + virtual std::unique_ptr writeEncryptedFile( + const String & path, size_t buf_size, WriteMode mode, const WriteSettings & settings) const; /// Returns the size of an encrypted file (only for encrypted disks). virtual size_t getEncryptedFileSize(const String & path) const; From b068f0b619ceb83023ba116c66a3fcafec7e8305 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 5 May 2023 11:40:33 +0200 Subject: [PATCH 086/127] Fix build. --- src/Backups/BackupEntryFromSmallFile.cpp | 2 +- src/Backups/BackupIO_S3.cpp | 10 +++++----- src/Disks/IDisk.h | 4 ++-- src/Disks/ObjectStorages/DiskObjectStorage.cpp | 4 ++-- .../ObjectStorages/DiskObjectStorageTransaction.cpp | 6 ++++-- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/Backups/BackupEntryFromSmallFile.cpp b/src/Backups/BackupEntryFromSmallFile.cpp index 22487767689..d0a99056b59 100644 --- a/src/Backups/BackupEntryFromSmallFile.cpp +++ b/src/Backups/BackupEntryFromSmallFile.cpp @@ -21,7 +21,7 @@ namespace String readFile(const DiskPtr & disk, const String & file_path, bool copy_encrypted) { - auto buf = copy_encrypted ? disk->readEncryptedFile(file_path) : disk->readFile(file_path); + auto buf = copy_encrypted ? disk->readEncryptedFile(file_path, {}) : disk->readFile(file_path); String s; readStringUntilEOF(s, *buf); return s; diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 69f56078f9d..40ecde71173 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -157,8 +157,8 @@ void BackupReaderS3::copyFileToDisk(const String & path_in_backup, size_t file_s fs::path(s3_uri.key) / path_in_backup, 0, file_size, - /* dest_bucket= */ blob_path[0], - /* dest_key= */ blob_path[1], + /* dest_bucket= */ blob_path[1], + /* dest_key= */ blob_path[0], request_settings, object_attributes, threadPoolCallbackRunner(BackupsIOThreadPool::get(), "BackupReaderS3"), @@ -196,7 +196,7 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src auto source_data_source_description = src_disk->getDataSourceDescription(); if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { - /// getBlobPath() can return more than 2 elements if the file is stored as multiple objects in S3 bucket. + /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in S3 bucket. /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { @@ -204,8 +204,8 @@ void BackupWriterS3::copyFileFromDisk(const String & path_in_backup, DiskPtr src LOG_TRACE(log, "Copying file {} from disk {} to S3 using native copy", src_path, src_disk->getName()); copyS3File( client, - /* src_bucket */ blob_path[0], - /* src_key= */ blob_path[1], + /* src_bucket */ blob_path[1], + /* src_key= */ blob_path[0], start_pos, length, s3_uri.bucket, diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index e3ac790d2b7..6bbd7c26bec 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -240,8 +240,8 @@ public: /// Returns the path to a blob representing a specified file. /// The meaning of the returned path depends on disk's type. - /// E.g. for DiskLocal it the absolute path to the file and for DiskObjectStorage it's the name of the objects' namespace - /// combined with StoredObject::absolute_path for each stored object representing a specified file. + /// E.g. for DiskLocal it's the absolute path to the file and for DiskObjectStorage it's + /// StoredObject::remote_path for each stored object combined with the name of the objects' namespace. virtual Strings getBlobPath(const String & path) const = 0; using WriteBlobFunction = std::function & object_attributes)>; diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index bfa1ed1ab26..129f1ab1ef7 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -584,11 +584,11 @@ Strings DiskObjectStorage::getBlobPath(const String & path) const auto objects = getStorageObjects(path); Strings res; res.reserve(objects.size() + 1); + for (const auto & object : objects) + res.emplace_back(object.remote_path); String objects_namespace = object_storage->getObjectsNamespace(); if (!objects_namespace.empty()) res.emplace_back(objects_namespace); - for (const auto & object : objects) - res.emplace_back(object.absolute_path); return res; } diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp index f98ac55889b..2c22df64d90 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp @@ -694,8 +694,10 @@ void DiskObjectStorageTransaction::writeFileUsingBlobWritingFunction( /// See DiskObjectStorage::getBlobPath(). Strings blob_path; blob_path.reserve(2); - blob_path.emplace_back(object_storage.getObjectsNamespace()); - blob_path.emplace_back(object.absolute_path); + blob_path.emplace_back(object.remote_path); + String objects_namespace = object_storage.getObjectsNamespace(); + if (!objects_namespace.empty()) + blob_path.emplace_back(objects_namespace); /// We always use mode Rewrite because we simulate append using metadata and different files size_t object_size = std::move(write_blob_function)(blob_path, WriteMode::Rewrite, object_attributes); From 50a536bba80fac997c856939573532c2adfe38ed Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 May 2023 15:26:24 +0200 Subject: [PATCH 087/127] Remove unused code --- .../ReplicatedMergeTreePartCheckThread.h | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h index e7b0e224d9b..b86191dbf50 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h @@ -36,30 +36,6 @@ public: void start(); void stop(); - /// Don't create more than one instance of this object simultaneously. - struct TemporarilyStop : private boost::noncopyable - { - ReplicatedMergeTreePartCheckThread * parent; - - explicit TemporarilyStop(ReplicatedMergeTreePartCheckThread * parent_) : parent(parent_) - { - parent->stop(); - } - - TemporarilyStop(TemporarilyStop && old) noexcept : parent(old.parent) - { - old.parent = nullptr; - } - - ~TemporarilyStop() - { - if (parent) - parent->start(); - } - }; - - TemporarilyStop temporarilyStop() { return TemporarilyStop(this); } - /// Add a part (for which there are suspicions that it is missing, damaged or not needed) in the queue for check. /// delay_to_check_seconds - check no sooner than the specified number of seconds. void enqueuePart(const String & name, time_t delay_to_check_seconds = 0); From a7bb8f412fdc9e293cd07ffaa71aa29d0ee48a82 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 12 May 2023 11:51:39 +0000 Subject: [PATCH 088/127] Allow ASOF JOIN over nullable right column --- src/Interpreters/HashJoin.cpp | 37 +++++++- src/Interpreters/TableJoin.cpp | 4 - tests/broken_tests.txt | 2 - .../01428_nullable_asof_join.reference | 28 ++++++ .../0_stateless/01428_nullable_asof_join.sql | 38 ++++++-- .../02735_asof_join_right_null.reference | 95 +++++++++++++++++++ .../02735_asof_join_right_null.sql | 32 +++++++ 7 files changed, 217 insertions(+), 19 deletions(-) create mode 100644 tests/queries/0_stateless/02735_asof_join_right_null.reference create mode 100644 tests/queries/0_stateless/02735_asof_join_right_null.sql diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index c9843dca825..c58120c3da9 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -710,15 +710,46 @@ Block HashJoin::prepareRightBlock(const Block & block) const return prepareRightBlock(block, savedBlockSample()); } -bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits) +bool HashJoin::addJoinedBlock(const Block & source_block_, bool check_limits) { if (!data) throw Exception(ErrorCodes::LOGICAL_ERROR, "Join data was released"); /// RowRef::SizeT is uint32_t (not size_t) for hash table Cell memory efficiency. /// It's possible to split bigger blocks and insert them by parts here. But it would be a dead code. - if (unlikely(source_block.rows() > std::numeric_limits::max())) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Too many rows in right table block for HashJoin: {}", source_block.rows()); + if (unlikely(source_block_.rows() > std::numeric_limits::max())) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Too many rows in right table block for HashJoin: {}", source_block_.rows()); + + Block source_block = source_block_; + if (strictness == JoinStrictness::Asof) + { + chassert(kind == JoinKind::Left || kind == JoinKind::Inner); + + // Filter out rows with NULLs in asof key + const auto & asof_key_name = table_join->getOnlyClause().key_names_right.back(); + auto & asof_column = source_block.getByName(asof_key_name); + + if (asof_column.type->isNullable()) + { + /// filter rows with nulls in asof key + if (const auto * asof_const_column = typeid_cast(asof_column.column.get())) + { + if (asof_const_column->isNullAt(0)) + return false; + } + else + { + const auto & asof_column_nullable = assert_cast(*asof_column.column).getNullMapData(); + + NullMap negative_null_map(asof_column_nullable.size()); + for (size_t i = 0; i < asof_column_nullable.size(); ++i) + negative_null_map[i] = !asof_column_nullable[i]; + + for (auto & column : source_block) + column.column = column.column->filter(negative_null_map, -1); + } + } + } size_t rows = source_block.rows(); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 2d882083f3d..5a23fbd00ff 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -492,10 +492,6 @@ void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig { if (clauses.size() != 1) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "ASOF join over multiple keys is not supported"); - - auto asof_key_type = right_types.find(clauses.back().key_names_right.back()); - if (asof_key_type != right_types.end() && asof_key_type->second->isNullable()) - throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "ASOF join over right table Nullable column is not implemented"); } forAllKeys(clauses, [&](const auto & left_key_name, const auto & right_key_name) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 0b4efacba0b..7db123bf467 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -37,8 +37,6 @@ 01268_shard_avgweighted 01270_optimize_skip_unused_shards_low_cardinality 01319_optimize_skip_unused_shards_nesting -01353_low_cardinality_join_types -01428_nullable_asof_join 01455_shard_leaf_max_rows_bytes_to_read 01476_right_full_join_switch 01477_lc_in_merge_join_left_key diff --git a/tests/queries/0_stateless/01428_nullable_asof_join.reference b/tests/queries/0_stateless/01428_nullable_asof_join.reference index f04655fefaa..73825dce725 100644 --- a/tests/queries/0_stateless/01428_nullable_asof_join.reference +++ b/tests/queries/0_stateless/01428_nullable_asof_join.reference @@ -5,6 +5,15 @@ left asof using 0 \N 0 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) 1 \N 1 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) 1 1 2 2 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +0 \N 0 \N UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +1 1 1 0 UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +1 1 2 0 UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +0 \N 0 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +0 \N 0 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) left asof on 0 \N 0 \N UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) 1 \N 1 \N UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) @@ -12,9 +21,28 @@ left asof on 0 \N 0 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) 1 \N 1 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) 1 1 2 2 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +0 \N 0 \N UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +1 1 1 0 UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +1 1 2 0 UInt8 Nullable(UInt8) UInt8 Nullable(UInt8) +0 \N 0 \N UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 Nullable(UInt8) Nullable(UInt8) Nullable(UInt8) asof using 1 1 2 2 UInt8 UInt8 UInt8 UInt8 1 1 2 2 UInt8 UInt8 Nullable(UInt8) UInt8 +1 1 2 2 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 UInt8 UInt8 Nullable(UInt8) +1 1 2 0 UInt8 UInt8 UInt8 Nullable(UInt8) +1 1 1 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) asof on 1 1 2 2 UInt8 UInt8 UInt8 UInt8 1 1 2 2 UInt8 UInt8 Nullable(UInt8) UInt8 +1 1 1 0 UInt8 UInt8 UInt8 Nullable(UInt8) +1 1 2 0 UInt8 UInt8 UInt8 Nullable(UInt8) +1 1 1 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 1 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) +1 1 2 0 UInt8 UInt8 Nullable(UInt8) Nullable(UInt8) diff --git a/tests/queries/0_stateless/01428_nullable_asof_join.sql b/tests/queries/0_stateless/01428_nullable_asof_join.sql index e1b00158d68..f07a26edd97 100644 --- a/tests/queries/0_stateless/01428_nullable_asof_join.sql +++ b/tests/queries/0_stateless/01428_nullable_asof_join.sql @@ -18,13 +18,19 @@ SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(ma FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a ASOF LEFT JOIN (SELECT 1 as pk, toNullable(0) as dt) b USING(pk, dt) -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 0; + +SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) +FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a +ASOF LEFT JOIN (SELECT 1 as pk, toNullable(0) as dt) b +USING(pk, dt) +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 1; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF LEFT JOIN (SELECT 1 as pk, toNullable(0) as dt) b USING(pk, dt) -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; select 'left asof on'; @@ -44,13 +50,13 @@ SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(ma FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a ASOF LEFT JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.pk = b.pk AND a.dt >= b.dt -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF LEFT JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.dt >= b.dt AND a.pk = b.pk -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; select 'asof using'; @@ -64,19 +70,31 @@ SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(ma FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, 2 as dt) b USING(pk, dt) -ORDER BY a.dt; +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 0; + +SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) +FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a +ASOF JOIN (SELECT 1 as pk, 2 as dt) b +USING(pk, dt) +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 1; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b USING(pk, dt) -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 0; + +SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) +FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a +ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b +USING(pk, dt) +ORDER BY a.dt SETTINGS allow_experimental_analyzer = 1; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b USING(pk, dt) -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; select 'asof on'; @@ -96,19 +114,19 @@ SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(ma FROM (SELECT toUInt8(number) > 0 as pk, toUInt8(number) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.pk = b.pk AND a.dt >= b.dt -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.pk = b.pk AND a.dt >= b.dt -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; SELECT a.pk, b.pk, a.dt, b.dt, toTypeName(a.pk), toTypeName(b.pk), toTypeName(materialize(a.dt)), toTypeName(materialize(b.dt)) FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM numbers(3)) a ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.dt >= b.dt AND a.pk = b.pk -ORDER BY a.dt; -- { serverError 48 } +ORDER BY a.dt; SELECT * FROM (SELECT NULL AS y, 1 AS x, '2020-01-01 10:10:10' :: DateTime64 AS t) AS t1 diff --git a/tests/queries/0_stateless/02735_asof_join_right_null.reference b/tests/queries/0_stateless/02735_asof_join_right_null.reference new file mode 100644 index 00000000000..d4332556cb5 --- /dev/null +++ b/tests/queries/0_stateless/02735_asof_join_right_null.reference @@ -0,0 +1,95 @@ +-- { echoOn } +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 1 +1 2 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +1 2 1 1 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; +1 1 1 1 +1 2 1 2 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 2 +1 2 0 \N +1 3 0 \N +1 4 0 \N +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 1 +1 2 1 2 +1 3 0 \N +1 4 0 \N +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +1 -1 0 \N +1 0 0 \N +1 1 0 \N +1 2 1 1 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; +1 -1 0 \N +1 0 0 \N +1 1 1 1 +1 2 1 2 +1 3 1 2 +1 4 1 2 +SET join_use_nulls = 1; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 1 +1 2 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +1 2 1 1 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; +1 1 1 1 +1 2 1 2 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 2 +1 2 \N \N +1 3 \N \N +1 4 \N \N +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +1 -1 1 1 +1 0 1 1 +1 1 1 1 +1 2 1 2 +1 3 \N \N +1 4 \N \N +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +1 -1 \N \N +1 0 \N \N +1 1 \N \N +1 2 1 1 +1 3 1 2 +1 4 1 2 +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; +1 -1 \N \N +1 0 \N \N +1 1 1 1 +1 2 1 2 +1 3 1 2 +1 4 1 2 +DROP TABLE t1; diff --git a/tests/queries/0_stateless/02735_asof_join_right_null.sql b/tests/queries/0_stateless/02735_asof_join_right_null.sql new file mode 100644 index 00000000000..997d33a0570 --- /dev/null +++ b/tests/queries/0_stateless/02735_asof_join_right_null.sql @@ -0,0 +1,32 @@ + +CREATE TABLE t1 (a Int, b Int) ENGINE = Memory; +INSERT INTO t1 VALUES (1, -1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4); + +CREATE TABLE t2 (a Int, b Nullable(Int)) ENGINE = Memory; +INSERT INTO t2 VALUES (1, 1), (1, NULL), (1, 2); + +-- { echoOn } +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; + +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; + +SET join_use_nulls = 1; + +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; + +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b < t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b <= t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ORDER BY t1.b; +SELECT * FROM t1 ASOF LEFT JOIN t2 ON t1.a = t2.a AND t1.b >= t2.b ORDER BY t1.b; + +DROP TABLE t1; + From ca005ecea14635126f0d2d4d835fcaeaab25081c Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 15 May 2023 16:57:27 +0000 Subject: [PATCH 089/127] Update comment about filtering nulls in asof join --- src/Interpreters/HashJoin.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index c58120c3da9..0af33a8bd20 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -725,7 +725,9 @@ bool HashJoin::addJoinedBlock(const Block & source_block_, bool check_limits) { chassert(kind == JoinKind::Left || kind == JoinKind::Inner); - // Filter out rows with NULLs in asof key + /// Filter out rows with NULLs in ASOF key, nulls are not joined with anything since they are not comparable + /// We support only INNER/LEFT ASOF join, so rows with NULLs never return from the right joined table. + /// So filter them out here not to handle in implementation. const auto & asof_key_name = table_join->getOnlyClause().key_names_right.back(); auto & asof_column = source_block.getByName(asof_key_name); From 1f55c320b42267efd2dc94ef024b03bba5b1ba0b Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 15 May 2023 21:41:45 +0200 Subject: [PATCH 090/127] Fix style --- src/Interpreters/HashJoin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 79fac60d8bc..50eda4482bd 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -155,7 +155,7 @@ public: /** Add block of data from right hand of JOIN to the map. * Returns false, if some limit was exceeded and you should not insert more data. */ - bool addJoinedBlock(const Block & block, bool check_limits) override; + bool addJoinedBlock(const Block & source_block_, bool check_limits) override; void checkTypesOfKeys(const Block & block) const override; From 1308c29747be863b951dbe17bb7cc2e4f3e0aaa8 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 16 May 2023 11:42:19 +0200 Subject: [PATCH 091/127] Update tests/broken_tests.txt --- tests/broken_tests.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 7db123bf467..d408b45f188 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -37,6 +37,7 @@ 01268_shard_avgweighted 01270_optimize_skip_unused_shards_low_cardinality 01319_optimize_skip_unused_shards_nesting +01353_low_cardinality_join_types 01455_shard_leaf_max_rows_bytes_to_read 01476_right_full_join_switch 01477_lc_in_merge_join_left_key From 77adb7c8bc4c680503d2937a8448f097fef1af3d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 16 May 2023 17:00:05 +0200 Subject: [PATCH 092/127] Update 02535_analyzer_group_by_use_nulls reference --- .../02535_analyzer_group_by_use_nulls.reference | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/queries/0_stateless/02535_analyzer_group_by_use_nulls.reference b/tests/queries/0_stateless/02535_analyzer_group_by_use_nulls.reference index 50755627996..63610604ddd 100644 --- a/tests/queries/0_stateless/02535_analyzer_group_by_use_nulls.reference +++ b/tests/queries/0_stateless/02535_analyzer_group_by_use_nulls.reference @@ -5,15 +5,25 @@ GROUP BY ROLLUP(number, number % 2) ORDER BY (number, number % 2, val) SETTINGS group_by_use_nulls=1; 0 0 0 +0 \N 0 1 1 1 +1 \N 1 2 0 2 +2 \N 2 3 1 3 +3 \N 3 4 0 4 +4 \N 4 5 1 5 +5 \N 5 6 0 6 +6 \N 6 7 1 7 +7 \N 7 8 0 8 +8 \N 8 9 1 9 +9 \N 9 \N \N 45 set optimize_group_by_function_keys = 0; SELECT number, number % 2, sum(number) AS val From 4bc5a76fa787e67be8b46fa6dba23f014d783ee8 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 16 May 2023 17:20:06 +0200 Subject: [PATCH 093/127] Add Compose request for GCS (#49693) * Add compose request * Check if outcome is successful --------- Co-authored-by: Nikita Mikhaylov --- src/IO/S3/Client.cpp | 57 +++++++++++++++++++++- src/IO/S3/Client.h | 3 ++ src/IO/S3/Requests.cpp | 101 +++++++++++++++++++++++++++++++++++++++ src/IO/S3/Requests.h | 36 ++++++++++++++ src/IO/S3/copyS3File.cpp | 46 ++++++------------ 5 files changed, 211 insertions(+), 32 deletions(-) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 3c39893b44e..93ef30a927e 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -2,6 +2,7 @@ #if USE_AWS_S3 +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include @@ -349,8 +351,32 @@ Model::CreateMultipartUploadOutcome Client::CreateMultipartUpload(const CreateMu Model::CompleteMultipartUploadOutcome Client::CompleteMultipartUpload(const CompleteMultipartUploadRequest & request) const { - return doRequest( + auto outcome = doRequest( request, [this](const Model::CompleteMultipartUploadRequest & req) { return CompleteMultipartUpload(req); }); + + if (!outcome.IsSuccess() || provider_type != ProviderType::GCS) + return outcome; + + const auto & key = request.GetKey(); + const auto & bucket = request.GetBucket(); + + /// For GCS we will try to compose object at the end, otherwise we cannot do a native copy + /// for the object (e.g. for backups) + /// We don't care if the compose fails, because the upload was still successful, only the + /// performance for copying the object will be affected + S3::ComposeObjectRequest compose_req; + compose_req.SetBucket(bucket); + compose_req.SetKey(key); + compose_req.SetComponentNames({key}); + compose_req.SetContentType("binary/octet-stream"); + auto compose_outcome = ComposeObject(compose_req); + + if (compose_outcome.IsSuccess()) + LOG_TRACE(log, "Composing object was successful"); + else + LOG_INFO(log, "Failed to compose object. Message: {}, Key: {}, Bucket: {}", compose_outcome.GetError().GetMessage(), key, bucket); + + return outcome; } Model::CopyObjectOutcome Client::CopyObject(const CopyObjectRequest & request) const @@ -383,6 +409,35 @@ Model::DeleteObjectsOutcome Client::DeleteObjects(const DeleteObjectsRequest & r return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return DeleteObjects(req); }); } +Client::ComposeObjectOutcome Client::ComposeObject(const ComposeObjectRequest & request) const +{ + auto request_fn = [this](const ComposeObjectRequest & req) + { + auto & endpoint_provider = const_cast(*this).accessEndpointProvider(); + AWS_OPERATION_CHECK_PTR(endpoint_provider, ComposeObject, Aws::Client::CoreErrors, Aws::Client::CoreErrors::ENDPOINT_RESOLUTION_FAILURE); + + if (!req.BucketHasBeenSet()) + { + AWS_LOGSTREAM_ERROR("ComposeObject", "Required field: Bucket, is not set") + return ComposeObjectOutcome(Aws::Client::AWSError(Aws::S3::S3Errors::MISSING_PARAMETER, "MISSING_PARAMETER", "Missing required field [Bucket]", false)); + } + + if (!req.KeyHasBeenSet()) + { + AWS_LOGSTREAM_ERROR("ComposeObject", "Required field: Key, is not set") + return ComposeObjectOutcome(Aws::Client::AWSError(Aws::S3::S3Errors::MISSING_PARAMETER, "MISSING_PARAMETER", "Missing required field [Key]", false)); + } + + auto endpointResolutionOutcome = endpoint_provider->ResolveEndpoint(req.GetEndpointContextParams()); + AWS_OPERATION_CHECK_SUCCESS(endpointResolutionOutcome, ComposeObject, Aws::Client::CoreErrors, Aws::Client::CoreErrors::ENDPOINT_RESOLUTION_FAILURE, endpointResolutionOutcome.GetError().GetMessage()); + endpointResolutionOutcome.GetResult().AddPathSegments(req.GetKey()); + endpointResolutionOutcome.GetResult().SetQueryString("?compose"); + return ComposeObjectOutcome(MakeRequest(req, endpointResolutionOutcome.GetResult(), Aws::Http::HttpMethod::HTTP_PUT)); + }; + + return doRequest(request, request_fn); +} + template std::invoke_result_t Client::doRequest(const RequestType & request, RequestFn request_fn) const diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 330c85c418a..36edb443681 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -192,6 +192,9 @@ public: Model::DeleteObjectOutcome DeleteObject(const DeleteObjectRequest & request) const; Model::DeleteObjectsOutcome DeleteObjects(const DeleteObjectsRequest & request) const; + using ComposeObjectOutcome = Aws::Utils::Outcome; + ComposeObjectOutcome ComposeObject(const ComposeObjectRequest & request) const; + using Aws::S3::S3Client::EnableRequestProcessing; using Aws::S3::S3Client::DisableRequestProcessing; diff --git a/src/IO/S3/Requests.cpp b/src/IO/S3/Requests.cpp index 1972b39d2af..56d2e44a2c4 100644 --- a/src/IO/S3/Requests.cpp +++ b/src/IO/S3/Requests.cpp @@ -3,6 +3,8 @@ #if USE_AWS_S3 #include +#include +#include namespace DB::S3 { @@ -50,6 +52,105 @@ Aws::Http::HeaderValueCollection CopyObjectRequest::GetRequestSpecificHeaders() return headers; } +Aws::String ComposeObjectRequest::SerializePayload() const +{ + if (component_names.empty()) + return {}; + + Aws::Utils::Xml::XmlDocument payload_doc = Aws::Utils::Xml::XmlDocument::CreateWithRootNode("ComposeRequest"); + auto root_node = payload_doc.GetRootElement(); + + for (const auto & name : component_names) + { + auto component_node = root_node.CreateChildElement("Component"); + auto name_node = component_node.CreateChildElement("Name"); + name_node.SetText(name); + } + + return payload_doc.ConvertToString(); +} + +void ComposeObjectRequest::AddQueryStringParameters(Aws::Http::URI & /*uri*/) const +{ +} + +Aws::Http::HeaderValueCollection ComposeObjectRequest::GetRequestSpecificHeaders() const +{ + if (content_type.empty()) + return {}; + + return {Aws::Http::HeaderValuePair(Aws::Http::CONTENT_TYPE_HEADER, content_type)}; +} + +Aws::Endpoint::EndpointParameters ComposeObjectRequest::GetEndpointContextParams() const +{ + EndpointParameters parameters; + if (BucketHasBeenSet()) + parameters.emplace_back("Bucket", GetBucket(), Aws::Endpoint::EndpointParameter::ParameterOrigin::OPERATION_CONTEXT); + + return parameters; +} + +const Aws::String & ComposeObjectRequest::GetBucket() const +{ + return bucket; +} + +bool ComposeObjectRequest::BucketHasBeenSet() const +{ + return !bucket.empty(); +} + +void ComposeObjectRequest::SetBucket(const Aws::String & value) +{ + bucket = value; +} + +void ComposeObjectRequest::SetBucket(Aws::String && value) +{ + bucket = std::move(value); +} + +void ComposeObjectRequest::SetBucket(const char * value) +{ + bucket.assign(value); +} + +const Aws::String & ComposeObjectRequest::GetKey() const +{ + return key; +} + +bool ComposeObjectRequest::KeyHasBeenSet() const +{ + return !key.empty(); +} + +void ComposeObjectRequest::SetKey(const Aws::String & value) +{ + key = value; +} + +void ComposeObjectRequest::SetKey(Aws::String && value) +{ + key = std::move(value); +} + +void ComposeObjectRequest::SetKey(const char * value) +{ + key.assign(value); +} + +void ComposeObjectRequest::SetComponentNames(std::vector component_names_) +{ + component_names = std::move(component_names_); +} + +void ComposeObjectRequest::SetContentType(Aws::String value) +{ + content_type = std::move(value); +} + } #endif diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h index 7b6e911c104..560ba9b2775 100644 --- a/src/IO/S3/Requests.h +++ b/src/IO/S3/Requests.h @@ -94,6 +94,42 @@ using PutObjectRequest = ExtendedRequest; using DeleteObjectRequest = ExtendedRequest; using DeleteObjectsRequest = ExtendedRequest; + +class ComposeObjectRequest : public ExtendedRequest +{ +public: + inline const char * GetServiceRequestName() const override { return "ComposeObject"; } + + AWS_S3_API Aws::String SerializePayload() const override; + + AWS_S3_API void AddQueryStringParameters(Aws::Http::URI & uri) const override; + + AWS_S3_API Aws::Http::HeaderValueCollection GetRequestSpecificHeaders() const override; + + AWS_S3_API EndpointParameters GetEndpointContextParams() const override; + + const Aws::String & GetBucket() const; + bool BucketHasBeenSet() const; + void SetBucket(const Aws::String & value); + void SetBucket(Aws::String && value); + void SetBucket(const char* value); + + const Aws::String & GetKey() const; + bool KeyHasBeenSet() const; + void SetKey(const Aws::String & value); + void SetKey(Aws::String && value); + void SetKey(const char * value); + + void SetComponentNames(std::vector component_names_); + + void SetContentType(Aws::String value); +private: + Aws::String bucket; + Aws::String key; + std::vector component_names; + Aws::String content_type; +}; + } #endif diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 3d99a584933..3a2fd513392 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -601,29 +601,10 @@ namespace void performCopy() { - if (size <= upload_settings.max_single_operation_copy_size) - { + if (!supports_multipart_copy || size <= upload_settings.max_single_operation_copy_size) performSingleOperationCopy(); - } - else if (!supports_multipart_copy) - { - LOG_INFO(&Poco::Logger::get("copyS3File"), "Multipart upload using copy is not supported, will use regular upload"); - copyDataToS3File( - getSourceObjectReadBuffer(), - offset, - size, - client_ptr, - dest_bucket, - dest_key, - request_settings, - object_metadata, - schedule, - for_disk_s3); - } else - { performMultipartUploadCopy(); - } if (request_settings.check_objects_after_upload) checkObjectAfterUpload(); @@ -696,19 +677,12 @@ namespace if (outcome.GetError().GetExceptionName() == "EntityTooLarge" || outcome.GetError().GetExceptionName() == "InvalidRequest" || outcome.GetError().GetExceptionName() == "InvalidArgument") { - // Can't come here with MinIO, MinIO allows single part upload for large objects. - LOG_INFO( - log, - "Single operation copy failed with error {} for Bucket: {}, Key: {}, Object size: {}, will retry with multipart " - "upload copy", - outcome.GetError().GetExceptionName(), - dest_bucket, - dest_key, - size); - if (!supports_multipart_copy) { - LOG_INFO(log, "Multipart upload using copy is not supported, will try regular upload"); + LOG_INFO(log, "Multipart upload using copy is not supported, will try regular upload for Bucket: {}, Key: {}, Object size: {}", + dest_bucket, + dest_key, + size); copyDataToS3File( getSourceObjectReadBuffer(), offset, @@ -724,6 +698,16 @@ namespace } else { + // Can't come here with MinIO, MinIO allows single part upload for large objects. + LOG_INFO( + log, + "Single operation copy failed with error {} for Bucket: {}, Key: {}, Object size: {}, will retry with multipart " + "upload copy", + outcome.GetError().GetExceptionName(), + dest_bucket, + dest_key, + size); + performMultipartUploadCopy(); break; } From 353791b2dd9e7f309f2a4b300c701cd32c1bf9df Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 16 May 2023 17:26:22 +0200 Subject: [PATCH 094/127] Try to fix flaky test_distributed_load_balancing tests (#49912) * Try to fix flaky test_distributed_load_balancing tests * Automatic style fix --------- Co-authored-by: robot-clickhouse Co-authored-by: Alexander Tokmakov --- .../test_distributed_load_balancing/test.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index 90771c027dc..1dba6a30bc4 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -9,9 +9,21 @@ from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) -n1 = cluster.add_instance("n1", main_configs=["configs/remote_servers.xml"]) -n2 = cluster.add_instance("n2", main_configs=["configs/remote_servers.xml"]) -n3 = cluster.add_instance("n3", main_configs=["configs/remote_servers.xml"]) +n1 = cluster.add_instance( + "n1", + main_configs=["configs/remote_servers.xml"], + user_configs=["configs/users.xml"], +) +n2 = cluster.add_instance( + "n2", + main_configs=["configs/remote_servers.xml"], + user_configs=["configs/users.xml"], +) +n3 = cluster.add_instance( + "n3", + main_configs=["configs/remote_servers.xml"], + user_configs=["configs/users.xml"], +) nodes = len(cluster.instances) queries = nodes * 10 From 724949927b0da0d1150f271f51e0fe11e214e472 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 16 May 2023 17:36:48 +0200 Subject: [PATCH 095/127] Add logging --- src/Interpreters/Cache/FileCache.cpp | 15 ++++++++++++--- src/Interpreters/Cache/LRUFileCachePriority.cpp | 7 ++++++- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 2d5744b630e..9ab7943e263 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -573,8 +573,6 @@ bool FileCache::tryReserve(FileSegment & file_segment, size_t size) else queue_size += 1; - size_t removed_size = 0; - class EvictionCandidates final : public std::vector { public: @@ -600,6 +598,7 @@ bool FileCache::tryReserve(FileSegment & file_segment, size_t size) std::unordered_map to_delete; + size_t removed_size = 0; auto iterate_func = [&](LockedKey & locked_key, FileSegmentMetadataPtr segment_metadata) { chassert(segment_metadata->file_segment->assertCorrectness()); @@ -655,8 +654,18 @@ bool FileCache::tryReserve(FileSegment & file_segment, size_t size) { /// max_size == 0 means unlimited cache size, /// max_element_size means unlimited number of cache elements. - return (main_priority->getSizeLimit() != 0 && main_priority->getSize(cache_lock) + size - removed_size > main_priority->getSizeLimit()) + const bool is_overflow = (main_priority->getSizeLimit() != 0 + && main_priority->getSize(cache_lock) + size - removed_size > main_priority->getSizeLimit()) || (main_priority->getElementsLimit() != 0 && queue_size > main_priority->getElementsLimit()); + + LOG_TEST( + log, "Overflow: {}, size: {}, ready to remove: {}, current cache size: {}/{}, elements: {}/{}, while reserving for {}:{}", + is_overflow, size, removed_size, + main_priority->getSize(cache_lock), main_priority->getSizeLimit(), + main_priority->getElementsCount(cache_lock), main_priority->getElementsLimit(), + file_segment.key(), file_segment.offset()); + + return is_overflow; }; main_priority->iterate( diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp index 5e05acd9680..3c36962a0e5 100644 --- a/src/Interpreters/Cache/LRUFileCachePriority.cpp +++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp @@ -44,7 +44,7 @@ IFileCachePriority::Iterator LRUFileCachePriority::add( throw Exception( ErrorCodes::LOGICAL_ERROR, "Not enough space to add {}:{} with size {}: current size: {}/{}", - key, offset, size, current_size, getSizeLimit()); + key, offset, size, current_size, size_limit); } auto iter = queue.insert(queue.end(), Entry(key, offset, size, key_metadata)); @@ -161,6 +161,11 @@ void LRUFileCachePriority::LRUFileCacheIterator::annul() void LRUFileCachePriority::LRUFileCacheIterator::updateSize(int64_t size) { + LOG_TEST( + cache_priority->log, + "Update size with {} in LRU queue for key: {}, offset: {}, previous size: {}", + size, queue_iter->key, queue_iter->offset, queue_iter->size); + cache_priority->current_size += size; queue_iter->size += size; From 0da82945ac2df6c79d7a47ae628346b3d1fda6e0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 16 May 2023 18:18:48 +0200 Subject: [PATCH 096/127] fix --- src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp | 3 ++- .../0_stateless/02440_mutations_finalization.reference | 2 +- .../queries/0_stateless/02440_mutations_finalization.sql | 5 +++-- .../0_stateless/02441_alter_delete_and_drop_column.sql | 8 +++++++- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 9f7ae3222a4..e3c9a54023c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1841,7 +1841,8 @@ MutationCommands ReplicatedMergeTreeQueue::getMutationCommands( MutationCommands commands; for (auto it = begin; it != end; ++it) { - chassert(mutation_pointer < it->second->entry->znode_name); + /// FIXME uncomment this assertion after relesing 23.5 (currently it fails in Upgrade check) + /// chassert(mutation_pointer < it->second->entry->znode_name); mutation_ids.push_back(it->second->entry->znode_name); const auto & commands_from_entry = it->second->entry->commands; commands.insert(commands.end(), commands_from_entry.begin(), commands_from_entry.end()); diff --git a/tests/queries/0_stateless/02440_mutations_finalization.reference b/tests/queries/0_stateless/02440_mutations_finalization.reference index a8b9c2acdce..c4bad0a3806 100644 --- a/tests/queries/0_stateless/02440_mutations_finalization.reference +++ b/tests/queries/0_stateless/02440_mutations_finalization.reference @@ -2,4 +2,4 @@ 1 0000000000 UPDATE n = 2 WHERE n = 1 ['all_0_0_0'] 0 2 -0000000000 UPDATE n = 2 WHERE n = 1 [] 1 +0000000000 UPDATE n = 2 WHERE n = 1 [] diff --git a/tests/queries/0_stateless/02440_mutations_finalization.sql b/tests/queries/0_stateless/02440_mutations_finalization.sql index 796dcde8e4e..c522d8ab9df 100644 --- a/tests/queries/0_stateless/02440_mutations_finalization.sql +++ b/tests/queries/0_stateless/02440_mutations_finalization.sql @@ -6,6 +6,7 @@ system stop merges mut; alter table mut update n = 2 where n = 1; -- it will create MUTATE_PART entry, but will not execute it +system sync replica mut pull; select mutation_id, command, parts_to_do_names, is_done from system.mutations where database=currentDatabase() and table='mut'; -- merges (and mutations) will start again after detach/attach, we need to avoid this somehow... @@ -26,8 +27,8 @@ select mutation_id, command, parts_to_do_names, is_done from system.mutations wh alter table mut modify setting max_number_of_mutations_for_replica=100; system sync replica mut; --- and now it should +-- and now it should (is_done may be 0, but it's okay) select * from mut; -select mutation_id, command, parts_to_do_names, is_done from system.mutations where database=currentDatabase() and table='mut'; +select mutation_id, command, parts_to_do_names from system.mutations where database=currentDatabase() and table='mut'; drop table tmp; -- btw, it will check that mutation can be cancelled between blocks on shutdown diff --git a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql index d274fae1a4f..b9b1b645e8e 100644 --- a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql +++ b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql @@ -5,11 +5,17 @@ insert into mut values (1, 2, 3), (10, 20, 30); system stop merges mut; alter table mut delete where n = 10; + +-- a funny way to wait for a MUTATE_PART to be assigned +select sleepEachRow(2) from url('http://localhost:8123/?param_tries={1..10}&query=' || encodeURLComponent( + 'select 1 where ''MUTATE_PART'' not in (select type from system.replication_queue where database=''' || currentDatabase() || ''' and table=''mut'')' + ), 'LineAsString', 's String') settings max_threads=1 format Null; + alter table mut drop column k settings alter_sync=0; system sync replica mut pull; -- a funny way to wait for ALTER_METADATA to disappear from the replication queue -select sleepEachRow(1) from url('http://localhost:8123/?param_tries={1..30}&query=' || encodeURLComponent( +select sleepEachRow(2) from url('http://localhost:8123/?param_tries={1..10}&query=' || encodeURLComponent( 'select * from system.replication_queue where database=''' || currentDatabase() || ''' and table=''mut'' and type=''ALTER_METADATA''' ), 'LineAsString', 's String') settings max_threads=1 format Null; From 15cb6276883288ce57966e067bd1efdae428ed54 Mon Sep 17 00:00:00 2001 From: Thom O'Connor Date: Tue, 16 May 2023 16:35:41 -0600 Subject: [PATCH 097/127] Update postgresql.md The type cannot be optional, so we need to be clear in the docs that [type1] [type2] cannot be in square brackets, so removed them --- docs/en/engines/table-engines/integrations/postgresql.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index fbd6d944363..f27d4d48f75 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -13,8 +13,8 @@ The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data th ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], - name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], + name1 type1 [DEFAULT|MATERIALIZED|ALIAS expr1] [TTL expr1], + name2 type2 [DEFAULT|MATERIALIZED|ALIAS expr2] [TTL expr2], ... ) ENGINE = PostgreSQL('host:port', 'database', 'table', 'user', 'password'[, `schema`]); ``` From 282297b677d0650b5d268f895d1eaa5c233e3eb1 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 16 May 2023 23:46:01 +0000 Subject: [PATCH 098/127] binary encoding of IPv6 in protobuf --- src/Formats/ProtobufSerializer.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 8bdef0e7d3f..ee33d2d9991 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -1,4 +1,5 @@ #include +#include "Common/formatIPv6.h" #if USE_PROTOBUF # include @@ -1852,25 +1853,26 @@ namespace write_function = [this](IPv6 value) { - ipToString(value, text_buffer); + text_buffer = String(IPV6_BINARY_LENGTH, '\0'); + memcpy(text_buffer.data(), &value.toUnderType(), IPV6_BINARY_LENGTH); writeStr(text_buffer); }; read_function = [this]() -> IPv6 { readStr(text_buffer); - return parse(text_buffer); + if (text_buffer.size() != IPV6_BINARY_LENGTH) + throw Exception(ErrorCodes::PROTOBUF_BAD_CAST, + "Could not convert bytes field {} to IPv6 for inserting into column {} - field size {} is not equal to IPv6 size {}", + field_descriptor.full_name(), column_name, text_buffer.size(), IPV6_BINARY_LENGTH); + IPv6 value; + memcpy(&value.toUnderType(), text_buffer.data(), IPV6_BINARY_LENGTH); + return value; }; default_function = [this]() -> IPv6 { return parse(field_descriptor.default_value_string()); }; } - static void ipToString(const IPv6 & ip, String & str) - { - WriteBufferFromString buf{str}; - writeText(ip, buf); - } - std::function write_function; std::function read_function; std::function default_function; From 0a44a69dc82bac8f490399b8197c6220d673e1d3 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Wed, 17 May 2023 00:22:13 +0000 Subject: [PATCH 099/127] remove unnecessary header --- src/Formats/ProtobufSerializer.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index ee33d2d9991..f690800d145 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -1,5 +1,4 @@ #include -#include "Common/formatIPv6.h" #if USE_PROTOBUF # include From f4ac4c3f9d783a83e86185373d90cdf4bbe6e1ba Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 17 May 2023 03:14:30 +0200 Subject: [PATCH 100/127] Corrections after review. --- src/Backups/BackupFileInfo.cpp | 12 +++++++----- src/Backups/BackupIO_Disk.cpp | 2 ++ src/Backups/BackupIO_File.cpp | 4 ++-- src/Storages/StorageMemory.cpp | 28 ++++++++++++++++------------ 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/Backups/BackupFileInfo.cpp b/src/Backups/BackupFileInfo.cpp index 42546d1b1b8..d539ada55c4 100644 --- a/src/Backups/BackupFileInfo.cpp +++ b/src/Backups/BackupFileInfo.cpp @@ -36,7 +36,7 @@ namespace { /// We cannot reuse base backup because our file is smaller /// than file stored in previous backup - if (new_entry_info.size < base_backup_info.first) + if ((new_entry_info.size < base_backup_info.first) || !base_backup_info.first) return CheckBackupResult::HasNothing; if (base_backup_info.first == new_entry_info.size) @@ -48,7 +48,10 @@ namespace struct ChecksumsForNewEntry { - UInt128 full_checksum; + /// 0 is the valid checksum of empty data. + UInt128 full_checksum = 0; + + /// std::nullopt here means that it's too difficult to calculate a partial checksum so it shouldn't be used. std::optional prefix_checksum; }; @@ -58,8 +61,7 @@ namespace { ChecksumsForNewEntry res; /// The partial checksum should be calculated before the full checksum to enable optimization in BackupEntryWithChecksumCalculation. - if (prefix_size > 0) - res.prefix_checksum = entry->getPartialChecksum(prefix_size); + res.prefix_checksum = entry->getPartialChecksum(prefix_size); res.full_checksum = entry->getChecksum(); return res; } @@ -116,7 +118,7 @@ BackupFileInfo buildFileInfoForBackupEntry(const String & file_name, const Backu /// We have info about this file in base backup /// If file has no checksum -- calculate and fill it. - if (base_backup_file_info.has_value()) + if (base_backup_file_info) { LOG_TRACE(log, "File {} found in base backup, checking for equality", adjusted_path); CheckBackupResult check_base = checkBaseBackupForFile(*base_backup_file_info, info); diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index 3b1651bb223..1514b4c24c7 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -37,6 +37,7 @@ void BackupReaderDisk::copyFileToDisk(const String & path_in_backup, size_t file DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { /// Use IDisk::copyFile() as a more optimal way to copy a file if it's possible. + /// However IDisk::copyFile() can't use throttling for reading, and can't copy an encrypted file or do appending. bool has_throttling = disk->isRemote() ? static_cast(read_settings.remote_throttler) : static_cast(read_settings.local_throttler); if (!has_throttling && (write_mode == WriteMode::Rewrite) && !encrypted_in_backup) { @@ -106,6 +107,7 @@ void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr s bool copy_encrypted, UInt64 start_pos, UInt64 length) { /// Use IDisk::copyFile() as a more optimal way to copy a file if it's possible. + /// However IDisk::copyFile() can't use throttling for reading, and can't copy an encrypted file or copy a part of the file. bool has_throttling = src_disk->isRemote() ? static_cast(read_settings.remote_throttler) : static_cast(read_settings.local_throttler); if (!has_throttling && !start_pos && !copy_encrypted) { diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 7ffae26d16f..e1a3f336521 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -41,7 +41,7 @@ std::unique_ptr BackupReaderFile::readFile(const String & fi void BackupReaderFile::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) { - /// std::filesystem::copy() can copy from the filesystem only, and it can't do the throttling or appending. + /// std::filesystem::copy() can copy from the filesystem only, and can't do throttling or appending. bool has_throttling = static_cast(read_settings.local_throttler); if (!has_throttling && (write_mode == WriteMode::Rewrite)) { @@ -121,7 +121,7 @@ void BackupWriterFile::removeFiles(const Strings & file_names) void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) { - /// std::filesystem::copy() can copy from the filesystem only, and it can't do the throttling or copy a part of the file. + /// std::filesystem::copy() can copy from the filesystem only, and can't do throttling or copy a part of the file. bool has_throttling = static_cast(read_settings.local_throttler); if (!has_throttling) { diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index c9654cfd105..31e45db55cb 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -319,24 +319,28 @@ namespace IndexForNativeFormat index; { auto data_file_path = temp_dir / fs::path{file_paths[data_bin_pos]}.filename(); - { - auto data_out_compressed = temp_disk->writeFile(data_file_path); - CompressedWriteBuffer data_out{*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size}; - NativeWriter block_out{data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; - for (const auto & block : *blocks) - block_out.write(block); - } + auto data_out_compressed = temp_disk->writeFile(data_file_path); + auto data_out = std::make_unique(*data_out_compressed, CompressionCodecFactory::instance().getDefaultCodec(), max_compress_block_size); + NativeWriter block_out{*data_out, 0, metadata_snapshot->getSampleBlock(), false, &index}; + for (const auto & block : *blocks) + block_out.write(block); + data_out->finalize(); + data_out.reset(); + data_out_compressed->finalize(); + data_out_compressed.reset(); backup_entries[data_bin_pos] = {file_paths[data_bin_pos], std::make_shared(temp_disk, data_file_path)}; } /// Writing index.mrk { auto index_mrk_path = temp_dir / fs::path{file_paths[index_mrk_pos]}.filename(); - { - auto index_mrk_out_compressed = temp_disk->writeFile(index_mrk_path); - CompressedWriteBuffer index_mrk_out{*index_mrk_out_compressed}; - index.write(index_mrk_out); - } + auto index_mrk_out_compressed = temp_disk->writeFile(index_mrk_path); + auto index_mrk_out = std::make_unique(*index_mrk_out_compressed); + index.write(*index_mrk_out); + index_mrk_out->finalize(); + index_mrk_out.reset(); + index_mrk_out_compressed->finalize(); + index_mrk_out_compressed.reset(); backup_entries[index_mrk_pos] = {file_paths[index_mrk_pos], std::make_shared(temp_disk, index_mrk_path)}; } From 3c80e30f02f40acd372942506d8d57ec803ef2b6 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 11 May 2023 16:59:19 +0200 Subject: [PATCH 101/127] Fix per-query IO/BACKUPs throttling settings (when default profile has them) When some of this settings was set for default profile (in users.xml/users.yml), then it will be always used regardless of what user passed. Fix this by not inherit per-query throttlers, for this they should be reset before making query context and they should not be initialized as before in Context::makeQueryContext(), since makeQueryContext() called too early, when user settings was not read yet. But there we had also initialization of per-server throttling, move this into the ContextSharedPart::configureServerWideThrottling(), and call it once we have ServerSettings set. Also note, that this patch makes the following settings - server settings: - max_replicated_fetches_network_bandwidth_for_server - max_replicated_sends_network_bandwidth_for_server But this change should not affect anybody, since it is done with compatiblity (i.e. if this setting is set in users profile it will be read from it as well as a fallback). Signed-off-by: Azat Khuzhin --- src/Core/ServerSettings.h | 2 + src/Core/Settings.h | 4 +- src/Interpreters/Context.cpp | 150 +++++++----------- .../configs/limit_replication_config.xml | 8 +- .../test_replicated_fetches_bandwidth/test.py | 2 +- 5 files changed, 65 insertions(+), 101 deletions(-) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index ee3482414af..2a73930836a 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -23,6 +23,8 @@ namespace DB M(UInt64, io_thread_pool_queue_size, 10000, "Queue size for IO thread pool.", 0) \ M(UInt64, max_outdated_parts_loading_thread_pool_size, 32, "The maximum number of threads that would be used for loading outdated data parts on startup", 0) \ M(UInt64, outdated_part_loading_thread_pool_queue_size, 10000, "Queue size for parts loading thread pool.", 0) \ + M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \ + M(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \ M(UInt64, max_remote_read_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for read. Zero means unlimited.", 0) \ M(UInt64, max_remote_write_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for write. Zero means unlimited.", 0) \ M(UInt64, max_local_read_bandwidth_for_server, 0, "The maximum speed of local reads in bytes per second. Zero means unlimited.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 610c7135a75..874e31b8d37 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -101,8 +101,6 @@ class IColumn; M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \ - M(UInt64, max_replicated_fetches_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited. Only has meaning at server startup.", 0) \ - M(UInt64, max_replicated_sends_network_bandwidth_for_server, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited. Only has meaning at server startup.", 0) \ M(UInt64, max_remote_read_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for read.", 0) \ M(UInt64, max_remote_write_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for write.", 0) \ M(UInt64, max_local_read_bandwidth, 0, "The maximum speed of local reads in bytes per second.", 0) \ @@ -791,6 +789,8 @@ class IColumn; MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_distributed_schedule_pool_size, 16) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_remote_read_network_bandwidth_for_server, 0) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_remote_write_network_bandwidth_for_server, 0) \ + MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_replicated_fetches_network_bandwidth_for_server, 0) \ + MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, max_replicated_sends_network_bandwidth_for_server, 0) \ /* ---- */ \ MAKE_OBSOLETE(M, DefaultDatabaseEngine, default_database_engine, DefaultDatabaseEngine::Atomic) \ MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0) \ diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index b4bdb7cf233..0ef9ea53ee8 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -631,6 +631,30 @@ struct ContextSharedPart : boost::noncopyable log->warning(message); warnings.push_back(message); } + + void configureServerWideThrottling() + { + if (auto bandwidth = server_settings.max_replicated_fetches_network_bandwidth_for_server) + replicated_fetches_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_replicated_sends_network_bandwidth_for_server) + replicated_sends_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_remote_read_network_bandwidth_for_server) + remote_read_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_remote_write_network_bandwidth_for_server) + remote_write_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_local_read_bandwidth_for_server) + local_read_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_local_write_bandwidth_for_server) + local_write_throttler = std::make_shared(bandwidth); + + if (auto bandwidth = server_settings.max_backup_bandwidth_for_server) + backups_server_throttler = std::make_shared(bandwidth); + } }; @@ -1897,16 +1921,22 @@ void Context::makeQueryContext() { query_context = shared_from_this(); - /// Create throttlers, to inherit the ThrottlePtr in the context copies. - { - getRemoteReadThrottler(); - getRemoteWriteThrottler(); - - getLocalReadThrottler(); - getLocalWriteThrottler(); - - getBackupsThrottler(); - } + /// Throttling should not be inherited, otherwise if you will set + /// throttling for default profile you will not able to overwrite it + /// per-user/query. + /// + /// Note, that if you need to set it server-wide, you should use + /// per-server settings, i.e.: + /// - max_backup_bandwidth_for_server + /// - max_remote_read_network_bandwidth_for_server + /// - max_remote_write_network_bandwidth_for_server + /// - max_local_read_bandwidth_for_server + /// - max_local_write_bandwidth_for_server + remote_read_query_throttler.reset(); + remote_write_query_throttler.reset(); + local_read_query_throttler.reset(); + local_write_query_throttler.reset(); + backups_query_throttler.reset(); } void Context::makeSessionContext() @@ -2438,143 +2468,76 @@ BackgroundSchedulePool & Context::getMessageBrokerSchedulePool() const ThrottlerPtr Context::getReplicatedFetchesThrottler() const { - auto lock = getLock(); - if (!shared->replicated_fetches_throttler) - shared->replicated_fetches_throttler = std::make_shared( - settings.max_replicated_fetches_network_bandwidth_for_server); - return shared->replicated_fetches_throttler; } ThrottlerPtr Context::getReplicatedSendsThrottler() const { - auto lock = getLock(); - if (!shared->replicated_sends_throttler) - shared->replicated_sends_throttler = std::make_shared( - settings.max_replicated_sends_network_bandwidth_for_server); - return shared->replicated_sends_throttler; } ThrottlerPtr Context::getRemoteReadThrottler() const { - ThrottlerPtr throttler; - - const auto & query_settings = getSettingsRef(); - UInt64 bandwidth_for_server = shared->server_settings.max_remote_read_network_bandwidth_for_server; - if (bandwidth_for_server) - { - auto lock = getLock(); - if (!shared->remote_read_throttler) - shared->remote_read_throttler = std::make_shared(bandwidth_for_server); - throttler = shared->remote_read_throttler; - } - - if (query_settings.max_remote_read_network_bandwidth) + ThrottlerPtr throttler = shared->remote_read_throttler; + if (auto bandwidth = getSettingsRef().max_remote_read_network_bandwidth) { auto lock = getLock(); if (!remote_read_query_throttler) - remote_read_query_throttler = std::make_shared(query_settings.max_remote_read_network_bandwidth, throttler); + remote_read_query_throttler = std::make_shared(bandwidth, throttler); throttler = remote_read_query_throttler; } - return throttler; } ThrottlerPtr Context::getRemoteWriteThrottler() const { - ThrottlerPtr throttler; - - const auto & query_settings = getSettingsRef(); - UInt64 bandwidth_for_server = shared->server_settings.max_remote_write_network_bandwidth_for_server; - if (bandwidth_for_server) - { - auto lock = getLock(); - if (!shared->remote_write_throttler) - shared->remote_write_throttler = std::make_shared(bandwidth_for_server); - throttler = shared->remote_write_throttler; - } - - if (query_settings.max_remote_write_network_bandwidth) + ThrottlerPtr throttler = shared->remote_write_throttler; + if (auto bandwidth = getSettingsRef().max_remote_write_network_bandwidth) { auto lock = getLock(); if (!remote_write_query_throttler) - remote_write_query_throttler = std::make_shared(query_settings.max_remote_write_network_bandwidth, throttler); + remote_write_query_throttler = std::make_shared(bandwidth, throttler); throttler = remote_write_query_throttler; } - return throttler; } ThrottlerPtr Context::getLocalReadThrottler() const { - ThrottlerPtr throttler; - - if (shared->server_settings.max_local_read_bandwidth_for_server) - { - auto lock = getLock(); - if (!shared->local_read_throttler) - shared->local_read_throttler = std::make_shared(shared->server_settings.max_local_read_bandwidth_for_server); - throttler = shared->local_read_throttler; - } - - const auto & query_settings = getSettingsRef(); - if (query_settings.max_local_read_bandwidth) + ThrottlerPtr throttler = shared->local_read_throttler; + if (auto bandwidth = getSettingsRef().max_local_read_bandwidth) { auto lock = getLock(); if (!local_read_query_throttler) - local_read_query_throttler = std::make_shared(query_settings.max_local_read_bandwidth, throttler); + local_read_query_throttler = std::make_shared(bandwidth, throttler); throttler = local_read_query_throttler; } - return throttler; } ThrottlerPtr Context::getLocalWriteThrottler() const { - ThrottlerPtr throttler; - - if (shared->server_settings.max_local_write_bandwidth_for_server) - { - auto lock = getLock(); - if (!shared->local_write_throttler) - shared->local_write_throttler = std::make_shared(shared->server_settings.max_local_write_bandwidth_for_server); - throttler = shared->local_write_throttler; - } - - const auto & query_settings = getSettingsRef(); - if (query_settings.max_local_write_bandwidth) + ThrottlerPtr throttler = shared->local_write_throttler; + if (auto bandwidth = getSettingsRef().max_local_write_bandwidth) { auto lock = getLock(); if (!local_write_query_throttler) - local_write_query_throttler = std::make_shared(query_settings.max_local_write_bandwidth, throttler); + local_write_query_throttler = std::make_shared(bandwidth, throttler); throttler = local_write_query_throttler; } - return throttler; } ThrottlerPtr Context::getBackupsThrottler() const { - ThrottlerPtr throttler; - - if (shared->server_settings.max_backup_bandwidth_for_server) - { - auto lock = getLock(); - if (!shared->backups_server_throttler) - shared->backups_server_throttler = std::make_shared(shared->server_settings.max_backup_bandwidth_for_server); - throttler = shared->backups_server_throttler; - } - - const auto & query_settings = getSettingsRef(); - if (query_settings.max_backup_bandwidth) + ThrottlerPtr throttler = shared->backups_server_throttler; + if (auto bandwidth = getSettingsRef().max_backup_bandwidth) { auto lock = getLock(); if (!backups_query_throttler) - backups_query_throttler = std::make_shared(query_settings.max_backup_bandwidth, throttler); + backups_query_throttler = std::make_shared(bandwidth, throttler); throttler = backups_query_throttler; } - return throttler; } @@ -3633,7 +3596,10 @@ void Context::setApplicationType(ApplicationType type) shared->application_type = type; if (type == ApplicationType::SERVER) + { shared->server_settings.loadSettingsFromConfig(Poco::Util::Application::instance().config()); + shared->configureServerWideThrottling(); + } } void Context::setDefaultProfiles(const Poco::Util::AbstractConfiguration & config) diff --git a/tests/integration/test_replicated_fetches_bandwidth/configs/limit_replication_config.xml b/tests/integration/test_replicated_fetches_bandwidth/configs/limit_replication_config.xml index b18f0fbc93f..2a7b47e3560 100644 --- a/tests/integration/test_replicated_fetches_bandwidth/configs/limit_replication_config.xml +++ b/tests/integration/test_replicated_fetches_bandwidth/configs/limit_replication_config.xml @@ -1,8 +1,4 @@ - - - 5242880 - 10485760 - - + 5242880 + 10485760 diff --git a/tests/integration/test_replicated_fetches_bandwidth/test.py b/tests/integration/test_replicated_fetches_bandwidth/test.py index 059102f8683..cd969746c31 100644 --- a/tests/integration/test_replicated_fetches_bandwidth/test.py +++ b/tests/integration/test_replicated_fetches_bandwidth/test.py @@ -12,7 +12,7 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance("node1", with_zookeeper=True) node2 = cluster.add_instance("node2", with_zookeeper=True) node3 = cluster.add_instance( - "node3", user_configs=["configs/limit_replication_config.xml"], with_zookeeper=True + "node3", main_configs=["configs/limit_replication_config.xml"], with_zookeeper=True ) From 7383da0c526399963ff3496c22b6a59f062ff98f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 11 May 2023 21:43:43 +0200 Subject: [PATCH 102/127] Fix per-query remote throttler remote throttler by some reason had been overwritten by the global one during reloads, likely this is for graceful reload of this option, but it breaks per-query throttling, remove this logic. Signed-off-by: Azat Khuzhin --- .../AzureBlobStorage/AzureObjectStorage.cpp | 1 - src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp | 5 ----- src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h | 5 ----- src/Disks/ObjectStorages/IObjectStorage.cpp | 11 ----------- src/Disks/ObjectStorages/IObjectStorage.h | 11 ++++------- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 1 - 6 files changed, 4 insertions(+), 30 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 9db5d13a7f8..62c3216ad3f 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -282,7 +282,6 @@ void AzureObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguratio { auto new_settings = getAzureBlobStorageSettings(config, config_prefix, context); settings.set(std::move(new_settings)); - applyRemoteThrottlingSettings(context); /// We don't update client } diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index e50e410823d..38c088ab213 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -156,11 +156,6 @@ void HDFSObjectStorage::copyObject( /// NOLINT } -void HDFSObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration &, const std::string &, ContextPtr context) -{ - applyRemoteThrottlingSettings(context); -} - std::unique_ptr HDFSObjectStorage::cloneObjectStorage(const std::string &, const Poco::Util::AbstractConfiguration &, const std::string &, ContextPtr) { throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "HDFS object storage doesn't support cloning"); diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h index 4064a5c5b7f..fdc47ad16a6 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.h @@ -105,11 +105,6 @@ public: void startup() override; - void applyNewSettings( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } std::unique_ptr cloneObjectStorage( diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 52e8b1a465d..1ee55a7b342 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -55,27 +55,16 @@ const std::string & IObjectStorage::getCacheName() const throw Exception(ErrorCodes::NOT_IMPLEMENTED, "getCacheName() is not implemented for object storage"); } -void IObjectStorage::applyRemoteThrottlingSettings(ContextPtr context) -{ - std::unique_lock lock{throttlers_mutex}; - remote_read_throttler = context->getRemoteReadThrottler(); - remote_write_throttler = context->getRemoteWriteThrottler(); -} - ReadSettings IObjectStorage::patchSettings(const ReadSettings & read_settings) const { - std::unique_lock lock{throttlers_mutex}; ReadSettings settings{read_settings}; - settings.remote_throttler = remote_read_throttler; settings.for_object_storage = true; return settings; } WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings) const { - std::unique_lock lock{throttlers_mutex}; WriteSettings settings{write_settings}; - settings.remote_throttler = remote_write_throttler; settings.for_object_storage = true; return settings; } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index f4f1b063ade..58a31fdc8c3 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -165,9 +165,10 @@ public: /// Apply new settings, in most cases reiniatilize client and some other staff virtual void applyNewSettings( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - ContextPtr context) = 0; + const Poco::Util::AbstractConfiguration &, + const std::string & /*config_prefix*/, + ContextPtr) + {} /// Sometimes object storages have something similar to chroot or namespace, for example /// buckets in S3. If object storage doesn't have any namepaces return empty string. @@ -205,10 +206,6 @@ public: virtual WriteSettings patchSettings(const WriteSettings & write_settings) const; -protected: - /// Should be called from implementation of applyNewSettings() - void applyRemoteThrottlingSettings(ContextPtr context); - private: mutable std::mutex throttlers_mutex; ThrottlerPtr remote_read_throttler; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 79b3d3a2b8b..9faae3a6c62 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -443,7 +443,6 @@ void S3ObjectStorage::applyNewSettings(const Poco::Util::AbstractConfiguration & auto new_client = getClient(config, config_prefix, context, *new_s3_settings); s3_settings.set(std::move(new_s3_settings)); client.set(std::move(new_client)); - applyRemoteThrottlingSettings(context); } std::unique_ptr S3ObjectStorage::cloneObjectStorage( From fdfb1eda55b0c2ee95dc7284629f7ded6f4c9196 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 11 May 2023 22:15:05 +0200 Subject: [PATCH 103/127] Fix {Local,Remote}ReadThrottlerSleepMicroseconds metric values And also update the test, since now you could have slightly less sleep intervals, if query spend some time in other places. But what is important is that query_duration_ms does not exceeded calculated delay. Signed-off-by: Azat Khuzhin --- src/Common/Throttler.cpp | 16 ++++++++-------- src/Common/Throttler.h | 8 ++++---- .../02703_max_local_read_bandwidth.sh | 2 +- .../02703_max_local_write_bandwidth.sh | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Common/Throttler.cpp b/src/Common/Throttler.cpp index 4f99c24cc8d..4c1320db27a 100644 --- a/src/Common/Throttler.cpp +++ b/src/Common/Throttler.cpp @@ -61,20 +61,20 @@ UInt64 Throttler::add(size_t amount) throw Exception::createDeprecated(limit_exceeded_exception_message + std::string(" Maximum: ") + toString(limit), ErrorCodes::LIMIT_EXCEEDED); /// Wait unless there is positive amount of tokens - throttling - Int64 sleep_time = 0; + Int64 sleep_time_ns = 0; if (max_speed && tokens_value < 0) { - sleep_time = static_cast(-tokens_value / max_speed * NS); - accumulated_sleep += sleep_time; - sleepForNanoseconds(sleep_time); - accumulated_sleep -= sleep_time; - ProfileEvents::increment(ProfileEvents::ThrottlerSleepMicroseconds, sleep_time / 1000UL); + sleep_time_ns = static_cast(-tokens_value / max_speed * NS); + accumulated_sleep += sleep_time_ns; + sleepForNanoseconds(sleep_time_ns); + accumulated_sleep -= sleep_time_ns; + ProfileEvents::increment(ProfileEvents::ThrottlerSleepMicroseconds, sleep_time_ns / 1000UL); } if (parent) - sleep_time += parent->add(amount); + sleep_time_ns += parent->add(amount); - return static_cast(sleep_time); + return static_cast(sleep_time_ns); } void Throttler::reset() diff --git a/src/Common/Throttler.h b/src/Common/Throttler.h index 4b117ae7637..7508065096b 100644 --- a/src/Common/Throttler.h +++ b/src/Common/Throttler.h @@ -34,15 +34,15 @@ public: const std::shared_ptr & parent_ = nullptr); /// Use `amount` tokens, sleeps if required or throws exception on limit overflow. - /// Returns duration of sleep in microseconds (to distinguish sleeping on different kinds of throttlers for metrics) + /// Returns duration of sleep in nanoseconds (to distinguish sleeping on different kinds of throttlers for metrics) UInt64 add(size_t amount); UInt64 add(size_t amount, ProfileEvents::Event event_amount, ProfileEvents::Event event_sleep_us) { - UInt64 sleep_us = add(amount); + UInt64 sleep_ns = add(amount); ProfileEvents::increment(event_amount, amount); - ProfileEvents::increment(event_sleep_us, sleep_us); - return sleep_us; + ProfileEvents::increment(event_sleep_us, sleep_ns / 1000UL); + return sleep_ns; } /// Not thread safe diff --git a/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh b/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh index 130f3a29ade..d47e2f363bd 100755 --- a/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh +++ b/tests/queries/0_stateless/02703_max_local_read_bandwidth.sh @@ -32,7 +32,7 @@ for read_method in "${read_methods[@]}"; do query_duration_ms >= 7e3, ProfileEvents['ReadBufferFromFileDescriptorReadBytes'] > 8e6, ProfileEvents['LocalReadThrottlerBytes'] > 8e6, - ProfileEvents['LocalReadThrottlerSleepMicroseconds'] > 7e6 + ProfileEvents['LocalReadThrottlerSleepMicroseconds'] > 7e6*0.9 FROM system.query_log WHERE current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' AND type != 'QueryStart' " diff --git a/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh b/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh index 80713e90169..41165d35d37 100755 --- a/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh +++ b/tests/queries/0_stateless/02703_max_local_write_bandwidth.sh @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT -nm -q " query_duration_ms >= 7e3, ProfileEvents['WriteBufferFromFileDescriptorWriteBytes'] > 8e6, ProfileEvents['LocalWriteThrottlerBytes'] > 8e6, - ProfileEvents['LocalWriteThrottlerSleepMicroseconds'] > 7e6 + ProfileEvents['LocalWriteThrottlerSleepMicroseconds'] > 7e6*0.9 FROM system.query_log WHERE current_database = '$CLICKHOUSE_DATABASE' AND query_id = '$query_id' AND type != 'QueryStart' " From 9fe4f1a934c86d5d42fef6bbd7c3d19db4e7f97b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 11 May 2023 21:58:07 +0200 Subject: [PATCH 104/127] Extensive coverage for bandwidth limiting settings Signed-off-by: Azat Khuzhin --- tests/integration/test_throttling/__init__.py | 0 .../configs/server_backups.xml | 34 ++ .../configs/server_overrides.xml | 3 + .../configs/users_overrides.xml | 3 + tests/integration/test_throttling/test.py | 413 ++++++++++++++++++ 5 files changed, 453 insertions(+) create mode 100644 tests/integration/test_throttling/__init__.py create mode 100644 tests/integration/test_throttling/configs/server_backups.xml create mode 100644 tests/integration/test_throttling/configs/server_overrides.xml create mode 100644 tests/integration/test_throttling/configs/users_overrides.xml create mode 100644 tests/integration/test_throttling/test.py diff --git a/tests/integration/test_throttling/__init__.py b/tests/integration/test_throttling/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_throttling/configs/server_backups.xml b/tests/integration/test_throttling/configs/server_backups.xml new file mode 100644 index 00000000000..d25c67a779c --- /dev/null +++ b/tests/integration/test_throttling/configs/server_backups.xml @@ -0,0 +1,34 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + +
+ s3 +
+
+
+
+
+ + + + http://minio1:9001/root/data/ + minio + minio123 + + + + + default + /backups/ + +
diff --git a/tests/integration/test_throttling/configs/server_overrides.xml b/tests/integration/test_throttling/configs/server_overrides.xml new file mode 100644 index 00000000000..197bf660500 --- /dev/null +++ b/tests/integration/test_throttling/configs/server_overrides.xml @@ -0,0 +1,3 @@ + + + diff --git a/tests/integration/test_throttling/configs/users_overrides.xml b/tests/integration/test_throttling/configs/users_overrides.xml new file mode 100644 index 00000000000..197bf660500 --- /dev/null +++ b/tests/integration/test_throttling/configs/users_overrides.xml @@ -0,0 +1,3 @@ + + + diff --git a/tests/integration/test_throttling/test.py b/tests/integration/test_throttling/test.py new file mode 100644 index 00000000000..ff8e7154d0d --- /dev/null +++ b/tests/integration/test_throttling/test.py @@ -0,0 +1,413 @@ +# pylint: disable=unused-argument +# pylint: disable=redefined-outer-name +# pylint: disable=line-too-long + +# This test covers the following options: +# - max_backup_bandwidth +# - max_backup_bandwidth_for_server +# - max_local_read_bandwidth +# - max_local_read_bandwidth_for_server +# - max_local_write_bandwidth +# - max_local_write_bandwidth_for_server +# - max_remote_read_network_bandwidth +# - max_remote_read_network_bandwidth_for_server +# - max_remote_write_network_bandwidth +# - max_remote_write_network_bandwidth_for_server +# - and that max_backup_bandwidth from the query will override setting from the user profile + +import time +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +def elapsed(func, *args, **kwargs): + start = time.time() + ret = func(*args, **kwargs) + end = time.time() + return ret, end - start + + +node = cluster.add_instance( + "node", + stay_alive=True, + main_configs=["configs/server_backups.xml", "configs/server_overrides.xml"], + user_configs=["configs/users_overrides.xml"], + with_minio=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield + finally: + cluster.shutdown() + + +@pytest.fixture(scope="function", autouse=True) +def revert_config(): + # Revert configs after the test, not before + yield + node.exec_in_container( + [ + "bash", + "-c", + f"echo '' > /etc/clickhouse-server/config.d/server_overrides.xml", + ] + ) + node.exec_in_container( + [ + "bash", + "-c", + f"echo '' > /etc/clickhouse-server/users.d/users_overrides.xml", + ] + ) + node.restart_clickhouse() + + +backup_id_counter = 0 + + +def next_backup_name(storage): + global backup_id_counter + if storage == "local": + backup_id_counter += 1 + return f"Disk('default', '{backup_id_counter}/')" + elif storage == "remote": + backup_id_counter += 1 + return f"S3(s3, '{backup_id_counter}/')" + else: + raise Exception(storage) + + +def node_update_config(mode, setting, value=None): + if mode is None: + return + if mode == "server": + config_path = "/etc/clickhouse-server/config.d/server_overrides.xml" + config_content = f""" + <{setting}>{value} + """ + else: + config_path = "/etc/clickhouse-server/users.d/users_overrides.xml" + config_content = f""" + + + + <{setting}>{value} + + + + """ + node.exec_in_container( + [ + "bash", + "-c", + f"echo '{config_content}' > {config_path}", + ] + ) + node.restart_clickhouse() + + +def assert_took(took, should_took): + assert took >= should_took[0] * 0.9 and took < should_took[1] + + +@pytest.mark.parametrize( + "policy,backup_name,mode,setting,value,should_took", + [ + # + # Local -> Local + # + pytest.param( + "default", + next_backup_name("local"), + None, + None, + None, + (0, 3), + id="no_local_throttling", + ), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "default", + next_backup_name("local"), + "user", + "max_backup_bandwidth", + "1M", + (7, 14), + id="user_local_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "default", + next_backup_name("local"), + "server", + "max_backup_bandwidth_for_server", + "2M", + (3, 7), + id="server_local_throttling", + ), + # + # Remote -> Local + # + pytest.param( + "s3", + next_backup_name("local"), + None, + None, + None, + (0, 3), + id="no_remote_to_local_throttling", + ), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "s3", + next_backup_name("local"), + "user", + "max_backup_bandwidth", + "1M", + (7, 14), + id="user_remote_to_local_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "s3", + next_backup_name("local"), + "server", + "max_backup_bandwidth_for_server", + "2M", + (3, 7), + id="server_remote_to_local_throttling", + ), + # + # Remote -> Remote + # + pytest.param( + "s3", + next_backup_name("remote"), + None, + None, + None, + (0, 3), + id="no_remote_to_remote_throttling", + ), + # No throttling for S3-to-S3, uses native copy + pytest.param( + "s3", + next_backup_name("remote"), + "user", + "max_backup_bandwidth", + "1M", + (0, 3), + id="user_remote_to_remote_throttling", + ), + # No throttling for S3-to-S3, uses native copy + pytest.param( + "s3", + next_backup_name("remote"), + "server", + "max_backup_bandwidth_for_server", + "2M", + (0, 3), + id="server_remote_to_remote_throttling", + ), + # + # Local -> Remote + # + # NOTE: S3 is complex, it will read file 3 times: + # - first for calculating the checksum + # - second for calculating the signature + # - and finally to write the payload to S3 + # Hence the value should be multipled by 3. + pytest.param( + "default", + next_backup_name("remote"), + None, + None, + None, + (0, 3), + id="no_local_to_remote_throttling", + ), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds, but for S3Client it is 2x more + pytest.param( + "default", + next_backup_name("remote"), + "user", + "max_backup_bandwidth", + "1M", + (7 * 3, 7 * 4 - 1), + id="user_local_to_remote_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds, but for S3Client it is 2x more + pytest.param( + "default", + next_backup_name("remote"), + "server", + "max_backup_bandwidth_for_server", + "2M", + (3 * 3, 3 * 5), + id="server_local_to_remote_throttling", + ), + ], +) +def test_backup_throttling(policy, backup_name, mode, setting, value, should_took): + node_update_config(mode, setting, value) + node.query( + f""" + drop table if exists data; + create table data (key UInt64 CODEC(NONE)) engine=MergeTree() order by tuple() settings min_bytes_for_wide_part=1e9, storage_policy='{policy}'; + insert into data select * from numbers(1e6); + """ + ) + _, took = elapsed(node.query, f"backup table data to {backup_name}") + assert_took(took, should_took) + + +def test_backup_throttling_override(): + node_update_config("user", "max_backup_bandwidth", "1M") + node.query( + """ + drop table if exists data; + create table data (key UInt64 CODEC(NONE)) engine=MergeTree() order by tuple() settings min_bytes_for_wide_part=1e9; + insert into data select * from numbers(1e6); + """ + ) + + backup_name = next_backup_name("local") + _, took = elapsed( + node.query, + f"backup table data to {backup_name}", + settings={ + "max_backup_bandwidth": "500K", + }, + ) + # reading 1e6*8 bytes with 500Ki default bandwith should take (8-0.5)/0.5=15 seconds + assert_took(took, (15, 20)) + + +@pytest.mark.parametrize( + "policy,mode,setting,value,should_took", + [ + # + # Local + # + pytest.param("default", None, None, None, (0, 3), id="no_local_throttling"), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "default", + "user", + "max_local_read_bandwidth", + "1M", + (7, 14), + id="user_local_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "default", + "server", + "max_local_read_bandwidth_for_server", + "2M", + (3, 7), + id="server_local_throttling", + ), + # + # Remote + # + pytest.param("s3", None, None, None, (0, 3), id="no_remote_throttling"), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "s3", + "user", + "max_remote_read_network_bandwidth", + "1M", + (7, 14), + id="user_remote_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "s3", + "server", + "max_remote_read_network_bandwidth_for_server", + "2M", + (3, 7), + id="server_remote_throttling", + ), + ], +) +def test_read_throttling(policy, mode, setting, value, should_took): + node_update_config(mode, setting, value) + node.query( + f""" + drop table if exists data; + create table data (key UInt64 CODEC(NONE)) engine=MergeTree() order by tuple() settings min_bytes_for_wide_part=1e9, storage_policy='{policy}'; + insert into data select * from numbers(1e6); + """ + ) + _, took = elapsed(node.query, f"select * from data") + assert_took(took, should_took) + + +@pytest.mark.parametrize( + "policy,mode,setting,value,should_took", + [ + # + # Local + # + pytest.param("default", None, None, None, (0, 3), id="no_local_throttling"), + # reading 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "default", + "user", + "max_local_write_bandwidth", + "1M", + (7, 14), + id="local_user_throttling", + ), + # reading 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "default", + "server", + "max_local_write_bandwidth_for_server", + "2M", + (3, 7), + id="local_server_throttling", + ), + # + # Remote + # + pytest.param("s3", None, None, None, (0, 3), id="no_remote_throttling"), + # writeing 1e6*8 bytes with 1M default bandwith should take (8-1)/1=7 seconds + pytest.param( + "s3", + "user", + "max_remote_write_network_bandwidth", + "1M", + (7, 14), + id="user_remote_throttling", + ), + # writeing 1e6*8 bytes with 2M default bandwith should take (8-2)/2=3 seconds + pytest.param( + "s3", + "server", + "max_remote_write_network_bandwidth_for_server", + "2M", + (3, 7), + id="server_remote_throttling", + ), + ], +) +def test_write_throttling(policy, mode, setting, value, should_took): + node_update_config(mode, setting, value) + node.query( + f""" + drop table if exists data; + create table data (key UInt64 CODEC(NONE)) engine=MergeTree() order by tuple() settings min_bytes_for_wide_part=1e9, storage_policy='{policy}'; + """ + ) + _, took = elapsed(node.query, f"insert into data select * from numbers(1e6)") + assert_took(took, should_took) From 3787b7f127750b573a98920f8e7243e67fe15d68 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 17 May 2023 12:16:18 +0200 Subject: [PATCH 105/127] Update Metadata.cpp --- src/Interpreters/Cache/Metadata.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index 01cdc7f1d1b..c87eaabbbf2 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -1,7 +1,6 @@ #include #include #include -#include "Common/Exception.h" #include #include From 36c31e1d795412b2d3107f58409d4c8806505a1a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 17 May 2023 14:07:34 +0300 Subject: [PATCH 106/127] Improve concurrent parts removal with zero copy replication (#49630) * improve concurrent parts removal * fix * fix --- src/Storages/MergeTree/MergeTreeData.cpp | 154 ++++++++++++++++++--- src/Storages/MergeTree/MergeTreeSettings.h | 2 + 2 files changed, 133 insertions(+), 23 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index b8208052f19..6ddfc3b806e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -2429,9 +2430,13 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t } /// Parallel parts removal. - size_t num_threads = std::min(settings->max_part_removal_threads, parts_to_remove.size()); + size_t num_threads = settings->max_part_removal_threads; + if (!num_threads) + num_threads = getNumberOfPhysicalCPUCores() * 2; + num_threads = std::min(num_threads, parts_to_remove.size()); std::mutex part_names_mutex; - ThreadPool pool(CurrentMetrics::MergeTreePartsCleanerThreads, CurrentMetrics::MergeTreePartsCleanerThreadsActive, num_threads); + ThreadPool pool(CurrentMetrics::MergeTreePartsCleanerThreads, CurrentMetrics::MergeTreePartsCleanerThreadsActive, + num_threads, num_threads, /* unlimited queue size */ 0); /// This flag disallow straightforward concurrent parts removal. It's required only in case /// when we have parts on zero-copy disk + at least some of them were mutated. @@ -2490,29 +2495,62 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t /// We remove disjoint subsets of parts in parallel. /// The problem is that it's not trivial to divide Outdated parts into disjoint subsets, /// because Outdated parts legally can be intersecting (but intersecting parts must be separated by a DROP_RANGE). - /// So we ignore level and version and use block numbers only. - ActiveDataPartSet independent_ranges_set(format_version); - for (const auto & part : parts_to_remove) + /// So we ignore level and version and use block numbers only (they cannot intersect by block numbers unless we have a bug). + + struct RemovalRanges { - MergeTreePartInfo range_info = part->info; - range_info.level = static_cast(range_info.max_block - range_info.min_block); - range_info.mutation = 0; - independent_ranges_set.add(range_info, range_info.getPartNameV1()); - } + std::vector infos; + std::vector parts; + std::vector split_times; + }; - auto independent_ranges_infos = independent_ranges_set.getPartInfos(); - size_t sum_of_ranges = 0; - for (auto range : independent_ranges_infos) + auto split_into_independent_ranges = [this](const DataPartsVector & parts_to_remove_, size_t split_times) -> RemovalRanges { - range.level = MergeTreePartInfo::MAX_LEVEL; - range.mutation = MergeTreePartInfo::MAX_BLOCK_NUMBER; + if (parts_to_remove_.empty()) + return {}; - DataPartsVector parts_in_range; - for (const auto & part : parts_to_remove) - if (range.contains(part->info)) - parts_in_range.push_back(part); - sum_of_ranges += parts_in_range.size(); + ActiveDataPartSet independent_ranges_set(format_version); + for (const auto & part : parts_to_remove_) + { + MergeTreePartInfo range_info = part->info; + range_info.level = static_cast(range_info.max_block - range_info.min_block); + range_info.mutation = 0; + independent_ranges_set.add(range_info, range_info.getPartNameV1()); + } + RemovalRanges independent_ranges; + independent_ranges.infos = independent_ranges_set.getPartInfos(); + size_t num_ranges = independent_ranges.infos.size(); + independent_ranges.parts.resize(num_ranges); + independent_ranges.split_times.resize(num_ranges, split_times); + size_t avg_range_size = parts_to_remove_.size() / num_ranges; + + size_t sum_of_ranges = 0; + for (size_t i = 0; i < num_ranges; ++i) + { + MergeTreePartInfo & range = independent_ranges.infos[i]; + DataPartsVector & parts_in_range = independent_ranges.parts[i]; + range.level = MergeTreePartInfo::MAX_LEVEL; + range.mutation = MergeTreePartInfo::MAX_BLOCK_NUMBER; + + parts_in_range.reserve(avg_range_size * 2); + for (const auto & part : parts_to_remove_) + if (range.contains(part->info)) + parts_in_range.push_back(part); + sum_of_ranges += parts_in_range.size(); + } + + if (parts_to_remove_.size() != sum_of_ranges) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of removed parts is not equal to number of parts in independent ranges " + "({} != {}), it's a bug", parts_to_remove_.size(), sum_of_ranges); + + return independent_ranges; + }; + + auto schedule_parts_removal = [this, &pool, &part_names_mutex, part_names_succeed]( + const MergeTreePartInfo & range, DataPartsVector && parts_in_range) + { + /// Below, range should be captured by copy to avoid use-after-scope on exception from pool pool.scheduleOrThrowOnError( [this, range, &part_names_mutex, part_names_succeed, thread_group = CurrentThread::getGroup(), batch = std::move(parts_in_range)] { @@ -2535,13 +2573,83 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t } } }); + }; + + RemovalRanges independent_ranges = split_into_independent_ranges(parts_to_remove, /* split_times */ 0); + DataPartsVector excluded_parts; + size_t num_ranges = independent_ranges.infos.size(); + size_t sum_of_ranges = 0; + for (size_t i = 0; i < num_ranges; ++i) + { + MergeTreePartInfo & range = independent_ranges.infos[i]; + DataPartsVector & parts_in_range = independent_ranges.parts[i]; + UInt64 split_times = independent_ranges.split_times[i]; + + /// It may happen that we have a huge part covering thousands small parts. + /// In this case, we will get a huge range that will be process by only one thread causing really long tail latency. + /// Let's try to exclude such parts in order to get smaller tasks for thread pool and more uniform distribution. + if (settings->concurrent_part_removal_threshold < parts_in_range.size() && + split_times < settings->zero_copy_concurrent_part_removal_max_split_times) + { + auto smaller_parts_pred = [&range](const DataPartPtr & part) + { + return !(part->info.min_block == range.min_block && part->info.max_block == range.max_block); + }; + + size_t covered_parts_count = std::count_if(parts_in_range.begin(), parts_in_range.end(), smaller_parts_pred); + size_t top_level_count = parts_in_range.size() - covered_parts_count; + chassert(top_level_count); + Float32 parts_to_exclude_ratio = static_cast(top_level_count) / parts_in_range.size(); + if (settings->zero_copy_concurrent_part_removal_max_postpone_ratio < parts_to_exclude_ratio) + { + /// Most likely we have a long mutations chain here + LOG_DEBUG(log, "Block range {} contains {} parts including {} top-level parts, will not try to split it", + range.getPartNameForLogs(), parts_in_range.size(), top_level_count); + } + else + { + auto new_end_it = std::partition(parts_in_range.begin(), parts_in_range.end(), smaller_parts_pred); + std::move(new_end_it, parts_in_range.end(), std::back_inserter(excluded_parts)); + parts_in_range.erase(new_end_it, parts_in_range.end()); + + RemovalRanges subranges = split_into_independent_ranges(parts_in_range, split_times + 1); + + LOG_DEBUG(log, "Block range {} contained {} parts, it was split into {} independent subranges after excluding {} top-level parts", + range.getPartNameForLogs(), parts_in_range.size() + top_level_count, subranges.infos.size(), top_level_count); + + std::move(subranges.infos.begin(), subranges.infos.end(), std::back_inserter(independent_ranges.infos)); + std::move(subranges.parts.begin(), subranges.parts.end(), std::back_inserter(independent_ranges.parts)); + std::move(subranges.split_times.begin(), subranges.split_times.end(), std::back_inserter(independent_ranges.split_times)); + num_ranges += subranges.infos.size(); + continue; + } + } + + sum_of_ranges += parts_in_range.size(); + + schedule_parts_removal(range, std::move(parts_in_range)); + } + + /// Remove excluded parts as well. They were reordered, so sort them again + std::sort(excluded_parts.begin(), excluded_parts.end(), [](const auto & x, const auto & y) { return x->info < y->info; }); + LOG_TRACE(log, "Will remove {} big parts separately: {}", excluded_parts.size(), fmt::join(excluded_parts, ", ")); + + independent_ranges = split_into_independent_ranges(excluded_parts, /* split_times */ 0); + pool.wait(); + + for (size_t i = 0; i < independent_ranges.infos.size(); ++i) + { + MergeTreePartInfo & range = independent_ranges.infos[i]; + DataPartsVector & parts_in_range = independent_ranges.parts[i]; + schedule_parts_removal(range, std::move(parts_in_range)); } pool.wait(); - if (parts_to_remove.size() != sum_of_ranges) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Number of removed parts is not equal to number of parts in independent ranges " - "({} != {}), it's a bug", parts_to_remove.size(), sum_of_ranges); + if (parts_to_remove.size() != sum_of_ranges + excluded_parts.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Number of parts to remove was not equal to number of parts in independent ranges and excluded parts" + "({} != {} + {}), it's a bug", parts_to_remove.size(), sum_of_ranges, excluded_parts.size()); } size_t MergeTreeData::clearOldBrokenPartsFromDetachedDirectory() diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 9c7488cb6a6..c9e81ce9103 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -146,6 +146,8 @@ struct Settings; M(MaxThreads, max_part_loading_threads, 0, "The number of threads to load data parts at startup.", 0) \ M(MaxThreads, max_part_removal_threads, 0, "The number of threads for concurrent removal of inactive data parts. One is usually enough, but in 'Google Compute Environment SSD Persistent Disks' file removal (unlink) operation is extraordinarily slow and you probably have to increase this number (recommended is up to 16).", 0) \ M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ + M(UInt64, zero_copy_concurrent_part_removal_max_split_times, 5, "Max recursion depth for splitting independent Outdated parts ranges into smaller subranges (highly not recommended to change)", 0) \ + M(Float, zero_copy_concurrent_part_removal_max_postpone_ratio, static_cast(0.05), "Max percentage of top level parts to postpone removal in order to get smaller independent ranges (highly not recommended to change)", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(String, disk, "", "Name of storage disk. Can be specified instead of storage policy.", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ From f2dbcb514673e439b0b11efa16575dd4fb8be164 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 May 2023 16:25:31 +0200 Subject: [PATCH 107/127] Better fix --- src/Interpreters/Cache/Metadata.cpp | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index c87eaabbbf2..e0b82763a08 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -257,30 +257,42 @@ void CacheMetadata::doCleanup() } locked_metadata->markAsRemoved(); + erase(it); + LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); + + const fs::path key_directory = getPathInLocalCache(cleanup_key); + const fs::path key_prefix_directory = key_directory.parent_path(); try { - const fs::path key_directory = getPathInLocalCache(cleanup_key); if (fs::exists(key_directory)) fs::remove_all(key_directory); - - const fs::path key_prefix_directory = key_directory.parent_path(); - if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory)) - fs::remove_all(key_prefix_directory); } catch (...) { - LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(false)); + LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true)); chassert(false); + continue; } - /// Remove key from metadata AFTER deleting key directory, because otherwise key lock is - /// released before we delete directory from fs and there might be a race: - /// a key, which we just removed, can be added back to cache before we start removing key directory, - /// which makes key directory either non-empty (and we get exception in try catch above) - /// or we removed directory while another thread thinks it exists. - erase(it); - LOG_DEBUG(log, "Key {} is removed from metadata", cleanup_key); + try + { + if (fs::exists(key_prefix_directory) && fs::is_empty(key_prefix_directory)) + fs::remove_all(key_prefix_directory); + } + catch (const fs::filesystem_error & e) + { + /// Key prefix directory can become non-empty just now, it is expected. + if (e.code() == std::errc::directory_not_empty) + return; + LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true)); + chassert(false); + } + catch (...) + { + LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true)); + chassert(false); + } } } From 1c04085e8fcbb916292635a6297ae45fde7da49d Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 17 May 2023 18:15:51 +0200 Subject: [PATCH 108/127] Update MergeTreeWriteAheadLog.h --- src/Storages/MergeTree/MergeTreeWriteAheadLog.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.h b/src/Storages/MergeTree/MergeTreeWriteAheadLog.h index eba7698b9f9..f5398a24e7d 100644 --- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.h +++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.h @@ -60,7 +60,6 @@ public: ~MergeTreeWriteAheadLog(); - void addPart(DataPartInMemoryPtr & part); void dropPart(const String & part_name); std::vector restore( const StorageMetadataPtr & metadata_snapshot, @@ -77,7 +76,6 @@ public: private: void init(); void rotate(const std::unique_lock & lock); - void sync(std::unique_lock & lock); const MergeTreeData & storage; DiskPtr disk; From c7ab59302f3ae1850619a79f138dee865a92bbb6 Mon Sep 17 00:00:00 2001 From: Timur Solodovnikov Date: Wed, 17 May 2023 11:03:42 -0700 Subject: [PATCH 109/127] Set allow_experimental_query_cache setting as obsolete (#49934) * set allow_experimental_query_cache as obsolete * add tsolodov to trusted contributors * CI linter --------- Co-authored-by: Nikita Mikhaylov --- src/Core/Settings.h | 1 + tests/ci/workflow_approve_rerun_lambda/app.py | 1 + ...726_set_allow_experimental_query_cache_as_obsolete.reference | 0 .../02726_set_allow_experimental_query_cache_as_obsolete.sql | 2 ++ 4 files changed, 4 insertions(+) create mode 100644 tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.reference create mode 100644 tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 610c7135a75..c65958b86b7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -779,6 +779,7 @@ class IColumn; MAKE_OBSOLETE(M, UInt64, partial_merge_join_optimizations, 0) \ MAKE_OBSOLETE(M, MaxThreads, max_alter_threads, 0) \ MAKE_OBSOLETE(M, Bool, allow_experimental_projection_optimization, true) \ + MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \ /* moved to config.xml: see also src/Core/ServerSettings.h */ \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_buffer_flush_schedule_pool_size, 16) \ MAKE_DEPRECATED_BY_SERVER_CONFIG(M, UInt64, background_pool_size, 16) \ diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py index 7d234bdcfc4..2cdbdecca6e 100644 --- a/tests/ci/workflow_approve_rerun_lambda/app.py +++ b/tests/ci/workflow_approve_rerun_lambda/app.py @@ -125,6 +125,7 @@ TRUSTED_CONTRIBUTORS = { "thevar1able", # ClickHouse Employee "aalexfvk", "MikhailBurdukov", + "tsolodov", # ClickHouse Employee ] } diff --git a/tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.reference b/tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.sql b/tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.sql new file mode 100644 index 00000000000..244ba4e959a --- /dev/null +++ b/tests/queries/0_stateless/02726_set_allow_experimental_query_cache_as_obsolete.sql @@ -0,0 +1,2 @@ +SET allow_experimental_query_cache = 0; +SET allow_experimental_query_cache = 1; From 6a136897e375e61bf59a25c6dc65e4e2570e7641 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Wed, 17 May 2023 13:23:53 -0600 Subject: [PATCH 110/127] Create reddit-comments.md --- .../example-datasets/reddit-comments.md | 636 ++++++++++++++++++ 1 file changed, 636 insertions(+) create mode 100644 docs/en/getting-started/example-datasets/reddit-comments.md diff --git a/docs/en/getting-started/example-datasets/reddit-comments.md b/docs/en/getting-started/example-datasets/reddit-comments.md new file mode 100644 index 00000000000..e1e372746c9 --- /dev/null +++ b/docs/en/getting-started/example-datasets/reddit-comments.md @@ -0,0 +1,636 @@ +--- +slug: /en/getting-started/example-datasets/reddit-comments +sidebar_label: Reddit comments +--- + +# Reddit comments dataset + +This dataset contains publicly-available comments on Reddit that go back to December, 2005, to March, 2023, and contains over 7B rows of data. The raw data is in JSON format in compressed `.zst` files and the rows look like the following: + +```json +{"controversiality":0,"body":"A look at Vietnam and Mexico exposes the myth of market liberalisation.","subreddit_id":"t5_6","link_id":"t3_17863","stickied":false,"subreddit":"reddit.com","score":2,"ups":2,"author_flair_css_class":null,"created_utc":1134365188,"author_flair_text":null,"author":"frjo","id":"c13","edited":false,"parent_id":"t3_17863","gilded":0,"distinguished":null,"retrieved_on":1473738411} +{"created_utc":1134365725,"author_flair_css_class":null,"score":1,"ups":1,"subreddit":"reddit.com","stickied":false,"link_id":"t3_17866","subreddit_id":"t5_6","controversiality":0,"body":"The site states \"What can I use it for? Meeting notes, Reports, technical specs Sign-up sheets, proposals and much more...\", just like any other new breeed of sites that want us to store everything we have on the web. And they even guarantee multiple levels of security and encryption etc. But what prevents these web site operators fom accessing and/or stealing Meeting notes, Reports, technical specs Sign-up sheets, proposals and much more, for competitive or personal gains...? I am pretty sure that most of them are honest, but what's there to prevent me from setting up a good useful site and stealing all your data? Call me paranoid - I am.","retrieved_on":1473738411,"distinguished":null,"gilded":0,"id":"c14","edited":false,"parent_id":"t3_17866","author":"zse7zse","author_flair_text":null} +{"gilded":0,"distinguished":null,"retrieved_on":1473738411,"author":"[deleted]","author_flair_text":null,"edited":false,"id":"c15","parent_id":"t3_17869","subreddit":"reddit.com","score":0,"ups":0,"created_utc":1134366848,"author_flair_css_class":null,"body":"Jython related topics by Frank Wierzbicki","controversiality":0,"subreddit_id":"t5_6","stickied":false,"link_id":"t3_17869"} +{"gilded":0,"retrieved_on":1473738411,"distinguished":null,"author_flair_text":null,"author":"[deleted]","edited":false,"parent_id":"t3_17870","id":"c16","subreddit":"reddit.com","created_utc":1134367660,"author_flair_css_class":null,"score":1,"ups":1,"body":"[deleted]","controversiality":0,"stickied":false,"link_id":"t3_17870","subreddit_id":"t5_6"} +{"gilded":0,"retrieved_on":1473738411,"distinguished":null,"author_flair_text":null,"author":"rjoseph","edited":false,"id":"c17","parent_id":"t3_17817","subreddit":"reddit.com","author_flair_css_class":null,"created_utc":1134367754,"score":1,"ups":1,"body":"Saft is by far the best extension you could tak onto your Safari","controversiality":0,"link_id":"t3_17817","stickied":false,"subreddit_id":"t5_6"} +``` + +A shoutout to Percona for the [motivation behind ingesting this dataset](https://www.percona.com/blog/big-data-set-reddit-comments-analyzing-clickhouse/), which we have downloaded and stored in an S3 bucket. + +:::note +The following commands were executed on ClickHouse Cloud. To run this on your own cluster, replace `default` in the `s3Cluster` function call with the name of your cluster. If you do not have a cluster, then replace the `s3Cluster` function with the `s3` function. +::: + +1. Let's create a table for the Reddit data: + +```sql +CREATE TABLE reddit +( + subreddit LowCardinality(String), + subreddit_id LowCardinality(String), + subreddit_type Enum('public' = 1, 'restricted' = 2, 'user' = 3, 'archived' = 4, 'gold_restricted' = 5, 'private' = 6), + author LowCardinality(String), + body String CODEC(ZSTD(6)), + created_date Date DEFAULT toDate(created_utc), + created_utc DateTime, + retrieved_on DateTime, + id String, + parent_id String, + link_id String, + score Int32, + total_awards_received UInt16, + controversiality UInt8, + gilded UInt8, + collapsed_because_crowd_control UInt8, + collapsed_reason Enum('' = 0, 'comment score below threshold' = 1, 'may be sensitive content' = 2, 'potentially toxic' = 3, 'potentially toxic content' = 4), + distinguished Enum('' = 0, 'moderator' = 1, 'admin' = 2, 'special' = 3), + removal_reason Enum('' = 0, 'legal' = 1), + author_created_utc DateTime, + author_fullname LowCardinality(String), + author_patreon_flair UInt8, + author_premium UInt8, + can_gild UInt8, + can_mod_post UInt8, + collapsed UInt8, + is_submitter UInt8, + _edited String, + locked UInt8, + quarantined UInt8, + no_follow UInt8, + send_replies UInt8, + stickied UInt8, + author_flair_text LowCardinality(String) +) +ENGINE = MergeTree +ORDER BY (subreddit, created_date, author); +``` + +:::note +The names of the files in S3 start with `RC_YYYY-MM` where `YYYY-MM` goes from `2005-12` to `2023-02`. The compression changes a couple of times though, so the file extensions are not consistent. For example: + +- the file names are initially `RC_2005-12.bz2` to `RC_2017-11.bz2` +- then they look like `RC_2017-12.xz` to `RC_2018-09.xz` +- and finally `RC_2018-10.zst` to `RC_2023-02.zst` +::: + +2. We are going to start with one month of data, but if you want to simply insert every row - skip ahead to step 8 below. The following file has 86M records from December, 2017: + +```sql +INSERT INTO reddit + SELECT * + FROM s3Cluster( + 'default', + 'https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/reddit/original/RC_2017-12.xz', + 'JSONEachRow' + ); +``` + +If you do not have a cluster, use `s3` instead of `s3Cluster`: + +```sql +INSERT INTO reddit + SELECT * + FROM s3( + 'https://clickhouse-public-datasets.s3.eu-central-1.amazonaws.com/reddit/original/RC_2017-12.xz', + 'JSONEachRow' + ); +``` + +3. It will take a while depending on your resources, but when it's done verify it worked: + +```sql +SELECT formatReadableQuantity(count()) +FROM reddit; +``` + +```response +┌─formatReadableQuantity(count())─┐ +│ 85.97 million │ +└─────────────────────────────────┘ +``` + +4. Let's see how many unique subreddits were in December of 2017: + +```sql +SELECT uniqExact(subreddit) +FROM reddit; +``` + +```response +┌─uniqExact(subreddit)─┐ +│ 91613 │ +└──────────────────────┘ + +1 row in set. Elapsed: 1.572 sec. Processed 85.97 million rows, 367.43 MB (54.71 million rows/s., 233.80 MB/s.) +``` + +5. This query returns the top 10 subreddits (in terms of number of comments): + +```sql +SELECT + subreddit, + count() AS c +FROM reddit +GROUP BY subreddit +ORDER BY c DESC +LIMIT 20; +``` + +```response +┌─subreddit───────┬───────c─┐ +│ AskReddit │ 5245881 │ +│ politics │ 1753120 │ +│ nfl │ 1220266 │ +│ nba │ 960388 │ +│ The_Donald │ 931857 │ +│ news │ 796617 │ +│ worldnews │ 765709 │ +│ CFB │ 710360 │ +│ gaming │ 602761 │ +│ movies │ 601966 │ +│ soccer │ 590628 │ +│ Bitcoin │ 583783 │ +│ pics │ 563408 │ +│ StarWars │ 562514 │ +│ funny │ 547563 │ +│ leagueoflegends │ 517213 │ +│ teenagers │ 492020 │ +│ DestinyTheGame │ 477377 │ +│ todayilearned │ 472650 │ +│ videos │ 450581 │ +└─────────────────┴─────────┘ + +20 rows in set. Elapsed: 0.368 sec. Processed 85.97 million rows, 367.43 MB (233.34 million rows/s., 997.25 MB/s.) +``` + +6. Here are the top 10 authors in December of 2017, in terms of number of comments posted: + +```sql +SELECT + author, + count() AS c +FROM reddit +GROUP BY author +ORDER BY c DESC +LIMIT 10; +``` + +```response +┌─author──────────┬───────c─┐ +│ [deleted] │ 5913324 │ +│ AutoModerator │ 784886 │ +│ ImagesOfNetwork │ 83241 │ +│ BitcoinAllBot │ 54484 │ +│ imguralbumbot │ 45822 │ +│ RPBot │ 29337 │ +│ WikiTextBot │ 25982 │ +│ Concise_AMA_Bot │ 19974 │ +│ MTGCardFetcher │ 19103 │ +│ TotesMessenger │ 19057 │ +└─────────────────┴─────────┘ + +10 rows in set. Elapsed: 8.143 sec. Processed 85.97 million rows, 711.05 MB (10.56 million rows/s., 87.32 MB/s.) +``` + +7. We already inserted some data, but we will start over: + +```sql +TRUNCATE TABLE reddit; +``` + +8. This is a fun dataset and it looks like we can find some great information, so let's go ahead and insert the entire dataset from 2005 to 2023. When you're ready, run this command to insert all the rows. (It takes a while - up to 17 hours!) + +```sql +INSERT INTO reddit +SELECT * +FROM s3Cluster( + 'default', + 'https://clickhouse-public-datasets.s3.amazonaws.com/reddit/original/RC*', + 'JSONEachRow' + ) +SETTINGS zstd_window_log_max = 31; +``` + +The response looks like: + +```response +0 rows in set. Elapsed: 61187.839 sec. Processed 6.74 billion rows, 2.06 TB (110.17 thousand rows/s., 33.68 MB/s.) +``` + +8. Let's see how many rows were inserted and how much disk space the table is using: + + +```sql +SELECT + sum(rows) AS count, + formatReadableQuantity(count), + formatReadableSize(sum(bytes)) AS disk_size, + formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size +FROM system.parts +WHERE (table = 'reddit') AND active +``` + +Notice the compression of disk storage is about 1/3 of the uncompressed size: + +```response +┌──────count─┬─formatReadableQuantity(sum(rows))─┬─disk_size──┬─uncompressed_size─┐ +│ 6739503568 │ 6.74 billion │ 501.10 GiB │ 1.51 TiB │ +└────────────┴───────────────────────────────────┴────────────┴───────────────────┘ + +1 row in set. Elapsed: 0.010 sec. +``` + +9. The following query shows how many comments, authors and subreddits we have for each month: + +```sql +SELECT + toStartOfMonth(created_utc) AS firstOfMonth, + count() AS c, + bar(c, 0, 50000000, 25) AS bar_count, + uniq(author) AS authors, + bar(authors, 0, 5000000, 25) AS bar_authors, + uniq(subreddit) AS subreddits, + bar(subreddits, 0, 100000, 25) AS bar_subreddits +FROM reddit +GROUP BY firstOfMonth +ORDER BY firstOfMonth ASC; +``` + +This is a substantial query that has to process all 6.74 billion rows, but we still get an impressive response time (about 3 minutes): + +```response +┌─firstOfMonth─┬─────────c─┬─bar_count─────────────────┬─authors─┬─bar_authors───────────────┬─subreddits─┬─bar_subreddits────────────┐ +│ 2005-12-01 │ 1075 │ │ 394 │ │ 1 │ │ +│ 2006-01-01 │ 3666 │ │ 791 │ │ 2 │ │ +│ 2006-02-01 │ 9095 │ │ 1464 │ │ 18 │ │ +│ 2006-03-01 │ 13859 │ │ 1958 │ │ 15 │ │ +│ 2006-04-01 │ 19090 │ │ 2334 │ │ 21 │ │ +│ 2006-05-01 │ 26859 │ │ 2698 │ │ 21 │ │ +│ 2006-06-01 │ 29163 │ │ 3043 │ │ 19 │ │ +│ 2006-07-01 │ 37031 │ │ 3532 │ │ 22 │ │ +│ 2006-08-01 │ 50559 │ │ 4750 │ │ 24 │ │ +│ 2006-09-01 │ 50675 │ │ 4908 │ │ 21 │ │ +│ 2006-10-01 │ 54148 │ │ 5654 │ │ 31 │ │ +│ 2006-11-01 │ 62021 │ │ 6490 │ │ 23 │ │ +│ 2006-12-01 │ 61018 │ │ 6707 │ │ 24 │ │ +│ 2007-01-01 │ 81341 │ │ 7931 │ │ 23 │ │ +│ 2007-02-01 │ 95634 │ │ 9020 │ │ 21 │ │ +│ 2007-03-01 │ 112444 │ │ 10842 │ │ 23 │ │ +│ 2007-04-01 │ 126773 │ │ 10701 │ │ 26 │ │ +│ 2007-05-01 │ 170097 │ │ 11365 │ │ 25 │ │ +│ 2007-06-01 │ 178800 │ │ 11267 │ │ 22 │ │ +│ 2007-07-01 │ 203319 │ │ 12482 │ │ 25 │ │ +│ 2007-08-01 │ 225111 │ │ 14124 │ │ 30 │ │ +│ 2007-09-01 │ 259497 │ ▏ │ 15416 │ │ 33 │ │ +│ 2007-10-01 │ 274170 │ ▏ │ 15302 │ │ 36 │ │ +│ 2007-11-01 │ 372983 │ ▏ │ 15134 │ │ 43 │ │ +│ 2007-12-01 │ 363390 │ ▏ │ 15915 │ │ 31 │ │ +│ 2008-01-01 │ 452990 │ ▏ │ 18857 │ │ 126 │ │ +│ 2008-02-01 │ 441768 │ ▏ │ 18266 │ │ 173 │ │ +│ 2008-03-01 │ 463728 │ ▏ │ 18947 │ │ 292 │ │ +│ 2008-04-01 │ 468317 │ ▏ │ 18590 │ │ 323 │ │ +│ 2008-05-01 │ 536380 │ ▎ │ 20861 │ │ 375 │ │ +│ 2008-06-01 │ 577684 │ ▎ │ 22557 │ │ 575 │ ▏ │ +│ 2008-07-01 │ 592610 │ ▎ │ 23123 │ │ 657 │ ▏ │ +│ 2008-08-01 │ 595959 │ ▎ │ 23729 │ │ 707 │ ▏ │ +│ 2008-09-01 │ 680892 │ ▎ │ 26374 │ ▏ │ 801 │ ▏ │ +│ 2008-10-01 │ 789874 │ ▍ │ 28970 │ ▏ │ 893 │ ▏ │ +│ 2008-11-01 │ 792310 │ ▍ │ 30272 │ ▏ │ 1024 │ ▎ │ +│ 2008-12-01 │ 850359 │ ▍ │ 34073 │ ▏ │ 1103 │ ▎ │ +│ 2009-01-01 │ 1051649 │ ▌ │ 38978 │ ▏ │ 1316 │ ▎ │ +│ 2009-02-01 │ 944711 │ ▍ │ 43390 │ ▏ │ 1132 │ ▎ │ +│ 2009-03-01 │ 1048643 │ ▌ │ 46516 │ ▏ │ 1203 │ ▎ │ +│ 2009-04-01 │ 1094599 │ ▌ │ 48284 │ ▏ │ 1334 │ ▎ │ +│ 2009-05-01 │ 1201257 │ ▌ │ 52512 │ ▎ │ 1395 │ ▎ │ +│ 2009-06-01 │ 1258750 │ ▋ │ 57728 │ ▎ │ 1473 │ ▎ │ +│ 2009-07-01 │ 1470290 │ ▋ │ 60098 │ ▎ │ 1686 │ ▍ │ +│ 2009-08-01 │ 1750688 │ ▉ │ 67347 │ ▎ │ 1777 │ ▍ │ +│ 2009-09-01 │ 2032276 │ █ │ 78051 │ ▍ │ 1784 │ ▍ │ +│ 2009-10-01 │ 2242017 │ █ │ 93409 │ ▍ │ 2071 │ ▌ │ +│ 2009-11-01 │ 2207444 │ █ │ 95940 │ ▍ │ 2141 │ ▌ │ +│ 2009-12-01 │ 2560510 │ █▎ │ 104239 │ ▌ │ 2141 │ ▌ │ +│ 2010-01-01 │ 2884096 │ █▍ │ 114314 │ ▌ │ 2313 │ ▌ │ +│ 2010-02-01 │ 2687779 │ █▎ │ 115683 │ ▌ │ 2522 │ ▋ │ +│ 2010-03-01 │ 3228254 │ █▌ │ 125775 │ ▋ │ 2890 │ ▋ │ +│ 2010-04-01 │ 3209898 │ █▌ │ 128936 │ ▋ │ 3170 │ ▊ │ +│ 2010-05-01 │ 3267363 │ █▋ │ 131851 │ ▋ │ 3166 │ ▊ │ +│ 2010-06-01 │ 3532867 │ █▊ │ 139522 │ ▋ │ 3301 │ ▊ │ +│ 2010-07-01 │ 4032737 │ ██ │ 153451 │ ▊ │ 3662 │ ▉ │ +│ 2010-08-01 │ 4247982 │ ██ │ 164071 │ ▊ │ 3653 │ ▉ │ +│ 2010-09-01 │ 4704069 │ ██▎ │ 186613 │ ▉ │ 4009 │ █ │ +│ 2010-10-01 │ 5032368 │ ██▌ │ 203800 │ █ │ 4154 │ █ │ +│ 2010-11-01 │ 5689002 │ ██▊ │ 226134 │ █▏ │ 4383 │ █ │ +│ 2010-12-01 │ 5972642 │ ██▉ │ 245824 │ █▏ │ 4692 │ █▏ │ +│ 2011-01-01 │ 6603329 │ ███▎ │ 270025 │ █▎ │ 5141 │ █▎ │ +│ 2011-02-01 │ 6363114 │ ███▏ │ 277593 │ █▍ │ 5202 │ █▎ │ +│ 2011-03-01 │ 7556165 │ ███▊ │ 314748 │ █▌ │ 5445 │ █▎ │ +│ 2011-04-01 │ 7571398 │ ███▊ │ 329920 │ █▋ │ 6128 │ █▌ │ +│ 2011-05-01 │ 8803949 │ ████▍ │ 365013 │ █▊ │ 6834 │ █▋ │ +│ 2011-06-01 │ 9766511 │ ████▉ │ 393945 │ █▉ │ 7519 │ █▉ │ +│ 2011-07-01 │ 10557466 │ █████▎ │ 424235 │ ██ │ 8293 │ ██ │ +│ 2011-08-01 │ 12316144 │ ██████▏ │ 475326 │ ██▍ │ 9657 │ ██▍ │ +│ 2011-09-01 │ 12150412 │ ██████ │ 503142 │ ██▌ │ 10278 │ ██▌ │ +│ 2011-10-01 │ 13470278 │ ██████▋ │ 548801 │ ██▋ │ 10922 │ ██▋ │ +│ 2011-11-01 │ 13621533 │ ██████▊ │ 574435 │ ██▊ │ 11572 │ ██▉ │ +│ 2011-12-01 │ 14509469 │ ███████▎ │ 622849 │ ███ │ 12335 │ ███ │ +│ 2012-01-01 │ 16350205 │ ████████▏ │ 696110 │ ███▍ │ 14281 │ ███▌ │ +│ 2012-02-01 │ 16015695 │ ████████ │ 722892 │ ███▌ │ 14949 │ ███▋ │ +│ 2012-03-01 │ 17881943 │ ████████▉ │ 789664 │ ███▉ │ 15795 │ ███▉ │ +│ 2012-04-01 │ 19044534 │ █████████▌ │ 842491 │ ████▏ │ 16440 │ ████ │ +│ 2012-05-01 │ 20388260 │ ██████████▏ │ 886176 │ ████▍ │ 16974 │ ████▏ │ +│ 2012-06-01 │ 21897913 │ ██████████▉ │ 946798 │ ████▋ │ 17952 │ ████▍ │ +│ 2012-07-01 │ 24087517 │ ████████████ │ 1018636 │ █████ │ 19069 │ ████▊ │ +│ 2012-08-01 │ 25703326 │ ████████████▊ │ 1094445 │ █████▍ │ 20553 │ █████▏ │ +│ 2012-09-01 │ 23419524 │ ███████████▋ │ 1088491 │ █████▍ │ 20831 │ █████▏ │ +│ 2012-10-01 │ 24788236 │ ████████████▍ │ 1131885 │ █████▋ │ 21868 │ █████▍ │ +│ 2012-11-01 │ 24648302 │ ████████████▎ │ 1167608 │ █████▊ │ 21791 │ █████▍ │ +│ 2012-12-01 │ 26080276 │ █████████████ │ 1218402 │ ██████ │ 22622 │ █████▋ │ +│ 2013-01-01 │ 30365867 │ ███████████████▏ │ 1341703 │ ██████▋ │ 24696 │ ██████▏ │ +│ 2013-02-01 │ 27213960 │ █████████████▌ │ 1304756 │ ██████▌ │ 24514 │ ██████▏ │ +│ 2013-03-01 │ 30771274 │ ███████████████▍ │ 1391703 │ ██████▉ │ 25730 │ ██████▍ │ +│ 2013-04-01 │ 33259557 │ ████████████████▋ │ 1485971 │ ███████▍ │ 27294 │ ██████▊ │ +│ 2013-05-01 │ 33126225 │ ████████████████▌ │ 1506473 │ ███████▌ │ 27299 │ ██████▊ │ +│ 2013-06-01 │ 32648247 │ ████████████████▎ │ 1506650 │ ███████▌ │ 27450 │ ██████▊ │ +│ 2013-07-01 │ 34922133 │ █████████████████▍ │ 1561771 │ ███████▊ │ 28294 │ ███████ │ +│ 2013-08-01 │ 34766579 │ █████████████████▍ │ 1589781 │ ███████▉ │ 28943 │ ███████▏ │ +│ 2013-09-01 │ 31990369 │ ███████████████▉ │ 1570342 │ ███████▊ │ 29408 │ ███████▎ │ +│ 2013-10-01 │ 35940040 │ █████████████████▉ │ 1683770 │ ████████▍ │ 30273 │ ███████▌ │ +│ 2013-11-01 │ 37396497 │ ██████████████████▋ │ 1757467 │ ████████▊ │ 31173 │ ███████▊ │ +│ 2013-12-01 │ 39810216 │ ███████████████████▉ │ 1846204 │ █████████▏ │ 32326 │ ████████ │ +│ 2014-01-01 │ 42420655 │ █████████████████████▏ │ 1927229 │ █████████▋ │ 35603 │ ████████▉ │ +│ 2014-02-01 │ 38703362 │ ███████████████████▎ │ 1874067 │ █████████▎ │ 37007 │ █████████▎ │ +│ 2014-03-01 │ 42459956 │ █████████████████████▏ │ 1959888 │ █████████▊ │ 37948 │ █████████▍ │ +│ 2014-04-01 │ 42440735 │ █████████████████████▏ │ 1951369 │ █████████▊ │ 38362 │ █████████▌ │ +│ 2014-05-01 │ 42514094 │ █████████████████████▎ │ 1970197 │ █████████▊ │ 39078 │ █████████▊ │ +│ 2014-06-01 │ 41990650 │ ████████████████████▉ │ 1943850 │ █████████▋ │ 38268 │ █████████▌ │ +│ 2014-07-01 │ 46868899 │ ███████████████████████▍ │ 2059346 │ ██████████▎ │ 40634 │ ██████████▏ │ +│ 2014-08-01 │ 46990813 │ ███████████████████████▍ │ 2117335 │ ██████████▌ │ 41764 │ ██████████▍ │ +│ 2014-09-01 │ 44992201 │ ██████████████████████▍ │ 2124708 │ ██████████▌ │ 41890 │ ██████████▍ │ +│ 2014-10-01 │ 47497520 │ ███████████████████████▋ │ 2206535 │ ███████████ │ 43109 │ ██████████▊ │ +│ 2014-11-01 │ 46118074 │ ███████████████████████ │ 2239747 │ ███████████▏ │ 43718 │ ██████████▉ │ +│ 2014-12-01 │ 48807699 │ ████████████████████████▍ │ 2372945 │ ███████████▊ │ 43823 │ ██████████▉ │ +│ 2015-01-01 │ 53851542 │ █████████████████████████ │ 2499536 │ ████████████▍ │ 47172 │ ███████████▊ │ +│ 2015-02-01 │ 48342747 │ ████████████████████████▏ │ 2448496 │ ████████████▏ │ 47229 │ ███████████▊ │ +│ 2015-03-01 │ 54564441 │ █████████████████████████ │ 2550534 │ ████████████▊ │ 48156 │ ████████████ │ +│ 2015-04-01 │ 55005780 │ █████████████████████████ │ 2609443 │ █████████████ │ 49865 │ ████████████▍ │ +│ 2015-05-01 │ 54504410 │ █████████████████████████ │ 2585535 │ ████████████▉ │ 50137 │ ████████████▌ │ +│ 2015-06-01 │ 54258492 │ █████████████████████████ │ 2595129 │ ████████████▉ │ 49598 │ ████████████▍ │ +│ 2015-07-01 │ 58451788 │ █████████████████████████ │ 2720026 │ █████████████▌ │ 55022 │ █████████████▊ │ +│ 2015-08-01 │ 58075327 │ █████████████████████████ │ 2743994 │ █████████████▋ │ 55302 │ █████████████▊ │ +│ 2015-09-01 │ 55574825 │ █████████████████████████ │ 2672793 │ █████████████▎ │ 53960 │ █████████████▍ │ +│ 2015-10-01 │ 59494045 │ █████████████████████████ │ 2816426 │ ██████████████ │ 70210 │ █████████████████▌ │ +│ 2015-11-01 │ 57117500 │ █████████████████████████ │ 2847146 │ ██████████████▏ │ 71363 │ █████████████████▊ │ +│ 2015-12-01 │ 58523312 │ █████████████████████████ │ 2854840 │ ██████████████▎ │ 94559 │ ███████████████████████▋ │ +│ 2016-01-01 │ 61991732 │ █████████████████████████ │ 2920366 │ ██████████████▌ │ 108438 │ █████████████████████████ │ +│ 2016-02-01 │ 59189875 │ █████████████████████████ │ 2854683 │ ██████████████▎ │ 109916 │ █████████████████████████ │ +│ 2016-03-01 │ 63918864 │ █████████████████████████ │ 2969542 │ ██████████████▊ │ 84787 │ █████████████████████▏ │ +│ 2016-04-01 │ 64271256 │ █████████████████████████ │ 2999086 │ ██████████████▉ │ 61647 │ ███████████████▍ │ +│ 2016-05-01 │ 65212004 │ █████████████████████████ │ 3034674 │ ███████████████▏ │ 67465 │ ████████████████▊ │ +│ 2016-06-01 │ 65867743 │ █████████████████████████ │ 3057604 │ ███████████████▎ │ 75170 │ ██████████████████▊ │ +│ 2016-07-01 │ 66974735 │ █████████████████████████ │ 3199374 │ ███████████████▉ │ 77732 │ ███████████████████▍ │ +│ 2016-08-01 │ 69654819 │ █████████████████████████ │ 3239957 │ ████████████████▏ │ 63080 │ ███████████████▊ │ +│ 2016-09-01 │ 67024973 │ █████████████████████████ │ 3190864 │ ███████████████▉ │ 62324 │ ███████████████▌ │ +│ 2016-10-01 │ 71826553 │ █████████████████████████ │ 3284340 │ ████████████████▍ │ 62549 │ ███████████████▋ │ +│ 2016-11-01 │ 71022319 │ █████████████████████████ │ 3300822 │ ████████████████▌ │ 69718 │ █████████████████▍ │ +│ 2016-12-01 │ 72942967 │ █████████████████████████ │ 3430324 │ █████████████████▏ │ 71705 │ █████████████████▉ │ +│ 2017-01-01 │ 78946585 │ █████████████████████████ │ 3572093 │ █████████████████▊ │ 78198 │ ███████████████████▌ │ +│ 2017-02-01 │ 70609487 │ █████████████████████████ │ 3421115 │ █████████████████ │ 69823 │ █████████████████▍ │ +│ 2017-03-01 │ 79723106 │ █████████████████████████ │ 3638122 │ ██████████████████▏ │ 73865 │ ██████████████████▍ │ +│ 2017-04-01 │ 77478009 │ █████████████████████████ │ 3620591 │ ██████████████████ │ 74387 │ ██████████████████▌ │ +│ 2017-05-01 │ 79810360 │ █████████████████████████ │ 3650820 │ ██████████████████▎ │ 74356 │ ██████████████████▌ │ +│ 2017-06-01 │ 79901711 │ █████████████████████████ │ 3737614 │ ██████████████████▋ │ 72114 │ ██████████████████ │ +│ 2017-07-01 │ 81798725 │ █████████████████████████ │ 3872330 │ ███████████████████▎ │ 76052 │ ███████████████████ │ +│ 2017-08-01 │ 84658503 │ █████████████████████████ │ 3960093 │ ███████████████████▊ │ 77798 │ ███████████████████▍ │ +│ 2017-09-01 │ 83165192 │ █████████████████████████ │ 3880501 │ ███████████████████▍ │ 78402 │ ███████████████████▌ │ +│ 2017-10-01 │ 85828912 │ █████████████████████████ │ 3980335 │ ███████████████████▉ │ 80685 │ ████████████████████▏ │ +│ 2017-11-01 │ 84965681 │ █████████████████████████ │ 4026749 │ ████████████████████▏ │ 82659 │ ████████████████████▋ │ +│ 2017-12-01 │ 85973810 │ █████████████████████████ │ 4196354 │ ████████████████████▉ │ 91984 │ ██████████████████████▉ │ +│ 2018-01-01 │ 91558594 │ █████████████████████████ │ 4364443 │ █████████████████████▊ │ 102577 │ █████████████████████████ │ +│ 2018-02-01 │ 86467179 │ █████████████████████████ │ 4277899 │ █████████████████████▍ │ 104610 │ █████████████████████████ │ +│ 2018-03-01 │ 96490262 │ █████████████████████████ │ 4422470 │ ██████████████████████ │ 112559 │ █████████████████████████ │ +│ 2018-04-01 │ 98101232 │ █████████████████████████ │ 4572434 │ ██████████████████████▊ │ 105284 │ █████████████████████████ │ +│ 2018-05-01 │ 100109100 │ █████████████████████████ │ 4698908 │ ███████████████████████▍ │ 103910 │ █████████████████████████ │ +│ 2018-06-01 │ 100009462 │ █████████████████████████ │ 4697426 │ ███████████████████████▍ │ 101107 │ █████████████████████████ │ +│ 2018-07-01 │ 108151359 │ █████████████████████████ │ 5099492 │ █████████████████████████ │ 106184 │ █████████████████████████ │ +│ 2018-08-01 │ 107330940 │ █████████████████████████ │ 5084082 │ █████████████████████████ │ 109985 │ █████████████████████████ │ +│ 2018-09-01 │ 104473929 │ █████████████████████████ │ 5011953 │ █████████████████████████ │ 109710 │ █████████████████████████ │ +│ 2018-10-01 │ 112346556 │ █████████████████████████ │ 5320405 │ █████████████████████████ │ 112533 │ █████████████████████████ │ +│ 2018-11-01 │ 112573001 │ █████████████████████████ │ 5353282 │ █████████████████████████ │ 112211 │ █████████████████████████ │ +│ 2018-12-01 │ 121953600 │ █████████████████████████ │ 5611543 │ █████████████████████████ │ 118291 │ █████████████████████████ │ +│ 2019-01-01 │ 129386587 │ █████████████████████████ │ 6016687 │ █████████████████████████ │ 125725 │ █████████████████████████ │ +│ 2019-02-01 │ 120645639 │ █████████████████████████ │ 5974488 │ █████████████████████████ │ 125420 │ █████████████████████████ │ +│ 2019-03-01 │ 137650471 │ █████████████████████████ │ 6410197 │ █████████████████████████ │ 135924 │ █████████████████████████ │ +│ 2019-04-01 │ 138473643 │ █████████████████████████ │ 6416384 │ █████████████████████████ │ 139844 │ █████████████████████████ │ +│ 2019-05-01 │ 142463421 │ █████████████████████████ │ 6574836 │ █████████████████████████ │ 142012 │ █████████████████████████ │ +│ 2019-06-01 │ 134172939 │ █████████████████████████ │ 6601267 │ █████████████████████████ │ 140997 │ █████████████████████████ │ +│ 2019-07-01 │ 145965083 │ █████████████████████████ │ 6901822 │ █████████████████████████ │ 147802 │ █████████████████████████ │ +│ 2019-08-01 │ 146854393 │ █████████████████████████ │ 6993882 │ █████████████████████████ │ 151888 │ █████████████████████████ │ +│ 2019-09-01 │ 137540219 │ █████████████████████████ │ 7001362 │ █████████████████████████ │ 148839 │ █████████████████████████ │ +│ 2019-10-01 │ 129771456 │ █████████████████████████ │ 6825690 │ █████████████████████████ │ 144453 │ █████████████████████████ │ +│ 2019-11-01 │ 107990259 │ █████████████████████████ │ 6368286 │ █████████████████████████ │ 141768 │ █████████████████████████ │ +│ 2019-12-01 │ 112895934 │ █████████████████████████ │ 6640902 │ █████████████████████████ │ 148277 │ █████████████████████████ │ +│ 2020-01-01 │ 54354879 │ █████████████████████████ │ 4782339 │ ███████████████████████▉ │ 111658 │ █████████████████████████ │ +│ 2020-02-01 │ 22696923 │ ███████████▎ │ 3135175 │ ███████████████▋ │ 79521 │ ███████████████████▉ │ +│ 2020-03-01 │ 3466677 │ █▋ │ 987960 │ ████▉ │ 40901 │ ██████████▏ │ +└──────────────┴───────────┴───────────────────────────┴─────────┴───────────────────────────┴────────────┴───────────────────────────┘ + +172 rows in set. Elapsed: 184.809 sec. Processed 6.74 billion rows, 89.56 GB (36.47 million rows/s., 484.62 MB/s.) +``` + +10. Here are the top 10 subreddits of 2022: + +```sql +SELECT + subreddit, + count() AS count +FROM reddit +WHERE toYear(created_utc) = 2022 +GROUP BY subreddit +ORDER BY count DESC +LIMIT 10; +``` + +The response is: + +```response +┌─subreddit────────┬───count─┐ +│ AskReddit │ 3858203 │ +│ politics │ 1356782 │ +│ memes │ 1249120 │ +│ nfl │ 883667 │ +│ worldnews │ 866065 │ +│ teenagers │ 777095 │ +│ AmItheAsshole │ 752720 │ +│ dankmemes │ 657932 │ +│ nba │ 514184 │ +│ unpopularopinion │ 473649 │ +└──────────────────┴─────────┘ + +10 rows in set. Elapsed: 27.824 sec. Processed 6.74 billion rows, 53.26 GB (242.22 million rows/s., 1.91 GB/s.) +``` + +11. Let's see which subreddits had the biggest increase in commnents from 2018 to 2019: + +```sql +SELECT + subreddit, + newcount - oldcount AS diff +FROM +( + SELECT + subreddit, + count(*) AS newcount + FROM reddit + WHERE toYear(created_utc) = 2019 + GROUP BY subreddit +) +ALL INNER JOIN +( + SELECT + subreddit, + count(*) AS oldcount + FROM reddit + WHERE toYear(created_utc) = 2018 + GROUP BY subreddit +) USING (subreddit) +ORDER BY diff DESC +LIMIT 50 +SETTINGS joined_subquery_requires_alias = 0; +``` + +It looks like memes and teenagers were busy on Reddit in 2019: + +```response +┌─subreddit────────────┬─────diff─┐ +│ memes │ 15368369 │ +│ AskReddit │ 14663662 │ +│ teenagers │ 12266991 │ +│ AmItheAsshole │ 11561538 │ +│ dankmemes │ 11305158 │ +│ unpopularopinion │ 6332772 │ +│ PewdiepieSubmissions │ 5930818 │ +│ Market76 │ 5014668 │ +│ relationship_advice │ 3776383 │ +│ freefolk │ 3169236 │ +│ Minecraft │ 3160241 │ +│ classicwow │ 2907056 │ +│ Animemes │ 2673398 │ +│ gameofthrones │ 2402835 │ +│ PublicFreakout │ 2267605 │ +│ ShitPostCrusaders │ 2207266 │ +│ RoastMe │ 2195715 │ +│ gonewild │ 2148649 │ +│ AnthemTheGame │ 1803818 │ +│ entitledparents │ 1706270 │ +│ MortalKombat │ 1679508 │ +│ Cringetopia │ 1620555 │ +│ pokemon │ 1615266 │ +│ HistoryMemes │ 1608289 │ +│ Brawlstars │ 1574977 │ +│ iamatotalpieceofshit │ 1558315 │ +│ trashy │ 1518549 │ +│ ChapoTrapHouse │ 1505748 │ +│ Pikabu │ 1501001 │ +│ Showerthoughts │ 1475101 │ +│ cursedcomments │ 1465607 │ +│ ukpolitics │ 1386043 │ +│ wallstreetbets │ 1384431 │ +│ interestingasfuck │ 1378900 │ +│ wholesomememes │ 1353333 │ +│ AskOuija │ 1233263 │ +│ borderlands3 │ 1197192 │ +│ aww │ 1168257 │ +│ insanepeoplefacebook │ 1155473 │ +│ FortniteCompetitive │ 1122778 │ +│ EpicSeven │ 1117380 │ +│ FreeKarma4U │ 1116423 │ +│ YangForPresidentHQ │ 1086700 │ +│ SquaredCircle │ 1044089 │ +│ MurderedByWords │ 1042511 │ +│ AskMen │ 1024434 │ +│ thedivision │ 1016634 │ +│ barstoolsports │ 985032 │ +│ nfl │ 978340 │ +│ BattlefieldV │ 971408 │ +└──────────────────────┴──────────┘ + +50 rows in set. Elapsed: 65.954 sec. Processed 13.48 billion rows, 79.67 GB (204.37 million rows/s., 1.21 GB/s.) +``` + +12. One more query: let's compare ClickHouse mentions to other technologies like Snowflake and Postgres. This query is a big one because it has to search all the comments three times for a substring, and unfortunately ClickHouse user are obviously not very active on Reddit yet: + +```sql +SELECT + toStartOfQuarter(created_utc) AS quarter, + sum(if(positionCaseInsensitive(body, 'clickhouse') > 0, 1, 0)) AS clickhouse, + sum(if(positionCaseInsensitive(body, 'snowflake') > 0, 1, 0)) AS snowflake, + sum(if(positionCaseInsensitive(body, 'postgres') > 0, 1, 0)) AS postgres +FROM reddit +GROUP BY quarter +ORDER BY quarter ASC; +``` + +```response +┌────Quarter─┬─clickhouse─┬─snowflake─┬─postgres─┐ +│ 2005-10-01 │ 0 │ 0 │ 0 │ +│ 2006-01-01 │ 0 │ 2 │ 23 │ +│ 2006-04-01 │ 0 │ 2 │ 24 │ +│ 2006-07-01 │ 0 │ 4 │ 13 │ +│ 2006-10-01 │ 0 │ 23 │ 73 │ +│ 2007-01-01 │ 0 │ 14 │ 91 │ +│ 2007-04-01 │ 0 │ 10 │ 59 │ +│ 2007-07-01 │ 0 │ 39 │ 116 │ +│ 2007-10-01 │ 0 │ 45 │ 125 │ +│ 2008-01-01 │ 0 │ 53 │ 234 │ +│ 2008-04-01 │ 0 │ 79 │ 303 │ +│ 2008-07-01 │ 0 │ 102 │ 174 │ +│ 2008-10-01 │ 0 │ 156 │ 323 │ +│ 2009-01-01 │ 0 │ 206 │ 208 │ +│ 2009-04-01 │ 0 │ 178 │ 417 │ +│ 2009-07-01 │ 0 │ 300 │ 295 │ +│ 2009-10-01 │ 0 │ 633 │ 589 │ +│ 2010-01-01 │ 0 │ 555 │ 501 │ +│ 2010-04-01 │ 0 │ 587 │ 469 │ +│ 2010-07-01 │ 0 │ 770 │ 821 │ +│ 2010-10-01 │ 0 │ 1480 │ 550 │ +│ 2011-01-01 │ 0 │ 1482 │ 568 │ +│ 2011-04-01 │ 0 │ 1558 │ 406 │ +│ 2011-07-01 │ 0 │ 2163 │ 628 │ +│ 2011-10-01 │ 0 │ 4064 │ 566 │ +│ 2012-01-01 │ 0 │ 4621 │ 662 │ +│ 2012-04-01 │ 0 │ 5737 │ 785 │ +│ 2012-07-01 │ 0 │ 6097 │ 1127 │ +│ 2012-10-01 │ 0 │ 7986 │ 600 │ +│ 2013-01-01 │ 0 │ 9704 │ 839 │ +│ 2013-04-01 │ 0 │ 8161 │ 853 │ +│ 2013-07-01 │ 0 │ 9704 │ 1028 │ +│ 2013-10-01 │ 0 │ 12879 │ 1404 │ +│ 2014-01-01 │ 0 │ 12317 │ 1548 │ +│ 2014-04-01 │ 0 │ 13181 │ 1577 │ +│ 2014-07-01 │ 0 │ 15640 │ 1710 │ +│ 2014-10-01 │ 0 │ 19479 │ 1959 │ +│ 2015-01-01 │ 0 │ 20411 │ 2104 │ +│ 2015-04-01 │ 1 │ 20309 │ 9112 │ +│ 2015-07-01 │ 0 │ 20325 │ 4771 │ +│ 2015-10-01 │ 0 │ 25087 │ 3030 │ +│ 2016-01-01 │ 0 │ 23462 │ 3126 │ +│ 2016-04-01 │ 3 │ 25496 │ 2757 │ +│ 2016-07-01 │ 4 │ 28233 │ 2928 │ +│ 2016-10-01 │ 2 │ 45445 │ 2449 │ +│ 2017-01-01 │ 9 │ 76019 │ 2808 │ +│ 2017-04-01 │ 9 │ 67919 │ 2803 │ +│ 2017-07-01 │ 13 │ 68974 │ 2771 │ +│ 2017-10-01 │ 12 │ 69730 │ 2906 │ +│ 2018-01-01 │ 17 │ 67476 │ 3152 │ +│ 2018-04-01 │ 3 │ 67139 │ 3986 │ +│ 2018-07-01 │ 14 │ 67979 │ 3609 │ +│ 2018-10-01 │ 28 │ 74147 │ 3850 │ +│ 2019-01-01 │ 14 │ 80250 │ 4305 │ +│ 2019-04-01 │ 30 │ 70307 │ 3872 │ +│ 2019-07-01 │ 33 │ 77149 │ 4164 │ +│ 2019-10-01 │ 13 │ 76746 │ 3541 │ +│ 2020-01-01 │ 16 │ 54475 │ 846 │ +└────────────┴────────────┴───────────┴──────────┘ + +58 rows in set. Elapsed: 2663.751 sec. Processed 6.74 billion rows, 1.21 TB (2.53 million rows/s., 454.37 MB/s.) +``` \ No newline at end of file From e2e3a03dbe1e1dc5a4f2d39532bf072c9e6bffa3 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 17 May 2023 22:33:30 +0200 Subject: [PATCH 111/127] Revert "`groupArray` returns cannot be nullable" --- src/AggregateFunctions/AggregateFunctionGroupArray.cpp | 2 +- src/AggregateFunctions/AggregateFunctionNull.cpp | 2 +- tests/queries/0_stateless/00529_orantius.reference | 2 +- tests/queries/0_stateless/01664_array_slice_ubsan.reference | 2 +- .../queries/0_stateless/02713_group_array_nullable.reference | 3 --- tests/queries/0_stateless/02713_group_array_nullable.sql | 5 ----- 6 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 tests/queries/0_stateless/02713_group_array_nullable.reference delete mode 100644 tests/queries/0_stateless/02713_group_array_nullable.sql diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index 67cfa3f7356..15f500b8bb6 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -121,7 +121,7 @@ AggregateFunctionPtr createAggregateFunctionGroupArraySample( void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory) { - AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = true }; + AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true }; factory.registerFunction("groupArray", { createAggregateFunctionGroupArray, properties }); factory.registerFunction("groupArraySample", { createAggregateFunctionGroupArraySample, properties }); diff --git a/src/AggregateFunctions/AggregateFunctionNull.cpp b/src/AggregateFunctions/AggregateFunctionNull.cpp index 19c66db98cd..3d3d7af3026 100644 --- a/src/AggregateFunctions/AggregateFunctionNull.cpp +++ b/src/AggregateFunctions/AggregateFunctionNull.cpp @@ -72,7 +72,7 @@ public: { /// Currently the only functions that returns not-NULL on all NULL arguments are count and uniq, and they returns UInt64. if (properties.returns_default_when_only_null) - return std::make_shared(arguments, params, nested_function->getResultType()); + return std::make_shared(arguments, params, std::make_shared()); else return std::make_shared(arguments, params, std::make_shared(std::make_shared())); } diff --git a/tests/queries/0_stateless/00529_orantius.reference b/tests/queries/0_stateless/00529_orantius.reference index 1deecf44752..865659048cc 100644 --- a/tests/queries/0_stateless/00529_orantius.reference +++ b/tests/queries/0_stateless/00529_orantius.reference @@ -3,7 +3,7 @@ 1 1 [[1],[-1]] -[] +\N 1 42 42 [NULL,'','',NULL] diff --git a/tests/queries/0_stateless/01664_array_slice_ubsan.reference b/tests/queries/0_stateless/01664_array_slice_ubsan.reference index beb31c4a02b..abae410d95c 100644 --- a/tests/queries/0_stateless/01664_array_slice_ubsan.reference +++ b/tests/queries/0_stateless/01664_array_slice_ubsan.reference @@ -1 +1 @@ -['\0','\0','\0'] +[0,0,0] diff --git a/tests/queries/0_stateless/02713_group_array_nullable.reference b/tests/queries/0_stateless/02713_group_array_nullable.reference deleted file mode 100644 index a163ccade3a..00000000000 --- a/tests/queries/0_stateless/02713_group_array_nullable.reference +++ /dev/null @@ -1,3 +0,0 @@ -[1] -[0,1,2,3,4,5,6,7,8,9] -[8,9] diff --git a/tests/queries/0_stateless/02713_group_array_nullable.sql b/tests/queries/0_stateless/02713_group_array_nullable.sql deleted file mode 100644 index a0cb3075252..00000000000 --- a/tests/queries/0_stateless/02713_group_array_nullable.sql +++ /dev/null @@ -1,5 +0,0 @@ -SET aggregate_functions_null_for_empty = 1; - -SELECT groupArray(1); -SELECT groupArray(number) FROM numbers(10); -SELECT groupArrayLast(2)(number) FROM numbers(10); From 612b79868b6a5e9f8c91ddf0caea6f72e97c0320 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Wed, 17 May 2023 20:40:51 +0000 Subject: [PATCH 112/127] test added --- .../0_stateless/02751_protobuf_ipv6.reference | 2 ++ tests/queries/0_stateless/02751_protobuf_ipv6.sh | 14 ++++++++++++++ .../format_schemas/02751_protobuf_ipv6.proto | 6 ++++++ 3 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/02751_protobuf_ipv6.reference create mode 100755 tests/queries/0_stateless/02751_protobuf_ipv6.sh create mode 100644 tests/queries/0_stateless/format_schemas/02751_protobuf_ipv6.proto diff --git a/tests/queries/0_stateless/02751_protobuf_ipv6.reference b/tests/queries/0_stateless/02751_protobuf_ipv6.reference new file mode 100644 index 00000000000..0318b49c77e --- /dev/null +++ b/tests/queries/0_stateless/02751_protobuf_ipv6.reference @@ -0,0 +1,2 @@ +::ffff:1.2.3.4 +::ffff:1.2.3.4 diff --git a/tests/queries/0_stateless/02751_protobuf_ipv6.sh b/tests/queries/0_stateless/02751_protobuf_ipv6.sh new file mode 100755 index 00000000000..ecf565d9db4 --- /dev/null +++ b/tests/queries/0_stateless/02751_protobuf_ipv6.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +SCHEMADIR=$CURDIR/format_schemas + + +echo 121a1000000000000000000000ffff01020304 | xxd -r -p | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02751_protobuf_ipv6:Message" --structure="ipv6_bytes IPv6" -q "select * from table" + +$CLICKHOUSE_LOCAL -q "select '::ffff:1.2.3.4'::IPv6 as ipv6_bytes format Protobuf settings format_schema = '$SCHEMADIR/02751_protobuf_ipv6:Message'" | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02751_protobuf_ipv6:Message" --structure="ipv6_bytes IPv6" -q "select * from table" + diff --git a/tests/queries/0_stateless/format_schemas/02751_protobuf_ipv6.proto b/tests/queries/0_stateless/format_schemas/02751_protobuf_ipv6.proto new file mode 100644 index 00000000000..8e6f115f2d7 --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02751_protobuf_ipv6.proto @@ -0,0 +1,6 @@ +syntax = "proto3"; + +message Message +{ + bytes ipv6_bytes = 3; +} From 855c95f6268b1f31cdadb4b55b4b4ada5db7847a Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 17 May 2023 22:46:09 +0200 Subject: [PATCH 113/127] Update src/Interpreters/Cache/Metadata.cpp Co-authored-by: Igor Nikonov <954088+devcrafter@users.noreply.github.com> --- src/Interpreters/Cache/Metadata.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp index e0b82763a08..843ffd45b63 100644 --- a/src/Interpreters/Cache/Metadata.cpp +++ b/src/Interpreters/Cache/Metadata.cpp @@ -284,7 +284,7 @@ void CacheMetadata::doCleanup() { /// Key prefix directory can become non-empty just now, it is expected. if (e.code() == std::errc::directory_not_empty) - return; + continue; LOG_ERROR(log, "Error while removing key {}: {}", cleanup_key, getCurrentExceptionMessage(true)); chassert(false); } From d294ecbc162e129e6f63a1d9ec4c9c80b994cddb Mon Sep 17 00:00:00 2001 From: libin Date: Thu, 18 May 2023 15:50:19 +0800 Subject: [PATCH 114/127] Update grant.md docs: Modifying grant example --- docs/zh/sql-reference/statements/grant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/sql-reference/statements/grant.md b/docs/zh/sql-reference/statements/grant.md index 12ad2e0fe25..7e7cdbff350 100644 --- a/docs/zh/sql-reference/statements/grant.md +++ b/docs/zh/sql-reference/statements/grant.md @@ -55,7 +55,7 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION 同样 `john` 有权执行 `GRANT OPTION`,因此他能给其它账号进行和自己账号权限范围相同的授权。 -可以使用`*` 号代替表或库名进行授权操作。例如, `GRANT SELECT ONdb.* TO john` 操作运行 `john`对 `db`库的所有表执行 `SELECT`查询。同样,你可以忽略库名。在这种情形下,权限将指向当前的数据库。例如, `GRANT SELECT ON* to john` 对当前数据库的所有表指定授权, `GARNT SELECT ON mytable to john`对当前数据库的 `mytable`表进行授权。 +可以使用`*` 号代替表或库名进行授权操作。例如, `GRANT SELECT ONdb.* TO john` 操作运行 `john`对 `db`库的所有表执行 `SELECT`查询。同样,你可以忽略库名。在这种情形下,权限将指向当前的数据库。例如, `GRANT SELECT ON* to john` 对当前数据库的所有表指定授权, `GRANT SELECT ON mytable to john`对当前数据库的 `mytable`表进行授权。 访问 `systen`数据库总是被允许的(因为这个数据库用来处理sql操作) 可以一次给多个账号进行多种授权操作。 `GRANT SELECT,INSERT ON *.* TO john,robin` 允许 `john`和`robin` 账号对任意数据库的任意表执行 `INSERT`和 `SELECT`操作。 From f98c337d2f9580fa65c8d21de447ab6e8fe3d781 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 18 May 2023 14:53:46 +0200 Subject: [PATCH 115/127] Fix stack-use-after-scope in resource manager test (#49908) * Fix stack-use-after-scope in resource manager test * fix --- .../gtest_resource_manager_hierarchical.cpp | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp index b113da31d59..43773559f03 100644 --- a/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp +++ b/src/IO/Resource/tests/gtest_resource_manager_hierarchical.cpp @@ -47,9 +47,18 @@ TEST(IOResourceDynamicResourceManager, Smoke) TEST(IOResourceDynamicResourceManager, Fairness) { - constexpr size_t T = 3; // threads per queue - int N = 100; // requests per thread - ResourceTest t(2 * T + 1); + // Total cost for A and B cannot differ for more than 1 (every request has cost equal to 1). + // Requests from A use `value = 1` and from B `value = -1` is used. + std::atomic unfairness = 0; + auto fairness_diff = [&] (Int64 value) + { + Int64 cur_unfairness = unfairness.fetch_add(value, std::memory_order_relaxed) + value; + EXPECT_NEAR(cur_unfairness, 0, 1); + }; + + constexpr size_t threads_per_queue = 3; + int requests_per_thread = 100; + ResourceTest t(2 * threads_per_queue + 1); t.update(R"CONFIG( @@ -70,24 +79,14 @@ TEST(IOResourceDynamicResourceManager, Fairness) )CONFIG"); - - // Total cost for A and B cannot differ for more than 1 (every request has cost equal to 1). - // Requests from A use `value = 1` and from B `value = -1` is used. - std::atomic unfairness = 0; - auto fairness_diff = [&] (Int64 value) - { - Int64 cur_unfairness = unfairness.fetch_add(value, std::memory_order_relaxed) + value; - EXPECT_NEAR(cur_unfairness, 0, 1); - }; - - for (int thr = 0; thr < T; thr++) + for (int thread = 0; thread < threads_per_queue; thread++) { t.threads.emplace_back([&] { ClassifierPtr c = t.manager->acquire("A"); ResourceLink link = c->get("res1"); - t.startBusyPeriod(link, 1, N); - for (int req = 0; req < N; req++) + t.startBusyPeriod(link, 1, requests_per_thread); + for (int request = 0; request < requests_per_thread; request++) { TestGuard g(t, link, 1); fairness_diff(1); @@ -95,14 +94,14 @@ TEST(IOResourceDynamicResourceManager, Fairness) }); } - for (int thr = 0; thr < T; thr++) + for (int thread = 0; thread < threads_per_queue; thread++) { t.threads.emplace_back([&] { ClassifierPtr c = t.manager->acquire("B"); ResourceLink link = c->get("res1"); - t.startBusyPeriod(link, 1, N); - for (int req = 0; req < N; req++) + t.startBusyPeriod(link, 1, requests_per_thread); + for (int request = 0; request < requests_per_thread; request++) { TestGuard g(t, link, 1); fairness_diff(-1); From 30083351f5769d781a2dacccc8c259e846518956 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Thu, 18 May 2023 14:42:48 +0000 Subject: [PATCH 116/127] test fix --- tests/queries/0_stateless/02751_protobuf_ipv6.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02751_protobuf_ipv6.sh b/tests/queries/0_stateless/02751_protobuf_ipv6.sh index ecf565d9db4..f93963aa6c6 100755 --- a/tests/queries/0_stateless/02751_protobuf_ipv6.sh +++ b/tests/queries/0_stateless/02751_protobuf_ipv6.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) SCHEMADIR=$CURDIR/format_schemas -echo 121a1000000000000000000000ffff01020304 | xxd -r -p | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02751_protobuf_ipv6:Message" --structure="ipv6_bytes IPv6" -q "select * from table" +echo -ne '\x12\x1a\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x01\x02\x03\x04' | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02751_protobuf_ipv6:Message" --structure="ipv6_bytes IPv6" -q "select * from table" $CLICKHOUSE_LOCAL -q "select '::ffff:1.2.3.4'::IPv6 as ipv6_bytes format Protobuf settings format_schema = '$SCHEMADIR/02751_protobuf_ipv6:Message'" | $CLICKHOUSE_LOCAL --input-format Protobuf --format_schema="$SCHEMADIR/02751_protobuf_ipv6:Message" --structure="ipv6_bytes IPv6" -q "select * from table" From 6b4dcbd3ed5f3f00322b86cf82780509f93ea038 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 18 May 2023 23:23:39 +0800 Subject: [PATCH 117/127] Use PROJECT_*_DIR instead of CMAKE_*_DIR. --- CMakeLists.txt | 8 ++++---- cmake/add_check.cmake | 4 ++-- cmake/git.cmake | 10 +++++----- cmake/print_flags.cmake | 6 +++--- cmake/sanitize.cmake | 6 +++--- cmake/version.cmake | 2 +- contrib/avro-cmake/CMakeLists.txt | 2 +- contrib/cassandra-cmake/CMakeLists.txt | 2 +- contrib/cctz-cmake/CMakeLists.txt | 2 +- contrib/libuv-cmake/CMakeLists.txt | 4 ++-- contrib/mariadb-connector-c-cmake/CMakeLists.txt | 2 +- contrib/snappy-cmake/CMakeLists.txt | 2 +- contrib/zlib-ng-cmake/CMakeLists.txt | 2 +- programs/self-extracting/CMakeLists.txt | 6 +++--- src/CMakeLists.txt | 2 +- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26188cb7110..56bf3e1c3f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,8 +259,8 @@ endif () option (ENABLE_BUILD_PATH_MAPPING "Enable remapping of file source paths in debug info, predefined preprocessor macros, and __builtin_FILE(). It's used to generate reproducible builds. See https://reproducible-builds.org/docs/build-path" ${ENABLE_BUILD_PATH_MAPPING_DEFAULT}) if (ENABLE_BUILD_PATH_MAPPING) - set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.") - set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.") + set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffile-prefix-map=${PROJECT_SOURCE_DIR}=.") + set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ffile-prefix-map=${PROJECT_SOURCE_DIR}=.") endif () option (ENABLE_BUILD_PROFILING "Enable profiling of build time" OFF) @@ -557,7 +557,7 @@ if (NATIVE_BUILD_TARGETS ) message (STATUS "Building native targets...") - set (NATIVE_BUILD_DIR "${CMAKE_BINARY_DIR}/native") + set (NATIVE_BUILD_DIR "${PROJECT_BINARY_DIR}/native") execute_process( COMMAND ${CMAKE_COMMAND} -E make_directory "${NATIVE_BUILD_DIR}" @@ -571,7 +571,7 @@ if (NATIVE_BUILD_TARGETS # Avoid overriding .cargo/config.toml with native toolchain. "-DENABLE_RUST=OFF" "-DENABLE_CLICKHOUSE_SELF_EXTRACTING=${ENABLE_CLICKHOUSE_SELF_EXTRACTING}" - ${CMAKE_SOURCE_DIR} + ${PROJECT_SOURCE_DIR} WORKING_DIRECTORY "${NATIVE_BUILD_DIR}" COMMAND_ECHO STDOUT) diff --git a/cmake/add_check.cmake b/cmake/add_check.cmake index c6abbcdb321..ba30ee8676f 100644 --- a/cmake/add_check.cmake +++ b/cmake/add_check.cmake @@ -5,11 +5,11 @@ if (NOT TARGET check) if (CMAKE_CONFIGURATION_TYPES) add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure --build-config "$" - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) else () add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND} --force-new-ctest-process --output-on-failure - WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) endif () endif () diff --git a/cmake/git.cmake b/cmake/git.cmake index 397ec3cd081..a4b3bd4bdab 100644 --- a/cmake/git.cmake +++ b/cmake/git.cmake @@ -5,14 +5,14 @@ if (Git_FOUND) # Commit hash + whether the building workspace was dirty or not execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GIT_HASH ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) # Branch name execute_process(COMMAND "${GIT_EXECUTABLE}" rev-parse --abbrev-ref HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -20,14 +20,14 @@ if (Git_FOUND) SET(ENV{TZ} "UTC") execute_process(COMMAND "${GIT_EXECUTABLE}" log -1 --format=%ad --date=iso-local - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GIT_DATE ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) # Subject of the commit execute_process(COMMAND "${GIT_EXECUTABLE}" log -1 --format=%s - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_SUBJECT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) @@ -35,7 +35,7 @@ if (Git_FOUND) execute_process( COMMAND ${GIT_EXECUTABLE} status - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} OUTPUT_STRIP_TRAILING_WHITESPACE) + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_STRIP_TRAILING_WHITESPACE) else() message(STATUS "Git could not be found.") endif() diff --git a/cmake/print_flags.cmake b/cmake/print_flags.cmake index 66f2a8bfbc7..869764602d4 100644 --- a/cmake/print_flags.cmake +++ b/cmake/print_flags.cmake @@ -7,6 +7,6 @@ message (STATUS "compiler CXX = ${CMAKE_CXX_COMPILER} ${FULL_CXX_FLAGS}") message (STATUS "LINKER_FLAGS = ${FULL_EXE_LINKER_FLAGS}") # Reproducible builds -string (REPLACE "${CMAKE_SOURCE_DIR}" "." FULL_C_FLAGS_NORMALIZED "${FULL_C_FLAGS}") -string (REPLACE "${CMAKE_SOURCE_DIR}" "." FULL_CXX_FLAGS_NORMALIZED "${FULL_CXX_FLAGS}") -string (REPLACE "${CMAKE_SOURCE_DIR}" "." FULL_EXE_LINKER_FLAGS_NORMALIZED "${FULL_EXE_LINKER_FLAGS}") +string (REPLACE "${PROJECT_SOURCE_DIR}" "." FULL_C_FLAGS_NORMALIZED "${FULL_C_FLAGS}") +string (REPLACE "${PROJECT_SOURCE_DIR}" "." FULL_CXX_FLAGS_NORMALIZED "${FULL_CXX_FLAGS}") +string (REPLACE "${PROJECT_SOURCE_DIR}" "." FULL_EXE_LINKER_FLAGS_NORMALIZED "${FULL_EXE_LINKER_FLAGS}") diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index b2fbdb256fd..17ce8a7db29 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -29,14 +29,14 @@ if (SANITIZE) # Linking can fail due to relocation overflows (see #49145), caused by too big object files / libraries. # Work around this with position-independent builds (-fPIC and -fpie), this is slightly slower than non-PIC/PIE but that's okay. - set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls -fPIC -fpie -fsanitize-blacklist=${CMAKE_SOURCE_DIR}/tests/msan_suppressions.txt") + set (MSAN_FLAGS "-fsanitize=memory -fsanitize-memory-use-after-dtor -fsanitize-memory-track-origins -fno-optimize-sibling-calls -fPIC -fpie -fsanitize-blacklist=${PROJECT_SOURCE_DIR}/tests/msan_suppressions.txt") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${MSAN_FLAGS}") elseif (SANITIZE STREQUAL "thread") set (TSAN_FLAGS "-fsanitize=thread") if (COMPILER_CLANG) - set (TSAN_FLAGS "${TSAN_FLAGS} -fsanitize-blacklist=${CMAKE_SOURCE_DIR}/tests/tsan_suppressions.txt") + set (TSAN_FLAGS "${TSAN_FLAGS} -fsanitize-blacklist=${PROJECT_SOURCE_DIR}/tests/tsan_suppressions.txt") endif() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${TSAN_FLAGS}") @@ -54,7 +54,7 @@ if (SANITIZE) set(UBSAN_FLAGS "${UBSAN_FLAGS} -fno-sanitize=unsigned-integer-overflow") endif() if (COMPILER_CLANG) - set (UBSAN_FLAGS "${UBSAN_FLAGS} -fsanitize-blacklist=${CMAKE_SOURCE_DIR}/tests/ubsan_suppressions.txt") + set (UBSAN_FLAGS "${UBSAN_FLAGS} -fsanitize-blacklist=${PROJECT_SOURCE_DIR}/tests/ubsan_suppressions.txt") endif() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${UBSAN_FLAGS}") diff --git a/cmake/version.cmake b/cmake/version.cmake index acaa772ff2f..9ca21556f4d 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -1,4 +1,4 @@ -include(${CMAKE_SOURCE_DIR}/cmake/autogenerated_versions.txt) +include(${PROJECT_SOURCE_DIR}/cmake/autogenerated_versions.txt) set(VERSION_EXTRA "" CACHE STRING "") set(VERSION_TWEAK "" CACHE STRING "") diff --git a/contrib/avro-cmake/CMakeLists.txt b/contrib/avro-cmake/CMakeLists.txt index 25474650d0e..63b3854eef9 100644 --- a/contrib/avro-cmake/CMakeLists.txt +++ b/contrib/avro-cmake/CMakeLists.txt @@ -6,7 +6,7 @@ if (NOT ENABLE_AVRO) return() endif() -set(AVROCPP_ROOT_DIR "${CMAKE_SOURCE_DIR}/contrib/avro/lang/c++") +set(AVROCPP_ROOT_DIR "${PROJECT_SOURCE_DIR}/contrib/avro/lang/c++") set(AVROCPP_INCLUDE_DIR "${AVROCPP_ROOT_DIR}/api") set(AVROCPP_SOURCE_DIR "${AVROCPP_ROOT_DIR}/impl") diff --git a/contrib/cassandra-cmake/CMakeLists.txt b/contrib/cassandra-cmake/CMakeLists.txt index 59ff908b63a..32611e0e151 100644 --- a/contrib/cassandra-cmake/CMakeLists.txt +++ b/contrib/cassandra-cmake/CMakeLists.txt @@ -18,7 +18,7 @@ endif() # Need to use C++17 since the compilation is not possible with C++20 currently. set (CMAKE_CXX_STANDARD 17) -set(CASS_ROOT_DIR ${CMAKE_SOURCE_DIR}/contrib/cassandra) +set(CASS_ROOT_DIR ${PROJECT_SOURCE_DIR}/contrib/cassandra) set(CASS_SRC_DIR "${CASS_ROOT_DIR}/src") set(CASS_INCLUDE_DIR "${CASS_ROOT_DIR}/include") diff --git a/contrib/cctz-cmake/CMakeLists.txt b/contrib/cctz-cmake/CMakeLists.txt index f1ef9b53f7d..10070fbd949 100644 --- a/contrib/cctz-cmake/CMakeLists.txt +++ b/contrib/cctz-cmake/CMakeLists.txt @@ -26,7 +26,7 @@ endif () # StorageSystemTimeZones.generated.cpp is autogenerated each time during a build # data in this file will be used to populate the system.time_zones table, this is specific to OS_LINUX # as the library that's built using embedded tzdata is also specific to OS_LINUX -set(SYSTEM_STORAGE_TZ_FILE "${CMAKE_BINARY_DIR}/src/Storages/System/StorageSystemTimeZones.generated.cpp") +set(SYSTEM_STORAGE_TZ_FILE "${PROJECT_BINARY_DIR}/src/Storages/System/StorageSystemTimeZones.generated.cpp") # remove existing copies so that its generated fresh on each build. file(REMOVE ${SYSTEM_STORAGE_TZ_FILE}) diff --git a/contrib/libuv-cmake/CMakeLists.txt b/contrib/libuv-cmake/CMakeLists.txt index fb88799ed38..928fdcdd7e6 100644 --- a/contrib/libuv-cmake/CMakeLists.txt +++ b/contrib/libuv-cmake/CMakeLists.txt @@ -1,7 +1,7 @@ # This file is a modified version of contrib/libuv/CMakeLists.txt -set (SOURCE_DIR "${CMAKE_SOURCE_DIR}/contrib/libuv") -set (BINARY_DIR "${CMAKE_BINARY_DIR}/contrib/libuv") +set (SOURCE_DIR "${PROJECT_SOURCE_DIR}/contrib/libuv") +set (BINARY_DIR "${PROJECT_BINARY_DIR}/contrib/libuv") set(uv_sources src/fs-poll.c diff --git a/contrib/mariadb-connector-c-cmake/CMakeLists.txt b/contrib/mariadb-connector-c-cmake/CMakeLists.txt index 50287c54ac1..18d1510a57b 100644 --- a/contrib/mariadb-connector-c-cmake/CMakeLists.txt +++ b/contrib/mariadb-connector-c-cmake/CMakeLists.txt @@ -15,7 +15,7 @@ endif() # This is the LGPL libmariadb project. -set(CC_SOURCE_DIR ${CMAKE_SOURCE_DIR}/contrib/mariadb-connector-c) +set(CC_SOURCE_DIR ${PROJECT_SOURCE_DIR}/contrib/mariadb-connector-c) set(CC_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) set(WITH_SSL ON) diff --git a/contrib/snappy-cmake/CMakeLists.txt b/contrib/snappy-cmake/CMakeLists.txt index 50cdc8732a1..f406de0e343 100644 --- a/contrib/snappy-cmake/CMakeLists.txt +++ b/contrib/snappy-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set (SOURCE_DIR "${CMAKE_SOURCE_DIR}/contrib/snappy") +set (SOURCE_DIR "${PROJECT_SOURCE_DIR}/contrib/snappy") if (ARCH_S390X) set (SNAPPY_IS_BIG_ENDIAN 1) diff --git a/contrib/zlib-ng-cmake/CMakeLists.txt b/contrib/zlib-ng-cmake/CMakeLists.txt index aa067ba37e0..79f343bfc75 100644 --- a/contrib/zlib-ng-cmake/CMakeLists.txt +++ b/contrib/zlib-ng-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set (SOURCE_DIR ${CMAKE_SOURCE_DIR}/contrib/zlib-ng) +set (SOURCE_DIR ${PROJECT_SOURCE_DIR}/contrib/zlib-ng) add_definitions(-DZLIB_COMPAT) add_definitions(-DWITH_GZFILEOP) diff --git a/programs/self-extracting/CMakeLists.txt b/programs/self-extracting/CMakeLists.txt index 2cc26926b38..f3ff0bbcd78 100644 --- a/programs/self-extracting/CMakeLists.txt +++ b/programs/self-extracting/CMakeLists.txt @@ -4,10 +4,10 @@ if (NOT( AND CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL CMAKE_SYSTEM_PROCESSOR ) ) - set (COMPRESSOR "${CMAKE_BINARY_DIR}/native/utils/self-extracting-executable/pre_compressor") - set (DECOMPRESSOR "--decompressor=${CMAKE_BINARY_DIR}/utils/self-extracting-executable/decompressor") + set (COMPRESSOR "${PROJECT_BINARY_DIR}/native/utils/self-extracting-executable/pre_compressor") + set (DECOMPRESSOR "--decompressor=${PROJECT_BINARY_DIR}/utils/self-extracting-executable/decompressor") else () - set (COMPRESSOR "${CMAKE_BINARY_DIR}/utils/self-extracting-executable/compressor") + set (COMPRESSOR "${PROJECT_BINARY_DIR}/utils/self-extracting-executable/compressor") endif () add_custom_target (self-extracting ALL diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3f4fbb7420..87a2979ecd1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -527,7 +527,7 @@ target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::fast_float) if (USE_ORC) dbms_target_link_libraries(PUBLIC ${ORC_LIBRARIES}) - dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ORC_INCLUDE_DIR} "${CMAKE_BINARY_DIR}/contrib/orc/c++/include") + dbms_target_include_directories(SYSTEM BEFORE PUBLIC ${ORC_INCLUDE_DIR} "${PROJECT_BINARY_DIR}/contrib/orc/c++/include") endif () if (TARGET ch_contrib::rocksdb) From 73661c3a4635398de2d4783f0e2c17fe59265258 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 May 2023 18:18:30 +0200 Subject: [PATCH 118/127] Move tunnings for woboq codebrowser to cmake out from build.sh Signed-off-by: Azat Khuzhin --- CMakeLists.txt | 6 ++++++ docker/test/codebrowser/build.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26188cb7110..54d1ae2bfcb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,12 @@ if (ENABLE_FUZZING) set (ENABLE_PROTOBUF 1) endif() +option (ENABLE_WOBOQ_CODEBROWSER "Build for woboq codebrowser" OFF) + +if (ENABLE_WOBOQ_CODEBROWSER) + set (ENABLE_EMBEDDED_COMPILER 0) +endif() + # Global libraries # See: # - default_libs.cmake diff --git a/docker/test/codebrowser/build.sh b/docker/test/codebrowser/build.sh index 5ab9de5a453..d76d0c3a039 100755 --- a/docker/test/codebrowser/build.sh +++ b/docker/test/codebrowser/build.sh @@ -15,7 +15,7 @@ nproc=$(($(nproc) + 2)) # increase parallelism read -ra CMAKE_FLAGS <<< "${CMAKE_FLAGS:-}" mkdir -p "$BUILD_DIRECTORY" && cd "$BUILD_DIRECTORY" -cmake "$SOURCE_DIRECTORY" -DCMAKE_CXX_COMPILER="/usr/bin/clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="/usr/bin/clang-${LLVM_VERSION}" -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 "${CMAKE_FLAGS[@]}" +cmake "$SOURCE_DIRECTORY" -DCMAKE_CXX_COMPILER="/usr/bin/clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="/usr/bin/clang-${LLVM_VERSION}" -DENABLE_WOBOQ_CODEBROWSER=ON "${CMAKE_FLAGS[@]}" mkdir -p "$HTML_RESULT_DIRECTORY" echo 'Filter out too noisy "Error: filename" lines and keep them in full codebrowser_generator.log' /woboq_codebrowser/generator/codebrowser_generator -b "$BUILD_DIRECTORY" -a \ From 0f7a310a6775728fd71c4784c5e7a8e776338ca5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 May 2023 18:21:19 +0200 Subject: [PATCH 119/127] Fix woboq codebrowser build with -Wno-poison-system-directories woboq codebrowser uses clang tooling, which adds clang system includes (in Linux::AddClangSystemIncludeArgs()), because none of (-nostdinc, -nobuiltininc) is set. And later it will complain with -Wpoison-system-directories for added by itself includes in InitHeaderSearch::AddUnmappedPath(), because they are starts from one of the following: - /usr/include - /usr/local/include The interesting thing here is that it got broken only after upgrading to llvm 16 (in #49678), and the reason for this is that clang 15 build has system includes that does not trigger the warning - "/usr/lib/clang/15.0.7/include", while clang 16 has "/usr/include/clang/16.0.4/include" So let's simply disable this warning, but only for woboq. Signed-off-by: Azat Khuzhin --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54d1ae2bfcb..ef3f34204b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,11 @@ option (ENABLE_WOBOQ_CODEBROWSER "Build for woboq codebrowser" OFF) if (ENABLE_WOBOQ_CODEBROWSER) set (ENABLE_EMBEDDED_COMPILER 0) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-poison-system-directories") + # woboq codebrowser uses clang tooling, and they could add default system + # clang includes, and later clang will warn for those added by itself + # includes. + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-poison-system-directories") endif() # Global libraries From e7b6056bbbab56b2fe8d0e9f6243bba355744f31 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Thu, 18 May 2023 15:18:55 -0300 Subject: [PATCH 120/127] test for #46128 --- ...with_short_circuit_functins_mutations.reference | 3 +++ ...ality_with_short_circuit_functins_mutations.sql | 14 ++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.reference create mode 100644 tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.sql diff --git a/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.reference b/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.reference new file mode 100644 index 00000000000..9a6e97d4503 --- /dev/null +++ b/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.reference @@ -0,0 +1,3 @@ +0 xxxx yyyy +1 yyyy yyyy +2 xxxx yyyy diff --git a/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.sql b/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.sql new file mode 100644 index 00000000000..9d183dde91d --- /dev/null +++ b/tests/queries/0_stateless/02481_low_cardinality_with_short_circuit_functins_mutations.sql @@ -0,0 +1,14 @@ +drop table if exists issue_46128; + +create table issue_46128 ( + id Int64, + a LowCardinality(Nullable(String)), + b LowCardinality(Nullable(String)) +) Engine = MergeTree order by id +as select number%100, 'xxxx', 'yyyy' from numbers(10); + +ALTER TABLE issue_46128 UPDATE a = b WHERE id= 1 settings mutations_sync=2; + +select * from issue_46128 where id <= 2 order by id; + +drop table issue_46128; From 8dc59c1efea2dfaa56ec5a0728a362714e2ab206 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 18 May 2023 21:40:20 +0000 Subject: [PATCH 121/127] Fix test_insert_same_partition_and_merge failing if one Azure request attempt fails --- src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp | 2 +- tests/integration/test_merge_tree_azure_blob_storage/test.py | 4 ++-- tests/integration/test_merge_tree_s3/test.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index f8ca6b9ab07..44185f74f60 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -51,7 +51,7 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function func, if (i == num_tries - 1) throw; - LOG_DEBUG(log, "Write at attempt {} for blob `{}` failed: {}", i + 1, blob_path, e.Message); + LOG_DEBUG(log, "Write at attempt {} for blob `{}` failed: {} {}", i + 1, blob_path, e.what(), e.Message); }; for (size_t i = 0; i < num_tries; ++i) diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index bcb62c3181d..8bf4df17c39 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -203,7 +203,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical): node.query(f"SYSTEM START MERGES {TABLE_NAME}") # Wait for merges and old parts deletion - for attempt in range(0, 10): + for attempt in range(0, 60): parts_count = azure_query( node, f"SELECT COUNT(*) FROM system.parts WHERE table = '{TABLE_NAME}' FORMAT Values", @@ -211,7 +211,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical): if parts_count == "(1)": break - if attempt == 9: + if attempt == 59: assert parts_count == "(1)" time.sleep(1) diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py index 7fbe8c8e99b..ee774f6632b 100644 --- a/tests/integration/test_merge_tree_s3/test.py +++ b/tests/integration/test_merge_tree_s3/test.py @@ -232,7 +232,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name): node.query("SYSTEM START MERGES s3_test") # Wait for merges and old parts deletion - for attempt in range(0, 10): + for attempt in range(0, 60): parts_count = node.query( "SELECT COUNT(*) FROM system.parts WHERE table = 's3_test' and active = 1 FORMAT Values" ) @@ -240,7 +240,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name): if parts_count == "(1)": break - if attempt == 9: + if attempt == 59: assert parts_count == "(1)" time.sleep(1) From e84f0895e72f25c7200993a1fd8ac8e11bd7f9b9 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 16 May 2023 19:27:30 +0000 Subject: [PATCH 122/127] Support hardlinking parts transactionally --- .../MergeTree/DataPartStorageOnDiskBase.cpp | 27 +++++++--- .../MergeTree/DataPartStorageOnDiskBase.h | 3 +- src/Storages/MergeTree/IDataPartStorage.h | 8 ++- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/localBackup.cpp | 50 ++++++++++++++----- src/Storages/MergeTree/localBackup.h | 8 +-- tests/integration/test_grpc_protocol/test.py | 2 +- .../test_grpc_protocol_ssl/test.py | 2 +- tests/integration/test_server_reload/test.py | 2 +- 9 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index ebe55ea7dc7..cfc3ff58f81 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -411,25 +411,38 @@ MutableDataPartStoragePtr DataPartStorageOnDiskBase::freeze( bool make_source_readonly, std::function save_metadata_callback, bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks) const + const NameSet & files_to_copy_instead_of_hardlinks, + DiskTransactionPtr external_transaction) const { auto disk = volume->getDisk(); - disk->createDirectories(to); + if (external_transaction) + external_transaction->createDirectories(to); + else + disk->createDirectories(to); - localBackup(disk, getRelativePath(), fs::path(to) / dir_path, make_source_readonly, {}, copy_instead_of_hardlink, files_to_copy_instead_of_hardlinks); + localBackup(disk, getRelativePath(), fs::path(to) / dir_path, make_source_readonly, {}, copy_instead_of_hardlink, files_to_copy_instead_of_hardlinks, external_transaction); if (save_metadata_callback) save_metadata_callback(disk); - disk->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); - disk->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); - disk->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); + if (external_transaction) + { + external_transaction->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); + external_transaction->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); + external_transaction->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); + } + else + { + disk->removeFileIfExists(fs::path(to) / dir_path / "delete-on-destroy.txt"); + disk->removeFileIfExists(fs::path(to) / dir_path / "txn_version.txt"); + disk->removeFileIfExists(fs::path(to) / dir_path / IMergeTreeDataPart::METADATA_VERSION_FILE_NAME); + } auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); /// Do not initialize storage in case of DETACH because part may be broken. bool to_detached = dir_path.starts_with("detached/"); - return create(single_disk_volume, to, dir_path, /*initialize=*/ !to_detached); + return create(single_disk_volume, to, dir_path, /*initialize=*/ !to_detached && !external_transaction); } MutableDataPartStoragePtr DataPartStorageOnDiskBase::clonePart( diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 09eb7f008bc..6b27b7296fc 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -64,7 +64,8 @@ public: bool make_source_readonly, std::function save_metadata_callback, bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks) const override; + const NameSet & files_to_copy_instead_of_hardlinks, + DiskTransactionPtr external_transaction) const override; MutableDataPartStoragePtr clonePart( const std::string & to, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 7c85469d890..f160254350d 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB { @@ -212,13 +213,18 @@ public: /// implementation which relies on paths of some blobs in S3. For example if we want to hardlink /// the whole part during mutation we shouldn't hardlink checksums.txt, because otherwise /// zero-copy locks for different parts will be on the same path in zookeeper. + /// + /// If `external_transaction` is provided, the disk operations (creating directories, hardlinking, + /// etc) won't be applied immediately; instead, they'll be added to external_transaction, which the + /// caller then needs to commit. virtual std::shared_ptr freeze( const std::string & to, const std::string & dir_path, bool make_source_readonly, std::function save_metadata_callback, bool copy_instead_of_hardlink, - const NameSet & files_to_copy_instead_of_hardlinks) const = 0; + const NameSet & files_to_copy_instead_of_hardlinks, + DiskTransactionPtr external_transaction = nullptr) const = 0; /// Make a full copy of a data part into 'to/dir_path' (possibly to a different disk). virtual std::shared_ptr clonePart( diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index ff40c1da8d1..388d96314c0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -161,7 +161,7 @@ public: void remove(); /// Initialize columns (from columns.txt if exists, or create from column files if not). - /// Load checksums from checksums.txt if exists. Load index if required. + /// Load various metadata into memory: checksums from checksums.txt, index if required, etc. void loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency); void appendFilesOfColumnsChecksumsIndexes(Strings & files, bool include_projection = false) const; diff --git a/src/Storages/MergeTree/localBackup.cpp b/src/Storages/MergeTree/localBackup.cpp index 3b05e3df8d3..6faacf3c066 100644 --- a/src/Storages/MergeTree/localBackup.cpp +++ b/src/Storages/MergeTree/localBackup.cpp @@ -17,9 +17,10 @@ namespace { void localBackupImpl( - const DiskPtr & disk, const String & source_path, + const DiskPtr & disk, IDiskTransaction * transaction, const String & source_path, const String & destination_path, bool make_source_readonly, size_t level, - std::optional max_level, const NameSet & files_to_copy_instead_of_hardlinks) + std::optional max_level, bool copy_instead_of_hardlinks, + const NameSet & files_to_copy_instead_of_hardlinks) { if (max_level && level > *max_level) return; @@ -27,7 +28,10 @@ void localBackupImpl( if (level >= 1000) throw DB::Exception(DB::ErrorCodes::TOO_DEEP_RECURSION, "Too deep recursion"); - disk->createDirectories(destination_path); + if (transaction) + transaction->createDirectories(destination_path); + else + disk->createDirectories(destination_path); for (auto it = disk->iterateDirectory(source_path); it->isValid(); it->next()) { @@ -37,15 +41,36 @@ void localBackupImpl( if (!disk->isDirectory(source)) { if (make_source_readonly) - disk->setReadOnly(source); - if (files_to_copy_instead_of_hardlinks.contains(it->name())) - disk->copyFile(source, *disk, destination); + { + if (transaction) + transaction->setReadOnly(source); + else + disk->setReadOnly(source); + } + if (copy_instead_of_hardlinks || files_to_copy_instead_of_hardlinks.contains(it->name())) + { + if (transaction) + { + transaction->copyFile(source, destination); + } + else + { + disk->copyFile(source, *disk, destination); + } + } else - disk->createHardLink(source, destination); + { + if (transaction) + transaction->createHardLink(source, destination); + else + disk->createHardLink(source, destination); + } } else { - localBackupImpl(disk, source, destination, make_source_readonly, level + 1, max_level, files_to_copy_instead_of_hardlinks); + localBackupImpl( + disk, transaction, source, destination, make_source_readonly, level + 1, max_level, + copy_instead_of_hardlinks, files_to_copy_instead_of_hardlinks); } } } @@ -89,7 +114,7 @@ private: void localBackup( const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, - std::optional max_level, bool copy_instead_of_hardlinks, const NameSet & files_to_copy_intead_of_hardlinks) + std::optional max_level, bool copy_instead_of_hardlinks, const NameSet & files_to_copy_intead_of_hardlinks, DiskTransactionPtr disk_transaction) { if (disk->exists(destination_path) && !disk->isDirectoryEmpty(destination_path)) { @@ -100,7 +125,8 @@ void localBackup( size_t try_no = 0; const size_t max_tries = 10; - CleanupOnFail cleanup([disk, destination_path]() { disk->removeRecursive(destination_path); }); + CleanupOnFail cleanup(disk_transaction ? std::function([]{}) : + [disk, destination_path]() { disk->removeRecursive(destination_path); }); /** Files in the directory can be permanently added and deleted. * If some file is deleted during an attempt to make a backup, then try again, @@ -110,10 +136,10 @@ void localBackup( { try { - if (copy_instead_of_hardlinks) + if (copy_instead_of_hardlinks && !disk_transaction) disk->copyDirectoryContent(source_path, disk, destination_path); else - localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level, files_to_copy_intead_of_hardlinks); + localBackupImpl(disk, disk_transaction.get(), source_path, destination_path, make_source_readonly, 0, max_level, copy_instead_of_hardlinks, files_to_copy_intead_of_hardlinks); } catch (const DB::ErrnoException & e) { diff --git a/src/Storages/MergeTree/localBackup.h b/src/Storages/MergeTree/localBackup.h index 74b188daff6..89906bf1d75 100644 --- a/src/Storages/MergeTree/localBackup.h +++ b/src/Storages/MergeTree/localBackup.h @@ -9,7 +9,7 @@ namespace DB /** Creates a local (at the same mount point) backup (snapshot) directory. * - * In the specified destination directory, it creates a hard links on all source-directory files + * In the specified destination directory, it creates hard links on all source-directory files * and in all nested directories, with saving (creating) all relative paths; * and also `chown`, removing the write permission. * @@ -17,9 +17,11 @@ namespace DB * and is intended to be used as a simple means of protection against a human or program error, * but not from a hardware failure. * - * If max_level is specified, than only files which depth relative source_path less or equal max_level will be copied. + * If max_level is specified, than only files with depth relative source_path less or equal max_level will be copied. * So, if max_level=0 than only direct file child are copied. + * + * If `transaction` is provided, the changes will be added to it instead of performend on disk. */ - void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional max_level = {}, bool copy_instead_of_hardlinks = false, const NameSet & files_to_copy_intead_of_hardlinks = {}); + void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional max_level = {}, bool copy_instead_of_hardlinks = false, const NameSet & files_to_copy_intead_of_hardlinks = {}, DiskTransactionPtr disk_transaction = nullptr); } diff --git a/tests/integration/test_grpc_protocol/test.py b/tests/integration/test_grpc_protocol/test.py index 254b78667d5..dead4d447ec 100644 --- a/tests/integration/test_grpc_protocol/test.py +++ b/tests/integration/test_grpc_protocol/test.py @@ -42,7 +42,7 @@ node = cluster.add_instance( main_configs=["configs/grpc_config.xml"], # Bug in TSAN reproduces in this test https://github.com/grpc/grpc/issues/29550#issuecomment-1188085387 env_variables={ - "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS") + "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS", default="") }, ) main_channel = None diff --git a/tests/integration/test_grpc_protocol_ssl/test.py b/tests/integration/test_grpc_protocol_ssl/test.py index f1a4475c1a5..4c7fe99f49d 100644 --- a/tests/integration/test_grpc_protocol_ssl/test.py +++ b/tests/integration/test_grpc_protocol_ssl/test.py @@ -44,7 +44,7 @@ node = cluster.add_instance( ], # Bug in TSAN reproduces in this test https://github.com/grpc/grpc/issues/29550#issuecomment-1188085387 env_variables={ - "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS") + "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS", default="") }, ) diff --git a/tests/integration/test_server_reload/test.py b/tests/integration/test_server_reload/test.py index 1429713cb84..da5208799ac 100644 --- a/tests/integration/test_server_reload/test.py +++ b/tests/integration/test_server_reload/test.py @@ -36,7 +36,7 @@ instance = cluster.add_instance( with_zookeeper=True, # Bug in TSAN reproduces in this test https://github.com/grpc/grpc/issues/29550#issuecomment-1188085387 env_variables={ - "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS") + "TSAN_OPTIONS": "report_atomic_races=0 " + os.getenv("TSAN_OPTIONS", default="") }, ) From e37e8f83bb28dc752c0264cee52dcbb7c4f24352 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 16 May 2023 15:57:26 +0200 Subject: [PATCH 123/127] Fix flakiness of test_distributed_load_balancing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I saw the following in the logs for the failed test: 2023.05.16 07:12:12.894051 [ 262 ] {74575ac0-b296-4fdc-bc8e-3476a305e6ea} ConnectionPoolWithFailover: Connection failed at try №1, reason: Timeout exceeded while reading from socket (socket (172.16.3.2:9000), receive timeout 2000 ms) And I think that the culprit is the test_distributed_replica_max_ignored_errors for which it is normal, however not for others, and this should not affect other tests. So fix this by calling SYSTEM RELOAD CONFIG, which should reset error count. CI: https://s3.amazonaws.com/clickhouse-test-reports/49380/5abc1a1c68ee204c9024493be1d19835cf5630f7/integration_tests__release__[3_4].html Signed-off-by: Azat Khuzhin --- .../test_distributed_load_balancing/test.py | 26 ++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index 1dba6a30bc4..271828f433e 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -29,23 +29,19 @@ nodes = len(cluster.instances) queries = nodes * 10 +# SYSTEM RELOAD CONFIG will reset some attributes of the nodes in cluster +# - error_count +# - last_used (round_robing) +# +# This is required to avoid interference results of one test to another +@pytest.fixture(scope="function", autouse=True) +def test_setup(): + for n in list(cluster.instances.values()): + n.query("SYSTEM RELOAD CONFIG") + + def bootstrap(): for n in list(cluster.instances.values()): - # At startup, server loads configuration files. - # - # However ConfigReloader does not know about already loaded files - # (files is empty()), hence it will always reload the configuration - # just after server starts (+ 2 seconds, reload timeout). - # - # And on configuration reload the clusters will be re-created, so some - # internal stuff will be reset: - # - error_count - # - last_used (round_robing) - # - # And if the reload will happen during round_robin test it will start - # querying from the beginning, so let's issue config reload just after - # start to avoid reload in the middle of the test execution. - n.query("SYSTEM RELOAD CONFIG") n.query("DROP TABLE IF EXISTS data") n.query("DROP TABLE IF EXISTS dist") n.query("CREATE TABLE data (key Int) Engine=Memory()") From dc353faf44c466cd89391a50076979d6f933e1d1 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 16 May 2023 15:59:22 +0200 Subject: [PATCH 124/127] Simplify obtaining query shard in test_distributed_load_balancing Signed-off-by: Azat Khuzhin --- .../test_distributed_load_balancing/test.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index 271828f433e..e879f09ccc1 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -109,19 +109,14 @@ def get_node(query_node, table="dist", *args, **kwargs): rows = query_node.query( """ - SELECT c.host_name - FROM ( - SELECT _shard_num - FROM cluster(shards_cluster, system.query_log) - WHERE - initial_query_id = '{query_id}' AND - is_initial_query = 0 AND - type = 'QueryFinish' - ORDER BY event_date DESC, event_time DESC - LIMIT 1 - ) a - JOIN system.clusters c - ON a._shard_num = c.shard_num WHERE cluster = 'shards_cluster' + SELECT hostName() + FROM cluster(shards_cluster, system.query_log) + WHERE + initial_query_id = '{query_id}' AND + is_initial_query = 0 AND + type = 'QueryFinish' + ORDER BY event_date DESC, event_time DESC + LIMIT 1 """.format( query_id=query_id ) From 55fc4adf055324cc9b359e57a8056025f744a317 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 May 2023 16:42:15 +0300 Subject: [PATCH 125/127] Update 02441_alter_delete_and_drop_column.sql --- tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql index b9b1b645e8e..9c4697362df 100644 --- a/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql +++ b/tests/queries/0_stateless/02441_alter_delete_and_drop_column.sql @@ -1,3 +1,4 @@ +-- Tags: no-replicated-database create table mut (n int, m int, k int) engine=ReplicatedMergeTree('/test/02441/{database}/mut', '1') order by n; set insert_keeper_fault_injection_probability=0; From 5237dd02451cb23d4101c0cecfe8bf133292103e Mon Sep 17 00:00:00 2001 From: mateng915 Date: Fri, 19 May 2023 22:06:43 +0800 Subject: [PATCH 126/127] New system table zookeeper connection (#45245) * Feature: Support new system table to show which zookeeper node be connected Description: ============ Currently we have no place to check which zk node be connected otherwise using lsof command. It not convenient Solution: ========= Implemented a new system table, system.zookeeper_host when CK Server has zk this table will show the zk node dir which connected by current CK server Noted: This table can support multi-zookeeper cluster scenario. * fixed review comments * added test case * update test cases * remove unused code * fixed review comments and removed unused code * updated test cases for print host, port and is_expired * modify the code comments * fixed CI Failed * fixed code style check failure * updated test cases by added Tags * update test reference * update test cases * added system.zookeeper_connection doc * Update docs/en/operations/system-tables/zookeeper_connection.md * Update docs/en/operations/system-tables/zookeeper_connection.md * Update docs/en/operations/system-tables/zookeeper_connection.md --------- Co-authored-by: Alexander Tokmakov --- .../system-tables/zookeeper_connection.md | 29 +++++++++++ src/Common/ZooKeeper/IKeeper.h | 2 + src/Common/ZooKeeper/TestKeeper.h | 3 ++ src/Common/ZooKeeper/ZooKeeper.cpp | 20 +++++++ src/Common/ZooKeeper/ZooKeeper.h | 8 +++ src/Common/ZooKeeper/ZooKeeperImpl.cpp | 4 ++ src/Common/ZooKeeper/ZooKeeperImpl.h | 3 ++ src/Interpreters/Context.cpp | 11 ++++ src/Interpreters/Context.h | 8 ++- .../StorageSystemZooKeeperConnection.cpp | 52 +++++++++++++++++++ .../System/StorageSystemZooKeeperConnection.h | 28 ++++++++++ src/Storages/System/attachSystemTables.cpp | 4 ++ ...2735_system_zookeeper_connection.reference | 1 + .../02735_system_zookeeper_connection.sql | 13 +++++ 14 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 docs/en/operations/system-tables/zookeeper_connection.md create mode 100644 src/Storages/System/StorageSystemZooKeeperConnection.cpp create mode 100644 src/Storages/System/StorageSystemZooKeeperConnection.h create mode 100644 tests/queries/0_stateless/02735_system_zookeeper_connection.reference create mode 100644 tests/queries/0_stateless/02735_system_zookeeper_connection.sql diff --git a/docs/en/operations/system-tables/zookeeper_connection.md b/docs/en/operations/system-tables/zookeeper_connection.md new file mode 100644 index 00000000000..9438cda1808 --- /dev/null +++ b/docs/en/operations/system-tables/zookeeper_connection.md @@ -0,0 +1,29 @@ +--- +slug: /en/operations/system-tables/zookeeper_connection +--- +#zookeeper_connection + +This table does not exist if ZooKeeper is not configured. The 'system.zookeeper_connection' table shows current connections to ZooKeeper (including auxiliary ZooKeepers). Each row shows information about one connection. + +Columns: + +- `name` ([String](../../sql-reference/data-types/string.md)) — ZooKeeper cluster's name. +- `host` ([String](../../sql-reference/data-types/string.md)) — The hostname/IP of the ZooKeeper node that ClickHouse connected to. +- `port` ([String](../../sql-reference/data-types/string.md)) — The port of the ZooKeeper node that ClickHouse connected to. +- `index` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The index of the ZooKeeper node that ClickHouse connected to. The index is from ZooKeeper config. +- `connected_time` ([String](../../sql-reference/data-types/string.md)) — When the connection was established +- `is_expired` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Is the current connection expired. +- `keeper_api_version` ([String](../../sql-reference/data-types/string.md)) — Keeper API version. +- `client_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Session id of the connection. + +Example: + +``` sql +SELECT * FROM system.zookeeper_connection; +``` + +``` text +┌─name──────────────┬─host─────────┬─port─┬─index─┬──────connected_time─┬─is_expired─┬─keeper_api_version─┬──────────client_id─┐ +│ default_zookeeper │ 127.0.0.1 │ 2181 │ 0 │ 2023-05-19 14:30:16 │ 0 │ 0 │ 216349144108826660 │ +└───────────────────┴──────────────┴──────┴───────┴─────────────────────┴────────────┴────────────────────┴────────────────────┘ +``` diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index b09f096d761..86f9a388644 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -466,6 +466,8 @@ public: /// Useful to check owner of ephemeral node. virtual int64_t getSessionID() const = 0; + virtual String getConnectedAddress() const = 0; + /// If the method will throw an exception, callbacks won't be called. /// /// After the method is executed successfully, you must wait for callbacks diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index 27405d8d571..11e56daf6b4 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -39,6 +39,7 @@ public: bool isExpired() const override { return expired; } int64_t getSessionID() const override { return 0; } + String getConnectedAddress() const override { return connected_zk_address; } void create( @@ -126,6 +127,8 @@ private: zkutil::ZooKeeperArgs args; + String connected_zk_address; + std::mutex push_request_mutex; std::atomic expired{false}; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 09047b5b232..c423e4fd498 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -111,6 +111,26 @@ void ZooKeeper::init(ZooKeeperArgs args_) LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(args.hosts, ",")); else LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(args.hosts, ","), args.chroot); + + String address = impl->getConnectedAddress(); + + size_t colon_pos = address.find(':'); + connected_zk_host = address.substr(0, colon_pos); + connected_zk_port = address.substr(colon_pos + 1); + + connected_zk_index = 0; + + if (args.hosts.size() > 1) + { + for (size_t i = 0; i < args.hosts.size(); i++) + { + if (args.hosts[i] == address) + { + connected_zk_index = i; + break; + } + } + } } else if (args.implementation == "testkeeper") { diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index ca6a44c4cbc..9b85938c726 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -523,6 +523,10 @@ public: void setServerCompletelyStarted(); + String getConnectedZooKeeperHost() const { return connected_zk_host; } + String getConnectedZooKeeperPort() const { return connected_zk_port; } + size_t getConnectedZooKeeperIndex() const { return connected_zk_index; } + private: void init(ZooKeeperArgs args_); @@ -586,6 +590,10 @@ private: ZooKeeperArgs args; + String connected_zk_host; + String connected_zk_port; + size_t connected_zk_index; + std::mutex mutex; Poco::Logger * log = nullptr; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 6c79fc4f178..34be8aa1332 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -433,6 +433,8 @@ void ZooKeeper::connect( } connected = true; + connected_zk_address = node.address.toString(); + break; } catch (...) @@ -448,6 +450,8 @@ void ZooKeeper::connect( if (!connected) { WriteBufferFromOwnString message; + connected_zk_address = ""; + message << "All connection tries failed while connecting to ZooKeeper. nodes: "; bool first = true; for (const auto & node : nodes) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index c0c57d3f719..6715607ca88 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -125,6 +125,8 @@ public: /// Useful to check owner of ephemeral node. int64_t getSessionID() const override { return session_id; } + String getConnectedAddress() const override { return connected_zk_address; } + void executeGenericRequest( const ZooKeeperRequestPtr & request, ResponseCallback callback); @@ -201,6 +203,7 @@ public: private: ACLs default_acls; + String connected_zk_address; zkutil::ZooKeeperArgs args; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 0ef9ea53ee8..d9f450191bc 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2778,6 +2778,17 @@ zkutil::ZooKeeperPtr Context::getAuxiliaryZooKeeper(const String & name) const return zookeeper->second; } + +std::map Context::getAuxiliaryZooKeepers() const +{ + std::lock_guard lock(shared->auxiliary_zookeepers_mutex); + + if (!shared->auxiliary_zookeepers.empty()) + return shared->auxiliary_zookeepers; + else + return std::map(); +} + #if USE_ROCKSDB MergeTreeMetadataCachePtr Context::getMergeTreeMetadataCache() const { diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 87843a458e8..15f2ff625ef 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -32,7 +32,11 @@ namespace Poco::Net { class IPAddress; } -namespace zkutil { class ZooKeeper; } +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} struct OvercommitTracker; @@ -827,6 +831,8 @@ public: std::shared_ptr getZooKeeper() const; /// Same as above but return a zookeeper connection from auxiliary_zookeepers configuration entry. std::shared_ptr getAuxiliaryZooKeeper(const String & name) const; + /// return Auxiliary Zookeeper map + std::map getAuxiliaryZooKeepers() const; /// Try to connect to Keeper using get(Auxiliary)ZooKeeper. Useful for /// internal Keeper start (check connection to some other node). Return true diff --git a/src/Storages/System/StorageSystemZooKeeperConnection.cpp b/src/Storages/System/StorageSystemZooKeeperConnection.cpp new file mode 100644 index 00000000000..f249097654e --- /dev/null +++ b/src/Storages/System/StorageSystemZooKeeperConnection.cpp @@ -0,0 +1,52 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +NamesAndTypesList StorageSystemZooKeeperConnection::getNamesAndTypes() +{ + return { + {"name", std::make_shared()}, + {"host", std::make_shared()}, + {"port", std::make_shared()}, + {"index", std::make_shared()}, + {"connected_time", std::make_shared()}, + {"is_expired", std::make_shared()}, + {"keeper_api_version", std::make_shared()}, + {"client_id", std::make_shared()} + }; +} + +void StorageSystemZooKeeperConnection::fillData(MutableColumns & res_columns, ContextPtr context, + const SelectQueryInfo &) const +{ + res_columns[0]->insert("default_zookeeper"); + res_columns[1]->insert(context->getZooKeeper()->getConnectedZooKeeperHost()); + res_columns[2]->insert(context->getZooKeeper()->getConnectedZooKeeperPort()); + res_columns[3]->insert(context->getZooKeeper()->getConnectedZooKeeperIndex()); + res_columns[4]->insert(context->getZooKeeperSessionUptime()); + res_columns[5]->insert(context->getZooKeeper()->expired()); + res_columns[6]->insert(context->getZooKeeper()->getApiVersion()); + res_columns[7]->insert(context->getZooKeeper()->getClientID()); + + for (const auto & elem : context->getAuxiliaryZooKeepers()) + { + res_columns[0]->insert(elem.first); + res_columns[1]->insert(elem.second->getConnectedZooKeeperHost()); + res_columns[1]->insert(elem.second->getConnectedZooKeeperHost()); + res_columns[2]->insert(elem.second->getConnectedZooKeeperPort()); + res_columns[3]->insert(elem.second->getConnectedZooKeeperIndex()); + res_columns[4]->insert(elem.second->getSessionUptime()); + res_columns[5]->insert(elem.second->expired()); + res_columns[6]->insert(elem.second->getApiVersion()); + res_columns[7]->insert(elem.second->getClientID()); + } + +} + +} diff --git a/src/Storages/System/StorageSystemZooKeeperConnection.h b/src/Storages/System/StorageSystemZooKeeperConnection.h new file mode 100644 index 00000000000..dd4c293c112 --- /dev/null +++ b/src/Storages/System/StorageSystemZooKeeperConnection.h @@ -0,0 +1,28 @@ +#pragma once + +#include + + +namespace DB +{ + +class Context; + + +/** Implements `zookeeper_connection` system table, which allows you to get information about the connected zookeeper info. + */ +class StorageSystemZooKeeperConnection final : public IStorageSystemOneBlock +{ +public: + std::string getName() const override { return "SystemZooKeeperConnection"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} + diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index d6982ba30d5..424c74662ec 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -80,6 +80,7 @@ #include #include #include +#include #ifdef OS_LINUX #include @@ -186,7 +187,10 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "named_collections"); if (has_zookeeper) + { attach(context, system_database, "zookeeper"); + attach(context, system_database, "zookeeper_connection"); + } if (context->getConfigRef().getInt("allow_experimental_transactions", 0)) attach(context, system_database, "transactions"); diff --git a/tests/queries/0_stateless/02735_system_zookeeper_connection.reference b/tests/queries/0_stateless/02735_system_zookeeper_connection.reference new file mode 100644 index 00000000000..c9cc8adede8 --- /dev/null +++ b/tests/queries/0_stateless/02735_system_zookeeper_connection.reference @@ -0,0 +1 @@ +[ :1]:9181 0 diff --git a/tests/queries/0_stateless/02735_system_zookeeper_connection.sql b/tests/queries/0_stateless/02735_system_zookeeper_connection.sql new file mode 100644 index 00000000000..10f12177b2e --- /dev/null +++ b/tests/queries/0_stateless/02735_system_zookeeper_connection.sql @@ -0,0 +1,13 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS test_zk_connection_table; + +CREATE TABLE test_zk_connection_table ( + key UInt64 +) +ENGINE ReplicatedMergeTree('/clickhouse/{database}/02731_zk_connection/{shard}', '{replica}') +ORDER BY tuple(); + +select host, port, is_expired from system.zookeeper_connection where name='default_zookeeper'; + +DROP TABLE IF EXISTS test_zk_connection_table; From 90872c2671f468cef899834d8fb15af6e5896960 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 19 May 2023 17:22:37 +0200 Subject: [PATCH 127/127] Update query_log.md Fix links to data types --- docs/en/operations/system-tables/query_log.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/system-tables/query_log.md b/docs/en/operations/system-tables/query_log.md index 42247e6fba2..1bcecfeb161 100644 --- a/docs/en/operations/system-tables/query_log.md +++ b/docs/en/operations/system-tables/query_log.md @@ -97,8 +97,8 @@ Columns: - `forwarded_for` ([String](../../sql-reference/data-types/string.md)) — HTTP header `X-Forwarded-For` passed in the HTTP query. - `quota_key` ([String](../../sql-reference/data-types/string.md)) — The `quota key` specified in the [quotas](../../operations/quotas.md) setting (see `keyed`). - `revision` ([UInt32](../../sql-reference/data-types/int-uint.md)) — ClickHouse revision. -- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/array.md)) — ProfileEvents that measure different metrics. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events) -- `Settings` ([Map(String, String)](../../sql-reference/data-types/array.md)) — Settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. +- `ProfileEvents` ([Map(String, UInt64)](../../sql-reference/data-types/map.md)) — ProfileEvents that measure different metrics. The description of them could be found in the table [system.events](../../operations/system-tables/events.md#system_tables-events) +- `Settings` ([Map(String, String)](../../sql-reference/data-types/map.md)) — Settings that were changed when the client ran the query. To enable logging changes to settings, set the `log_query_settings` parameter to 1. - `log_comment` ([String](../../sql-reference/data-types/string.md)) — Log comment. It can be set to arbitrary string no longer than [max_query_size](../../operations/settings/settings.md#settings-max_query_size). An empty string if it is not defined. - `thread_ids` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — Thread ids that are participating in query execution. - `used_aggregate_functions` ([Array(String)](../../sql-reference/data-types/array.md)) — Canonical names of `aggregate functions`, which were used during query execution.