From c7402c2a3998974fb5e9bf3fde2c609595d6a584 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 20:10:14 -0400 Subject: [PATCH 01/94] [IO] Version ID provided during buffer read The Version ID, if provided, is to be set in the request to read the buffer from S3. If not provided the latest version of the object must be retrieved. Version ID set to last parameter to allow defaulting and backward compatibility with calls. --- src/IO/ReadBufferFromS3.cpp | 38 +++++++++++++++++++++++++++++++++---- src/IO/ReadBufferFromS3.h | 2 ++ 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 93bbe02c9cd..18144950f31 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -39,6 +39,7 @@ ReadBufferFromS3::ReadBufferFromS3( std::shared_ptr client_ptr_, const String & bucket_, const String & key_, + const String & version_id_, UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, @@ -48,6 +49,7 @@ ReadBufferFromS3::ReadBufferFromS3( , client_ptr(std::move(client_ptr_)) , bucket(bucket_) , key(key_) + , version_id(version_id_) , max_single_read_retries(max_single_read_retries_) , read_settings(settings_) , use_external_buffer(use_external_buffer_) @@ -125,8 +127,15 @@ bool ReadBufferFromS3::nextImpl() ProfileEvents::increment(ProfileEvents::S3ReadMicroseconds, watch.elapsedMicroseconds()); ProfileEvents::increment(ProfileEvents::S3ReadRequestsErrors, 1); - LOG_DEBUG(log, "Caught exception while reading S3 object. Bucket: {}, Key: {}, Offset: {}, Attempt: {}, Message: {}", - bucket, key, getPosition(), attempt, e.message()); + LOG_DEBUG( + log, + "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}", + bucket, + key, + version_id.empty() ? "Latest" : version_id, + getPosition(), + attempt, + e.message()); if (attempt + 1 == max_single_read_retries) throw; @@ -213,6 +222,10 @@ std::optional ReadBufferFromS3::getTotalSize() Aws::S3::Model::HeadObjectRequest request; request.SetBucket(bucket); request.SetKey(key); + if (!version_id.empty()) + { + request.SetVersionId(version_id); + } auto outcome = client_ptr->HeadObject(request); auto head_result = outcome.GetResultWithOwnership(); @@ -239,6 +252,10 @@ std::unique_ptr ReadBufferFromS3::initialize() Aws::S3::Model::GetObjectRequest req; req.SetBucket(bucket); req.SetKey(key); + if (!version_id.empty()) + { + req.SetVersionId(version_id); + } /** * If remote_filesystem_read_method = 'threadpool', then for MergeTree family tables @@ -250,13 +267,26 @@ std::unique_ptr ReadBufferFromS3::initialize() throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); req.SetRange(fmt::format("bytes={}-{}", offset, read_until_position - 1)); - LOG_TEST(log, "Read S3 object. Bucket: {}, Key: {}, Range: {}-{}", bucket, key, offset, read_until_position - 1); + LOG_TEST( + log, + "Read S3 object. Bucket: {}, Key: {}, Version: {}, Range: {}-{}", + bucket, + key, + version_id.empty() ? "Latest" : version_id, + offset, + read_until_position - 1); } else { if (offset) req.SetRange(fmt::format("bytes={}-", offset)); - LOG_TEST(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset); + LOG_TEST( + log, + "Read S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}", + bucket, + key, + version_id.empty() ? "Latest" : version_id, + offset); } Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req); diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 157b6d46b6d..51b11e1c8c3 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -29,6 +29,7 @@ private: std::shared_ptr client_ptr; String bucket; String key; + String version_id; UInt64 max_single_read_retries; off_t offset = 0; @@ -42,6 +43,7 @@ public: std::shared_ptr client_ptr_, const String & bucket_, const String & key_, + const String & version_id_, UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer = false, From f299bf46e24003576d9793b8f0e14baa41f86770 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 20:11:20 -0400 Subject: [PATCH 02/94] [Disks] readSchemaVersion supplies empty Version ID Empty version ID supplied to retrieve version ID of object. --- src/Disks/S3/DiskS3.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index de63f3ed82f..ead449c6a7c 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -402,6 +402,7 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc ReadBufferFromS3 buffer( settings->client, source_bucket, + "" /* version_id empty and to be extract */, source_path + SCHEMA_VERSION_OBJECT, settings->s3_max_single_read_retries, context->getReadSettings()); From e3a96393a1171101294a9b462f6195538c0b4f26 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 21:25:00 -0400 Subject: [PATCH 03/94] [Disks] Version ID added. Version ID added to constructor. --- src/Disks/S3/DiskS3.cpp | 2 ++ src/Disks/S3/DiskS3.h | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index ead449c6a7c..7580bb9f874 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -151,6 +151,7 @@ void logIfError(const Aws::Utils::Outcome & response, Fnthread_pool_size) , bucket(std::move(bucket_)) + , version_id(std::move(version_id_)) , current_settings(std::move(settings_)) , settings_getter(settings_getter_) , context(context_) diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 2de1600d906..664cb05d66b 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -72,6 +72,7 @@ public: DiskS3( String name_, String bucket_, + String version_id_, String s3_root_path_, DiskPtr metadata_disk_, FileCachePtr cache_, @@ -163,6 +164,8 @@ private: const String bucket; + const String version_id; + MultiVersion current_settings; /// Gets disk settings from context. GetDiskSettings settings_getter; From 98b2994352b542a62c01f00a39366365987ad75d Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 20:18:10 -0400 Subject: [PATCH 04/94] [Disks] ReadBufferFromS3 requires a version id. --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- src/Disks/IO/ReadBufferFromRemoteFSGather.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 8f91804bbbe..12c800d2e4e 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -43,7 +43,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S auto remote_file_reader_creator = [=, this]() { return std::make_unique( - client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, + client_ptr, bucket, version_id, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, settings, /* use_external_buffer */true, read_until_position, /* restricted_seek */true); }; diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 25bfe0b7e16..aa874aaccb0 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -104,12 +104,14 @@ public: const String & path_, std::shared_ptr client_ptr_, const String & bucket_, + const String & version_id_, IDiskRemote::Metadata metadata_, size_t max_single_read_retries_, const ReadSettings & settings_) : ReadBufferFromRemoteFSGather(metadata_, settings_, path_) , client_ptr(std::move(client_ptr_)) , bucket(bucket_) + , version_id(version_id_) , max_single_read_retries(max_single_read_retries_) { } @@ -119,6 +121,7 @@ public: private: std::shared_ptr client_ptr; String bucket; + String version_id; UInt64 max_single_read_retries; }; #endif From e181207f2d3d592b48cd7fb7ed3b0de7a9bab611 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 20:19:30 -0400 Subject: [PATCH 05/94] [Disks] Version ID parameter added to readFile --- src/Disks/S3/DiskS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 7580bb9f874..723b62222ef 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -231,7 +231,7 @@ std::unique_ptr DiskS3::readFile(const String & path, co disk_read_settings.remote_fs_cache = cache; auto s3_impl = std::make_unique( - path, settings->client, bucket, metadata, + path, settings->client, bucket, version_id, metadata, settings->s3_max_single_read_retries, disk_read_settings); if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) From 171260f40bfa16e623b16299d1cd815ebbb81f10 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:03:58 -0400 Subject: [PATCH 06/94] [Storages] Added Version ID parameter for S3. --- src/Storages/StorageS3.cpp | 9 +++++++-- src/Storages/StorageS3.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index ec506ad0cd0..f9a88d135a7 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -274,7 +274,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()), + std::make_unique(client, bucket, version_id, current_key, max_single_read_retries, getContext()->getReadSettings()), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -927,7 +927,12 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( return wrapReadBufferWithCompressionMethod( std::make_unique( - client_auth.client, client_auth.uri.bucket, current_key, max_single_read_retries, ctx->getReadSettings()), + client_auth.client, + client_auth.uri.bucket, + client_auth.uri.version_id, + current_key, + max_single_read_retries, + ctx->getReadSettings()), chooseCompressionMethod(current_key, compression_method)); }; diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index b2283687e2b..e1c1807976c 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -85,6 +85,7 @@ public: private: String name; String bucket; + String version_id; String file_path; String format; ColumnsDescription columns_desc; From fea8824489f343b0e5238f31ef93809897ac5f2e Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:06:09 -0400 Subject: [PATCH 07/94] [Disks] Added Version ID parameter to registerDisksS3. --- src/Disks/S3/registerDiskS3.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp index 2b5fe3c5a81..12a3c1cf96e 100644 --- a/src/Disks/S3/registerDiskS3.cpp +++ b/src/Disks/S3/registerDiskS3.cpp @@ -192,6 +192,7 @@ void registerDiskS3(DiskFactory & factory) name, uri.bucket, uri.key, + uri.version_id, metadata_disk, std::move(cache), context, From ec28af055f6cb3a07eceb1d4703a697476ee305a Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:11:18 -0400 Subject: [PATCH 08/94] [IO] S3 URI data struct contains Version ID Added Version ID field to S3::URI struct. TODO: Extract version id if present. --- src/IO/S3Common.h | 1 + src/IO/tests/gtest_s3_uri.cpp | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 97cb4f74f90..d4d36df091e 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -66,6 +66,7 @@ struct URI String endpoint; String bucket; String key; + String version_id; String storage_name; bool is_virtual_hosted_style; diff --git a/src/IO/tests/gtest_s3_uri.cpp b/src/IO/tests/gtest_s3_uri.cpp index 7ee72069e57..4e513186730 100644 --- a/src/IO/tests/gtest_s3_uri.cpp +++ b/src/IO/tests/gtest_s3_uri.cpp @@ -20,6 +20,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3.yandexcloud.net", uri.endpoint); ASSERT_EQ("jokserfn", uri.bucket); ASSERT_EQ("", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(true, uri.is_virtual_hosted_style); } { @@ -27,6 +28,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3.yandexcloud.net", uri.endpoint); ASSERT_EQ("jokserfn", uri.bucket); ASSERT_EQ("", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } { @@ -34,6 +36,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://yandexcloud.net", uri.endpoint); ASSERT_EQ("bucket", uri.bucket); ASSERT_EQ("", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } { @@ -41,6 +44,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3.yandexcloud.net", uri.endpoint); ASSERT_EQ("jokserfn", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(true, uri.is_virtual_hosted_style); } { @@ -48,6 +52,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://storage.yandexcloud.net", uri.endpoint); ASSERT_EQ("jokserfn", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } { @@ -55,6 +60,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://cos.ap-beijing.myqcloud.com", uri.endpoint); ASSERT_EQ("bucketname", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(true, uri.is_virtual_hosted_style); } { @@ -62,6 +68,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3.us-east-2.amazonaws.com", uri.endpoint); ASSERT_EQ("bucketname", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(true, uri.is_virtual_hosted_style); } { @@ -69,6 +76,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3.us-east-2.amazonaws.com", uri.endpoint); ASSERT_EQ("bucketname", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } { @@ -76,6 +84,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3-us-east-2.amazonaws.com", uri.endpoint); ASSERT_EQ("bucketname", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(true, uri.is_virtual_hosted_style); } { @@ -83,6 +92,7 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("https://s3-us-east-2.amazonaws.com", uri.endpoint); ASSERT_EQ("bucketname", uri.bucket); ASSERT_EQ("data", uri.key); + ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } } From 1a85e9b60d0bdecacf5a45de20d31496c5ab68d9 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:16:24 -0400 Subject: [PATCH 09/94] [IO] Extracting Version ID in S3::URI Extracting Version ID form query string in URI. --- src/IO/S3Common.cpp | 22 ++++++++++ src/IO/tests/gtest_s3_uri.cpp | 81 ++++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 1 deletion(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 59a4dab837b..80852b880a7 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -780,6 +780,28 @@ namespace S3 if (uri.getHost().empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); + // Extract object version ID from query string. + { + const String version_key = "versionId"; + const auto query_string = uri.getQuery(); + + auto start = query_string.rfind(version_key); + if (start == std::string::npos) + { + version_id = ""; + } + else + { + start = query_string.find_first_of('=', start); + start = start == std::string::npos ? query_string.length() : start + 1; + + auto end = query_string.find_first_of('&', start); + end = end == std::string::npos ? query_string.length() : end - start; + + version_id = query_string.substr(start, end); + } + } + String name; String endpoint_authority_from_uri; diff --git a/src/IO/tests/gtest_s3_uri.cpp b/src/IO/tests/gtest_s3_uri.cpp index 4e513186730..7908e2809d9 100644 --- a/src/IO/tests/gtest_s3_uri.cpp +++ b/src/IO/tests/gtest_s3_uri.cpp @@ -3,12 +3,79 @@ #if USE_AWS_S3 -# include +#include namespace { using namespace DB; +struct TestCase +{ + S3::URI uri; + String endpoint; + String bucket; + String key; + String version_id; + bool is_virtual_hosted_style; +}; + +const TestCase TestCases[] = { + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&secondKey=anotherKey")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId&anotherKey=someOtherKey")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "testVersionId", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?firstKey=someKey&versionId=testVersionId&anotherKey=someOtherKey")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "testVersionId", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?anotherKey=someOtherKey&versionId=testVersionId")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "testVersionId", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=testVersionId")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "testVersionId", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId=")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId&")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "", + true}, + {S3::URI(Poco::URI("https://bucketname.s3.us-east-2.amazonaws.com/data?versionId")), + "https://s3.us-east-2.amazonaws.com", + "bucketname", + "data", + "", + true}, +}; + class S3UriTest : public testing::TestWithParam { }; @@ -102,6 +169,18 @@ TEST_P(S3UriTest, invalidPatterns) ASSERT_ANY_THROW(S3::URI(Poco::URI(GetParam()))); } +TEST(S3UriTest, versionIdChecks) +{ + for (const auto& test_case : TestCases) + { + ASSERT_EQ(test_case.endpoint, test_case.uri.endpoint); + ASSERT_EQ(test_case.bucket, test_case.uri.bucket); + ASSERT_EQ(test_case.key, test_case.uri.key); + ASSERT_EQ(test_case.version_id, test_case.uri.version_id); + ASSERT_EQ(test_case.is_virtual_hosted_style, test_case.uri.is_virtual_hosted_style); + } +} + INSTANTIATE_TEST_SUITE_P( S3, S3UriTest, From d9b370cf1def1d37bb68682c1e24f36f990261f2 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:17:50 -0400 Subject: [PATCH 10/94] [IO] S3 URI versionId extraction performance tweak. --- src/IO/S3Common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 80852b880a7..aaf48c38bd6 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -782,7 +782,7 @@ namespace S3 // Extract object version ID from query string. { - const String version_key = "versionId"; + const String version_key = "versionId="; const auto query_string = uri.getQuery(); auto start = query_string.rfind(version_key); @@ -792,7 +792,7 @@ namespace S3 } else { - start = query_string.find_first_of('=', start); + start += version_key.length(); start = start == std::string::npos ? query_string.length() : start + 1; auto end = query_string.find_first_of('&', start); From fb450b706815506276b4b83c53f4bc8a542b3382 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:20:02 -0400 Subject: [PATCH 11/94] [Storages] bugfix Corrected order of parameters by swapping bucket and versionId. --- src/Storages/StorageS3.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f9a88d135a7..366e69c814b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -274,7 +274,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, version_id, current_key, max_single_read_retries, getContext()->getReadSettings()), + std::make_unique(client, bucket, current_key, version_id, max_single_read_retries, getContext()->getReadSettings()), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -929,8 +929,8 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( std::make_unique( client_auth.client, client_auth.uri.bucket, - client_auth.uri.version_id, current_key, + client_auth.uri.version_id, max_single_read_retries, ctx->getReadSettings()), chooseCompressionMethod(current_key, compression_method)); From 679abaafdc4fd0b51df7bf299ea301e52bc109b7 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:25:40 -0400 Subject: [PATCH 12/94] [Storages] bugfix Extracting Version Id for S3::URI. --- src/Storages/StorageS3.cpp | 3 +++ src/Storages/StorageS3.h | 1 + 2 files changed, 4 insertions(+) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 366e69c814b..a20c01df091 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -236,11 +236,13 @@ StorageS3Source::StorageS3Source( String compression_hint_, const std::shared_ptr & client_, const String & bucket_, + const String & version_id_, std::shared_ptr file_iterator_) : SourceWithProgress(getHeader(sample_block_, need_path, need_file)) , WithContext(context_) , name(std::move(name_)) , bucket(bucket_) + , version_id(version_id_) , format(format_) , columns_desc(columns_) , max_block_size(max_block_size_) @@ -674,6 +676,7 @@ Pipe StorageS3::read( compression_method, client_auth.client, client_auth.uri.bucket, + client_auth.uri.version_id, iterator_wrapper)); } auto pipe = Pipe::unitePipes(std::move(pipes)); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index e1c1807976c..4b4b9050d03 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -74,6 +74,7 @@ public: String compression_hint_, const std::shared_ptr & client_, const String & bucket, + const String & version_id, std::shared_ptr file_iterator_); String getName() const override; From 7b35c79b04670670483a88796041f74c4357a835 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:27:58 -0400 Subject: [PATCH 13/94] [IO] versionId extraction simplified. Reducing cognitive load. --- src/IO/S3Common.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index aaf48c38bd6..8d90e516b75 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -782,15 +782,12 @@ namespace S3 // Extract object version ID from query string. { + version_id = ""; const String version_key = "versionId="; const auto query_string = uri.getQuery(); auto start = query_string.rfind(version_key); - if (start == std::string::npos) - { - version_id = ""; - } - else + if (start != std::string::npos) { start += version_key.length(); start = start == std::string::npos ? query_string.length() : start + 1; From 40fbea49f71b3387205a064cfaec78c4b5bf941e Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Mon, 14 Mar 2022 22:37:47 -0400 Subject: [PATCH 14/94] [Disks] bugfix Fixed order of parameters to constructor: - name - bucket - key - versionId - ... --- src/Disks/S3/DiskS3.cpp | 2 +- src/Disks/S3/DiskS3.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 723b62222ef..666b50fd7dd 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -151,8 +151,8 @@ void logIfError(const Aws::Utils::Outcome & response, Fn Date: Mon, 14 Mar 2022 22:42:09 -0400 Subject: [PATCH 15/94] =?UTF-8?q?[Disks]=20bugfix=20=F0=9F=A4=A6=E2=80=8D?= =?UTF-8?q?=E2=99=82=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correcting order of parameters for in . --- src/Disks/S3/DiskS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 666b50fd7dd..d8986998cfe 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -404,8 +404,8 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc ReadBufferFromS3 buffer( settings->client, source_bucket, - "" /* version_id empty and to be extract */, source_path + SCHEMA_VERSION_OBJECT, + "" /* version_id empty */, settings->s3_max_single_read_retries, context->getReadSettings()); From 275bf0ef8164937831ac5027eba571e8a65905f2 Mon Sep 17 00:00:00 2001 From: Saad Ur Rahman Date: Tue, 15 Mar 2022 12:13:44 -0400 Subject: [PATCH 16/94] [IO] fixed bug in versionId extraction introduced during rebase. --- src/IO/S3Common.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 8d90e516b75..61deeab16b5 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -780,7 +780,7 @@ namespace S3 if (uri.getHost().empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); - // Extract object version ID from query string. + /// Extract object version ID from query string. { version_id = ""; const String version_key = "versionId="; @@ -790,7 +790,6 @@ namespace S3 if (start != std::string::npos) { start += version_key.length(); - start = start == std::string::npos ? query_string.length() : start + 1; auto end = query_string.find_first_of('&', start); end = end == std::string::npos ? query_string.length() : end - start; From 755d5c55f96ac724561b33d5381e2ae943fb005c Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 7 Apr 2022 11:57:45 +0200 Subject: [PATCH 17/94] resolve conflict --- src/Storages/StorageS3.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index c68f816dec6..d4d13eea120 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -728,13 +728,9 @@ Pipe StorageS3::read( compression_method, client_auth.client, client_auth.uri.bucket, -<<<<<<< HEAD client_auth.uri.version_id, - iterator_wrapper)); -======= iterator_wrapper, max_download_threads)); ->>>>>>> origin/master } auto pipe = Pipe::unitePipes(std::move(pipes)); From 03f76dda1d9cd6960b81ecc68668d2a94344b201 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 7 Apr 2022 12:03:54 +0200 Subject: [PATCH 18/94] better --- src/IO/S3Common.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 0f6cd554c37..4e0e50a7c51 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -779,7 +779,6 @@ namespace S3 static constexpr auto OBS = "OBS"; static constexpr auto OSS = "OSS"; - uri = uri_; storage_name = S3; @@ -796,11 +795,8 @@ namespace S3 if (start != std::string::npos) { start += version_key.length(); - auto end = query_string.find_first_of('&', start); - end = end == std::string::npos ? query_string.length() : end - start; - - version_id = query_string.substr(start, end); + version_id = query_string.substr(start, end == std::string::npos ? std::string::npos : end - start); } } From 01425df7585f2e823a03f391b6262d0ebaf3e225 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 7 Apr 2022 16:07:12 +0200 Subject: [PATCH 19/94] fix build --- src/IO/ReadBufferFromS3.cpp | 2 +- src/Storages/StorageS3.cpp | 4 ++-- src/Storages/StorageS3.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 5a95f3009d4..5030e8f2733 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -319,7 +319,7 @@ SeekableReadBufferPtr ReadBufferS3Factory::getReader() client_ptr, bucket, key, - verion_id, + version_id, s3_max_single_read_retries, read_settings, false /*use_external_buffer*/, diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index d4d13eea120..a370f85954a 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -272,7 +272,7 @@ bool StorageS3Source::initialize() file_path = fs::path(bucket) / current_key; - read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key, version_id), chooseCompressionMethod(current_key, compression_hint)); + read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint)); auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings); QueryPipelineBuilder builder; @@ -291,7 +291,7 @@ bool StorageS3Source::initialize() return true; } -std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key, const String & version_id) +std::unique_ptr StorageS3Source::createS3ReadBuffer(const String & key) { const size_t object_size = DB::S3::getObjectSize(client, bucket, key, version_id, false); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 8000f8802d8..198f75e3e81 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -112,7 +112,7 @@ private: /// Recreate ReadBuffer and BlockInputStream for each file. bool initialize(); - std::unique_ptr createS3ReadBuffer(const String & key, const String & version_id); + std::unique_ptr createS3ReadBuffer(const String & key); }; /** From fd001b5c80282fbbb1783d94b8fe28c8876b6a1f Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Fri, 8 Apr 2022 15:51:53 +0200 Subject: [PATCH 20/94] fix param order --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 388340dd6e2..ede2ed1f47c 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -43,7 +43,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S auto remote_file_reader_creator = [=, this]() { return std::make_unique( - client_ptr, bucket, version_id, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, + client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, version_id, max_single_read_retries, settings, /* use_external_buffer */true, /* offset */ 0, read_until_position, /* restricted_seek */true); }; From b3adf150b569468b72d0a90bb6f42b3be9e0e339 Mon Sep 17 00:00:00 2001 From: Memo Date: Mon, 18 Apr 2022 12:15:41 +0800 Subject: [PATCH 21/94] add_query_level_settings --- src/Core/Settings.h | 2 ++ src/Storages/MergeTree/MergeTreeData.cpp | 14 +++++++++----- src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/MergeTree/MergeTreeSink.cpp | 2 +- .../MergeTree/ReplicatedMergeTreeSink.cpp | 2 +- .../02280_add_query_level_settings.reference | 0 .../02280_add_query_level_settings.sql | 17 +++++++++++++++++ 7 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/02280_add_query_level_settings.reference create mode 100644 tests/queries/0_stateless/02280_add_query_level_settings.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index aa78456702c..07d718b13cf 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -247,6 +247,8 @@ class IColumn; M(Bool, fallback_to_stale_replicas_for_distributed_queries, true, "Suppose max_replica_delay_for_distributed_queries is set and all replicas for the queried table are stale. If this setting is enabled, the query will be performed anyway, otherwise the error will be reported.", 0) \ M(UInt64, preferred_max_column_in_block_size_bytes, 0, "Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size.", 0) \ \ + M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \ + M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. Zero means async mode.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 29b3083c38f..d614af0d6b9 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3227,9 +3227,10 @@ std::optional MergeTreeData::getMinPartDataVersion() const } -void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until) const +void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, ContextPtr query_context) const { const auto settings = getSettings(); + const auto & query_settings = query_context->getSettingsRef(); const size_t parts_count_in_total = getPartsCount(); if (parts_count_in_total >= settings->max_parts_in_total) { @@ -3252,8 +3253,11 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until) const } k_inactive = ssize_t(inactive_parts_count_in_partition) - ssize_t(settings->inactive_parts_to_delay_insert); } + + auto parts_to_delay_insert = query_settings.parts_to_delay_insert.changed ? query_settings.parts_to_delay_insert : settings->parts_to_delay_insert; + auto parts_to_throw_insert = query_settings.parts_to_throw_insert.changed ? query_settings.parts_to_throw_insert : settings->parts_to_throw_insert; - if (parts_count_in_partition >= settings->parts_to_throw_insert) + if (parts_count_in_partition >= parts_to_throw_insert) { ProfileEvents::increment(ProfileEvents::RejectedInserts); throw Exception( @@ -3262,15 +3266,15 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until) const parts_count_in_partition); } - if (k_inactive < 0 && parts_count_in_partition < settings->parts_to_delay_insert) + if (k_inactive < 0 && parts_count_in_partition < parts_to_delay_insert) return; - const ssize_t k_active = ssize_t(parts_count_in_partition) - ssize_t(settings->parts_to_delay_insert); + const ssize_t k_active = ssize_t(parts_count_in_partition) - ssize_t(parts_to_delay_insert); size_t max_k; size_t k; if (k_active > k_inactive) { - max_k = settings->parts_to_throw_insert - settings->parts_to_delay_insert; + max_k = parts_to_throw_insert - parts_to_delay_insert; k = k_active + 1; } else diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 1601a6714d3..2b75999d24c 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -528,7 +528,7 @@ public: /// If the table contains too many active parts, sleep for a while to give them time to merge. /// If until is non-null, wake up from the sleep earlier if the event happened. - void delayInsertOrThrowIfNeeded(Poco::Event * until = nullptr) const; + void delayInsertOrThrowIfNeeded(Poco::Event * until = nullptr, ContextPtr query_context = nullptr) const; /// Renames temporary part to a permanent part and adds it to the parts set. /// It is assumed that the part does not intersect with existing parts. diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 7e8ee3dcbef..93b9f356595 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -26,7 +26,7 @@ void MergeTreeSink::onStart() { /// Only check "too many parts" before write, /// because interrupting long-running INSERT query in the middle is not convenient for users. - storage.delayInsertOrThrowIfNeeded(); + storage.delayInsertOrThrowIfNeeded(nullptr, context); } void MergeTreeSink::onFinish() diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 187e4eb96c5..fb29b3f30aa 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -604,7 +604,7 @@ void ReplicatedMergeTreeSink::onStart() { /// Only check "too many parts" before write, /// because interrupting long-running INSERT query in the middle is not convenient for users. - storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event); + storage.delayInsertOrThrowIfNeeded(&storage.partial_shutdown_event, context); } void ReplicatedMergeTreeSink::onFinish() diff --git a/tests/queries/0_stateless/02280_add_query_level_settings.reference b/tests/queries/0_stateless/02280_add_query_level_settings.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02280_add_query_level_settings.sql b/tests/queries/0_stateless/02280_add_query_level_settings.sql new file mode 100644 index 00000000000..030fc8e1565 --- /dev/null +++ b/tests/queries/0_stateless/02280_add_query_level_settings.sql @@ -0,0 +1,17 @@ +DROP TABLE IF EXISTS table_for_alter; + +CREATE TABLE table_for_alter ( + id UInt64, + Data String +) ENGINE = MergeTree() ORDER BY id SETTINGS parts_to_throw_insert = 1, parts_to_delay_insert = 1; + +INSERT INTO table_for_alter VALUES (1, '1'); +INSERT INTO table_for_alter VALUES (2, '2'); -- { serverError 252 } + +INSERT INTO table_for_alter settings parts_to_throw_insert = 100, parts_to_delay_insert = 100 VALUES (2, '2'); + +INSERT INTO table_for_alter VALUES (3, '3'); -- { serverError 252 } + +ALTER TABLE table_for_alter MODIFY SETTING parts_to_throw_insert = 100, parts_to_delay_insert = 100; + +INSERT INTO table_for_alter VALUES (3, '3'); \ No newline at end of file From d03621115a2edcc3acd93ae86e0bcec242104a18 Mon Sep 17 00:00:00 2001 From: Memo Date: Mon, 18 Apr 2022 12:21:48 +0800 Subject: [PATCH 22/94] fix code style --- .../0_stateless/02280_add_query_level_settings.reference | 2 ++ tests/queries/0_stateless/02280_add_query_level_settings.sql | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02280_add_query_level_settings.reference b/tests/queries/0_stateless/02280_add_query_level_settings.reference index e69de29bb2d..139597f9cb0 100644 --- a/tests/queries/0_stateless/02280_add_query_level_settings.reference +++ b/tests/queries/0_stateless/02280_add_query_level_settings.reference @@ -0,0 +1,2 @@ + + diff --git a/tests/queries/0_stateless/02280_add_query_level_settings.sql b/tests/queries/0_stateless/02280_add_query_level_settings.sql index 030fc8e1565..a44f8eb854e 100644 --- a/tests/queries/0_stateless/02280_add_query_level_settings.sql +++ b/tests/queries/0_stateless/02280_add_query_level_settings.sql @@ -14,4 +14,4 @@ INSERT INTO table_for_alter VALUES (3, '3'); -- { serverError 252 } ALTER TABLE table_for_alter MODIFY SETTING parts_to_throw_insert = 100, parts_to_delay_insert = 100; -INSERT INTO table_for_alter VALUES (3, '3'); \ No newline at end of file +INSERT INTO table_for_alter VALUES (3, '3'); From a17f102fe63ba12464e5b77afdf3830957f5d758 Mon Sep 17 00:00:00 2001 From: Memo Date: Mon, 18 Apr 2022 20:31:40 +0800 Subject: [PATCH 23/94] fix style --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index d614af0d6b9..e06513a1ed9 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3253,7 +3253,7 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until, ContextPtr q } k_inactive = ssize_t(inactive_parts_count_in_partition) - ssize_t(settings->inactive_parts_to_delay_insert); } - + auto parts_to_delay_insert = query_settings.parts_to_delay_insert.changed ? query_settings.parts_to_delay_insert : settings->parts_to_delay_insert; auto parts_to_throw_insert = query_settings.parts_to_throw_insert.changed ? query_settings.parts_to_throw_insert : settings->parts_to_throw_insert; From 1b533aa5831e90fdca6ca00f569ab21a8f227982 Mon Sep 17 00:00:00 2001 From: lthaooo <354210402@qq.com> Date: Mon, 18 Apr 2022 21:13:08 +0800 Subject: [PATCH 24/94] fix dictionary reload bug --- src/Dictionaries/ClickHouseDictionarySource.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Dictionaries/ClickHouseDictionarySource.cpp b/src/Dictionaries/ClickHouseDictionarySource.cpp index 5a18dcffb22..9751cf01d6f 100644 --- a/src/Dictionaries/ClickHouseDictionarySource.cpp +++ b/src/Dictionaries/ClickHouseDictionarySource.cpp @@ -169,9 +169,13 @@ Pipe ClickHouseDictionarySource::createStreamForQuery(const String & query, std: /// Sample block should not contain first row default values auto empty_sample_block = sample_block.cloneEmpty(); + auto context_copy = Context::createCopy(context); + context_copy->makeQueryContext(); + if (configuration.is_local) { - builder.init(executeQuery(query, context, true).pipeline); + + builder.init(executeQuery(query, context_copy, true).pipeline); auto converting = ActionsDAG::makeConvertingActions( builder.getHeader().getColumnsWithTypeAndName(), empty_sample_block.getColumnsWithTypeAndName(), @@ -185,7 +189,7 @@ Pipe ClickHouseDictionarySource::createStreamForQuery(const String & query, std: else { builder.init(Pipe(std::make_shared( - std::make_shared(pool, query, empty_sample_block, context), false, false))); + std::make_shared(pool, query, empty_sample_block, context_copy), false, false))); } if (result_size_hint) From bb4bc17af1f6469f270746de7f5562e36703b22e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 7 Apr 2022 09:25:01 +0000 Subject: [PATCH 25/94] Force recover from configuration --- src/Coordination/KeeperServer.cpp | 34 +++++++++++++++++++++---- src/Coordination/KeeperServer.h | 3 +++ src/Coordination/KeeperStateMachine.cpp | 3 ++- src/Coordination/KeeperStateManager.cpp | 1 + src/Coordination/KeeperStateManager.h | 9 +++++++ 5 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 30db486dd1b..25d985a7ea3 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -102,20 +102,35 @@ KeeperServer::KeeperServer( checkAndGetSuperdigest(configuration_and_settings_->super_digest))) , state_manager(nuraft::cs_new(server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings)) , log(&Poco::Logger::get("KeeperServer")) + , recover(config.getBool("recover")) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); } -void KeeperServer::startup(bool enable_ipv6) +void KeeperServer::loadLatestConfig() { - state_machine->init(); - - state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); - auto latest_snapshot_config = state_machine->getClusterConfig(); auto latest_log_store_config = state_manager->getLatestConfigFromLogStore(); + if (recover) + { + auto local_cluster_config = state_manager->getLocalConfig(); + latest_log_store_config = std::make_shared(0, latest_log_store_config ? latest_log_store_config->get_log_idx() : 0); + latest_log_store_config->get_servers() = local_cluster_config->get_servers(); + latest_log_store_config->set_log_idx(state_manager->getLogStore()->next_slot()); + + for (auto & server : latest_log_store_config->get_servers()) + { + LOG_INFO(log, "Having server {} with log idx {}", server->get_id(), latest_log_store_config->get_log_idx()); + } + + + state_manager->save_config(*latest_log_store_config); + state_machine->commit_config(latest_log_store_config->get_log_idx(), latest_log_store_config); + return; + } + if (latest_snapshot_config && latest_log_store_config) { if (latest_snapshot_config->get_log_idx() > latest_log_store_config->get_log_idx()) @@ -143,6 +158,13 @@ void KeeperServer::startup(bool enable_ipv6) { LOG_INFO(log, "No config in log store and snapshot, probably it's initial run. Will use config from .xml on disk"); } +} + +void KeeperServer::startup(bool enable_ipv6) +{ + state_machine->init(); + + state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); nuraft::raft_params params; params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); @@ -204,6 +226,8 @@ void KeeperServer::launchRaftServer( nuraft::ptr casted_state_manager = state_manager; nuraft::ptr casted_state_machine = state_machine; + loadLatestConfig(); + /// raft_server creates unique_ptr from it nuraft::context * ctx = new nuraft::context( casted_state_manager, casted_state_machine, diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 4ed88ceb855..6aa5e8f3c77 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -48,6 +48,9 @@ private: void shutdownRaftServer(); + void loadLatestConfig(); + bool recover = false; + public: KeeperServer( const KeeperConfigurationAndSettingsPtr & settings_, diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 315882ee988..08f137cd1a8 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -175,11 +175,12 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) } -void KeeperStateMachine::commit_config(const uint64_t /*log_idx*/, nuraft::ptr & new_conf) +void KeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr & new_conf) { std::lock_guard lock(cluster_config_lock); auto tmp = new_conf->serialize(); cluster_config = ClusterConfig::deserialize(*tmp); + last_committed_idx = log_idx; } nuraft::ptr KeeperStateMachine::last_snapshot() diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index f9bfea5e69a..909ad15a02c 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -248,6 +248,7 @@ void KeeperStateManager::flushLogStore() void KeeperStateManager::save_config(const nuraft::cluster_config & config) { + LOG_INFO(&Poco::Logger::get("State Manager"), "Save config called"); std::lock_guard lock(configuration_wrapper_mutex); nuraft::ptr buf = config.serialize(); configuration_wrapper.cluster_config = nuraft::cluster_config::deserialize(*buf); diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index 66c6cc03b87..2beb9670355 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -6,6 +6,7 @@ #include #include #include +#include "Coordination/KeeperStateMachine.h" #include namespace DB @@ -103,9 +104,17 @@ public: /// Read all log entries in log store from the begging and return latest config (with largest log_index) ClusterConfigPtr getLatestConfigFromLogStore() const; + ClusterConfigPtr getLocalConfig() const + { + std::lock_guard lock{configuration_wrapper_mutex}; + return configuration_wrapper.cluster_config; + } + /// Get configuration diff between proposed XML and current state in RAFT ConfigUpdateActions getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) const; + void applyConfigsToLogEntry(); + private: /// Wrapper struct for Keeper cluster config. We parse this /// info from XML files. From 0e1ba927bdb0c07fbb056979267144339ef4ae95 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 8 Apr 2022 07:18:18 +0000 Subject: [PATCH 26/94] Add argument for force recovery --- programs/keeper/Keeper.cpp | 18 ++++++++++++++++++ programs/keeper/Keeper.h | 2 ++ src/Coordination/KeeperServer.cpp | 13 +++++++++---- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 1d9bbef58a5..d5af2d6148e 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -239,6 +239,18 @@ std::string Keeper::getDefaultConfigFileName() const return "keeper_config.xml"; } +void Keeper::handleCustomArguments(const std::string & arg, const std::string & value) +{ + if (arg == "force-recovery") + { + assert(value.empty()); + config().setBool("keeper_server.recover", true); + return; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid argument {} provided", arg); +} + void Keeper::defineOptions(Poco::Util::OptionSet & options) { options.addOption( @@ -251,6 +263,12 @@ void Keeper::defineOptions(Poco::Util::OptionSet & options) .required(false) .repeatable(false) .binding("version")); + options.addOption( + Poco::Util::Option("force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration") + .required(false) + .repeatable(false) + .noArgument() + .callback(Poco::Util::OptionCallback(this, &Keeper::handleCustomArguments))); BaseDaemon::defineOptions(options); } diff --git a/programs/keeper/Keeper.h b/programs/keeper/Keeper.h index 5b8fbadd0a2..db7a117c2e2 100644 --- a/programs/keeper/Keeper.h +++ b/programs/keeper/Keeper.h @@ -43,6 +43,8 @@ public: protected: void logRevision() const override; + void handleCustomArguments(const std::string & arg, const std::string & value); + int run() override; void initialize(Application & self) override; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 25d985a7ea3..13d7f67e697 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -102,7 +102,7 @@ KeeperServer::KeeperServer( checkAndGetSuperdigest(configuration_and_settings_->super_digest))) , state_manager(nuraft::cs_new(server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings)) , log(&Poco::Logger::get("KeeperServer")) - , recover(config.getBool("recover")) + , recover(config.has("keeper_server.recover") && config.getBool("keeper_server.recover")) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); @@ -127,7 +127,6 @@ void KeeperServer::loadLatestConfig() state_manager->save_config(*latest_log_store_config); - state_machine->commit_config(latest_log_store_config->get_log_idx(), latest_log_store_config); return; } @@ -182,6 +181,13 @@ void KeeperServer::startup(bool enable_ipv6) params.return_method_ = nuraft::raft_params::async_handler; + if (recover) + { + LOG_INFO(log, "Custom quorum size"); + params.with_custom_commit_quorum_size(1); + params.with_custom_election_quorum_size(1); + } + nuraft::asio_service::options asio_opts{}; if (state_manager->isSecure()) { @@ -226,13 +232,12 @@ void KeeperServer::launchRaftServer( nuraft::ptr casted_state_manager = state_manager; nuraft::ptr casted_state_machine = state_machine; - loadLatestConfig(); - /// raft_server creates unique_ptr from it nuraft::context * ctx = new nuraft::context( casted_state_manager, casted_state_machine, asio_listener, logger, rpc_cli_factory, scheduler, params); + loadLatestConfig(); raft_instance = nuraft::cs_new(ctx, init_options); raft_instance->start_server(init_options.skip_initial_election_timeout_); From dbd88a5acb0a0b60378e59b5e18a578957c40341 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 11 Apr 2022 06:41:46 +0000 Subject: [PATCH 27/94] Use 1 leader quorum for recovery --- src/Coordination/KeeperDispatcher.cpp | 2 +- src/Coordination/KeeperServer.cpp | 39 ++++++++++++++------------- src/Coordination/KeeperServer.h | 3 ++- src/Coordination/KeeperStateManager.h | 1 + 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 4d71c11221e..65a6fbb287a 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -278,7 +278,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf try { LOG_DEBUG(log, "Waiting server to initialize"); - server->startup(configuration_and_settings->enable_ipv6); + server->startup(config, configuration_and_settings->enable_ipv6); LOG_DEBUG(log, "Server initialized, waiting for quorum"); if (!start_async) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 13d7f67e697..39df5ff61fa 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -113,23 +114,6 @@ void KeeperServer::loadLatestConfig() auto latest_snapshot_config = state_machine->getClusterConfig(); auto latest_log_store_config = state_manager->getLatestConfigFromLogStore(); - if (recover) - { - auto local_cluster_config = state_manager->getLocalConfig(); - latest_log_store_config = std::make_shared(0, latest_log_store_config ? latest_log_store_config->get_log_idx() : 0); - latest_log_store_config->get_servers() = local_cluster_config->get_servers(); - latest_log_store_config->set_log_idx(state_manager->getLogStore()->next_slot()); - - for (auto & server : latest_log_store_config->get_servers()) - { - LOG_INFO(log, "Having server {} with log idx {}", server->get_id(), latest_log_store_config->get_log_idx()); - } - - - state_manager->save_config(*latest_log_store_config); - return; - } - if (latest_snapshot_config && latest_log_store_config) { if (latest_snapshot_config->get_log_idx() > latest_log_store_config->get_log_idx()) @@ -159,7 +143,7 @@ void KeeperServer::loadLatestConfig() } } -void KeeperServer::startup(bool enable_ipv6) +void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) { state_machine->init(); @@ -201,6 +185,24 @@ void KeeperServer::startup(bool enable_ipv6) launchRaftServer(enable_ipv6, params, asio_opts); + if (recover) + { + auto configuration = state_manager->parseServersConfiguration(config, false); + auto local_cluster_config = configuration.cluster_config; + auto latest_log_store_config = std::make_shared(0, local_cluster_config ? local_cluster_config->get_log_idx() : 0); + latest_log_store_config->get_servers() = local_cluster_config->get_servers(); + latest_log_store_config->set_log_idx(state_manager->getLogStore()->next_slot()); + + for (auto & server : latest_log_store_config->get_servers()) + { + LOG_INFO(log, "Having server {} with log idx {}", server->get_id(), latest_log_store_config->get_log_idx()); + } + + + state_manager->save_config(*latest_log_store_config); + return; + } + if (!raft_instance) throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); } @@ -242,6 +244,7 @@ void KeeperServer::launchRaftServer( raft_instance->start_server(init_options.skip_initial_election_timeout_); asio_listener->listen(raft_instance); + } void KeeperServer::shutdownRaftServer() diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 6aa5e8f3c77..210777a4086 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -59,7 +60,7 @@ public: SnapshotsQueue & snapshots_queue_); /// Load state machine from the latest snapshot and load log storage. Start NuRaft with required settings. - void startup(bool enable_ipv6 = true); + void startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6 = true); /// Put local read request and execute in state machine directly and response into /// responses queue diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index 2beb9670355..f1830ee4b79 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -140,6 +140,7 @@ private: nuraft::ptr log_store; nuraft::ptr server_state; +public: /// Parse configuration from xml config. KeeperConfigurationWrapper parseServersConfiguration(const Poco::Util::AbstractConfiguration & config, bool allow_without_us) const; }; From ff2ebe113cb0d86306535ff81bf76878859b15d8 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 12 Apr 2022 14:08:32 +0000 Subject: [PATCH 28/94] WIP --- src/Coordination/FourLetterCommand.cpp | 2 +- src/Coordination/KeeperDispatcher.h | 10 ++++ src/Coordination/KeeperServer.cpp | 63 +++++++++++++++---------- src/Coordination/KeeperServer.h | 39 +++++++++++++-- src/Coordination/KeeperStateMachine.cpp | 3 +- src/Coordination/KeeperStateManager.cpp | 1 - src/Coordination/KeeperStateManager.h | 8 ---- src/Server/KeeperTCPHandler.cpp | 4 +- 8 files changed, 87 insertions(+), 43 deletions(-) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 9553279d955..35ee0eb8d15 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -205,7 +205,7 @@ String MonitorCommand::run() auto & stats = keeper_dispatcher.getKeeperConnectionStats(); Keeper4LWInfo keeper_info = keeper_dispatcher.getKeeper4LWInfo(); - if (!keeper_info.has_leader) + if (!keeper_dispatcher.isServerActive()) return "This instance is not currently serving requests"; const auto & state_machine = keeper_dispatcher.getStateMachine(); diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 5db1c4ddf36..aa34f44a39e 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -114,6 +114,11 @@ public: return server && server->checkInit(); } + bool isServerActive() const + { + return checkInit() && hasLeader() && !server->inRecover(); + } + /// Registered in ConfigReloader callback. Add new configuration changes to /// update_configuration_queue. Keeper Dispatcher apply them asynchronously. void updateConfiguration(const Poco::Util::AbstractConfiguration & config); @@ -147,6 +152,11 @@ public: return server->isLeaderAlive(); } + bool inRecover() const + { + return server->inRecover(); + } + bool isObserver() const { return server->isObserver(); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 39df5ff61fa..d1c0d8c59a8 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include namespace DB { @@ -149,6 +150,8 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); + loadLatestConfig(); + nuraft::raft_params params; params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); @@ -165,13 +168,6 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo params.return_method_ = nuraft::raft_params::async_handler; - if (recover) - { - LOG_INFO(log, "Custom quorum size"); - params.with_custom_commit_quorum_size(1); - params.with_custom_election_quorum_size(1); - } - nuraft::asio_service::options asio_opts{}; if (state_manager->isSecure()) { @@ -183,26 +179,25 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo #endif } - launchRaftServer(enable_ipv6, params, asio_opts); - if (recover) { + LOG_WARNING(log, "This instance was started in recovery mode. Until the quorum is restored, no requests should be sent to any " + "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); + params.with_custom_commit_quorum_size(1); + params.with_custom_election_quorum_size(1); + + auto latest_config = state_manager->load_config(); auto configuration = state_manager->parseServersConfiguration(config, false); auto local_cluster_config = configuration.cluster_config; - auto latest_log_store_config = std::make_shared(0, local_cluster_config ? local_cluster_config->get_log_idx() : 0); - latest_log_store_config->get_servers() = local_cluster_config->get_servers(); - latest_log_store_config->set_log_idx(state_manager->getLogStore()->next_slot()); + auto new_config = std::make_shared(0, latest_config ? latest_config->get_log_idx() : 0); + new_config->get_servers() = local_cluster_config->get_servers(); + new_config->set_log_idx(state_manager->getLogStore()->next_slot()); - for (auto & server : latest_log_store_config->get_servers()) - { - LOG_INFO(log, "Having server {} with log idx {}", server->get_id(), latest_log_store_config->get_log_idx()); - } - - - state_manager->save_config(*latest_log_store_config); - return; + state_manager->save_config(*new_config); } + launchRaftServer(enable_ipv6, params, asio_opts); + if (!raft_instance) throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); } @@ -221,7 +216,7 @@ void KeeperServer::launchRaftServer( return callbackFunc(type, param); }; - nuraft::ptr logger = nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level); + nuraft::ptr logger = nuraft::cs_new("RaftInstance", DB::LogsLevel::information); asio_service = nuraft::cs_new(asio_opts, logger); asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger, enable_ipv6); @@ -239,12 +234,12 @@ void KeeperServer::launchRaftServer( casted_state_manager, casted_state_machine, asio_listener, logger, rpc_cli_factory, scheduler, params); - loadLatestConfig(); - raft_instance = nuraft::cs_new(ctx, init_options); + raft_instance = nuraft::cs_new(ctx, init_options); - raft_instance->start_server(init_options.skip_initial_election_timeout_); - asio_listener->listen(raft_instance); + raft_instance->start_server(state_manager->shouldStartAsFollower()); + auto raft_server_ptr = std::static_pointer_cast(raft_instance); + asio_listener->listen(raft_server_ptr); } void KeeperServer::shutdownRaftServer() @@ -369,6 +364,18 @@ uint64_t KeeperServer::getSyncedFollowerCount() const nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param) { + if (type == nuraft::cb_func::HeartBeat && recover && raft_instance->isClusterHealthy()) + { + auto new_params = raft_instance->get_current_params(); + new_params.custom_commit_quorum_size_ = 0; + new_params.custom_election_quorum_size_ = 0; + raft_instance->update_params(new_params); + + LOG_INFO(log, "Recovery is done"); + recover = false; + return nuraft::cb_func::ReturnCode::Ok; + } + if (initialized_flag) return nuraft::cb_func::ReturnCode::Ok; @@ -448,6 +455,12 @@ ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::Abstrac void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) { + if (recover) + { + LOG_INFO(log, "Config update ignored because we are in recovery mode"); + return; + } + size_t sleep_ms = 500; if (task.action_type == ConfigUpdateActionType::AddServer) { diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 210777a4086..30ad2f8774c 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -1,12 +1,10 @@ #pragma once -#include #include #include #include #include #include -#include #include namespace DB @@ -17,6 +15,33 @@ using RaftAppendResult = nuraft::ptrelapsedMilliseconds() < expiry) + return false; + } + + const size_t voting_members = get_num_voting_members(); + const auto not_responding_peers = get_not_responding_peers(); + const auto quorum_size = voting_members / 2 + 1; + const auto max_not_responding_peers = voting_members - quorum_size; + + return not_responding_peers <= max_not_responding_peers; + } + + using nuraft::raft_server::raft_server; + + std::optional timer_from_init = std::make_optional(); + }; + const int server_id; CoordinationSettingsPtr coordination_settings; @@ -25,7 +50,7 @@ private: nuraft::ptr state_manager; - nuraft::ptr raft_instance; + nuraft::ptr raft_instance; nuraft::ptr asio_service; nuraft::ptr asio_listener; @@ -50,7 +75,8 @@ private: void shutdownRaftServer(); void loadLatestConfig(); - bool recover = false; + + std::atomic_bool recover = false; public: KeeperServer( @@ -66,6 +92,11 @@ public: /// responses queue void putLocalReadRequest(const KeeperStorage::RequestForSession & request); + bool inRecover() const + { + return recover; + } + /// Put batch of requests into Raft and get result of put. Responses will be set separately into /// responses_queue. RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests); diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 08f137cd1a8..be7110fa841 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -175,12 +175,11 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) } -void KeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr & new_conf) +void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr & new_conf) { std::lock_guard lock(cluster_config_lock); auto tmp = new_conf->serialize(); cluster_config = ClusterConfig::deserialize(*tmp); - last_committed_idx = log_idx; } nuraft::ptr KeeperStateMachine::last_snapshot() diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index 909ad15a02c..f9bfea5e69a 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -248,7 +248,6 @@ void KeeperStateManager::flushLogStore() void KeeperStateManager::save_config(const nuraft::cluster_config & config) { - LOG_INFO(&Poco::Logger::get("State Manager"), "Save config called"); std::lock_guard lock(configuration_wrapper_mutex); nuraft::ptr buf = config.serialize(); configuration_wrapper.cluster_config = nuraft::cluster_config::deserialize(*buf); diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index f1830ee4b79..7de9e90673e 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -104,17 +104,9 @@ public: /// Read all log entries in log store from the begging and return latest config (with largest log_index) ClusterConfigPtr getLatestConfigFromLogStore() const; - ClusterConfigPtr getLocalConfig() const - { - std::lock_guard lock{configuration_wrapper_mutex}; - return configuration_wrapper.cluster_config; - } - /// Get configuration diff between proposed XML and current state in RAFT ConfigUpdateActions getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) const; - void applyConfigsToLogEntry(); - private: /// Wrapper struct for Keeper cluster config. We parse this /// info from XML files. diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index d6eca9bec2f..b3d115c8acd 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -345,7 +345,7 @@ void KeeperTCPHandler::runImpl() return; } - if (keeper_dispatcher->checkInit() && keeper_dispatcher->hasLeader()) + if (keeper_dispatcher->isServerActive()) { try { @@ -366,7 +366,7 @@ void KeeperTCPHandler::runImpl() else { String reason; - if (!keeper_dispatcher->checkInit() && !keeper_dispatcher->hasLeader()) + if (!keeper_dispatcher->checkInit() && !keeper_dispatcher->hasLeader() && !keeper_dispatcher->inRecover()) reason = "server is not initialized yet and no alive leader exists"; else if (!keeper_dispatcher->checkInit()) reason = "server is not initialized yet"; From 4ecb66c1ad0d4308eb984c5a6f3639330ecbfc5d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 13 Apr 2022 14:08:13 +0000 Subject: [PATCH 29/94] Support recovery with four letter commands --- src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 9 +++++ src/Coordination/FourLetterCommand.h | 11 ++++++ src/Coordination/KeeperDispatcher.cpp | 5 +++ src/Coordination/KeeperDispatcher.h | 4 +++ src/Coordination/KeeperServer.cpp | 43 ++++++++++++++--------- src/Coordination/KeeperServer.h | 5 +++ 7 files changed, 62 insertions(+), 17 deletions(-) diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index dd7cf1d2c8c..12be3efada8 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 35ee0eb8d15..6ed8895592e 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -129,6 +129,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr watch_command = std::make_shared(keeper_dispatcher); factory.registerCommand(watch_command); + FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(recovery_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -419,4 +422,10 @@ String IsReadOnlyCommand::run() return "rw"; } +String RecoveryCommand::run() +{ + keeper_dispatcher.forceRecovery(); + return ""; +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index d190c6c7d9b..b5d08f4c250 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -304,4 +304,15 @@ struct IsReadOnlyCommand : public IFourLetterCommand ~IsReadOnlyCommand() override = default; }; +struct RecoveryCommand : public IFourLetterCommand +{ + explicit RecoveryCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "rcvr"; } + String run() override; + ~RecoveryCommand() override = default; +}; } diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 65a6fbb287a..59bf42cdcf4 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -367,6 +367,11 @@ void KeeperDispatcher::shutdown() LOG_DEBUG(log, "Dispatcher shut down"); } +void KeeperDispatcher::forceRecovery() +{ + server->forceRecovery(); +} + KeeperDispatcher::~KeeperDispatcher() { shutdown(); diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index aa34f44a39e..12811d6815c 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -109,6 +109,8 @@ public: /// standalone_keeper -- we are standalone keeper application (not inside clickhouse server) void initialize(const Poco::Util::AbstractConfiguration & config, bool standalone_keeper, bool start_async); + void startServer(); + bool checkInit() const { return server && server->checkInit(); @@ -126,6 +128,8 @@ public: /// Shutdown internal keeper parts (server, state machine, log storage, etc) void shutdown(); + void forceRecovery(); + /// Put request to ClickHouse Keeper bool putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index d1c0d8c59a8..824e85f9196 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -144,14 +144,15 @@ void KeeperServer::loadLatestConfig() } } -void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) +void KeeperServer::forceRecovery() { - state_machine->init(); - - state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); - - loadLatestConfig(); + shutdownRaftServer(); + recover = true; + startupRaftServer(true); +} +void KeeperServer::startupRaftServer(bool enable_ipv6) +{ nuraft::raft_params params; params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); @@ -187,10 +188,8 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo params.with_custom_election_quorum_size(1); auto latest_config = state_manager->load_config(); - auto configuration = state_manager->parseServersConfiguration(config, false); - auto local_cluster_config = configuration.cluster_config; auto new_config = std::make_shared(0, latest_config ? latest_config->get_log_idx() : 0); - new_config->get_servers() = local_cluster_config->get_servers(); + new_config->get_servers() = last_read_config->get_servers(); new_config->set_log_idx(state_manager->getLogStore()->next_slot()); state_manager->save_config(*new_config); @@ -202,6 +201,19 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); } +void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) +{ + state_machine->init(); + + state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); + + loadLatestConfig(); + + last_read_config = state_manager->parseServersConfiguration(config, true).cluster_config; + + startupRaftServer(enable_ipv6); +} + void KeeperServer::launchRaftServer( bool enable_ipv6, const nuraft::raft_params & params, @@ -450,17 +462,16 @@ std::vector KeeperServer::getDeadSessions() ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::AbstractConfiguration & config) { - return state_manager->getConfigurationDiff(config); + auto diff = state_manager->getConfigurationDiff(config); + + if (!diff.empty()) + last_read_config = state_manager->parseServersConfiguration(config, true).cluster_config; + + return diff; } void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) { - if (recover) - { - LOG_INFO(log, "Config update ignored because we are in recovery mode"); - return; - } - size_t sleep_ms = 500; if (task.action_type == ConfigUpdateActionType::AddServer) { diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 30ad2f8774c..7b8de39d214 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -59,12 +59,15 @@ private: std::condition_variable initialized_cv; std::atomic initial_batch_committed = false; + nuraft::ptr last_read_config; + Poco::Logger * log; /// Callback func which is called by NuRaft on all internal events. /// Used to determine the moment when raft is ready to server new requests nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); + void startupRaftServer(bool enable_ipv6); /// Almost copy-paste from nuraft::launcher, but with separated server init and start /// Allows to avoid race conditions. void launchRaftServer( @@ -109,6 +112,8 @@ public: return state_machine; } + void forceRecovery(); + bool isLeader() const; bool isFollower() const; From c968353ee9d209122fef8a87f24c6deb14a70972 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 14 Apr 2022 10:30:35 +0000 Subject: [PATCH 30/94] Define tests for 3 node cluster --- .../test_keeper_force_recovery/__init__.py | 1 + .../configs/enable_keeper1.xml | 33 +++ .../configs/enable_keeper2.xml | 33 +++ .../configs/enable_keeper3.xml | 33 +++ .../configs/enable_keeper4.xml | 35 ++++ .../configs/enable_keeper5.xml | 33 +++ .../configs/recovered_keeper1.xml | 33 +++ .../configs/use_keeper.xml | 24 +++ .../test_keeper_force_recovery/test.py | 198 ++++++++++++++++++ 9 files changed, 423 insertions(+) create mode 100644 tests/integration/test_keeper_force_recovery/__init__.py create mode 100644 tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/use_keeper.xml create mode 100644 tests/integration/test_keeper_force_recovery/test.py diff --git a/tests/integration/test_keeper_force_recovery/__init__.py b/tests/integration/test_keeper_force_recovery/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml new file mode 100644 index 00000000000..441c1bc185d --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml @@ -0,0 +1,33 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml new file mode 100644 index 00000000000..e2e2c1fd7db --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml @@ -0,0 +1,33 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml new file mode 100644 index 00000000000..e2ac0400d88 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml @@ -0,0 +1,33 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml new file mode 100644 index 00000000000..b123d9970e4 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml @@ -0,0 +1,35 @@ + + + 9181 + 4 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + true + 3 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml new file mode 100644 index 00000000000..dfcd44cf6ed --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml @@ -0,0 +1,33 @@ + + + 9181 + 5 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml new file mode 100644 index 00000000000..8ead85ce501 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml @@ -0,0 +1,33 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml b/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml new file mode 100644 index 00000000000..5555ed1dc7b --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml @@ -0,0 +1,24 @@ + + + + node1 + 9181 + + + node2 + 9181 + + + node3 + 9181 + + + node3 + 9181 + + + node3 + 9181 + + + diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py new file mode 100644 index 00000000000..7ca9254c2a9 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -0,0 +1,198 @@ +import os +import pytest +import socket +from helpers.cluster import ClickHouseCluster +import time + +cluster = ClickHouseCluster(__file__) +CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/enable_keeper1.xml", "configs/use_keeper.xml"], + stay_alive=True, +) +node2 = cluster.add_instance( + "node2", + main_configs=["configs/enable_keeper2.xml", "configs/use_keeper.xml"], + stay_alive=True, +) +node3 = cluster.add_instance( + "node3", + main_configs=["configs/enable_keeper3.xml", "configs/use_keeper.xml"], + stay_alive=True, +) +node4 = cluster.add_instance( + "node4", + main_configs=["configs/enable_keeper4.xml", "configs/use_keeper.xml"], + stay_alive=True, +) +node5 = cluster.add_instance( + "node5", + main_configs=["configs/enable_keeper5.xml", "configs/use_keeper.xml"], + stay_alive=True, +) + +from kazoo.client import KazooClient, KazooState + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def smaller_exception(ex): + return "\n".join(str(ex).split("\n")[0:2]) + + +def wait_node(node): + for _ in range(100): + zk = None + try: + node.query("SELECT * FROM system.zookeeper WHERE path = '/'") + zk = get_fake_zk(node.name, timeout=30.0) + zk.create("/test", sequence=True) + print("node", node.name, "ready") + break + except Exception as ex: + time.sleep(0.2) + print("Waiting until", node.name, "will be ready, exception", ex) + finally: + if zk: + zk.stop() + zk.close() + else: + raise Exception("Can't wait node", node.name, "to become ready") + + +def wait_nodes(nodes): + for node in nodes: + wait_node(node) + + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + +def get_keeper_socket(node_name): + hosts = cluster.get_instance_ip(node_name) + client = socket.socket() + client.settimeout(10) + client.connect((hosts, 9181)) + return client + +def send_4lw_cmd(node_name=node1.name, cmd="ruok"): + client = None + try: + client = get_keeper_socket(node_name) + client.send(cmd.encode()) + data = client.recv(100_000) + data = data.decode() + return data + finally: + if client is not None: + client.close() + +def wait_until_connected(node_name): + while send_4lw_cmd(node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: + time.sleep(0.2) + +def wait_and_assert_data(zk, path, data): + while zk.exists(path) is None: + time.sleep(0.2) + assert zk.get(path)[0] == data.encode() + + +NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" + +def test_three_node_recovery(started_cluster): + try: + # initial cluster is node1 <-> node2 <-> node3 + node4.stop_clickhouse(); + node5.stop_clickhouse(); + + wait_nodes([node1, node2, node3]) + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + node4_zk = None + node5_zk = None + + for i, zk in enumerate([node1_zk, node2_zk, node3_zk]): + zk.create(f"/test_force_recovery_node{i+1}", f"somedata{i+1}".encode()) + + for zk in [node1_zk, node2_zk, node3_zk]: + for i in range(1, 4): + wait_and_assert_data(zk, f"/test_force_recovery_node{i}", f"somedata{i}") + + node1.stop_clickhouse() + + node2_zk.create("/test_force_recovery_extra", b"someexstradata") + wait_and_assert_data(node3_zk, "/test_force_recovery_extra", "someexstradata") + + node1.start_clickhouse() + wait_and_assert_data(node1_zk, "/test_force_recovery_extra", "someexstradata") + + node2.stop_clickhouse() + node3.stop_clickhouse() + + # wait for node1 to lose quorum + while send_4lw_cmd(node1.name, "mntr") != NOT_SERVING_REQUESTS_ERROR_MSG: + time.sleep(0.2) + + node1.copy_file_to_container( + os.path.join(CONFIG_DIR, "recovered_keeper1.xml"), + "/etc/clickhouse-server/config.d/enable_keeper1.xml") + + node1.query("SYSTEM RELOAD CONFIG") + + assert send_4lw_cmd(node1.name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG + send_4lw_cmd(node1.name, "rcvr") + assert send_4lw_cmd(node1.name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG + + node4.start_clickhouse() + wait_node(node4) + wait_until_connected(node4.name) + + # node1 should have quorum now and accept requests + wait_until_connected(node1.name) + + node5.start_clickhouse() + wait_node(node5) + wait_until_connected(node5.name) + + node4_zk = get_fake_zk("node4") + node5_zk = get_fake_zk("node5") + + for zk in [node1_zk, node4_zk, node5_zk]: + for i in range(1, 4): + wait_and_assert_data(zk, f"/test_force_recovery_node{i}", f"somedata{i}") + wait_and_assert_data(zk, "/test_force_recovery_extra", "someexstradata") + + # new nodes can achieve quorum without the recovery node (cluster should work properly from now on) + node1.stop_clickhouse() + + node4_zk.create("/test_force_recovery_node4", b"somedata4") + wait_and_assert_data(node5_zk, "/test_force_recovery_node4", "somedata4") + + node1.start_clickhouse() + for i in range(1, 5): + wait_and_assert_data(node1_zk, f"/test_force_recovery_node{i}", f"somedata{i}") + wait_and_assert_data(node1_zk, "/test_force_recovery_extra", "someexstradata") + finally: + try: + for zk_conn in [node1_zk, node2_zk, node3_zk, node4_zk, node5_zk]: + if zk_conn: + zk_conn.stop() + zk_conn.close() + except: + pass From 272965fc44a8432db77ae379cacbf9d34dc6e58b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 14 Apr 2022 12:00:47 +0000 Subject: [PATCH 31/94] Refactoring --- src/Coordination/KeeperDispatcher.cpp | 5 ++ src/Coordination/KeeperDispatcher.h | 11 ++- src/Coordination/KeeperServer.cpp | 98 +++++++++++++++++---------- src/Coordination/KeeperServer.h | 43 ++---------- 4 files changed, 77 insertions(+), 80 deletions(-) diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 59bf42cdcf4..73e39183bc1 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -579,6 +579,11 @@ void KeeperDispatcher::updateConfigurationThread() } } +bool KeeperDispatcher::isServerActive() const +{ + return checkInit() && hasLeader() && !server->isRecovering(); +} + void KeeperDispatcher::updateConfiguration(const Poco::Util::AbstractConfiguration & config) { auto diff = server->getConfigurationDiff(config); diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 12811d6815c..67973a42be7 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -116,9 +116,11 @@ public: return server && server->checkInit(); } - bool isServerActive() const + bool isServerActive() const; + + bool serverIsRecovering() const { - return checkInit() && hasLeader() && !server->inRecover(); + return server->isRecovering(); } /// Registered in ConfigReloader callback. Add new configuration changes to @@ -156,11 +158,6 @@ public: return server->isLeaderAlive(); } - bool inRecover() const - { - return server->inRecover(); - } - bool isObserver() const { return server->isObserver(); diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 824e85f9196..6d571f0392f 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -104,12 +104,44 @@ KeeperServer::KeeperServer( checkAndGetSuperdigest(configuration_and_settings_->super_digest))) , state_manager(nuraft::cs_new(server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings)) , log(&Poco::Logger::get("KeeperServer")) - , recover(config.has("keeper_server.recover") && config.getBool("keeper_server.recover")) + , is_recovering(config.has("keeper_server.recover") && config.getBool("keeper_server.recover")) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); } +struct KeeperServer::KeeperRaftServer : public nuraft::raft_server +{ + bool isClusterHealthy() + { + if (timer_from_init) + { + size_t expiry = get_current_params().heart_beat_interval_ * + raft_server::raft_limits_.response_limit_; + + if (timer_from_init->elapsedMilliseconds() < expiry) + return false; + + timer_from_init.reset(); + } + + const size_t voting_members = get_num_voting_members(); + const auto not_responding_peers = get_not_responding_peers(); + const auto quorum_size = voting_members / 2 + 1; + const auto max_not_responding_peers = voting_members - quorum_size; + + return not_responding_peers <= max_not_responding_peers; + } + + using nuraft::raft_server::raft_server; + + // peers are initially marked as responding because at least one cycle + // of heartbeat * response_limit (20) need to pass to be marked + // as not responding + // until that time passes we can't say that the cluster is healthy + std::optional timer_from_init = std::make_optional(); +}; + void KeeperServer::loadLatestConfig() { auto latest_snapshot_config = state_machine->getClusterConfig(); @@ -147,11 +179,11 @@ void KeeperServer::loadLatestConfig() void KeeperServer::forceRecovery() { shutdownRaftServer(); - recover = true; - startupRaftServer(true); + is_recovering = true; + launchRaftServer(true); } -void KeeperServer::startupRaftServer(bool enable_ipv6) +void KeeperServer::launchRaftServer(bool enable_ipv6) { nuraft::raft_params params; params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); @@ -180,45 +212,21 @@ void KeeperServer::startupRaftServer(bool enable_ipv6) #endif } - if (recover) + if (is_recovering) { - LOG_WARNING(log, "This instance was started in recovery mode. Until the quorum is restored, no requests should be sent to any " + LOG_WARNING(log, "This instance is in recovery mode. Until the quorum is restored, no requests should be sent to any " "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); params.with_custom_commit_quorum_size(1); params.with_custom_election_quorum_size(1); auto latest_config = state_manager->load_config(); auto new_config = std::make_shared(0, latest_config ? latest_config->get_log_idx() : 0); - new_config->get_servers() = last_read_config->get_servers(); + new_config->get_servers() = last_local_config->get_servers(); new_config->set_log_idx(state_manager->getLogStore()->next_slot()); state_manager->save_config(*new_config); } - launchRaftServer(enable_ipv6, params, asio_opts); - - if (!raft_instance) - throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); -} - -void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) -{ - state_machine->init(); - - state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); - - loadLatestConfig(); - - last_read_config = state_manager->parseServersConfiguration(config, true).cluster_config; - - startupRaftServer(enable_ipv6); -} - -void KeeperServer::launchRaftServer( - bool enable_ipv6, - const nuraft::raft_params & params, - const nuraft::asio_service::options & asio_opts) -{ nuraft::raft_server::init_options init_options; init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower(); @@ -250,8 +258,24 @@ void KeeperServer::launchRaftServer( raft_instance->start_server(state_manager->shouldStartAsFollower()); - auto raft_server_ptr = std::static_pointer_cast(raft_instance); - asio_listener->listen(raft_server_ptr); + nuraft::ptr casted_raft_server = raft_instance; + asio_listener->listen(casted_raft_server); + + if (!raft_instance) + throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance"); +} + +void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) +{ + state_machine->init(); + + state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); + + loadLatestConfig(); + + last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config; + + launchRaftServer(enable_ipv6); } void KeeperServer::shutdownRaftServer() @@ -376,15 +400,15 @@ uint64_t KeeperServer::getSyncedFollowerCount() const nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param) { - if (type == nuraft::cb_func::HeartBeat && recover && raft_instance->isClusterHealthy()) + if (type == nuraft::cb_func::HeartBeat && is_recovering && raft_instance->isClusterHealthy()) { auto new_params = raft_instance->get_current_params(); new_params.custom_commit_quorum_size_ = 0; new_params.custom_election_quorum_size_ = 0; raft_instance->update_params(new_params); - LOG_INFO(log, "Recovery is done"); - recover = false; + LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); + is_recovering = false; return nuraft::cb_func::ReturnCode::Ok; } @@ -465,7 +489,7 @@ ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::Abstrac auto diff = state_manager->getConfigurationDiff(config); if (!diff.empty()) - last_read_config = state_manager->parseServersConfiguration(config, true).cluster_config; + last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config; return diff; } diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 7b8de39d214..4a0a535975d 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB { @@ -15,33 +16,6 @@ using RaftAppendResult = nuraft::ptrelapsedMilliseconds() < expiry) - return false; - } - - const size_t voting_members = get_num_voting_members(); - const auto not_responding_peers = get_not_responding_peers(); - const auto quorum_size = voting_members / 2 + 1; - const auto max_not_responding_peers = voting_members - quorum_size; - - return not_responding_peers <= max_not_responding_peers; - } - - using nuraft::raft_server::raft_server; - - std::optional timer_from_init = std::make_optional(); - }; - const int server_id; CoordinationSettingsPtr coordination_settings; @@ -50,6 +24,7 @@ private: nuraft::ptr state_manager; + struct KeeperRaftServer; nuraft::ptr raft_instance; nuraft::ptr asio_service; nuraft::ptr asio_listener; @@ -59,7 +34,7 @@ private: std::condition_variable initialized_cv; std::atomic initial_batch_committed = false; - nuraft::ptr last_read_config; + nuraft::ptr last_local_config; Poco::Logger * log; @@ -67,19 +42,15 @@ private: /// Used to determine the moment when raft is ready to server new requests nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); - void startupRaftServer(bool enable_ipv6); /// Almost copy-paste from nuraft::launcher, but with separated server init and start /// Allows to avoid race conditions. - void launchRaftServer( - bool enable_ipv6, - const nuraft::raft_params & params, - const nuraft::asio_service::options & asio_opts); + void launchRaftServer(bool enable_ipv6); void shutdownRaftServer(); void loadLatestConfig(); - std::atomic_bool recover = false; + std::atomic_bool is_recovering = false; public: KeeperServer( @@ -95,9 +66,9 @@ public: /// responses queue void putLocalReadRequest(const KeeperStorage::RequestForSession & request); - bool inRecover() const + bool isRecovering() const { - return recover; + return is_recovering; } /// Put batch of requests into Raft and get result of put. Responses will be set separately into From 6c3bf0a5d3b6d6553fa05e1777be15b8a9c6709d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 14 Apr 2022 12:07:33 +0000 Subject: [PATCH 32/94] Format files --- programs/keeper/Keeper.cpp | 4 +- src/Coordination/KeeperServer.cpp | 123 ++++++++++++++++++------------ src/Coordination/KeeperServer.h | 25 ++---- src/Server/KeeperTCPHandler.cpp | 10 +-- 4 files changed, 87 insertions(+), 75 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index d5af2d6148e..3c938bb31c8 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -244,7 +244,7 @@ void Keeper::handleCustomArguments(const std::string & arg, const std::string & if (arg == "force-recovery") { assert(value.empty()); - config().setBool("keeper_server.recover", true); + config().setBool("keeper_server.force_recovery", true); return; } @@ -264,7 +264,7 @@ void Keeper::defineOptions(Poco::Util::OptionSet & options) .repeatable(false) .binding("version")); options.addOption( - Poco::Util::Option("force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration") + Poco::Util::Option("force-recovery", "force-recovery", "Force recovery mode allowing Keeper to overwrite cluster configuration without quorum") .required(false) .repeatable(false) .noArgument() diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 6d571f0392f..30735940062 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -1,23 +1,23 @@ -#include #include +#include #include "config_core.h" -#include +#include +#include +#include #include #include -#include +#include #include +#include #include #include -#include -#include -#include -#include -#include -#include #include #include +#include +#include +#include namespace DB { @@ -63,7 +63,6 @@ void setSSLParams(nuraft::asio_service::options & asio_opts) } #endif - std::string checkAndGetSuperdigest(const String & user_and_digest) { if (user_and_digest.empty()) @@ -72,7 +71,8 @@ std::string checkAndGetSuperdigest(const String & user_and_digest) std::vector scheme_and_id; boost::split(scheme_and_id, user_and_digest, [](char c) { return c == ':'; }); if (scheme_and_id.size() != 2 || scheme_and_id[0] != "super") - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Incorrect superdigest in keeper_server config. Must be 'super:base64string'"); + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, "Incorrect superdigest in keeper_server config. Must be 'super:base64string'"); return user_and_digest; } @@ -81,7 +81,12 @@ int32_t getValueOrMaxInt32AndLogWarning(uint64_t value, const std::string & name { if (value > std::numeric_limits::max()) { - LOG_WARNING(log, "Got {} value for setting '{}' which is bigger than int32_t max value, lowering value to {}.", value, name, std::numeric_limits::max()); + LOG_WARNING( + log, + "Got {} value for setting '{}' which is bigger than int32_t max value, lowering value to {}.", + value, + name, + std::numeric_limits::max()); return std::numeric_limits::max(); } @@ -98,13 +103,15 @@ KeeperServer::KeeperServer( : server_id(configuration_and_settings_->server_id) , coordination_settings(configuration_and_settings_->coordination_settings) , state_machine(nuraft::cs_new( - responses_queue_, snapshots_queue_, - configuration_and_settings_->snapshot_storage_path, - coordination_settings, - checkAndGetSuperdigest(configuration_and_settings_->super_digest))) - , state_manager(nuraft::cs_new(server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings)) + responses_queue_, + snapshots_queue_, + configuration_and_settings_->snapshot_storage_path, + coordination_settings, + checkAndGetSuperdigest(configuration_and_settings_->super_digest))) + , state_manager(nuraft::cs_new( + server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings)) , log(&Poco::Logger::get("KeeperServer")) - , is_recovering(config.has("keeper_server.recover") && config.getBool("keeper_server.recover")) + , is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery")) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); @@ -116,8 +123,7 @@ struct KeeperServer::KeeperRaftServer : public nuraft::raft_server { if (timer_from_init) { - size_t expiry = get_current_params().heart_beat_interval_ * - raft_server::raft_limits_.response_limit_; + size_t expiry = get_current_params().heart_beat_interval_ * raft_server::raft_limits_.response_limit_; if (timer_from_init->elapsedMilliseconds() < expiry) return false; @@ -186,18 +192,25 @@ void KeeperServer::forceRecovery() void KeeperServer::launchRaftServer(bool enable_ipv6) { nuraft::raft_params params; - params.heart_beat_interval_ = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); - params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); - params.election_timeout_upper_bound_ = getValueOrMaxInt32AndLogWarning(coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(), "election_timeout_upper_bound_ms", log); + params.heart_beat_interval_ + = getValueOrMaxInt32AndLogWarning(coordination_settings->heart_beat_interval_ms.totalMilliseconds(), "heart_beat_interval_ms", log); + params.election_timeout_lower_bound_ = getValueOrMaxInt32AndLogWarning( + coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(), "election_timeout_lower_bound_ms", log); + params.election_timeout_upper_bound_ = getValueOrMaxInt32AndLogWarning( + coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds(), "election_timeout_upper_bound_ms", log); params.reserved_log_items_ = getValueOrMaxInt32AndLogWarning(coordination_settings->reserved_log_items, "reserved_log_items", log); params.snapshot_distance_ = getValueOrMaxInt32AndLogWarning(coordination_settings->snapshot_distance, "snapshot_distance", log); params.stale_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->stale_log_gap, "stale_log_gap", log); params.fresh_log_gap_ = getValueOrMaxInt32AndLogWarning(coordination_settings->fresh_log_gap, "fresh_log_gap", log); - params.client_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds(), "operation_timeout_ms", log); + params.client_req_timeout_ + = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds(), "operation_timeout_ms", log); params.auto_forwarding_ = coordination_settings->auto_forwarding; - params.auto_forwarding_req_timeout_ = std::max(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, std::numeric_limits::max()); - params.auto_forwarding_req_timeout_ = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log); - params.max_append_size_ = getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_batch_size, "max_requests_batch_size", log); + params.auto_forwarding_req_timeout_ + = std::max(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, std::numeric_limits::max()); + params.auto_forwarding_req_timeout_ + = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log); + params.max_append_size_ + = getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_batch_size, "max_requests_batch_size", log); params.return_method_ = nuraft::raft_params::async_handler; @@ -207,15 +220,17 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) #if USE_SSL setSSLParams(asio_opts); #else - throw Exception{"SSL support for NuRaft is disabled because ClickHouse was built without SSL support.", - ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception{ + "SSL support for NuRaft is disabled because ClickHouse was built without SSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; #endif } if (is_recovering) { - LOG_WARNING(log, "This instance is in recovery mode. Until the quorum is restored, no requests should be sent to any " - "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); + LOG_WARNING( + log, + "This instance is in recovery mode. Until the quorum is restored, no requests should be sent to any " + "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); params.with_custom_commit_quorum_size(1); params.with_custom_election_quorum_size(1); @@ -231,10 +246,7 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower(); init_options.start_server_in_constructor_ = false; - init_options.raft_callback_ = [this] (nuraft::cb_func::Type type, nuraft::cb_func::Param * param) - { - return callbackFunc(type, param); - }; + init_options.raft_callback_ = [this](nuraft::cb_func::Type type, nuraft::cb_func::Param * param) { return callbackFunc(type, param); }; nuraft::ptr logger = nuraft::cs_new("RaftInstance", DB::LogsLevel::information); asio_service = nuraft::cs_new(asio_opts, logger); @@ -250,9 +262,8 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) nuraft::ptr casted_state_machine = state_machine; /// raft_server creates unique_ptr from it - nuraft::context * ctx = new nuraft::context( - casted_state_manager, casted_state_machine, - asio_listener, logger, rpc_cli_factory, scheduler, params); + nuraft::context * ctx + = new nuraft::context(casted_state_manager, casted_state_machine, asio_listener, logger, rpc_cli_factory, scheduler, params); raft_instance = nuraft::cs_new(ctx, init_options); @@ -345,7 +356,6 @@ void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions) { - std::vector> entries; for (const auto & [session_id, time, request] : requests_for_sessions) entries.push_back(getZooKeeperLogEntry(session_id, time, request)); @@ -421,7 +431,7 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ if (next_index < last_commited || next_index - last_commited <= 1) commited_store = true; - auto set_initialized = [this] () + auto set_initialized = [this]() { std::unique_lock lock(initialized_mutex); initialized_flag = true; @@ -518,12 +528,21 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) auto result = raft_instance->add_srv(*task.server); if (!result->get_accepted()) - LOG_INFO(log, "Command to add server {} was not accepted for the {} time, will sleep for {} ms and retry", task.server->get_id(), i + 1, sleep_ms * (i + 1)); + LOG_INFO( + log, + "Command to add server {} was not accepted for the {} time, will sleep for {} ms and retry", + task.server->get_id(), + i + 1, + sleep_ms * (i + 1)); std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms * (i + 1))); } if (!added) - throw Exception(ErrorCodes::RAFT_ERROR, "Configuration change to add server (id {}) was not accepted by RAFT after all {} retries", task.server->get_id(), coordination_settings->configuration_change_tries_count); + throw Exception( + ErrorCodes::RAFT_ERROR, + "Configuration change to add server (id {}) was not accepted by RAFT after all {} retries", + task.server->get_id(), + coordination_settings->configuration_change_tries_count); } else if (task.action_type == ConfigUpdateActionType::RemoveServer) { @@ -532,8 +551,10 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) bool removed = false; if (task.server->get_id() == state_manager->server_id()) { - LOG_INFO(log, "Trying to remove leader node (ourself), so will yield leadership and some other node (new leader) will try remove us. " - "Probably you will have to run SYSTEM RELOAD CONFIG on the new leader node"); + LOG_INFO( + log, + "Trying to remove leader node (ourself), so will yield leadership and some other node (new leader) will try remove us. " + "Probably you will have to run SYSTEM RELOAD CONFIG on the new leader node"); raft_instance->yield_leadership(); return; @@ -556,12 +577,21 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) auto result = raft_instance->remove_srv(task.server->get_id()); if (!result->get_accepted()) - LOG_INFO(log, "Command to remove server {} was not accepted for the {} time, will sleep for {} ms and retry", task.server->get_id(), i + 1, sleep_ms * (i + 1)); + LOG_INFO( + log, + "Command to remove server {} was not accepted for the {} time, will sleep for {} ms and retry", + task.server->get_id(), + i + 1, + sleep_ms * (i + 1)); std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms * (i + 1))); } if (!removed) - throw Exception(ErrorCodes::RAFT_ERROR, "Configuration change to remove server (id {}) was not accepted by RAFT after all {} retries", task.server->get_id(), coordination_settings->configuration_change_tries_count); + throw Exception( + ErrorCodes::RAFT_ERROR, + "Configuration change to remove server (id {}) was not accepted by RAFT after all {} retries", + task.server->get_id(), + coordination_settings->configuration_change_tries_count); } else if (task.action_type == ConfigUpdateActionType::UpdatePriority) raft_instance->set_priority(task.server->get_id(), task.server->get_priority()); @@ -572,7 +602,6 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) bool KeeperServer::waitConfigurationUpdate(const ConfigUpdateAction & task) { - size_t sleep_ms = 500; if (task.action_type == ConfigUpdateActionType::AddServer) { diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 4a0a535975d..fda867f6153 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -1,12 +1,12 @@ #pragma once -#include -#include -#include -#include #include -#include +#include +#include +#include +#include #include +#include namespace DB { @@ -66,10 +66,7 @@ public: /// responses queue void putLocalReadRequest(const KeeperStorage::RequestForSession & request); - bool isRecovering() const - { - return is_recovering; - } + bool isRecovering() const { return is_recovering; } /// Put batch of requests into Raft and get result of put. Responses will be set separately into /// responses_queue. @@ -78,10 +75,7 @@ public: /// Return set of the non-active sessions std::vector getDeadSessions(); - nuraft::ptr getKeeperStateMachine() const - { - return state_machine; - } + nuraft::ptr getKeeperStateMachine() const { return state_machine; } void forceRecovery(); @@ -103,10 +97,7 @@ public: void waitInit(); /// Return true if KeeperServer initialized - bool checkInit() const - { - return initialized_flag; - } + bool checkInit() const { return initialized_flag; } void shutdown(); diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index b3d115c8acd..9f444d25b15 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -365,15 +365,7 @@ void KeeperTCPHandler::runImpl() } else { - String reason; - if (!keeper_dispatcher->checkInit() && !keeper_dispatcher->hasLeader() && !keeper_dispatcher->inRecover()) - reason = "server is not initialized yet and no alive leader exists"; - else if (!keeper_dispatcher->checkInit()) - reason = "server is not initialized yet"; - else - reason = "no alive leader exists"; - - LOG_WARNING(log, "Ignoring user request, because {}", reason); + LOG_WARNING(log, "Ignoring user request, because the server is not acitve yet"); sendHandshake(false); return; } From 4bd24e9fd3dd02bd3088d5aa1f37b5771a9ac2d6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 14 Apr 2022 12:36:46 +0000 Subject: [PATCH 33/94] Update NuRaft --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 1707a7572aa..b6050a4e349 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d +Subproject commit b6050a4e34995f7023912d8e9e03a061f5d4235c From cd548aeb3026b3edd6eba5aa99204855d919deff Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 15 Apr 2022 08:52:34 +0000 Subject: [PATCH 34/94] Refactor test for multiple cluster size --- programs/keeper/Keeper.cpp | 3 +- src/Server/KeeperTCPHandler.cpp | 2 +- .../five_node_cluster/enable_keeper1.xml | 43 +++ .../five_node_cluster/enable_keeper2.xml | 43 +++ .../five_node_cluster/enable_keeper3.xml | 43 +++ .../five_node_cluster/enable_keeper4.xml | 43 +++ .../five_node_cluster/enable_keeper5.xml | 43 +++ .../five_node_cluster/enable_keeper6.xml | 43 +++ .../five_node_cluster/enable_keeper7.xml | 43 +++ .../five_node_cluster/enable_keeper8.xml | 43 +++ .../five_node_cluster/recovered_keeper1.xml | 43 +++ .../configs/five_node_cluster/use_keeper.xml | 36 +++ .../enable_keeper1.xml | 0 .../enable_keeper2.xml | 0 .../enable_keeper3.xml | 0 .../enable_keeper4.xml | 0 .../enable_keeper5.xml | 0 .../recovered_keeper1.xml | 0 .../{ => three_node_cluster}/use_keeper.xml | 4 +- .../test_keeper_force_recovery/test.py | 245 ++++++++++-------- 20 files changed, 570 insertions(+), 107 deletions(-) create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml create mode 100644 tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/enable_keeper1.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/enable_keeper2.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/enable_keeper3.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/enable_keeper4.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/enable_keeper5.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/recovered_keeper1.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{ => three_node_cluster}/use_keeper.xml (89%) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 3c938bb31c8..b61df4527e5 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -62,6 +62,7 @@ namespace ErrorCodes extern const int NETWORK_ERROR; extern const int MISMATCHING_USERS_FOR_PROCESS_AND_DATA; extern const int FAILED_TO_GETPWUID; + extern const int LOGICAL_ERROR; } namespace @@ -239,7 +240,7 @@ std::string Keeper::getDefaultConfigFileName() const return "keeper_config.xml"; } -void Keeper::handleCustomArguments(const std::string & arg, const std::string & value) +void Keeper::handleCustomArguments(const std::string & arg, [[maybe_unused]] const std::string & value) // NOLINT { if (arg == "force-recovery") { diff --git a/src/Server/KeeperTCPHandler.cpp b/src/Server/KeeperTCPHandler.cpp index 9f444d25b15..41092f61124 100644 --- a/src/Server/KeeperTCPHandler.cpp +++ b/src/Server/KeeperTCPHandler.cpp @@ -365,7 +365,7 @@ void KeeperTCPHandler::runImpl() } else { - LOG_WARNING(log, "Ignoring user request, because the server is not acitve yet"); + LOG_WARNING(log, "Ignoring user request, because the server is not active yet"); sendHandshake(false); return; } diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml new file mode 100644 index 00000000000..b7f9d1b058e --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml @@ -0,0 +1,43 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml new file mode 100644 index 00000000000..b773d59f259 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml @@ -0,0 +1,43 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml new file mode 100644 index 00000000000..d4c2befc10f --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml @@ -0,0 +1,43 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml new file mode 100644 index 00000000000..c039e709c9e --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml @@ -0,0 +1,43 @@ + + + 9181 + 4 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml new file mode 100644 index 00000000000..fb43b6524c8 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml @@ -0,0 +1,43 @@ + + + 9181 + 5 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 3 + node3 + 9234 + + + 4 + node4 + 9234 + + + 5 + node5 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml new file mode 100644 index 00000000000..430e662bf36 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml @@ -0,0 +1,43 @@ + + + 9181 + 6 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 6 + node6 + 9234 + + + 7 + node7 + 9234 + + + 8 + node8 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml new file mode 100644 index 00000000000..aa10774ad7d --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml @@ -0,0 +1,43 @@ + + + 9181 + 7 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 6 + node6 + 9234 + + + 7 + node7 + 9234 + + + 8 + node8 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml new file mode 100644 index 00000000000..4f1c21a1084 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml @@ -0,0 +1,43 @@ + + + 9181 + 8 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 6 + node6 + 9234 + + + 7 + node7 + 9234 + + + 8 + node8 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml new file mode 100644 index 00000000000..eaf0f01afc9 --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml @@ -0,0 +1,43 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 5000 + 10000 + 75 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + 6 + node6 + 9234 + + + 7 + node7 + 9234 + + + 8 + node8 + 9234 + + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml new file mode 100644 index 00000000000..f41e8c6e49c --- /dev/null +++ b/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml @@ -0,0 +1,36 @@ + + + + node1 + 9181 + + + node2 + 9181 + + + node3 + 9181 + + + node4 + 9181 + + + node5 + 9181 + + + node6 + 9181 + + + node7 + 9181 + + + node8 + 9181 + + + diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml similarity index 89% rename from tests/integration/test_keeper_force_recovery/configs/use_keeper.xml rename to tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml index 5555ed1dc7b..22bebe41aa0 100644 --- a/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml +++ b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml @@ -13,11 +13,11 @@ 9181 - node3 + node4 9181 - node3 + node5 9181 diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index 7ca9254c2a9..1e4121ad931 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -4,59 +4,57 @@ import socket from helpers.cluster import ClickHouseCluster import time -cluster = ClickHouseCluster(__file__) -CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") +BASE_DIR = os.path.dirname(os.path.realpath(__file__)) -node1 = cluster.add_instance( - "node1", - main_configs=["configs/enable_keeper1.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node2 = cluster.add_instance( - "node2", - main_configs=["configs/enable_keeper2.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node3 = cluster.add_instance( - "node3", - main_configs=["configs/enable_keeper3.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node4 = cluster.add_instance( - "node4", - main_configs=["configs/enable_keeper4.xml", "configs/use_keeper.xml"], - stay_alive=True, -) -node5 = cluster.add_instance( - "node5", - main_configs=["configs/enable_keeper5.xml", "configs/use_keeper.xml"], - stay_alive=True, -) - -from kazoo.client import KazooClient, KazooState +from kazoo.client import KazooClient -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() +def get_quorum_size(cluster_size): + return cluster_size // 2 + 1 - yield cluster - finally: - cluster.shutdown() +def get_config_dir(cluster_size): + if cluster_size == 3: + return "configs/three_node_cluster" + elif cluster_size == 5: + return "configs/five_node_cluster" + else: + raise Exception("Invalid cluster size {}", cluster_size) + + +def create_and_start_cluster(cluster_size): + cluster = ClickHouseCluster(__file__) + config_dir = get_config_dir(cluster_size) + + quorum_size = get_quorum_size(cluster_size) + + nodes = [] + for i in range(1, cluster_size + quorum_size + 1): + nodes.append( + cluster.add_instance( + f"node{i}", + main_configs=[ + f"{config_dir}/enable_keeper{i}.xml", + f"{config_dir}/use_keeper.xml", + ], + stay_alive=True, + ) + ) + + cluster.start() + return cluster, nodes def smaller_exception(ex): return "\n".join(str(ex).split("\n")[0:2]) -def wait_node(node): +def wait_node(cluster, node): for _ in range(100): zk = None try: node.query("SELECT * FROM system.zookeeper WHERE path = '/'") - zk = get_fake_zk(node.name, timeout=30.0) + zk = get_fake_zk(cluster, node.name, timeout=30.0) zk.create("/test", sequence=True) print("node", node.name, "ready") break @@ -71,29 +69,31 @@ def wait_node(node): raise Exception("Can't wait node", node.name, "to become ready") -def wait_nodes(nodes): +def wait_nodes(cluster, nodes): for node in nodes: - wait_node(node) + wait_node(cluster, node) -def get_fake_zk(nodename, timeout=30.0): +def get_fake_zk(cluster, nodename, timeout=30.0): _fake_zk_instance = KazooClient( hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout ) _fake_zk_instance.start() return _fake_zk_instance -def get_keeper_socket(node_name): + +def get_keeper_socket(cluster, node_name): hosts = cluster.get_instance_ip(node_name) client = socket.socket() client.settimeout(10) client.connect((hosts, 9181)) return client -def send_4lw_cmd(node_name=node1.name, cmd="ruok"): + +def send_4lw_cmd(cluster, node_name, cmd="ruok"): client = None try: - client = get_keeper_socket(node_name) + client = get_keeper_socket(cluster, node_name) client.send(cmd.encode()) data = client.recv(100_000) data = data.decode() @@ -101,98 +101,137 @@ def send_4lw_cmd(node_name=node1.name, cmd="ruok"): finally: if client is not None: client.close() - -def wait_until_connected(node_name): - while send_4lw_cmd(node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: - time.sleep(0.2) + + +def wait_until_connected(cluster, node_name): + while send_4lw_cmd(cluster, node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: + time.sleep(0.1) + def wait_and_assert_data(zk, path, data): while zk.exists(path) is None: - time.sleep(0.2) + time.sleep(0.1) assert zk.get(path)[0] == data.encode() +def close_zk(zk): + zk.stop() + zk.close() + + NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" -def test_three_node_recovery(started_cluster): + +@pytest.mark.parametrize("cluster_size", [3, 5]) +def test_three_node_recovery(cluster_size): + cluster, nodes = create_and_start_cluster(3) + quorum_size = get_quorum_size(cluster_size) + node_zks = [] try: - # initial cluster is node1 <-> node2 <-> node3 - node4.stop_clickhouse(); - node5.stop_clickhouse(); + # initial cluster of `cluster_size` nodes + for node in nodes[cluster_size:]: + node.stop_clickhouse() - wait_nodes([node1, node2, node3]) - node1_zk = get_fake_zk("node1") - node2_zk = get_fake_zk("node2") - node3_zk = get_fake_zk("node3") - node4_zk = None - node5_zk = None + wait_nodes(cluster, nodes[:cluster_size]) - for i, zk in enumerate([node1_zk, node2_zk, node3_zk]): - zk.create(f"/test_force_recovery_node{i+1}", f"somedata{i+1}".encode()) + node_zks = [get_fake_zk(cluster, node.name) for node in nodes[:cluster_size]] - for zk in [node1_zk, node2_zk, node3_zk]: - for i in range(1, 4): - wait_and_assert_data(zk, f"/test_force_recovery_node{i}", f"somedata{i}") + data_in_cluster = [] - node1.stop_clickhouse() + def add_data(zk, path, data): + zk.create(path, data.encode()) + data_in_cluster.append((path, data)) - node2_zk.create("/test_force_recovery_extra", b"someexstradata") - wait_and_assert_data(node3_zk, "/test_force_recovery_extra", "someexstradata") + def assert_all_data(zk): + for path, data in data_in_cluster: + wait_and_assert_data(zk, path, data) - node1.start_clickhouse() - wait_and_assert_data(node1_zk, "/test_force_recovery_extra", "someexstradata") + for i, zk in enumerate(node_zks): + add_data(zk, f"/test_force_recovery_node{i+1}", f"somedata{i+1}") - node2.stop_clickhouse() - node3.stop_clickhouse() + for zk in node_zks: + assert_all_data(zk) + + nodes[0].stop_clickhouse() + + add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") + + for node_zk in node_zks[2:cluster_size]: + wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") + + nodes[0].start_clickhouse() + wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra") + + # stop last quorum size nodes + nodes_left = cluster_size - quorum_size + for node_zk in node_zks[nodes_left:cluster_size]: + close_zk(node_zk) + + node_zks = node_zks[:nodes_left] + + for node in nodes[nodes_left:cluster_size]: + node.stop_clickhouse() # wait for node1 to lose quorum - while send_4lw_cmd(node1.name, "mntr") != NOT_SERVING_REQUESTS_ERROR_MSG: + while ( + send_4lw_cmd(cluster, nodes[0].name, "mntr") + != NOT_SERVING_REQUESTS_ERROR_MSG + ): time.sleep(0.2) - node1.copy_file_to_container( - os.path.join(CONFIG_DIR, "recovered_keeper1.xml"), - "/etc/clickhouse-server/config.d/enable_keeper1.xml") + nodes[0].copy_file_to_container( + os.path.join( + BASE_DIR, get_config_dir(cluster_size), "recovered_keeper1.xml" + ), + "/etc/clickhouse-server/config.d/enable_keeper1.xml", + ) - node1.query("SYSTEM RELOAD CONFIG") + nodes[0].query("SYSTEM RELOAD CONFIG") - assert send_4lw_cmd(node1.name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG - send_4lw_cmd(node1.name, "rcvr") - assert send_4lw_cmd(node1.name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG + assert ( + send_4lw_cmd(cluster, nodes[0].name, "mntr") + == NOT_SERVING_REQUESTS_ERROR_MSG + ) + send_4lw_cmd(cluster, nodes[0].name, "rcvr") + assert ( + send_4lw_cmd(cluster, nodes[0].name, "mntr") + == NOT_SERVING_REQUESTS_ERROR_MSG + ) - node4.start_clickhouse() - wait_node(node4) - wait_until_connected(node4.name) + # add one node to restore the quorum + nodes[cluster_size].start_clickhouse() + wait_node(cluster, nodes[cluster_size]) + wait_until_connected(cluster, nodes[cluster_size].name) # node1 should have quorum now and accept requests - wait_until_connected(node1.name) + wait_until_connected(cluster, nodes[0].name) - node5.start_clickhouse() - wait_node(node5) - wait_until_connected(node5.name) + node_zks.append(get_fake_zk(cluster, nodes[cluster_size].name)) - node4_zk = get_fake_zk("node4") - node5_zk = get_fake_zk("node5") + # add rest of the nodes + for node in nodes[cluster_size + 1 :]: + node.start_clickhouse() + wait_node(cluster, node) + wait_until_connected(cluster, node.name) + node_zks.append(get_fake_zk(cluster, node.name)) - for zk in [node1_zk, node4_zk, node5_zk]: - for i in range(1, 4): - wait_and_assert_data(zk, f"/test_force_recovery_node{i}", f"somedata{i}") - wait_and_assert_data(zk, "/test_force_recovery_extra", "someexstradata") + for zk in node_zks: + assert_all_data(zk) # new nodes can achieve quorum without the recovery node (cluster should work properly from now on) - node1.stop_clickhouse() + nodes[0].stop_clickhouse() - node4_zk.create("/test_force_recovery_node4", b"somedata4") - wait_and_assert_data(node5_zk, "/test_force_recovery_node4", "somedata4") + add_data(node_zks[-2], "/test_force_recovery_last", "somedatalast") + wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast") - node1.start_clickhouse() - for i in range(1, 5): - wait_and_assert_data(node1_zk, f"/test_force_recovery_node{i}", f"somedata{i}") - wait_and_assert_data(node1_zk, "/test_force_recovery_extra", "someexstradata") + nodes[0].start_clickhouse() + for zk in node_zks[:nodes_left]: + assert_all_data(zk) finally: try: - for zk_conn in [node1_zk, node2_zk, node3_zk, node4_zk, node5_zk]: - if zk_conn: - zk_conn.stop() - zk_conn.close() + for zk_conn in node_zks: + close_zk(zk_conn) except: pass + + cluster.shutdown() From 608c0996d01fa4a0ee1cf6277c6c2ca6fd97483f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Apr 2022 08:08:13 +0000 Subject: [PATCH 35/94] Fix tests --- src/Coordination/KeeperDispatcher.cpp | 10 ++ src/Coordination/KeeperServer.cpp | 116 +++++++++++++----- src/Coordination/KeeperServer.h | 4 + .../test_keeper_force_recovery/test.py | 42 +++++-- 4 files changed, 130 insertions(+), 42 deletions(-) diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 73e39183bc1..4ab26968ed3 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -547,6 +547,13 @@ void KeeperDispatcher::updateConfigurationThread() continue; } + if (server->isRecovering()) + { + LOG_INFO(log, "Server is recovering, will not apply configuration until recovery is finished"); + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + continue; + } + ConfigUpdateAction action; if (!update_configuration_queue.pop(action)) break; @@ -556,6 +563,9 @@ void KeeperDispatcher::updateConfigurationThread() bool done = false; while (!done) { + if (server->isRecovering()) + break; + if (shutdown_called) return; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 30735940062..e919d4afe79 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -139,6 +140,20 @@ struct KeeperServer::KeeperRaftServer : public nuraft::raft_server return not_responding_peers <= max_not_responding_peers; } + // Manually set the internal config of the raft server + // This should be used only for recovery + void setConfig(const nuraft::ptr & new_config) + { + set_config(new_config); + } + + // Manually reconfigure the cluster + // This should be used only for recovery + void forceReconfigure(const nuraft::ptr & new_config) + { + reconfigure(new_config); + } + using nuraft::raft_server::raft_server; // peers are initially marked as responding because at least one cycle @@ -182,11 +197,33 @@ void KeeperServer::loadLatestConfig() } } +void KeeperServer::recoveryMode(nuraft::raft_params & params) +{ + LOG_WARNING( + log, + "This instance is in recovery mode. Until the quorum is restored, no requests should be sent to any " + "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); + + auto latest_config = state_manager->load_config(); + + nuraft::ptr new_config = std::make_shared(0, latest_config ? latest_config->get_log_idx() : 0); + new_config->set_log_idx(state_manager->load_log_store()->next_slot()); + + new_config->get_servers() = last_local_config->get_servers(); + + state_manager->save_config(*new_config); + params.with_custom_commit_quorum_size(1); + params.with_custom_election_quorum_size(1); +} + void KeeperServer::forceRecovery() { - shutdownRaftServer(); is_recovering = true; - launchRaftServer(true); + std::lock_guard lock{server_mutex}; + auto params = raft_instance->get_current_params(); + recoveryMode(params); + raft_instance->setConfig(state_manager->load_config()); + raft_instance->update_params(params); } void KeeperServer::launchRaftServer(bool enable_ipv6) @@ -220,26 +257,14 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) #if USE_SSL setSSLParams(asio_opts); #else - throw Exception{ - "SSL support for NuRaft is disabled because ClickHouse was built without SSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception( + "SSL support for NuRaft is disabled because ClickHouse was built without SSL support.", ErrorCodes::SUPPORT_IS_DISABLED); #endif } if (is_recovering) { - LOG_WARNING( - log, - "This instance is in recovery mode. Until the quorum is restored, no requests should be sent to any " - "of the cluster instances. This instance will start accepting requests only when the recovery is finished."); - params.with_custom_commit_quorum_size(1); - params.with_custom_election_quorum_size(1); - - auto latest_config = state_manager->load_config(); - auto new_config = std::make_shared(0, latest_config ? latest_config->get_log_idx() : 0); - new_config->get_servers() = last_local_config->get_servers(); - new_config->set_log_idx(state_manager->getLogStore()->next_slot()); - - state_manager->save_config(*new_config); + recoveryMode(params); } nuraft::raft_server::init_options init_options; @@ -248,7 +273,7 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) init_options.start_server_in_constructor_ = false; init_options.raft_callback_ = [this](nuraft::cb_func::Type type, nuraft::cb_func::Param * param) { return callbackFunc(type, param); }; - nuraft::ptr logger = nuraft::cs_new("RaftInstance", DB::LogsLevel::information); + nuraft::ptr logger = nuraft::cs_new("RaftInstance", coordination_settings->raft_logs_level); asio_service = nuraft::cs_new(asio_opts, logger); asio_listener = asio_service->create_rpc_listener(state_manager->getPort(), logger, enable_ipv6); @@ -360,6 +385,10 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS for (const auto & [session_id, time, request] : requests_for_sessions) entries.push_back(getZooKeeperLogEntry(session_id, time, request)); + std::lock_guard lock{server_mutex}; + if (is_recovering) + return nullptr; + return raft_instance->append_entries(entries); } @@ -410,16 +439,32 @@ uint64_t KeeperServer::getSyncedFollowerCount() const nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param) { - if (type == nuraft::cb_func::HeartBeat && is_recovering && raft_instance->isClusterHealthy()) + if (is_recovering) { - auto new_params = raft_instance->get_current_params(); - new_params.custom_commit_quorum_size_ = 0; - new_params.custom_election_quorum_size_ = 0; - raft_instance->update_params(new_params); + if (type == nuraft::cb_func::HeartBeat && raft_instance->isClusterHealthy()) + { + auto new_params = raft_instance->get_current_params(); + new_params.custom_commit_quorum_size_ = 0; + new_params.custom_election_quorum_size_ = 0; + raft_instance->update_params(new_params); - LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); - is_recovering = false; - return nuraft::cb_func::ReturnCode::Ok; + LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); + is_recovering = false; + return nuraft::cb_func::ReturnCode::Ok; + } + + if (type == nuraft::cb_func::NewConfig) + { + // Apply the manually set config when in recovery mode + // NuRaft will commit but skip the reconfigure if the current + // config is the same as the committed one + // Because we manually set the config to commit + // we need to call the reconfigure also + uint64_t log_idx = *static_cast(param->ctx); + if (log_idx == state_manager->load_config()->get_log_idx()) + raft_instance->forceReconfigure(state_manager->load_config()); + return nuraft::cb_func::ReturnCode::Ok; + } } if (initialized_flag) @@ -499,19 +544,26 @@ ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::Abstrac auto diff = state_manager->getConfigurationDiff(config); if (!diff.empty()) + { + std::lock_guard lock{server_mutex}; last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config; + } return diff; } void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) { + std::lock_guard lock{server_mutex}; + if (is_recovering) + return; + size_t sleep_ms = 500; if (task.action_type == ConfigUpdateActionType::AddServer) { LOG_INFO(log, "Will try to add server with id {}", task.server->get_id()); bool added = false; - for (size_t i = 0; i < coordination_settings->configuration_change_tries_count; ++i) + for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) { if (raft_instance->get_srv_config(task.server->get_id()) != nullptr) { @@ -560,7 +612,7 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) return; } - for (size_t i = 0; i < coordination_settings->configuration_change_tries_count; ++i) + for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) { if (raft_instance->get_srv_config(task.server->get_id()) == nullptr) { @@ -602,11 +654,15 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) bool KeeperServer::waitConfigurationUpdate(const ConfigUpdateAction & task) { + std::lock_guard lock{server_mutex}; + if (is_recovering) + return false; + size_t sleep_ms = 500; if (task.action_type == ConfigUpdateActionType::AddServer) { LOG_INFO(log, "Will try to wait server with id {} to be added", task.server->get_id()); - for (size_t i = 0; i < coordination_settings->configuration_change_tries_count; ++i) + for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) { if (raft_instance->get_srv_config(task.server->get_id()) != nullptr) { @@ -628,7 +684,7 @@ bool KeeperServer::waitConfigurationUpdate(const ConfigUpdateAction & task) { LOG_INFO(log, "Will try to wait remove of server with id {}", task.server->get_id()); - for (size_t i = 0; i < coordination_settings->configuration_change_tries_count; ++i) + for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) { if (raft_instance->get_srv_config(task.server->get_id()) == nullptr) { diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index fda867f6153..19d346de345 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ private: nuraft::ptr raft_instance; nuraft::ptr asio_service; nuraft::ptr asio_listener; + mutable std::mutex server_mutex; std::mutex initialized_mutex; std::atomic initialized_flag = false; @@ -50,6 +52,8 @@ private: void loadLatestConfig(); + void recoveryMode(nuraft::raft_params & params); + std::atomic_bool is_recovering = false; public: diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index 1e4121ad931..a0f7e87e3f6 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -29,26 +29,27 @@ def create_and_start_cluster(cluster_size): quorum_size = get_quorum_size(cluster_size) nodes = [] - for i in range(1, cluster_size + quorum_size + 1): + for i in range(cluster_size): nodes.append( cluster.add_instance( - f"node{i}", + f"node{i+1}", main_configs=[ - f"{config_dir}/enable_keeper{i}.xml", + f"{config_dir}/enable_keeper{i+1}.xml", f"{config_dir}/use_keeper.xml", ], stay_alive=True, ) ) + for i in range(cluster_size, cluster_size + quorum_size): + nodes.append( + cluster.add_instance(f"node{i+1}", main_configs=[], stay_alive=True) + ) + cluster.start() return cluster, nodes -def smaller_exception(ex): - return "\n".join(str(ex).split("\n")[0:2]) - - def wait_node(cluster, node): for _ in range(100): zk = None @@ -123,8 +124,8 @@ NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving request @pytest.mark.parametrize("cluster_size", [3, 5]) -def test_three_node_recovery(cluster_size): - cluster, nodes = create_and_start_cluster(3) +def test_cluster_recovery(cluster_size): + cluster, nodes = create_and_start_cluster(cluster_size) quorum_size = get_quorum_size(cluster_size) node_zks = [] try: @@ -160,6 +161,8 @@ def test_three_node_recovery(cluster_size): wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") nodes[0].start_clickhouse() + wait_node(cluster, nodes[0]) + node_zks[0] = get_fake_zk(cluster, nodes[0].name) wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra") # stop last quorum size nodes @@ -199,8 +202,16 @@ def test_three_node_recovery(cluster_size): ) # add one node to restore the quorum + nodes[cluster_size].copy_file_to_container( + os.path.join( + BASE_DIR, + get_config_dir(cluster_size), + f"enable_keeper{cluster_size+1}.xml", + ), + f"/etc/clickhouse-server/config.d/enable_keeper{cluster_size+1}.xml", + ) + nodes[cluster_size].start_clickhouse() - wait_node(cluster, nodes[cluster_size]) wait_until_connected(cluster, nodes[cluster_size].name) # node1 should have quorum now and accept requests @@ -209,9 +220,15 @@ def test_three_node_recovery(cluster_size): node_zks.append(get_fake_zk(cluster, nodes[cluster_size].name)) # add rest of the nodes - for node in nodes[cluster_size + 1 :]: + for i in range(cluster_size + 1, len(nodes)): + node = nodes[i] + node.copy_file_to_container( + os.path.join( + BASE_DIR, get_config_dir(cluster_size), f"enable_keeper{i+1}.xml" + ), + f"/etc/clickhouse-server/config.d/enable_keeper{i+1}.xml", + ) node.start_clickhouse() - wait_node(cluster, node) wait_until_connected(cluster, node.name) node_zks.append(get_fake_zk(cluster, node.name)) @@ -225,6 +242,7 @@ def test_three_node_recovery(cluster_size): wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast") nodes[0].start_clickhouse() + node_zks[0] = get_fake_zk(cluster, nodes[0].name) for zk in node_zks[:nodes_left]: assert_all_data(zk) finally: From 73db184ad69378d72da5c1860fef2f7302ef7538 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Apr 2022 10:23:54 +0000 Subject: [PATCH 36/94] Ignore requests on leader while in recovery mode --- src/Coordination/KeeperServer.cpp | 53 +++++++++++++++++++------------ 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index e919d4afe79..73bfc359181 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -441,29 +441,40 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ { if (is_recovering) { - if (type == nuraft::cb_func::HeartBeat && raft_instance->isClusterHealthy()) + switch (type) { - auto new_params = raft_instance->get_current_params(); - new_params.custom_commit_quorum_size_ = 0; - new_params.custom_election_quorum_size_ = 0; - raft_instance->update_params(new_params); + case nuraft::cb_func::HeartBeat: + { + if (raft_instance->isClusterHealthy()) + { + auto new_params = raft_instance->get_current_params(); + new_params.custom_commit_quorum_size_ = 0; + new_params.custom_election_quorum_size_ = 0; + raft_instance->update_params(new_params); - LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); - is_recovering = false; - return nuraft::cb_func::ReturnCode::Ok; - } - - if (type == nuraft::cb_func::NewConfig) - { - // Apply the manually set config when in recovery mode - // NuRaft will commit but skip the reconfigure if the current - // config is the same as the committed one - // Because we manually set the config to commit - // we need to call the reconfigure also - uint64_t log_idx = *static_cast(param->ctx); - if (log_idx == state_manager->load_config()->get_log_idx()) - raft_instance->forceReconfigure(state_manager->load_config()); - return nuraft::cb_func::ReturnCode::Ok; + LOG_INFO(log, "Recovery is done. You can continue using cluster normally."); + is_recovering = false; + } + break; + } + case nuraft::cb_func::NewConfig: + { + // Apply the manually set config when in recovery mode + // NuRaft will commit but skip the reconfigure if the current + // config is the same as the committed one + // Because we manually set the config to commit + // we need to call the reconfigure also + uint64_t log_idx = *static_cast(param->ctx); + if (log_idx == state_manager->load_config()->get_log_idx()) + raft_instance->forceReconfigure(state_manager->load_config()); + break; + } + case nuraft::cb_func::ProcessReq: + // we don't accept requests from our peers or clients + // while in recovery mode + return nuraft::cb_func::ReturnCode::ReturnNull; + default: + break; } } From e129f8ce004edd253fcf11d2c5ea62434a917b86 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Apr 2022 12:39:34 +0000 Subject: [PATCH 37/94] Small fix --- src/Coordination/FourLetterCommand.cpp | 6 +++--- src/Coordination/KeeperDispatcher.h | 5 ----- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index ebe54c634fd..5ea3b1124f3 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -207,14 +207,14 @@ constexpr auto * SERVER_NOT_ACTIVE_MSG = "This instance is not currently serving String MonitorCommand::run() { + if (!keeper_dispatcher.isServerActive()) + return SERVER_NOT_ACTIVE_MSG; + auto & stats = keeper_dispatcher.getKeeperConnectionStats(); Keeper4LWInfo keeper_info = keeper_dispatcher.getKeeper4LWInfo(); const auto & state_machine = keeper_dispatcher.getStateMachine(); - if (!keeper_dispatcher.isServerActive()) - return SERVER_NOT_ACTIVE_MSG; - StringBuffer ret; print(ret, "version", String(VERSION_DESCRIBE) + "-" + VERSION_GITHASH); diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index 67973a42be7..af7b132ac3a 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -118,11 +118,6 @@ public: bool isServerActive() const; - bool serverIsRecovering() const - { - return server->isRecovering(); - } - /// Registered in ConfigReloader callback. Add new configuration changes to /// update_configuration_queue. Keeper Dispatcher apply them asynchronously. void updateConfiguration(const Poco::Util::AbstractConfiguration & config); From 7e6a0bc8ae3116b6f5d724241419d96ad9f663b5 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Apr 2022 12:42:40 +0000 Subject: [PATCH 38/94] add steps for force recovery of cluster --- docs/en/operations/clickhouse-keeper.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 26d61dabaf9..f54cfee601e 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -318,3 +318,28 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 -- 4. Copy snapshot to ClickHouse server nodes with a configured `keeper` or start ClickHouse Keeper instead of ZooKeeper. The snapshot must persist on all nodes, otherwise, empty nodes can be faster and one of them can become a leader. [Original article](https://clickhouse.com/docs/en/operations/clickhouse-keeper/) + +## Recovering after losing quorum + +Because Clickhouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \ +E.g. for a 3-node cluster, it will continue working correctly if only 1 node crashes. + +Cluster configuration can be dynamically configured but there are some limitations. Reconfiguration relies on Raft also +so to add/remove a node from the cluster you need to have a quorum. If you lose too many nodes in your cluster at the same time without any chance +of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the convenvtional way. + +Nevertheless, Clickhouse Keeper has a recovery mode which allows you to forcfully reconfigure your cluster with only 1 node. +This should be done only as your last resort if you cannot start your nodes again, or start a new instance on the same endpoint. + +Important things to note before continuing: +- Make sure that the failed nodes cannot connect to the cluster again. +- Do not start any of the new nodes until it's specified in the steps + +After making sure that the above things are true, you need to do following: +1. Pick a single Keeper node to be your new leader. +2. Before doing anything else, make a backup of the `log_storage_path` folder of the picked node +3. Reconfigure the cluster on all of the nodes you want to use +4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode +5. One by one, start Keeper instances on the new nodes making sure that `mntr` returns `follower` for the `zk_server_state` before starting the next one +6. While in the recovery mode, the leader node will return error message for `mntr` command until it achieves quorum with the new nodes and refuse any requests from the client and the followers +7. After quorum is achieved, the leader node will return to the normal mode of operation, accepting all the requests using Raft - verify with `mntr` which should return `leader` for the `zk_server_state` From bb0d941adddd4726c6151e0ac8db5c84fbd6a223 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 20 Apr 2022 07:23:21 +0000 Subject: [PATCH 39/94] Run tests on a single cluster --- src/Coordination/KeeperServer.cpp | 4 +- .../enable_keeper1.xml | 0 .../enable_keeper2.xml | 0 .../enable_keeper3.xml | 0 .../enable_keeper4.xml | 0 .../enable_keeper5.xml | 0 .../enable_keeper6.xml | 0 .../enable_keeper7.xml | 0 .../enable_keeper8.xml | 0 .../recovered_keeper1.xml | 0 .../three_node_cluster/enable_keeper1.xml | 33 ---- .../three_node_cluster/enable_keeper2.xml | 33 ---- .../three_node_cluster/enable_keeper3.xml | 33 ---- .../three_node_cluster/enable_keeper4.xml | 35 ---- .../three_node_cluster/enable_keeper5.xml | 33 ---- .../three_node_cluster/recovered_keeper1.xml | 33 ---- .../configs/three_node_cluster/use_keeper.xml | 24 --- .../{five_node_cluster => }/use_keeper.xml | 0 .../test_keeper_force_recovery/test.py | 159 +++++++----------- 19 files changed, 63 insertions(+), 324 deletions(-) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper1.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper2.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper3.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper4.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper5.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper6.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper7.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/enable_keeper8.xml (100%) rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/recovered_keeper1.xml (100%) delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml delete mode 100644 tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml rename tests/integration/test_keeper_force_recovery/configs/{five_node_cluster => }/use_keeper.xml (100%) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 73bfc359181..c4ef8d7e7c4 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -263,9 +263,7 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) } if (is_recovering) - { recoveryMode(params); - } nuraft::raft_server::init_options init_options; @@ -292,7 +290,7 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) raft_instance = nuraft::cs_new(ctx, init_options); - raft_instance->start_server(state_manager->shouldStartAsFollower()); + raft_instance->start_server(init_options.skip_initial_election_timeout_); nuraft::ptr casted_raft_server = raft_instance; asio_listener->listen(casted_raft_server); diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper1.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper1.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper2.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper2.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper3.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper3.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper4.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper4.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper5.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper5.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper6.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper6.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper6.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper7.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper7.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper7.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml b/tests/integration/test_keeper_force_recovery/configs/enable_keeper8.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/enable_keeper8.xml rename to tests/integration/test_keeper_force_recovery/configs/enable_keeper8.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/recovered_keeper1.xml rename to tests/integration/test_keeper_force_recovery/configs/recovered_keeper1.xml diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml deleted file mode 100644 index 441c1bc185d..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper1.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 9181 - 1 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - - - 2 - node2 - 9234 - - - 3 - node3 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml deleted file mode 100644 index e2e2c1fd7db..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper2.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 9181 - 2 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - - - 2 - node2 - 9234 - - - 3 - node3 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml deleted file mode 100644 index e2ac0400d88..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper3.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 9181 - 3 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - - - 2 - node2 - 9234 - - - 3 - node3 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml deleted file mode 100644 index b123d9970e4..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper4.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - 9181 - 4 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - true - 3 - - - 4 - node4 - 9234 - - - 5 - node5 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml deleted file mode 100644 index dfcd44cf6ed..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/enable_keeper5.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 9181 - 5 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - - - 4 - node4 - 9234 - - - 5 - node5 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml deleted file mode 100644 index 8ead85ce501..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/recovered_keeper1.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - 9181 - 1 - /var/lib/clickhouse/coordination/log - /var/lib/clickhouse/coordination/snapshots - - - 5000 - 10000 - 75 - trace - - - - - 1 - node1 - 9234 - - - 4 - node4 - 9234 - - - 5 - node5 - 9234 - - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml b/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml deleted file mode 100644 index 22bebe41aa0..00000000000 --- a/tests/integration/test_keeper_force_recovery/configs/three_node_cluster/use_keeper.xml +++ /dev/null @@ -1,24 +0,0 @@ - - - - node1 - 9181 - - - node2 - 9181 - - - node3 - 9181 - - - node4 - 9181 - - - node5 - 9181 - - - diff --git a/tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml b/tests/integration/test_keeper_force_recovery/configs/use_keeper.xml similarity index 100% rename from tests/integration/test_keeper_force_recovery/configs/five_node_cluster/use_keeper.xml rename to tests/integration/test_keeper_force_recovery/configs/use_keeper.xml diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index a0f7e87e3f6..181c4189443 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -4,78 +4,53 @@ import socket from helpers.cluster import ClickHouseCluster import time -BASE_DIR = os.path.dirname(os.path.realpath(__file__)) from kazoo.client import KazooClient - -def get_quorum_size(cluster_size): - return cluster_size // 2 + 1 +CLUSTER_SIZE = 5 +QUORUM_SIZE = CLUSTER_SIZE // 2 + 1 -def get_config_dir(cluster_size): - if cluster_size == 3: - return "configs/three_node_cluster" - elif cluster_size == 5: - return "configs/five_node_cluster" - else: - raise Exception("Invalid cluster size {}", cluster_size) +cluster = ClickHouseCluster(__file__) +CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") -def create_and_start_cluster(cluster_size): - cluster = ClickHouseCluster(__file__) - config_dir = get_config_dir(cluster_size) - - quorum_size = get_quorum_size(cluster_size) - +def get_nodes(): nodes = [] - for i in range(cluster_size): + for i in range(CLUSTER_SIZE): nodes.append( cluster.add_instance( f"node{i+1}", main_configs=[ - f"{config_dir}/enable_keeper{i+1}.xml", - f"{config_dir}/use_keeper.xml", + f"configs/enable_keeper{i+1}.xml", + f"configs/use_keeper.xml", ], stay_alive=True, ) ) - for i in range(cluster_size, cluster_size + quorum_size): + for i in range(CLUSTER_SIZE, CLUSTER_SIZE + QUORUM_SIZE): nodes.append( cluster.add_instance(f"node{i+1}", main_configs=[], stay_alive=True) ) - cluster.start() - return cluster, nodes + return nodes -def wait_node(cluster, node): - for _ in range(100): - zk = None - try: - node.query("SELECT * FROM system.zookeeper WHERE path = '/'") - zk = get_fake_zk(cluster, node.name, timeout=30.0) - zk.create("/test", sequence=True) - print("node", node.name, "ready") - break - except Exception as ex: - time.sleep(0.2) - print("Waiting until", node.name, "will be ready, exception", ex) - finally: - if zk: - zk.stop() - zk.close() - else: - raise Exception("Can't wait node", node.name, "to become ready") +nodes = get_nodes() -def wait_nodes(cluster, nodes): - for node in nodes: - wait_node(cluster, node) +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + finally: + cluster.shutdown() -def get_fake_zk(cluster, nodename, timeout=30.0): +def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient( hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout ) @@ -83,7 +58,7 @@ def get_fake_zk(cluster, nodename, timeout=30.0): return _fake_zk_instance -def get_keeper_socket(cluster, node_name): +def get_keeper_socket(node_name): hosts = cluster.get_instance_ip(node_name) client = socket.socket() client.settimeout(10) @@ -91,10 +66,10 @@ def get_keeper_socket(cluster, node_name): return client -def send_4lw_cmd(cluster, node_name, cmd="ruok"): +def send_4lw_cmd(node_name, cmd="ruok"): client = None try: - client = get_keeper_socket(cluster, node_name) + client = get_keeper_socket(node_name) client.send(cmd.encode()) data = client.recv(100_000) data = data.decode() @@ -104,11 +79,16 @@ def send_4lw_cmd(cluster, node_name, cmd="ruok"): client.close() -def wait_until_connected(cluster, node_name): - while send_4lw_cmd(cluster, node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: +def wait_until_connected(node_name): + while send_4lw_cmd(node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG: time.sleep(0.1) +def wait_nodes(nodes): + for node in nodes: + wait_until_connected(node.name) + + def wait_and_assert_data(zk, path, data): while zk.exists(path) is None: time.sleep(0.1) @@ -123,19 +103,16 @@ def close_zk(zk): NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests" -@pytest.mark.parametrize("cluster_size", [3, 5]) -def test_cluster_recovery(cluster_size): - cluster, nodes = create_and_start_cluster(cluster_size) - quorum_size = get_quorum_size(cluster_size) +def test_cluster_recovery(started_cluster): node_zks = [] try: # initial cluster of `cluster_size` nodes - for node in nodes[cluster_size:]: + for node in nodes[CLUSTER_SIZE:]: node.stop_clickhouse() - wait_nodes(cluster, nodes[:cluster_size]) + wait_nodes(nodes[:CLUSTER_SIZE]) - node_zks = [get_fake_zk(cluster, node.name) for node in nodes[:cluster_size]] + node_zks = [get_fake_zk(node.name) for node in nodes[:CLUSTER_SIZE]] data_in_cluster = [] @@ -157,80 +134,70 @@ def test_cluster_recovery(cluster_size): add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra") - for node_zk in node_zks[2:cluster_size]: + for node_zk in node_zks[2:CLUSTER_SIZE]: wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra") nodes[0].start_clickhouse() - wait_node(cluster, nodes[0]) - node_zks[0] = get_fake_zk(cluster, nodes[0].name) + wait_until_connected(nodes[0].name) + node_zks[0] = get_fake_zk(nodes[0].name) wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra") # stop last quorum size nodes - nodes_left = cluster_size - quorum_size - for node_zk in node_zks[nodes_left:cluster_size]: + nodes_left = CLUSTER_SIZE - QUORUM_SIZE + for node_zk in node_zks[nodes_left:CLUSTER_SIZE]: close_zk(node_zk) node_zks = node_zks[:nodes_left] - for node in nodes[nodes_left:cluster_size]: + for node in nodes[nodes_left:CLUSTER_SIZE]: node.stop_clickhouse() # wait for node1 to lose quorum - while ( - send_4lw_cmd(cluster, nodes[0].name, "mntr") - != NOT_SERVING_REQUESTS_ERROR_MSG - ): + while send_4lw_cmd(nodes[0].name, "mntr") != NOT_SERVING_REQUESTS_ERROR_MSG: time.sleep(0.2) nodes[0].copy_file_to_container( - os.path.join( - BASE_DIR, get_config_dir(cluster_size), "recovered_keeper1.xml" - ), + os.path.join(CONFIG_DIR, "recovered_keeper1.xml"), "/etc/clickhouse-server/config.d/enable_keeper1.xml", ) nodes[0].query("SYSTEM RELOAD CONFIG") - assert ( - send_4lw_cmd(cluster, nodes[0].name, "mntr") - == NOT_SERVING_REQUESTS_ERROR_MSG - ) - send_4lw_cmd(cluster, nodes[0].name, "rcvr") - assert ( - send_4lw_cmd(cluster, nodes[0].name, "mntr") - == NOT_SERVING_REQUESTS_ERROR_MSG - ) + assert send_4lw_cmd(nodes[0].name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG + send_4lw_cmd(nodes[0].name, "rcvr") + assert send_4lw_cmd(nodes[0].name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG # add one node to restore the quorum - nodes[cluster_size].copy_file_to_container( + nodes[CLUSTER_SIZE].copy_file_to_container( os.path.join( - BASE_DIR, - get_config_dir(cluster_size), - f"enable_keeper{cluster_size+1}.xml", + CONFIG_DIR, + f"enable_keeper{CLUSTER_SIZE+1}.xml", ), - f"/etc/clickhouse-server/config.d/enable_keeper{cluster_size+1}.xml", + f"/etc/clickhouse-server/config.d/enable_keeper{CLUSTER_SIZE+1}.xml", ) - nodes[cluster_size].start_clickhouse() - wait_until_connected(cluster, nodes[cluster_size].name) + nodes[CLUSTER_SIZE].start_clickhouse() + wait_until_connected(nodes[CLUSTER_SIZE].name) # node1 should have quorum now and accept requests - wait_until_connected(cluster, nodes[0].name) + wait_until_connected(nodes[0].name) - node_zks.append(get_fake_zk(cluster, nodes[cluster_size].name)) + node_zks.append(get_fake_zk(nodes[CLUSTER_SIZE].name)) # add rest of the nodes - for i in range(cluster_size + 1, len(nodes)): + for i in range(CLUSTER_SIZE + 1, len(nodes)): node = nodes[i] node.copy_file_to_container( - os.path.join( - BASE_DIR, get_config_dir(cluster_size), f"enable_keeper{i+1}.xml" - ), + os.path.join(CONFIG_DIR, f"enable_keeper{i+1}.xml"), f"/etc/clickhouse-server/config.d/enable_keeper{i+1}.xml", ) node.start_clickhouse() - wait_until_connected(cluster, node.name) - node_zks.append(get_fake_zk(cluster, node.name)) + wait_until_connected(node.name) + node_zks.append(get_fake_zk(node.name)) + + # refresh old zk sessions + for i, node in enumerate(nodes[:nodes_left]): + node_zks[i] = get_fake_zk(node.name) for zk in node_zks: assert_all_data(zk) @@ -242,7 +209,7 @@ def test_cluster_recovery(cluster_size): wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast") nodes[0].start_clickhouse() - node_zks[0] = get_fake_zk(cluster, nodes[0].name) + node_zks[0] = get_fake_zk(nodes[0].name) for zk in node_zks[:nodes_left]: assert_all_data(zk) finally: @@ -251,5 +218,3 @@ def test_cluster_recovery(cluster_size): close_zk(zk_conn) except: pass - - cluster.shutdown() From 6c95a2c195d452dec06c33a88b0e9a3b74b74d7f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 20 Apr 2022 10:35:35 +0000 Subject: [PATCH 40/94] Wait for node1 to connect --- tests/integration/test_keeper_force_recovery/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index 181c4189443..e4f42ba21f6 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -209,6 +209,7 @@ def test_cluster_recovery(started_cluster): wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast") nodes[0].start_clickhouse() + wait_until_connected(nodes[0].name) node_zks[0] = get_fake_zk(nodes[0].name) for zk in node_zks[:nodes_left]: assert_all_data(zk) From f261291fa676745558feda529c4851ef43a43283 Mon Sep 17 00:00:00 2001 From: xinhuitian Date: Thu, 21 Apr 2022 14:10:29 +0800 Subject: [PATCH 41/94] fix benchmark json report info --- programs/benchmark/Benchmark.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 60e5ca92f77..3170bde4747 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -548,11 +548,13 @@ private: json_out << double_quote << connections[i]->getDescription() << ": {\n"; json_out << double_quote << "statistics" << ": {\n"; - print_key_value("QPS", info->queries / info->work_time); - print_key_value("RPS", info->read_rows / info->work_time); - print_key_value("MiBPS", info->read_bytes / info->work_time); - print_key_value("RPS_result", info->result_rows / info->work_time); - print_key_value("MiBPS_result", info->result_bytes / info->work_time); + double seconds = info->work_time / concurrency; + + print_key_value("QPS", info->queries.load() / seconds); + print_key_value("RPS", info->read_rows / seconds); + print_key_value("MiBPS", info->read_bytes / seconds); + print_key_value("RPS_result", info->result_rows / seconds); + print_key_value("MiBPS_result", info->result_bytes / seconds); print_key_value("num_queries", info->queries.load()); print_key_value("num_errors", info->errors, false); From e2866b0541025b04df6dbc05b8379a36f77d2790 Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 21 Apr 2022 12:09:10 +0300 Subject: [PATCH 42/94] Fixed missing enum values for ClientInfo::Interface --- src/Interpreters/SessionLog.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/SessionLog.cpp b/src/Interpreters/SessionLog.cpp index a0c29c07d38..1744c582b28 100644 --- a/src/Interpreters/SessionLog.cpp +++ b/src/Interpreters/SessionLog.cpp @@ -97,6 +97,7 @@ NamesAndTypesList SessionLogElement::getNamesAndTypes() AUTH_TYPE_NAME_AND_VALUE(AuthType::KERBEROS) }); #undef AUTH_TYPE_NAME_AND_VALUE + static_assert(static_cast(AuthenticationType::MAX) == 7); auto interface_type_column = std::make_shared( DataTypeEnum8::Values @@ -105,8 +106,11 @@ NamesAndTypesList SessionLogElement::getNamesAndTypes() {"HTTP", static_cast(Interface::HTTP)}, {"gRPC", static_cast(Interface::GRPC)}, {"MySQL", static_cast(Interface::MYSQL)}, - {"PostgreSQL", static_cast(Interface::POSTGRESQL)} + {"PostgreSQL", static_cast(Interface::POSTGRESQL)}, + {"Local", static_cast(Interface::LOCAL)}, + {"TCP_Interserver", static_cast(Interface::TCP_INTERSERVER)} }); + static_assert(magic_enum::enum_count() == 7); auto lc_string_datatype = std::make_shared(std::make_shared()); From b107cc989aa7f12ea0ce14c39cbadab60ddea270 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 21 Apr 2022 12:36:44 +0000 Subject: [PATCH 43/94] Update docs with argument option --- docs/en/operations/clickhouse-keeper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 3365bba3fbe..58f4a0fc16e 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -340,7 +340,7 @@ After making sure that the above things are true, you need to do following: 1. Pick a single Keeper node to be your new leader. 2. Before doing anything else, make a backup of the `log_storage_path` folder of the picked node 3. Reconfigure the cluster on all of the nodes you want to use -4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode +4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode OR stop Keeper instance on the picked node and start it again with the `--force-recovery` argument 5. One by one, start Keeper instances on the new nodes making sure that `mntr` returns `follower` for the `zk_server_state` before starting the next one 6. While in the recovery mode, the leader node will return error message for `mntr` command until it achieves quorum with the new nodes and refuse any requests from the client and the followers 7. After quorum is achieved, the leader node will return to the normal mode of operation, accepting all the requests using Raft - verify with `mntr` which should return `leader` for the `zk_server_state` From 85071f8e32ee4f035c0374e5b899aa434a907974 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 21 Apr 2022 12:37:01 +0000 Subject: [PATCH 44/94] update NuRaft --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index b6050a4e349..aed15187ef0 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit b6050a4e34995f7023912d8e9e03a061f5d4235c +Subproject commit aed15187ef0f051f5b7ea5628176824e91f6ecb1 From 164647cc05cd0cd1568b03e4ec2c2cdd5b4c89ed Mon Sep 17 00:00:00 2001 From: Tian Xinhui Date: Fri, 22 Apr 2022 18:31:07 +0800 Subject: [PATCH 45/94] Update programs/benchmark/Benchmark.cpp Co-authored-by: Vladimir C --- programs/benchmark/Benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp index 3170bde4747..71429cbc6a7 100644 --- a/programs/benchmark/Benchmark.cpp +++ b/programs/benchmark/Benchmark.cpp @@ -552,9 +552,9 @@ private: print_key_value("QPS", info->queries.load() / seconds); print_key_value("RPS", info->read_rows / seconds); - print_key_value("MiBPS", info->read_bytes / seconds); + print_key_value("MiBPS", info->read_bytes / seconds / 1048576); print_key_value("RPS_result", info->result_rows / seconds); - print_key_value("MiBPS_result", info->result_bytes / seconds); + print_key_value("MiBPS_result", info->result_bytes / seconds / 1048576); print_key_value("num_queries", info->queries.load()); print_key_value("num_errors", info->errors, false); From 0d2393a687f86a2f0c879f4724164f53899b7d7a Mon Sep 17 00:00:00 2001 From: xinhuitian Date: Sun, 24 Apr 2022 10:25:52 +0800 Subject: [PATCH 46/94] restart ci From d66951b0676d027d638b73655e58f8098c915e61 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Sun, 24 Apr 2022 20:27:22 -0600 Subject: [PATCH 47/94] /zh version of clickhouse-keeper.md --- docs/zh/operations/clickhouse-keeper.md | 326 +++++++++++++++++++++++- 1 file changed, 325 insertions(+), 1 deletion(-) mode change 120000 => 100644 docs/zh/operations/clickhouse-keeper.md diff --git a/docs/zh/operations/clickhouse-keeper.md b/docs/zh/operations/clickhouse-keeper.md deleted file mode 120000 index 528ba82fea4..00000000000 --- a/docs/zh/operations/clickhouse-keeper.md +++ /dev/null @@ -1 +0,0 @@ -../../en/operations/clickhouse-keeper.md \ No newline at end of file diff --git a/docs/zh/operations/clickhouse-keeper.md b/docs/zh/operations/clickhouse-keeper.md new file mode 100644 index 00000000000..a18930f32ab --- /dev/null +++ b/docs/zh/operations/clickhouse-keeper.md @@ -0,0 +1,325 @@ +--- +sidebar_position: 66 +sidebar_label: ClickHouse Keeper +--- + +# [预发生产] ClickHouse Keeper {#clickHouse-keeper} + +ClickHouse 服务为了 [副本](../engines/table-engines/mergetree-family/replication.md) 和 [分布式DDL](../sql-reference/distributed-ddl.md) 查询执行使用 [ZooKeeper](https://zookeeper.apache.org/) 协调系统. ClickHouse Keeper 和 ZooKeeper是相互兼容的,可互相替代. + +:::danger "警告" +这个功能当前还在预发生产阶段. 我们只是在内部部分使用于生产环境和测试CI中. +::: + +## 实现细节 {#implementation-details} + +ZooKeeper最早的非常著名的开源协调系统之一. 它是通过Java语言实现的, 有一个相当节点和强大的数据模型. ZooKeeper的协调算法叫做 ZAB (ZooKeeper Atomic Broadcast) zk不能保证读取的线性化, 以为每个zk节点服务都是通过本地线性读的. ClickHouse Keeper是通过C++写的,和zookeeper不一样, ClickHouse Keeper使用的[RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft)算法. 这个算法允许线性读和写, 已经有几种不同的语言的开源实现. + +ClickHouse Keeper 默认提供了一些保证和ZooKeeper是一样的 (线性写, 非线性读)和. clickhouse keeper有一个兼容的客户端服务端协议, 所以任何标准的zookeeper客户端都可以用来与clickhouse keeper进行交互. 快照和日志的格式与ZooKeeper不兼容, 但是通过`clickhouse-keeper-converter` 允许转换 ZooKeeper 数据到 ClickHouse Keeper 快照. ClickHouse Keeper的interserver协议和zookeeper也不兼容,所以ZooKeeper / ClickHouse Keeper 混合部署集群是不可能的. + +ClickHouse Keeper支持访问控制列表(ACL)的方式和[ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) 一样. ClickHouse Keeper支持相同的权限集合并且有完全相同的内置方案如:`world`, `auth`, `digest`, `host` and `ip`. 权限校验使用 `用户名:密码`方式配对. 密码通过Base64算法加密. + +:::info "注意" +不支持外部集成 +::: + +## 配置 {#configuration} + +ClickHouse Keeper 完全可以作为ZooKeeper的独立替代品或者作为ClickHouse server服务的内部组件, 但是这两种方式下的配置使用 `.xml` 格式也几乎都是相同的. ClickHouse Keeper 配置的标签是 ``. Keeper 配置有以下这些参数: + +- `tcp_port` — 客户端连接的端口(ZooKeeper默认是`2181`). +- `tcp_port_secure` — client 和 keeper-server之间的SSL连接的安全端口. +- `server_id` — 唯一的服务器ID, ClickHouse Keeper 集群的每个参与组件都必须有一个唯一的编号(1, 2, 3, 等等). +- `log_storage_path` — 协调日志的路径, 最好存放在不繁忙的机器上 (和ZooKeeper一样). +- `snapshot_storage_path` — 协调快照的路径. + +其他常见参数继承自ClickHouse server的配置 (`listen_host`, `logger`, 等等). + +内部协调配置位于`.`部分: + +- `operation_timeout_ms` — 单个客户端操作的超时时间(ms)(默认值:10000)。 +- `min_session_timeout_ms` — 客户端会话的最小超时时间(ms)(默认值:10000)。 +- `session_timeout_ms` — 客户端会话最大超时时间(ms)(默认100000)。 +- `dead_session_check_period_ms` — ClickHouse Keeper检查死会话并删除它们的频率(毫秒)(默认值:500)。 +- `heart_beat_interval_ms` — ClickHouse Keeper的leader发送心跳频率(毫秒)(默认为500)。 +- `election_timeout_lower_bound_ms` — 如果follower在此间隔内没有收到leader的心跳,那么它可以启动leader选举(默认为1000). +- `election_timeout_upper_bound_ms` — 如果follower在此间隔内没有收到leader的心跳,那么它必须启动leader选举(默认为2000)。 +- `rotate_log_storage_interval` — 单个文件中存储的日志记录数量(默认100000条)。 +- `reserved_log_items` — 在压缩之前需要存储多少协调日志记录(默认100000)。 +- `snapshot_distance` — ClickHouse Keeper创建新快照的频率(以日志记录的数量为单位)(默认100000)。 +- `snapshots_to_keep` — 保留多少个快照(默认值:3)。 +- `stale_log_gap` — 当leader认为follower过时并发送快照给follower而不是日志时的阈值(默认值:10000)。 +- `fresh_log_gap` — 当节点变成新鲜时的间隔(默认值:200)。 +- `max_requests_batch_size` - 发送到RAFT之前的最大批量请求数(默认值:100)。 +- `force_sync` — 在每次写入协调日志时是否调用' fsync '(默认值:true)。 +- `quorum_reads` — 通过整个RAFT共识以类似的速度执行读请求和写请求(默认值:false)。 +- `raft_logs_level` — 关于协调的文本日志级别 (trace, debug, 等等) (默认: system default). +- `auto_forwarding` — 允许将follower的请求转发到leader (默认: true). +- `shutdown_timeout` — 等待内部连接完成并关闭(ms)(默认值:5000)。 +- `startup_timeout` — 如果服务器在指定的超时时间内没有连接到其他仲裁参与者,它将终止(ms)(默认值:30000)。 +- `four_letter_word_white_list` — 4个字母的白名单列表 (默认: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro"). + +仲裁配置位于 `.` 部分,并且保护一些描述 + +整个仲裁的唯一参数是“secure”,它为仲裁参与者之间的通信启用加密连接。如果节点之间的内部通信需要SSL连接,则该参数可以设置为“true”,否则不指定。 + +每个``的主要参数是: + +- `id` — 仲裁中的服务器标识符。 +- `hostname` — 放置该服务器的主机名。 +- `port` — 服务器监听连接的端口。 + + +在[integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration)可以找到带有 `test_keeper_` 前缀的3个节点的仲裁配置示例. 服务配置举例如下 #1: + +```xml + + 2181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + + + 10000 + 30000 + trace + + + + + 1 + zoo1 + 9444 + + + 2 + zoo2 + 9444 + + + 3 + zoo3 + 9444 + + + +``` + +## 如何运行 {#how-to-run} + +ClickHouse Keeper被绑定到ClickHouse服务器包中,只需添加配置' ',并像往常一样启动ClickHouse服务器。如果你想运行独立的ClickHouse Keeper,你可以用类似的方式启动它: + +```bash +clickhouse-keeper --config /etc/your_path_to_config/config.xml +``` + +如果你没有符号链接(' clickhouse-keeper '),你可以创建它或指定' keeper '作为参数: + +```bash +clickhouse keeper --config /etc/your_path_to_config/config.xml +``` + +## 四字母命令 {#four-letter-word-commands} + +ClickHouse Keeper还提供了与Zookeeper几乎相同的4lw命令。每个命令由4个字母组成,如“mntr”、“stat”等。还有一些更有趣的命令:' stat '给出了服务器和连接客户端的一般信息,而' srvr '和' cons '分别给出了服务器和连接的详细信息。 + +4lw命令有一个白名单配置“four_letter_word_white_list”,它的默认值为“conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro”。 + +您可以通过telnet或nc在客户端端口向ClickHouse Keeper发出命令。 + +``` +echo mntr | nc localhost 9181 +``` + +下面是4lw的详细命令: + +- `ruok`: 测试服务器运行时是否处于无错误状态。如果服务器正在运行,它将用imok响应。否则它将完全不响应。“imok”的响应并不一定表明服务器已加入仲裁,只是表明服务器进程处于活动状态并绑定到指定的客户端端口。使用“stat”获取状态wrt仲裁和客户端连接信息的详细信息。 + +``` +imok +``` + +- `mntr`: 输出可用于监视集群运行状况的变量列表。 + +``` +zk_version v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +zk_avg_latency 0 +zk_max_latency 0 +zk_min_latency 0 +zk_packets_received 68 +zk_packets_sent 68 +zk_num_alive_connections 1 +zk_outstanding_requests 0 +zk_server_state leader +zk_znode_count 4 +zk_watch_count 1 +zk_ephemerals_count 0 +zk_approximate_data_size 723 +zk_open_file_descriptor_count 310 +zk_max_file_descriptor_count 10240 +zk_followers 0 +zk_synced_followers 0 +``` + +- `srvr`: 列出服务器的完整详细信息。 + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Latency min/avg/max: 0/0/0 +Received: 2 +Sent : 2 +Connections: 1 +Outstanding: 0 +Zxid: 34 +Mode: leader +Node count: 4 +``` + +- `stat`: 列出服务器和连接客户机的简要详细信息。 + +``` +ClickHouse Keeper version: v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +Clients: + 192.168.1.1:52852(recved=0,sent=0) + 192.168.1.1:52042(recved=24,sent=48) +Latency min/avg/max: 0/0/0 +Received: 4 +Sent : 4 +Connections: 1 +Outstanding: 0 +Zxid: 36 +Mode: leader +Node count: 4 +``` + +- `srst`: 重置服务器统计数据。该命令将影响' srvr ', ' mntr '和' stat '的结果。 + +``` +Server stats reset. +``` + +- `conf`: 打印服务配置详细信息。 + +``` +server_id=1 +tcp_port=2181 +four_letter_word_white_list=* +log_storage_path=./coordination/logs +snapshot_storage_path=./coordination/snapshots +max_requests_batch_size=100 +session_timeout_ms=30000 +operation_timeout_ms=10000 +dead_session_check_period_ms=500 +heart_beat_interval_ms=500 +election_timeout_lower_bound_ms=1000 +election_timeout_upper_bound_ms=2000 +reserved_log_items=1000000000000000 +snapshot_distance=10000 +auto_forwarding=true +shutdown_timeout=5000 +startup_timeout=240000 +raft_logs_level=information +snapshots_to_keep=3 +rotate_log_storage_interval=100000 +stale_log_gap=10000 +fresh_log_gap=200 +max_requests_batch_size=100 +quorum_reads=false +force_sync=false +compress_logs=true +compress_snapshots_with_zstd_format=true +configuration_change_tries_count=20 +``` + +- `cons`: 列出所有连接到此服务器的客户端的完整连接/会话详细信息。包括接收/发送的包数、会话id、操作延迟、最后执行的操作等信息。 + +``` + 192.168.1.1:52163(recved=0,sent=0,sid=0xffffffffffffffff,lop=NA,est=1636454787393,to=30000,lzxid=0xffffffffffffffff,lresp=0,llat=0,minlat=0,avglat=0,maxlat=0) + 192.168.1.1:52042(recved=9,sent=18,sid=0x0000000000000001,lop=List,est=1636454739887,to=30000,lcxid=0x0000000000000005,lzxid=0x0000000000000005,lresp=1636454739892,llat=0,minlat=0,avglat=0,maxlat=0) +``` + +- `crst`: 重置所有连接的连接/会话统计信息。 + +``` +Connection stats reset. +``` + +- `envi`: 打印服务环境详细信息 + +``` +Environment: +clickhouse.keeper.version=v21.11.1.1-prestable-7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7 +host.name=ZBMAC-C02D4054M.local +os.name=Darwin +os.arch=x86_64 +os.version=19.6.0 +cpu.count=12 +user.name=root +user.home=/Users/JackyWoo/ +user.dir=/Users/JackyWoo/project/jd/clickhouse/cmake-build-debug/programs/ +user.tmp=/var/folders/b4/smbq5mfj7578f2jzwn602tt40000gn/T/ +``` + + +- `dirs`: 以字节为单位显示快照和日志文件的总大小 + +``` +snapshot_dir_size: 0 +log_dir_size: 3875 +``` + +- `isro`: 测试服务器是否以只读模式运行。如果处于只读模式,服务器将响应“ro”,如果不是只读模式,则响应“rw”。 + +``` +rw +``` + +- `wchs`: 列出服务器的监视的简要信息。 + +``` +1 connections watching 1 paths +Total watches:1 +``` + +- `wchc`: 按会话列出服务器的监视的详细信息。这将输出一个会话(连接)列表和相关的监视(路径)。注意,根据监视的数量,此操作可能会很昂贵(即影响服务器性能),请谨慎使用。 + +``` +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +- `wchp`: 按路径列出有关服务器的监视的详细信息。这将输出一个带有关联会话的路径(znode)列表。注意,根据监视的数量,此操作可能昂贵(即影响服务器性能),请谨慎使用。 + +``` +/clickhouse/task_queue/ddl + 0x0000000000000001 +``` + +- `dump`: 列出未完成的会话和临时节点。这只对领导者有效。 + +``` +Sessions dump (2): +0x0000000000000001 +0x0000000000000002 +Sessions with Ephemerals (1): +0x0000000000000001 + /clickhouse/task_queue/ddl +``` + +## [实现] 从ZooKeeper迁移 {#migration-from-zookeeper} + +从ZooKeeper无缝迁移到ClickHouse Keeper是不可能的,你必须停止你的ZooKeeper集群,转换数据并启动ClickHouse Keeper。' ClickHouse - Keeper -converter '工具允许将ZooKeeper日志和快照转换为ClickHouse Keeper快照。它只适用于ZooKeeper 大于 3.4。迁移的步骤: + +1. 停掉ZooKeeper节点. + +2. 可选,但建议:找到ZooKeeper leader节点,重新启停。它会强制ZooKeeper创建一致的快照。 + +3. 在leader节点运行`clickhouse-keeper-converter`, 如下: + +```bash +clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots +``` + +4. 将快照复制到配置了“keeper”的ClickHouse服务器节点,或者启动ClickHouse keeper而不是ZooKeeper。快照必须在所有节点上持久保存,否则,空节点可能更快,其中一个节点可能成为leader. + +[Original article](https://clickhouse.com/docs/en/operations/clickhouse-keeper/) \ No newline at end of file From 6743eb6befc01f90283ea9b3c79dca7e8bf28716 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 10 Apr 2022 17:18:44 +0200 Subject: [PATCH 48/94] Correct error messages shown by File & Disk backup engines. --- src/Backups/registerBackupEnginesFileAndDisk.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index fa1786c6350..c4e51797f9e 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -29,14 +29,14 @@ namespace { String key = "backups.allowed_disk"; if (!config.has(key)) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "The \"backups.allowed_disk\" configuration parameter is not set, cannot use Disk() backup engine"); + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "The 'backups.allowed_disk' configuration parameter is not set, cannot use 'Disk' backup engine"); size_t counter = 0; while (config.getString(key) != disk_name) { key = "backups.allowed_disk[" + std::to_string(++counter) + "]"; if (!config.has(key)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk {} is not allowed for backups, see the \"backups.allowed_disk\" configuration parameter", disk_name); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk {} is not allowed for backups, see the 'backups.allowed_disk' configuration parameter", quoteString(disk_name)); } } @@ -49,7 +49,7 @@ namespace bool path_ok = path.empty() || (path.is_relative() && (*path.begin() != "..")); if (!path_ok) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path {} to backup must be inside the specified disk {}", quoteString(path.c_str()), disk_name); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Path {} to backup must be inside the specified disk {}", quoteString(path.c_str()), quoteString(disk_name)); } /// Checks that a path specified as parameters of File() is valid. @@ -62,7 +62,7 @@ namespace String key = "backups.allowed_path"; if (!config.has(key)) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, - "The \"backups.allowed_path\" configuration parameter is not set, cannot use File() backup engine"); + "The 'backups.allowed_path' configuration parameter is not set, cannot use 'File' backup engine"); if (path.is_relative()) { @@ -86,7 +86,7 @@ namespace key = "backups.allowed_path[" + std::to_string(++counter) + "]"; if (!config.has(key)) throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Path {} is not allowed for backups, see the \"backups.allowed_path\" configuration parameter", + "Path {} is not allowed for backups, see the 'backups.allowed_path' configuration parameter", quoteString(path.c_str())); } } @@ -104,7 +104,7 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) { throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Backup engine '{}' requires first argument to be a string", + "Backup engine '{}' requires its first argument to be a string", engine_name); } From 88b0cf3ca016063cac7cb00fc0f2dcfc178b59d6 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 12 Apr 2022 17:09:09 +0200 Subject: [PATCH 49/94] Fix TemporaryFileOnDisk --- src/Disks/TemporaryFileOnDisk.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Disks/TemporaryFileOnDisk.cpp b/src/Disks/TemporaryFileOnDisk.cpp index 90e51dcc369..6251bff23a1 100644 --- a/src/Disks/TemporaryFileOnDisk.cpp +++ b/src/Disks/TemporaryFileOnDisk.cpp @@ -18,10 +18,14 @@ TemporaryFileOnDisk::TemporaryFileOnDisk(const DiskPtr & disk_, const String & p TemporaryFileOnDisk::~TemporaryFileOnDisk() { -#if 1 - if (disk && !filepath.empty()) - disk->removeRecursive(filepath); -#endif + try + { + if (disk && !filepath.empty() && disk->exists(filepath)) + disk->removeRecursive(filepath); + } + catch (...) + { + } } } From d20b3d78c5b8faf673076165faf4e2c8e3676b2c Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 12 Apr 2022 22:38:21 +0200 Subject: [PATCH 50/94] Rename some restore settings. --- src/Backups/RestoreSettings.cpp | 42 +++++++++++++++---- src/Backups/RestoreSettings.h | 37 +++++++++++----- src/Backups/RestoreUtils.cpp | 25 ++++++----- src/Common/ErrorCodes.cpp | 1 + .../test_backup_restore_new/test.py | 4 +- 5 files changed, 78 insertions(+), 31 deletions(-) diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index 485650e39f0..b1d3c157e13 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -1,8 +1,10 @@ #include #include +#include #include #include #include +#include namespace DB @@ -10,6 +12,30 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_SETTING; + extern const int CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE; +} + +namespace +{ + RestoreTableCreationMode parseRestoreTableCreationMode(const Field & field) + { + if (field.getType() == Field::Types::String) + { + String str = field.get(); + if (str == "1" || boost::iequals(str, "true")) + return RestoreTableCreationMode::kCreate; + if (str == "0" || boost::iequals(str, "false")) + return RestoreTableCreationMode::kMustExist; + if (boost::iequals(str, "if not exists")) + return RestoreTableCreationMode::kCreateIfNotExists; + throw Exception("Cannot parse creation mode from string '" + str + "'", + ErrorCodes::CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE); + } + if (applyVisitor(FieldVisitorConvertToNumber(), field)) + return RestoreTableCreationMode::kCreate; + else + return RestoreTableCreationMode::kMustExist; + } } RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) @@ -28,14 +54,14 @@ RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) res.password = SettingFieldString{setting.value}; else if (setting.name == "structure_only") res.structure_only = SettingFieldBool{setting.value}; - else if (setting.name == "throw_if_database_exists") - res.throw_if_database_exists = SettingFieldBool{setting.value}; - else if (setting.name == "throw_if_table_exists") - res.throw_if_table_exists = SettingFieldBool{setting.value}; - else if (setting.name == "throw_if_database_def_differs") - res.throw_if_database_def_differs = SettingFieldBool{setting.value}; - else if (setting.name == "throw_if_table_def_differs") - res.throw_if_table_def_differs = SettingFieldBool{setting.value}; + else if (setting.name == "create_table") + res.create_table = parseRestoreTableCreationMode(setting.value); + else if (setting.name == "create_database") + res.create_database = parseRestoreTableCreationMode(setting.value); + else if (setting.name == "allow_different_table_def") + res.allow_different_table_def = SettingFieldBool{setting.value}; + else if (setting.name == "allow_different_database_def") + res.allow_different_database_def = SettingFieldBool{setting.value}; else throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); } diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index b129224943b..9f4951862c7 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -12,6 +12,21 @@ struct StorageRestoreSettings { }; +/// How the RESTORE command will handle table/database existence. +enum class RestoreTableCreationMode +{ + /// RESTORE TABLE always tries to create a table and it throws an exception if the table already exists. + kCreate, + + /// RESTORE TABLE never tries to create a table and it throws an exception if the table doesn't exist. + kMustExist, + + /// RESTORE TABLE tries to create a table if it doesn't exist. + kCreateIfNotExists, +}; + +using RestoreDatabaseCreationMode = RestoreTableCreationMode; + /// Settings specified in the "SETTINGS" clause of a RESTORE query. struct RestoreSettings : public StorageRestoreSettings { @@ -27,19 +42,21 @@ struct RestoreSettings : public StorageRestoreSettings /// without the data of tables. bool structure_only = false; - /// Whether RESTORE DATABASE must throw an exception if a destination database already exists. - bool throw_if_database_exists = true; + /// How RESTORE command should work if a table to restore already exists. + RestoreTableCreationMode create_table = RestoreTableCreationMode::kCreateIfNotExists; - /// Whether RESTORE TABLE must throw an exception if a destination table already exists. - bool throw_if_table_exists = true; + /// How RESTORE command should work if a database to restore already exists. + RestoreDatabaseCreationMode create_database = RestoreDatabaseCreationMode::kCreateIfNotExists; - /// Whether RESTORE DATABASE must throw an exception if a destination database has - /// a different definition comparing with the definition read from backup. - bool throw_if_database_def_differs = true; + /// Normally RESTORE command throws an exception if a destination table exists but has a different definition + /// (i.e. create query) comparing with its definition extracted from backup. + /// Set `allow_different_table_def` to true to skip this check. + bool allow_different_table_def = false; - /// Whether RESTORE TABLE must throw an exception if a destination table has - /// a different definition comparing with the definition read from backup. - bool throw_if_table_def_differs = true; + /// Normally RESTORE command throws an exception if a destination database exists but has a different definition + /// (i.e. create query) comparing with its definition extracted from backup. + /// Set `allow_different_database_def` to true to skip this check. + bool allow_different_database_def = false; static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query); }; diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index 8073b6d0818..f439860e61c 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -70,9 +70,12 @@ namespace private: void createDatabase() { - /// We need to call clone() for `create_query` because the interpreter can decide - /// to change a passed AST a little bit. - InterpreterCreateQuery create_interpreter{create_query->clone(), context}; + if (restore_settings->create_database == RestoreDatabaseCreationMode::kMustExist) + return; + + auto cloned_create_query = typeid_cast>(create_query->clone()); + cloned_create_query->if_not_exists = (restore_settings->create_database == RestoreDatabaseCreationMode::kCreateIfNotExists); + InterpreterCreateQuery create_interpreter{cloned_create_query, context}; create_interpreter.execute(); } @@ -92,7 +95,7 @@ namespace void checkDatabaseCreateQuery() { - if (ignore_if_database_def_differs || !restore_settings->throw_if_database_def_differs) + if (ignore_if_database_def_differs || restore_settings->allow_different_database_def) return; getDatabaseCreateQuery(); @@ -153,9 +156,12 @@ namespace private: void createStorage() { - /// We need to call clone() for `create_query` because the interpreter can decide - /// to change a passed AST a little bit. - InterpreterCreateQuery create_interpreter{create_query->clone(), context}; + if (restore_settings->create_table == RestoreTableCreationMode::kMustExist) + return; + + auto cloned_create_query = typeid_cast>(create_query->clone()); + cloned_create_query->if_not_exists = (restore_settings->create_table == RestoreTableCreationMode::kCreateIfNotExists); + InterpreterCreateQuery create_interpreter{cloned_create_query, context}; create_interpreter.execute(); } @@ -178,7 +184,7 @@ namespace void checkStorageCreateQuery() { - if (!restore_settings->throw_if_table_def_differs) + if (restore_settings->allow_different_table_def) return; getStorageCreateQuery(); @@ -330,7 +336,6 @@ namespace /// Make a create query for this table. auto create_query = renameInCreateQuery(readCreateQueryFromBackup(table_name_)); - create_query->if_not_exists = !restore_settings.throw_if_table_exists; CreateTableInfo info; info.create_query = create_query; @@ -416,8 +421,6 @@ namespace db_name_in_backup.clear(); } - create_db_query->if_not_exists = !restore_settings.throw_if_database_exists; - CreateDatabaseInfo info_db; info_db.create_query = create_db_query; info_db.name_in_backup = std::move(db_name_in_backup); diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index c7282939556..e2298e04b44 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -621,6 +621,7 @@ M(650, SERIALIZATION_ERROR) \ M(651, CAPN_PROTO_BAD_TYPE) \ M(652, ONLY_NULLS_WHILE_READING_SCHEMA) \ + M(653, CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 32ad0fbebbc..2bc72c30bdc 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -78,12 +78,12 @@ def test_restore_table_into_existing_table(engine): instance.query(f"BACKUP TABLE test.table TO {backup_name}") instance.query( - f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SETTINGS throw_if_table_exists=0" + f"RESTORE TABLE test.table INTO test.table FROM {backup_name}" ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "200\t9900\n" instance.query( - f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SETTINGS throw_if_table_exists=0" + f"RESTORE TABLE test.table INTO test.table FROM {backup_name}" ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "300\t14850\n" From 6c3333b50b7d63dbdedca4edbe62d3a8c22199c8 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 13 Apr 2022 00:34:17 +0200 Subject: [PATCH 51/94] RESTORE TABLE doesn't create containing database anymore --- src/Backups/BackupUtils.cpp | 80 +++++--------------------- src/Backups/RestoreUtils.cpp | 107 ++++++----------------------------- 2 files changed, 30 insertions(+), 157 deletions(-) diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index 9e2da6f34f8..8d6c57181f0 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -80,15 +80,6 @@ namespace /// Makes backup entries, should be called after prepare(). BackupEntries makeBackupEntries() const { - /// Check that there are not `different_create_query`. (If it's set it means error.) - for (const auto & info : databases | boost::adaptors::map_values) - { - if (info.different_create_query) - throw Exception(ErrorCodes::CANNOT_BACKUP_DATABASE, - "Cannot backup a database because two different create queries were generated for it: {} and {}", - serializeAST(*info.create_query), serializeAST(*info.different_create_query)); - } - BackupEntries res; for (const auto & info : databases | boost::adaptors::map_values) res.push_back(makeBackupEntryForMetadata(*info.create_query)); @@ -138,9 +129,9 @@ namespace database->getEngineName()); /// Check that we are not trying to backup the same table again. - DatabaseAndTableName new_table_name = renaming_settings.getNewTableName(table_name_); - if (tables.contains(new_table_name)) - throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, "Cannot backup the {} twice", formatTableNameOrTemporaryTableName(new_table_name)); + DatabaseAndTableName name_in_backup = renaming_settings.getNewTableName(table_name_); + if (tables.contains(name_in_backup)) + throw Exception(ErrorCodes::CANNOT_BACKUP_TABLE, "Cannot backup the {} twice", formatTableNameOrTemporaryTableName(name_in_backup)); /// Make a create query for this table. auto create_query = prepareCreateQueryForBackup(database->getCreateTableQuery(table_name_.second, context)); @@ -155,40 +146,9 @@ namespace CreateTableInfo info; info.create_query = create_query; info.storage = storage; - info.name_in_backup = new_table_name; info.partitions = partitions_; info.has_data = has_data; - tables[new_table_name] = std::move(info); - - /// If it's not system or temporary database then probably we need to backup the database's definition too. - if (!isSystemOrTemporaryDatabase(table_name_.first)) - { - if (!databases.contains(new_table_name.first)) - { - /// Add a create query to backup the database if we haven't done it yet. - auto create_db_query = prepareCreateQueryForBackup(database->getCreateDatabaseQuery()); - create_db_query->setDatabase(new_table_name.first); - - CreateDatabaseInfo info_db; - info_db.create_query = create_db_query; - info_db.original_name = table_name_.first; - info_db.is_explicit = false; - databases[new_table_name.first] = std::move(info_db); - } - else - { - /// We already have added a create query to backup the database, - /// set `different_create_query` if it's not the same. - auto & info_db = databases[new_table_name.first]; - if (!info_db.is_explicit && (info_db.original_name != table_name_.first) && !info_db.different_create_query) - { - auto create_db_query = prepareCreateQueryForBackup(table_.first->getCreateDatabaseQuery()); - create_db_query->setDatabase(new_table_name.first); - if (!areDatabaseDefinitionsSame(*info_db.create_query, *create_db_query)) - info_db.different_create_query = create_db_query; - } - } - } + tables[name_in_backup] = std::move(info); } /// Prepares to restore a database and all tables in it. @@ -203,21 +163,19 @@ namespace context->checkAccess(AccessType::SHOW_DATABASES, database_name_); /// Check that we are not trying to restore the same database again. - String new_database_name = renaming_settings.getNewDatabaseName(database_name_); - if (databases.contains(new_database_name) && databases[new_database_name].is_explicit) - throw Exception(ErrorCodes::CANNOT_BACKUP_DATABASE, "Cannot backup the database {} twice", backQuoteIfNeed(new_database_name)); + String name_in_backup = renaming_settings.getNewDatabaseName(database_name_); + if (databases.contains(name_in_backup)) + throw Exception(ErrorCodes::CANNOT_BACKUP_DATABASE, "Cannot backup the database {} twice", backQuoteIfNeed(name_in_backup)); /// Of course we're not going to backup the definition of the system or the temporary database. if (!isSystemOrTemporaryDatabase(database_name_)) { /// Make a create query for this database. - auto create_db_query = prepareCreateQueryForBackup(database_->getCreateDatabaseQuery()); + auto create_query = prepareCreateQueryForBackup(database_->getCreateDatabaseQuery()); - CreateDatabaseInfo info_db; - info_db.create_query = create_db_query; - info_db.original_name = database_name_; - info_db.is_explicit = true; - databases[new_database_name] = std::move(info_db); + CreateDatabaseInfo info; + info.create_query = create_query; + databases[name_in_backup] = std::move(info); } /// Backup tables in this database. @@ -273,7 +231,6 @@ namespace { ASTPtr create_query; StoragePtr storage; - DatabaseAndTableName name_in_backup; ASTs partitions; bool has_data = false; }; @@ -282,24 +239,13 @@ namespace struct CreateDatabaseInfo { ASTPtr create_query; - String original_name; - - /// Whether the creation of this database is specified explicitly, via RESTORE DATABASE or - /// RESTORE ALL DATABASES. - /// It's false if the creation of this database is caused by creating a table contained in it. - bool is_explicit = false; - - /// If this is set it means the following error: - /// it means that for implicitly created database there were two different create query - /// generated so we cannot restore the database. - ASTPtr different_create_query; }; ContextPtr context; BackupSettings backup_settings; DDLRenamingSettings renaming_settings; - std::map databases; - std::map tables; + std::map databases; + std::map tables; }; } diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index f439860e61c..9a976c4753a 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -48,12 +48,10 @@ namespace RestoreDatabaseTask( ContextMutablePtr context_, const ASTPtr & create_query_, - const RestoreSettingsPtr & restore_settings_, - bool ignore_if_database_def_differs_) + const RestoreSettingsPtr & restore_settings_) : context(context_) , create_query(typeid_cast>(create_query_)) , restore_settings(restore_settings_) - , ignore_if_database_def_differs(ignore_if_database_def_differs_) { } @@ -95,7 +93,7 @@ namespace void checkDatabaseCreateQuery() { - if (ignore_if_database_def_differs || restore_settings->allow_different_database_def) + if (restore_settings->allow_different_database_def) return; getDatabaseCreateQuery(); @@ -114,7 +112,6 @@ namespace ContextMutablePtr context; std::shared_ptr create_query; RestoreSettingsPtr restore_settings; - bool ignore_if_database_def_differs = false; DatabasePtr database; ASTPtr database_create_query; }; @@ -302,21 +299,11 @@ namespace /// Makes tasks for restoring, should be called after prepare(). RestoreTasks makeTasks() const { - /// Check that there are not `different_create_query`. (If it's set it means error.) - for (const auto & info : databases | boost::adaptors::map_values) - { - if (info.different_create_query) - throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, - "Cannot restore a database because two different create queries were generated for it: {} and {}", - serializeAST(*info.create_query), serializeAST(*info.different_create_query)); - } - auto restore_settings_ptr = std::make_shared(restore_settings); RestoreTasks res; for (const auto & info : databases | boost::adaptors::map_values) - res.push_back(std::make_unique(context, info.create_query, restore_settings_ptr, - /* ignore_if_database_def_differs = */ !info.is_explicit)); + res.push_back(std::make_unique(context, info.create_query, restore_settings_ptr)); /// TODO: We need to restore tables according to their dependencies. for (const auto & info : tables | boost::adaptors::map_values) @@ -342,52 +329,6 @@ namespace info.name_in_backup = table_name_; info.partitions = partitions_; tables[new_table_name] = std::move(info); - - /// If it's not system or temporary database then probably we need to restore the database's definition too. - if (!isSystemOrTemporaryDatabase(new_table_name.first)) - { - if (!databases.contains(new_table_name.first)) - { - /// Add a create query for restoring the database if we haven't done it yet. - std::shared_ptr create_db_query; - String db_name_in_backup = table_name_.first; - if (hasCreateQueryInBackup(db_name_in_backup)) - { - create_db_query = renameInCreateQuery(readCreateQueryFromBackup(db_name_in_backup)); - } - else - { - create_db_query = std::make_shared(); - db_name_in_backup.clear(); - } - create_db_query->setDatabase(new_table_name.first); - create_db_query->if_not_exists = true; - - CreateDatabaseInfo info_db; - info_db.create_query = create_db_query; - info_db.name_in_backup = std::move(db_name_in_backup); - info_db.is_explicit = false; - databases[new_table_name.first] = std::move(info_db); - } - else - { - /// We already have added a create query for restoring the database, - /// set `different_create_query` if it's not the same. - auto & info_db = databases[new_table_name.first]; - if (!info_db.is_explicit && (info_db.name_in_backup != table_name_.first) && !info_db.different_create_query) - { - std::shared_ptr create_db_query; - if (hasCreateQueryInBackup(table_name_.first)) - create_db_query = renameInCreateQuery(readCreateQueryFromBackup(table_name_.first)); - else - create_db_query = std::make_shared(); - create_db_query->setDatabase(new_table_name.first); - create_db_query->if_not_exists = true; - if (!areDatabaseDefinitionsSame(*info_db.create_query, *create_db_query)) - info_db.different_create_query = create_db_query; - } - } - } } /// Prepares to restore a database and all tables in it. @@ -395,37 +336,34 @@ namespace { /// Check that we are not trying to restore the same database again. String new_database_name = renaming_settings.getNewDatabaseName(database_name_); - if (databases.contains(new_database_name) && databases[new_database_name].is_explicit) + if (databases.contains(new_database_name)) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} twice", backQuoteIfNeed(new_database_name)); Strings table_metadata_filenames = backup->listFiles("metadata/" + escapeForFileName(database_name_) + "/", "/"); + bool has_tables_in_backup = !table_metadata_filenames.empty(); + bool has_create_query_in_backup = hasCreateQueryInBackup(database_name_); - bool throw_if_no_create_database_query = table_metadata_filenames.empty(); - if (throw_if_no_create_database_query && !hasCreateQueryInBackup(database_name_)) + if (!has_create_query_in_backup && !has_tables_in_backup) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} because there is no such database in the backup", backQuoteIfNeed(database_name_)); /// Of course we're not going to restore the definition of the system or the temporary database. if (!isSystemOrTemporaryDatabase(new_database_name)) { /// Make a create query for this database. - std::shared_ptr create_db_query; - String db_name_in_backup = database_name_; - if (hasCreateQueryInBackup(db_name_in_backup)) + std::shared_ptr create_query; + if (has_create_query_in_backup) { - create_db_query = renameInCreateQuery(readCreateQueryFromBackup(db_name_in_backup)); + create_query = renameInCreateQuery(readCreateQueryFromBackup(database_name_)); } else { - create_db_query = std::make_shared(); - create_db_query->setDatabase(database_name_); - db_name_in_backup.clear(); + create_query = std::make_shared(); + create_query->setDatabase(database_name_); } - CreateDatabaseInfo info_db; - info_db.create_query = create_db_query; - info_db.name_in_backup = std::move(db_name_in_backup); - info_db.is_explicit = true; - databases[new_database_name] = std::move(info_db); + CreateDatabaseInfo info; + info.create_query = create_query; + databases[new_database_name] = std::move(info); } /// Restore tables in this database. @@ -513,25 +451,14 @@ namespace struct CreateDatabaseInfo { ASTPtr create_query; - String name_in_backup; - - /// Whether the creation of this database is specified explicitly, via RESTORE DATABASE or - /// RESTORE ALL DATABASES. - /// It's false if the creation of this database is caused by creating a table contained in it. - bool is_explicit = false; - - /// If this is set it means the following error: - /// it means that for implicitly created database there were two different create query - /// generated so we cannot restore the database. - ASTPtr different_create_query; }; ContextMutablePtr context; BackupPtr backup; RestoreSettings restore_settings; DDLRenamingSettings renaming_settings; - std::map databases; - std::map tables; + std::map databases; + std::map tables; }; From acd28d8a1dda9b07d6c4541cd7f29a2108af9859 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 13 Apr 2022 15:26:17 +0200 Subject: [PATCH 52/94] Implement RESTORE for replicated tables. --- src/Storages/StorageReplicatedMergeTree.cpp | 161 ++++++++++++++++++ src/Storages/StorageReplicatedMergeTree.h | 3 + .../__init__.py | 0 .../configs/backups_disk.xml | 14 ++ .../configs/remote_servers.xml | 16 ++ .../test_backup_restore_replicated/test.py | 105 ++++++++++++ 6 files changed, 299 insertions(+) create mode 100644 tests/integration/test_backup_restore_replicated/__init__.py create mode 100644 tests/integration/test_backup_restore_replicated/configs/backups_disk.xml create mode 100644 tests/integration/test_backup_restore_replicated/configs/remote_servers.xml create mode 100644 tests/integration/test_backup_restore_replicated/test.py diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index ea69f658aa9..17150dce78f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -68,6 +68,11 @@ #include #include +#include +#include +#include +#include + #include #include @@ -8195,4 +8200,160 @@ void StorageReplicatedMergeTree::createAndStoreFreezeMetadata(DiskPtr disk, Data } +class ReplicatedMergeTreeRestoreTask : public IRestoreTask +{ +public: + ReplicatedMergeTreeRestoreTask( + const ContextPtr & query_context_, + const std::shared_ptr & storage_, + const std::unordered_set & partition_ids_, + const BackupPtr & backup_, + const String & data_path_in_backup_) + : query_context(query_context_) + , storage(storage_) + , partition_ids(partition_ids_) + , backup(backup_) + , data_path_in_backup(data_path_in_backup_) + { + } + + RestoreTasks run() override + { + RestoreTasks restore_part_tasks; + Strings part_names = backup->listFiles(data_path_in_backup); + + auto metadata_snapshot = storage->getInMemoryMetadataPtr(); + auto sink = std::make_shared(*storage, metadata_snapshot, 0, 0, 0, false, false, query_context, /*is_attach*/true); + + for (const String & part_name : part_names) + { + const auto part_info = MergeTreePartInfo::tryParsePartName(part_name, storage->format_version); + if (!part_info) + continue; + + if (!partition_ids.empty() && !partition_ids.contains(part_info->partition_id)) + continue; + + restore_part_tasks.push_back( + std::make_unique(storage, sink, part_name, *part_info, backup, data_path_in_backup)); + } + return restore_part_tasks; + } + +private: + ContextPtr query_context; + std::shared_ptr storage; + std::unordered_set partition_ids; + BackupPtr backup; + String data_path_in_backup; + + class RestorePartTask : public IRestoreTask + { + public: + RestorePartTask( + const std::shared_ptr & storage_, + const std::shared_ptr & sink_, + const String & part_name_, + const MergeTreePartInfo & part_info_, + const BackupPtr & backup_, + const String & data_path_in_backup_) + : storage(storage_) + , sink(sink_) + , part_name(part_name_) + , part_info(part_info_) + , backup(backup_) + , data_path_in_backup(data_path_in_backup_) + { + } + + RestoreTasks run() override + { + UInt64 total_size_of_part = 0; + Strings filenames = backup->listFiles(data_path_in_backup + part_name + "/", ""); + for (const String & filename : filenames) + total_size_of_part += backup->getFileSize(data_path_in_backup + part_name + "/" + filename); + + std::shared_ptr reservation = storage->getStoragePolicy()->reserveAndCheck(total_size_of_part); + auto disk = reservation->getDisk(); + String relative_data_path = storage->getRelativeDataPath(); + + auto temp_part_dir_owner = std::make_shared(disk, relative_data_path + "restoring_" + part_name + "_"); + String temp_part_dir = temp_part_dir_owner->getPath(); + disk->createDirectories(temp_part_dir); + + assert(temp_part_dir.starts_with(relative_data_path)); + String relative_temp_part_dir = temp_part_dir.substr(relative_data_path.size()); + + for (const String & filename : filenames) + { + auto backup_entry = backup->readFile(fs::path(data_path_in_backup) / part_name / filename); + auto read_buffer = backup_entry->getReadBuffer(); + auto write_buffer = disk->writeFile(fs::path(temp_part_dir) / filename); + copyData(*read_buffer, *write_buffer); + reservation->update(reservation->getSize() - backup_entry->getSize()); + } + + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + auto part = storage->createPart(part_name, part_info, single_disk_volume, relative_temp_part_dir); + /// TODO Transactions: Decide what to do with version metadata (if any). Let's just remove it for now. + disk->removeFileIfExists(fs::path(temp_part_dir) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); + part->version.setCreationTID(Tx::PrehistoricTID, nullptr); + part->loadColumnsChecksumsIndexes(false, true); + sink->writeExistingPart(part); + return {}; + } + + private: + std::shared_ptr storage; + std::shared_ptr sink; + String part_name; + MergeTreePartInfo part_info; + BackupPtr backup; + String data_path_in_backup; + }; +}; + + +#if 0 +PartsTemporaryRename renamed_parts(*this, "detached/"); +MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts); + +/// TODO Allow to use quorum here. +ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, query_context, + /*is_attach*/true); + +for (size_t i = 0; i < loaded_parts.size(); ++i) +{ + const String old_name = loaded_parts[i]->name; + + output.writeExistingPart(loaded_parts[i]); + + renamed_parts.old_and_new_names[i].old_name.clear(); + + LOG_DEBUG(log, "Attached part {} as {}", old_name, loaded_parts[i]->name); + + results.push_back(PartitionCommandResultInfo{ + .partition_id = loaded_parts[i]->info.partition_id, + .part_name = loaded_parts[i]->name, + .old_part_name = old_name, + }); +} +#endif + + +RestoreTaskPtr StorageReplicatedMergeTree::restoreData( + ContextMutablePtr local_context, + const ASTs & partitions, + const BackupPtr & backup, + const String & data_path_in_backup, + const StorageRestoreSettings &) +{ + return std::make_unique( + local_context, + std::static_pointer_cast(shared_from_this()), + getPartitionIDsFromQuery(partitions, local_context), + backup, + data_path_in_backup); +} + } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 096ef20cf7f..99cdac36314 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -225,6 +225,9 @@ public: static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger); + /// Extract data from the backup and put it to the storage. + RestoreTaskPtr restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + /// Schedules job to execute in background pool (merge, mutate, drop range and so on) bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override; diff --git a/tests/integration/test_backup_restore_replicated/__init__.py b/tests/integration/test_backup_restore_replicated/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_backup_restore_replicated/configs/backups_disk.xml b/tests/integration/test_backup_restore_replicated/configs/backups_disk.xml new file mode 100644 index 00000000000..beb8d605f39 --- /dev/null +++ b/tests/integration/test_backup_restore_replicated/configs/backups_disk.xml @@ -0,0 +1,14 @@ + + + + + + local + /backups/ + + + + + backups + + diff --git a/tests/integration/test_backup_restore_replicated/configs/remote_servers.xml b/tests/integration/test_backup_restore_replicated/configs/remote_servers.xml new file mode 100644 index 00000000000..84d16206080 --- /dev/null +++ b/tests/integration/test_backup_restore_replicated/configs/remote_servers.xml @@ -0,0 +1,16 @@ + + + + + + node1 + 9000 + + + node2 + 9000 + + + + + diff --git a/tests/integration/test_backup_restore_replicated/test.py b/tests/integration/test_backup_restore_replicated/test.py new file mode 100644 index 00000000000..c0c7bd91b6e --- /dev/null +++ b/tests/integration/test_backup_restore_replicated/test.py @@ -0,0 +1,105 @@ +import pytest +import os.path +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + external_dirs=["/backups/"], + macros={"replica": "node1"}, + with_zookeeper=True, +) + +node2 = cluster.add_instance( + "node2", + main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + external_dirs=["/backups/"], + macros={"replica": "node2"}, + with_zookeeper=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY") + finally: + cluster.shutdown() + + +def create_table(instance = None): + on_cluster_clause = "" if instance else "ON CLUSTER 'cluster'" + instance_to_execute = instance if instance else node1 + instance_to_execute.query( + "CREATE TABLE tbl " + on_cluster_clause + " (" + "x UInt8, y String" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" + "ORDER BY x" + ) + + +def drop_table(instance = None): + on_cluster_clause = "" if instance else "ON CLUSTER 'cluster'" + instance_to_execute = instance if instance else node1 + instance_to_execute.query(f"DROP TABLE tbl {on_cluster_clause} NO DELAY") + + +def insert_data(instance = None): + instance1_to_execute = instance if instance else node1 + instance2_to_execute = instance if instance else node2 + instance1_to_execute.query("INSERT INTO tbl VALUES (1, 'Don''t')") + instance2_to_execute.query("INSERT INTO tbl VALUES (2, 'count')") + instance1_to_execute.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (3, 'your')") + instance2_to_execute.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (4, 'chickens')") + + +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"Disk('backups', '{backup_id_counter}.zip')" + + +def get_path_to_backup(instance, backup_name): + return os.path.join( + instance.path, + "backups", + backup_name.removeprefix("Disk('backups', '").removesuffix("')"), + ) + + +def test_backup_and_restore(): + create_table() + insert_data() + + backup_name = new_backup_name() + + # Make backup on node 1. + node1.query(f"BACKUP TABLE tbl TO {backup_name}") + + # Drop table on both nodes. + drop_table() + + # Restore from backup on node2. + os.link( + get_path_to_backup(node1, backup_name), get_path_to_backup(node2, backup_name) + ) + node2.query(f"RESTORE TABLE tbl FROM {backup_name}") + + assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + + # Data should be replicated to node1. + create_table(node1) + assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) From f14613f4337640824a9263347824c27ef940c27d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sat, 16 Apr 2022 22:18:42 +0200 Subject: [PATCH 53/94] Implemented backup version 2: now files in backups are named by their checksums. It will allow to store duplicate files only one time. --- src/Backups/BackupImpl.cpp | 326 +++++++++++------- src/Backups/BackupImpl.h | 25 +- src/Backups/IBackup.h | 6 +- .../test_backup_restore_new/test.py | 16 +- 4 files changed, 233 insertions(+), 140 deletions(-) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 071ed35bc83..f1bb8830a68 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -35,7 +35,10 @@ namespace ErrorCodes namespace { - const UInt64 BACKUP_VERSION = 1; + const UInt64 INITIAL_BACKUP_VERSION = 1; + const UInt64 CURRENT_BACKUP_VERSION = 2; + + const UInt64 BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES = 2; UInt128 unhexChecksum(const String & checksum) { @@ -51,18 +54,18 @@ class BackupImpl::BackupEntryFromBackupImpl : public IBackupEntry public: BackupEntryFromBackupImpl( const std::shared_ptr & backup_, - const String & file_name_, + const String & data_file_name_, UInt64 size_, - const std::optional checksum_, + const UInt128 checksum_, BackupEntryPtr base_backup_entry_ = {}) - : backup(backup_), file_name(file_name_), size(size_), checksum(checksum_), + : backup(backup_), data_file_name(data_file_name_), size(size_), checksum(checksum_), base_backup_entry(std::move(base_backup_entry_)) { } std::unique_ptr getReadBuffer() const override { - auto read_buffer = backup->readFileImpl(file_name); + auto read_buffer = backup->readFileImpl(data_file_name); if (base_backup_entry) { auto base_backup_read_buffer = base_backup_entry->getReadBuffer(); @@ -76,15 +79,93 @@ public: private: const std::shared_ptr backup; - const String file_name; + const String data_file_name; const UInt64 size; - const std::optional checksum; + const UInt128 checksum; BackupEntryPtr base_backup_entry; }; +class BackupImpl::LocalFileInfos : public IFileInfos +{ +public: + LocalFileInfos() = default; + ~LocalFileInfos() override = default; + + void add(FileInfo && file_info, bool & is_new_checksum) override + { + file_names.emplace(file_info.file_name, file_info.checksum); + is_new_checksum = (file_info.checksum && !file_infos.contains(file_info.checksum)); + if (is_new_checksum) + file_infos.emplace(file_info.checksum, std::move(file_info)); + } + + std::vector getAllFileInfos() override + { + std::vector res; + for (const auto & [file_name, checksum] : file_names) + { + FileInfo info = file_infos.at(checksum); + info.file_name = file_name; + res.push_back(std::move(info)); + } + return res; + } + + Strings listFiles(const String & prefix, const String & terminator) override + { + Strings elements; + for (auto it = file_names.lower_bound(prefix); it != file_names.end(); ++it) + { + const String & name = it->first; + if (!name.starts_with(prefix)) + break; + size_t start_pos = prefix.length(); + size_t end_pos = String::npos; + if (!terminator.empty()) + end_pos = name.find(terminator, start_pos); + std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); + if (!elements.empty() && (elements.back() == new_element)) + continue; + elements.push_back(String{new_element}); + } + return elements; + } + + std::optional getChecksumByFileName(const String & file_name) override + { + auto it = file_names.find(file_name); + if (it == file_names.end()) + return std::nullopt; + return it->second; + } + + std::optional getFileInfoByChecksum(const UInt128 & checksum) override + { + auto it = file_infos.find(checksum); + if (it == file_infos.end()) + return std::nullopt; + return it->second; + } + + std::optional getFileInfoByFileName(const String & file_name) override + { + auto it = file_names.find(file_name); + if (it == file_names.end()) + return std::nullopt; + FileInfo info = file_infos.at(it->second); + info.file_name = file_name; + return info; + } + +private: + std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. + std::unordered_map file_infos; /// Information about files. Without empty files. +}; + + BackupImpl::BackupImpl(const String & backup_name_, const ContextPtr & context_, const std::optional & base_backup_info_) - : backup_name(backup_name_), context(context_), base_backup_info_param(base_backup_info_) + : backup_name(backup_name_), context(context_), base_backup_info_param(base_backup_info_), file_infos(std::make_unique()) { } @@ -107,7 +188,6 @@ void BackupImpl::open(OpenMode open_mode_) timestamp = std::time(nullptr); uuid = UUIDHelpers::generateV4(); writing_finalized = false; - written_files.clear(); } if (open_mode_ == OpenMode::READ) @@ -146,14 +226,14 @@ void BackupImpl::close() if (open_mode == OpenMode::NONE) return; - closeImpl(written_files, writing_finalized); + closeImpl({}/*written_files*/, writing_finalized); uuid = UUIDHelpers::Nil; timestamp = 0; base_backup_info.reset(); base_backup.reset(); base_backup_uuid.reset(); - file_infos.clear(); + //file_infos.clear(); open_mode = OpenMode::NONE; } @@ -172,14 +252,14 @@ time_t BackupImpl::getTimestamp() const void BackupImpl::writeBackupMetadata() { Poco::AutoPtr config{new Poco::Util::XMLConfiguration()}; - config->setUInt("version", BACKUP_VERSION); + config->setUInt("version", CURRENT_BACKUP_VERSION); config->setString("timestamp", toString(LocalDateTime{timestamp})); config->setString("uuid", toString(uuid)); if (base_backup_info) { bool base_backup_in_use = false; - for (const auto & [name, info] : file_infos) + for (const auto & info : file_infos->getAllFileInfos()) { if (info.base_size) base_backup_in_use = true; @@ -193,10 +273,10 @@ void BackupImpl::writeBackupMetadata() } size_t index = 0; - for (const auto & [name, info] : file_infos) + for (const auto & info : file_infos->getAllFileInfos()) { String prefix = index ? "contents.file[" + std::to_string(index) + "]." : "contents.file."; - config->setString(prefix + "name", name); + config->setString(prefix + "name", info.file_name); config->setUInt(prefix + "size", info.size); if (info.size) { @@ -217,7 +297,7 @@ void BackupImpl::writeBackupMetadata() std::ostringstream stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM config->save(stream); String str = stream.str(); - written_files.push_back(".backup"); + //written_files.push_back(".backup"); auto out = writeFileImpl(".backup"); out->write(str.data(), str.size()); } @@ -231,8 +311,8 @@ void BackupImpl::readBackupMetadata() Poco::AutoPtr config{new Poco::Util::XMLConfiguration()}; config->load(stream); - UInt64 version = config->getUInt("version"); - if (version != BACKUP_VERSION) + version = config->getUInt("version"); + if ((version < INITIAL_BACKUP_VERSION) || (version > CURRENT_BACKUP_VERSION)) throw Exception(ErrorCodes::BACKUP_VERSION_NOT_SUPPORTED, "Backup {}: Version {} is not supported", getName(), version); timestamp = parse(config->getString("timestamp")).to_time_t(); @@ -244,7 +324,7 @@ void BackupImpl::readBackupMetadata() if (config->has("base_backup_uuid")) base_backup_uuid = parse(config->getString("base_backup_uuid")); - file_infos.clear(); + //file_infos.clear(); Poco::Util::AbstractConfiguration::Keys keys; config->keys("contents", keys); for (const auto & key : keys) @@ -252,24 +332,25 @@ void BackupImpl::readBackupMetadata() if ((key == "file") || key.starts_with("file[")) { String prefix = "contents." + key + "."; - String name = config->getString(prefix + "name"); FileInfo info; + info.file_name = config->getString(prefix + "name"); info.size = config->getUInt(prefix + "size"); - if (info.size) + info.checksum = info.size ? unhexChecksum(config->getString(prefix + "checksum")) : UInt128{0}; + + bool use_base = config->getBool(prefix + "use_base", false); + info.base_size = config->getUInt(prefix + "base_size", use_base ? info.size : 0); + if (info.base_size) + use_base = true; + + if (use_base) { - info.checksum = unhexChecksum(config->getString(prefix + "checksum")); - bool use_base = config->getBool(prefix + "use_base", false); - info.base_size = config->getUInt(prefix + "base_size", use_base ? info.size : 0); - if (info.base_size) - { - if (info.base_size == info.size) - info.base_checksum = info.checksum; - else - info.base_checksum = unhexChecksum(config->getString(prefix + "base_checksum")); - } + if (info.base_size == info.size) + info.base_checksum = info.checksum; + else + info.base_checksum = unhexChecksum(config->getString(prefix + "base_checksum")); } - file_infos.emplace(name, info); - file_checksums.emplace(info.checksum, name); + + file_infos->add(std::move(info)); } } } @@ -278,73 +359,64 @@ Strings BackupImpl::listFiles(const String & prefix, const String & terminator) { if (!prefix.ends_with('/') && !prefix.empty()) throw Exception("prefix should end with '/'", ErrorCodes::BAD_ARGUMENTS); - std::lock_guard lock{mutex}; - Strings elements; - for (auto it = file_infos.lower_bound(prefix); it != file_infos.end(); ++it) - { - const String & name = it->first; - if (!name.starts_with(prefix)) - break; - size_t start_pos = prefix.length(); - size_t end_pos = String::npos; - if (!terminator.empty()) - end_pos = name.find(terminator, start_pos); - std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); - if (!elements.empty() && (elements.back() == new_element)) - continue; - elements.push_back(String{new_element}); - } - return elements; + return file_infos->listFiles(prefix, terminator); } bool BackupImpl::fileExists(const String & file_name) const { std::lock_guard lock{mutex}; - return file_infos.contains(file_name); + return file_infos->getChecksumByFileName(file_name).has_value(); +} + +bool BackupImpl::fileExistsByChecksum(const UInt128 & checksum) const +{ + return file_infos->getFileInfoByChecksum(checksum).has_value(); } size_t BackupImpl::getFileSize(const String & file_name) const { - std::lock_guard lock{mutex}; - auto it = file_infos.find(file_name); - if (it == file_infos.end()) + auto info = file_infos->getFileInfoByFileName(file_name); + if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); - return it->second.size; + return info->size; +} + +size_t BackupImpl::getFileSizeByChecksum(const UInt128 & checksum) const +{ + auto info = file_infos->getFileInfoByChecksum(checksum); + if (!info) + throw Exception( + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), getHexUIntLowercase(checksum)); + return info->size; } UInt128 BackupImpl::getFileChecksum(const String & file_name) const { - std::lock_guard lock{mutex}; - auto it = file_infos.find(file_name); - if (it == file_infos.end()) + auto info = file_infos->getFileInfoByFileName(file_name); + if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); - return it->second.checksum; + return info->checksum; } -std::optional BackupImpl::findFileByChecksum(const UInt128 & checksum) const -{ - std::lock_guard lock{mutex}; - auto it = file_checksums.find(checksum); - if (it == file_checksums.end()) - return std::nullopt; - return it->second; -} - - BackupEntryPtr BackupImpl::readFile(const String & file_name) const +{ + return readFileByChecksum(getFileChecksum(file_name)); +} + +BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const { std::lock_guard lock{mutex}; if (open_mode != OpenMode::READ) throw Exception("Backup is not opened for reading", ErrorCodes::LOGICAL_ERROR); - auto it = file_infos.find(file_name); - if (it == file_infos.end()) + auto info_opt = file_infos->getFileInfoByChecksum(checksum); + if (!info_opt) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), getHexUIntLowercase(checksum)); - const auto & info = it->second; + const auto & info = *info_opt; if (!info.size) { /// Entry's data is empty. @@ -354,8 +426,9 @@ BackupEntryPtr BackupImpl::readFile(const String & file_name) const if (!info.base_size) { /// Data goes completely from this backup, the base backup isn't used. + String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? getHexUIntLowercase(checksum) : info.file_name; return std::make_unique( - std::static_pointer_cast(shared_from_this()), file_name, info.size, info.checksum); + std::static_pointer_cast(shared_from_this()), data_file_name, info.size, info.checksum); } if (info.size < info.base_size) @@ -363,7 +436,7 @@ BackupEntryPtr BackupImpl::readFile(const String & file_name) const throw Exception( ErrorCodes::BACKUP_DAMAGED, "Backup {}: Entry {} has its data size less than in the base backup {}: {} < {}", - getName(), quoteString(file_name), base_backup->getName(), info.size, info.base_size); + getName(), getHexUIntLowercase(checksum), base_backup->getName(), info.size, info.base_size); } if (!base_backup) @@ -371,26 +444,25 @@ BackupEntryPtr BackupImpl::readFile(const String & file_name) const throw Exception( ErrorCodes::NO_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but there is no base backup specified", - getName(), quoteString(file_name)); + getName(), getHexUIntLowercase(checksum)); } - auto base_file_name = base_backup->findFileByChecksum(info.base_checksum); - if (!base_file_name) + if (!base_backup->fileExistsByChecksum(info.base_checksum)) { throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but doesn't exist there", - getName(), quoteString(file_name)); + getName(), getHexUIntLowercase(checksum)); } - auto base_entry = base_backup->readFile(*base_file_name); + auto base_entry = base_backup->readFileByChecksum(info.base_checksum); auto base_size = base_entry->getSize(); if (base_size != info.base_size) { throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} has unexpected size in the base backup {}: {} (expected size: {})", - getName(), quoteString(file_name), base_backup->getName(), base_size, info.base_size); + getName(), getHexUIntLowercase(checksum), base_backup->getName(), base_size, info.base_size); } if (info.size == info.base_size) @@ -399,10 +471,13 @@ BackupEntryPtr BackupImpl::readFile(const String & file_name) const return base_entry; } - /// The beginning of the data goes from the base backup, - /// and the ending goes from this backup. - return std::make_unique( - static_pointer_cast(shared_from_this()), file_name, info.size, info.checksum, std::move(base_entry)); + { + /// The beginning of the data goes from the base backup, + /// and the ending goes from this backup. + String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? getHexUIntLowercase(checksum) : info.file_name; + return std::make_unique( + static_pointer_cast(shared_from_this()), data_file_name, info.size, info.checksum, std::move(base_entry)); + } } @@ -412,17 +487,28 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); - if (file_infos.contains(file_name)) + if (file_infos->getChecksumByFileName(file_name)) throw Exception( ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", getName(), quoteString(file_name)); - UInt64 size = entry->getSize(); - std::optional checksum = entry->getChecksum(); + FileInfo info; + info.file_name = file_name; + size_t size = entry->getSize(); + info.size = size; /// Check if the entry's data is empty. - if (!size) + if (!info.size) { - file_infos.emplace(file_name, FileInfo{}); + file_infos->add(std::move(info)); + return; + } + + /// Maybe we have a copy of this file in the backup already. + std::optional checksum = entry->getChecksum(); + if (checksum && file_infos->getFileInfoByChecksum(*checksum)) + { + info.checksum = *checksum; + file_infos->add(std::move(info)); return; } @@ -477,36 +563,43 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) checksum = hashing_read_buffer->getHash(); } hashing_read_buffer.reset(); + info.checksum = *checksum; + + /// Maybe we have a copy of this file in the backup already. + if (file_infos->getFileInfoByChecksum(*checksum)) + { + file_infos->add(std::move(info)); + return; + } /// Check if a entry with the same checksum exists in the base backup. - if (base_backup && !use_base) + if (base_backup && !use_base && base_backup->fileExistsByChecksum(*checksum)) { - if (auto base_file_name = base_backup->findFileByChecksum(*checksum)) - { - if (size == base_backup->getFileSize(*base_file_name)) - { - /// The entry's data has not changed since the base backup, - /// but the entry itself has been moved or renamed. - base_size = size; - base_checksum = *checksum; - use_base = true; - } - } + /// The entry's data has not changed since the base backup, + /// but the entry itself has been moved or renamed. + base_size = size; + base_checksum = *checksum; + use_base = true; + } + + if (use_base) + { + info.base_size = base_size; + info.base_checksum = base_checksum; } if (use_base && (size == base_size)) { /// The entry's data has not been changed since the base backup. - FileInfo info; - info.size = size; - info.checksum = *checksum; - info.base_size = base_size; - info.base_checksum = base_checksum; - file_infos.emplace(file_name, info); - file_checksums.emplace(*checksum, file_name); + file_infos->add(std::move(info)); return; } + bool is_new_checksum; + file_infos->add(std::move(info), is_new_checksum); + if (!is_new_checksum) + return; /// We copy data only if it's a new checksum. + /// Either the entry wasn't exist in the base backup /// or the entry has data appended to the end of the data from the base backup. /// In both those cases we have to copy data to this backup. @@ -527,21 +620,8 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) } /// Copy the entry's data after `copy_pos`. - written_files.push_back(file_name); - auto out = writeFileImpl(file_name); + auto out = writeFileImpl(getHexUIntLowercase(*checksum)); copyData(*read_buffer, *out); - - /// Done! - FileInfo info; - info.size = size; - info.checksum = *checksum; - if (use_base) - { - info.base_size = base_size; - info.base_checksum = base_checksum; - } - file_infos.emplace(file_name, info); - file_checksums.emplace(*checksum, file_name); } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 597b025d0ef..3fba3762b35 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -33,10 +33,12 @@ public: UUID getUUID() const override { return uuid; } Strings listFiles(const String & prefix, const String & terminator) const override; bool fileExists(const String & file_name) const override; + bool fileExistsByChecksum(const UInt128 & checksum) const override; size_t getFileSize(const String & file_name) const override; + size_t getFileSizeByChecksum(const UInt128 & checksum) const override; UInt128 getFileChecksum(const String & file_name) const override; - std::optional findFileByChecksum(const UInt128 & checksum) const override; BackupEntryPtr readFile(const String & file_name) const override; + BackupEntryPtr readFileByChecksum(const UInt128 & checksum) const override; void writeFile(const String & file_name, BackupEntryPtr entry) override; void finalizeWriting() override; @@ -65,6 +67,8 @@ private: struct FileInfo { + String file_name; + UInt64 size = 0; UInt128 checksum{0, 0}; @@ -73,7 +77,21 @@ private: UInt128 base_checksum{0, 0}; }; + class IFileInfos + { + public: + virtual ~IFileInfos() {} + virtual void add(FileInfo && file_info, bool & is_new_checksum) = 0; + void add(FileInfo && file_info) { bool dummy; add(std::move(file_info), dummy); } + virtual std::vector getAllFileInfos() = 0; + virtual Strings listFiles(const String & prefix, const String & terminator) = 0; + virtual std::optional getChecksumByFileName(const String & file_name) = 0; + virtual std::optional getFileInfoByChecksum(const UInt128 & checksum) = 0; + virtual std::optional getFileInfoByFileName(const String & file_name) = 0; + }; + class BackupEntryFromBackupImpl; + class LocalFileInfos; const String backup_name; ContextPtr context; @@ -81,12 +99,11 @@ private: OpenMode open_mode = OpenMode::NONE; UUID uuid = {}; time_t timestamp = 0; + UInt64 version = 1; std::optional base_backup_info; std::shared_ptr base_backup; std::optional base_backup_uuid; - std::map file_infos; /// Should be ordered alphabetically, see listFiles(). - std::unordered_map file_checksums; - Strings written_files; + std::unique_ptr file_infos; bool writing_finalized = false; }; diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 5e13ff88575..727480923cf 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -53,20 +53,20 @@ public: /// Checks if an entry with a specified name exists. virtual bool fileExists(const String & file_name) const = 0; + virtual bool fileExistsByChecksum(const UInt128 & checksum) const = 0; /// Returns the size of the entry's data. /// This function does the same as `read(file_name)->getSize()` but faster. virtual size_t getFileSize(const String & file_name) const = 0; + virtual size_t getFileSizeByChecksum(const UInt128 & checksum) const = 0; /// Returns the checksum of the entry's data. /// This function does the same as `read(file_name)->getCheckum()` but faster. virtual UInt128 getFileChecksum(const String & file_name) const = 0; - /// Finds a file by its checksum, returns nullopt if not found. - virtual std::optional findFileByChecksum(const UInt128 & checksum) const = 0; - /// Reads an entry from the backup. virtual BackupEntryPtr readFile(const String & file_name) const = 0; + virtual BackupEntryPtr readFileByChecksum(const UInt128 & checksum) const = 0; /// Puts a new entry to the backup. virtual void writeFile(const String & file_name, BackupEntryPtr entry) = 0; diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 2bc72c30bdc..9937ce692bc 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -162,16 +162,12 @@ def test_incremental_backup_after_renaming_table(): # Files in a base backup can be searched by checksum, so an incremental backup with a renamed table actually # contains only its changed metadata. - assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "metadata")) == True - assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "data")) == True - assert ( - os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "metadata")) - == True - ) - assert ( - os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "data")) - == False - ) + contents = os.listdir(get_backup_dir(incremental_backup_name)) + assert '.backup' in contents + contents.remove('.backup') + assert len(contents) == 1 + with open(os.path.join(get_backup_dir(incremental_backup_name), contents[0])) as table_def_in_backup: + assert table_def_in_backup.read().startswith('CREATE TABLE test.table2') instance.query("DROP TABLE test.table2") instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name}") From 3966ee1e30b853681cbb1df2032d6cfdf5ed2ddd Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 18 Apr 2022 00:53:45 +0200 Subject: [PATCH 54/94] Use SeekableReadBuffer instead of ReadBuffer in IBackupEntry. --- src/Backups/ArchiveBackup.cpp | 2 +- src/Backups/ArchiveBackup.h | 2 +- src/Backups/BackupEntryFromAppendOnlyFile.cpp | 6 +- src/Backups/BackupEntryFromAppendOnlyFile.h | 2 +- src/Backups/BackupEntryFromImmutableFile.cpp | 2 +- src/Backups/BackupEntryFromImmutableFile.h | 2 +- src/Backups/BackupEntryFromMemory.cpp | 2 +- src/Backups/BackupEntryFromMemory.h | 2 +- src/Backups/BackupImpl.cpp | 22 +-- src/Backups/BackupImpl.h | 3 +- src/Backups/DirectoryBackup.cpp | 2 +- src/Backups/DirectoryBackup.h | 2 +- src/Backups/IBackupEntriesBatch.cpp | 4 +- src/Backups/IBackupEntriesBatch.h | 2 +- src/Backups/IBackupEntry.h | 4 +- src/IO/ConcatSeekableReadBuffer.cpp | 139 ++++++++++++++++++ src/IO/ConcatSeekableReadBuffer.h | 46 ++++++ src/IO/LimitSeekableReadBuffer.cpp | 131 +++++++++++++++++ src/IO/LimitSeekableReadBuffer.h | 33 +++++ src/Storages/StorageMemory.cpp | 2 +- 20 files changed, 377 insertions(+), 33 deletions(-) create mode 100644 src/IO/ConcatSeekableReadBuffer.cpp create mode 100644 src/IO/ConcatSeekableReadBuffer.h create mode 100644 src/IO/LimitSeekableReadBuffer.cpp create mode 100644 src/IO/LimitSeekableReadBuffer.h diff --git a/src/Backups/ArchiveBackup.cpp b/src/Backups/ArchiveBackup.cpp index 0c4b0c3cd40..69194622e5a 100644 --- a/src/Backups/ArchiveBackup.cpp +++ b/src/Backups/ArchiveBackup.cpp @@ -86,7 +86,7 @@ void ArchiveBackup::closeImpl(const Strings &, bool writing_finalized_) fs::remove(path); } -std::unique_ptr ArchiveBackup::readFileImpl(const String & file_name) const +std::unique_ptr ArchiveBackup::readFileImpl(const String & file_name) const { /// mutex is already locked return reader->readFile(file_name); diff --git a/src/Backups/ArchiveBackup.h b/src/Backups/ArchiveBackup.h index d947fa16beb..4e935efbddc 100644 --- a/src/Backups/ArchiveBackup.h +++ b/src/Backups/ArchiveBackup.h @@ -37,7 +37,7 @@ private: void openImpl(OpenMode open_mode_) override; void closeImpl(const Strings & written_files_, bool writing_finalized_) override; bool supportsWritingInMultipleThreads() const override { return false; } - std::unique_ptr readFileImpl(const String & file_name) const override; + std::unique_ptr readFileImpl(const String & file_name) const override; std::unique_ptr writeFileImpl(const String & file_name) override; const DiskPtr disk; diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.cpp b/src/Backups/BackupEntryFromAppendOnlyFile.cpp index d7f9d5624c8..fa816091bdf 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.cpp +++ b/src/Backups/BackupEntryFromAppendOnlyFile.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace DB @@ -26,10 +26,10 @@ BackupEntryFromAppendOnlyFile::BackupEntryFromAppendOnlyFile( { } -std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer() const +std::unique_ptr BackupEntryFromAppendOnlyFile::getReadBuffer() const { auto buf = BackupEntryFromImmutableFile::getReadBuffer(); - return std::make_unique(std::move(buf), limit, false); + return std::make_unique(std::move(buf), limit); } } diff --git a/src/Backups/BackupEntryFromAppendOnlyFile.h b/src/Backups/BackupEntryFromAppendOnlyFile.h index c1de6930483..d868f82d45f 100644 --- a/src/Backups/BackupEntryFromAppendOnlyFile.h +++ b/src/Backups/BackupEntryFromAppendOnlyFile.h @@ -26,7 +26,7 @@ public: const std::shared_ptr & temporary_file_ = {}); UInt64 getSize() const override { return limit; } - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer() const override; private: const UInt64 limit; diff --git a/src/Backups/BackupEntryFromImmutableFile.cpp b/src/Backups/BackupEntryFromImmutableFile.cpp index 4323682950d..088324f364a 100644 --- a/src/Backups/BackupEntryFromImmutableFile.cpp +++ b/src/Backups/BackupEntryFromImmutableFile.cpp @@ -36,7 +36,7 @@ UInt64 BackupEntryFromImmutableFile::getSize() const return *file_size; } -std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const +std::unique_ptr BackupEntryFromImmutableFile::getReadBuffer() const { if (disk) return disk->readFile(file_path); diff --git a/src/Backups/BackupEntryFromImmutableFile.h b/src/Backups/BackupEntryFromImmutableFile.h index bc1d08aa180..4d5f47b4f61 100644 --- a/src/Backups/BackupEntryFromImmutableFile.h +++ b/src/Backups/BackupEntryFromImmutableFile.h @@ -33,7 +33,7 @@ public: UInt64 getSize() const override; std::optional getChecksum() const override { return checksum; } - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer() const override; String getFilePath() const { return file_path; } DiskPtr getDisk() const { return disk; } diff --git a/src/Backups/BackupEntryFromMemory.cpp b/src/Backups/BackupEntryFromMemory.cpp index 96493e7962e..f59eadc2d7f 100644 --- a/src/Backups/BackupEntryFromMemory.cpp +++ b/src/Backups/BackupEntryFromMemory.cpp @@ -15,7 +15,7 @@ BackupEntryFromMemory::BackupEntryFromMemory(String data_, const std::optional BackupEntryFromMemory::getReadBuffer() const +std::unique_ptr BackupEntryFromMemory::getReadBuffer() const { return std::make_unique(data); } diff --git a/src/Backups/BackupEntryFromMemory.h b/src/Backups/BackupEntryFromMemory.h index d497ff1b439..2226112c9c3 100644 --- a/src/Backups/BackupEntryFromMemory.h +++ b/src/Backups/BackupEntryFromMemory.h @@ -17,7 +17,7 @@ public: UInt64 getSize() const override { return data.size(); } std::optional getChecksum() const override { return checksum; } - std::unique_ptr getReadBuffer() const override; + std::unique_ptr getReadBuffer() const override; private: const String data; diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index f1bb8830a68..7b8c6418367 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include #include @@ -63,13 +63,14 @@ public: { } - std::unique_ptr getReadBuffer() const override + std::unique_ptr getReadBuffer() const override { auto read_buffer = backup->readFileImpl(data_file_name); if (base_backup_entry) { - auto base_backup_read_buffer = base_backup_entry->getReadBuffer(); - read_buffer = std::make_unique(std::move(base_backup_read_buffer), std::move(read_buffer)); + size_t base_size = base_backup_entry->getSize(); + read_buffer = std::make_unique( + base_backup_entry->getReadBuffer(), base_size, std::move(read_buffer), size - base_size); } return read_buffer; } @@ -522,7 +523,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) base_checksum = base_backup->getFileChecksum(file_name); } - std::unique_ptr read_buffer; /// We'll set that later. + std::unique_ptr read_buffer; /// We'll set that later. std::optional hashing_read_buffer; UInt64 hashing_pos = 0; /// Current position in `hashing_read_buffer`. @@ -608,16 +609,9 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) auto copy_pos = use_base ? base_size : 0; /// Move the current read position to the start position to copy data. - /// If `read_buffer` is seekable it's easier, otherwise we can use ignore(). - if (auto * seekable_buffer = dynamic_cast(read_buffer.get())) - { - seekable_buffer->seek(copy_pos, SEEK_SET); - } - else - { + if (!read_buffer) read_buffer = entry->getReadBuffer(); - read_buffer->ignore(copy_pos); - } + read_buffer->seek(copy_pos, SEEK_SET); /// Copy the entry's data after `copy_pos`. auto out = writeFileImpl(getHexUIntLowercase(*checksum)); diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 3fba3762b35..38fd7e7e7d3 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -11,6 +11,7 @@ namespace DB { class Context; using ContextPtr = std::shared_ptr; +class SeekableReadBuffer; /// Base implementation of IBackup. /// Along with passed files it also stores backup metadata - a single file named ".backup" in XML format @@ -53,7 +54,7 @@ protected: /// Read a file from the backup. /// Low level: the function doesn't check base backup or checksums. - virtual std::unique_ptr readFileImpl(const String & file_name) const = 0; + virtual std::unique_ptr readFileImpl(const String & file_name) const = 0; /// Add a file to the backup. /// Low level: the function doesn't check base backup or checksums. diff --git a/src/Backups/DirectoryBackup.cpp b/src/Backups/DirectoryBackup.cpp index 0deb41c200d..6a60cbdd1ef 100644 --- a/src/Backups/DirectoryBackup.cpp +++ b/src/Backups/DirectoryBackup.cpp @@ -56,7 +56,7 @@ void DirectoryBackup::closeImpl(const Strings & written_files_, bool writing_fin } } -std::unique_ptr DirectoryBackup::readFileImpl(const String & file_name) const +std::unique_ptr DirectoryBackup::readFileImpl(const String & file_name) const { auto file_path = path / file_name; return disk->readFile(file_path); diff --git a/src/Backups/DirectoryBackup.h b/src/Backups/DirectoryBackup.h index 499a1893dca..d9dbc81fa78 100644 --- a/src/Backups/DirectoryBackup.h +++ b/src/Backups/DirectoryBackup.h @@ -27,7 +27,7 @@ private: bool backupExists() const override; void openImpl(OpenMode open_mode_) override; void closeImpl(const Strings & written_files_, bool writing_finalized_) override; - std::unique_ptr readFileImpl(const String & file_name) const override; + std::unique_ptr readFileImpl(const String & file_name) const override; std::unique_ptr writeFileImpl(const String & file_name) override; DiskPtr disk; diff --git a/src/Backups/IBackupEntriesBatch.cpp b/src/Backups/IBackupEntriesBatch.cpp index bf6bc6cce83..34a91668023 100644 --- a/src/Backups/IBackupEntriesBatch.cpp +++ b/src/Backups/IBackupEntriesBatch.cpp @@ -1,5 +1,5 @@ #include -#include +#include namespace DB @@ -15,7 +15,7 @@ public: UInt64 getSize() const override { return batch->getSize(index); } std::optional getChecksum() const override { return batch->getChecksum(index); } - std::unique_ptr getReadBuffer() const override { return batch->getReadBuffer(index); } + std::unique_ptr getReadBuffer() const override { return batch->getReadBuffer(index); } private: const std::shared_ptr batch; diff --git a/src/Backups/IBackupEntriesBatch.h b/src/Backups/IBackupEntriesBatch.h index 0d8c8d5aa26..7fceb793c00 100644 --- a/src/Backups/IBackupEntriesBatch.h +++ b/src/Backups/IBackupEntriesBatch.h @@ -17,7 +17,7 @@ public: protected: IBackupEntriesBatch(const Strings & entry_names_) : entry_names(entry_names_) {} - virtual std::unique_ptr getReadBuffer(size_t index) = 0; + virtual std::unique_ptr getReadBuffer(size_t index) = 0; virtual UInt64 getSize(size_t index) = 0; virtual std::optional getChecksum(size_t) { return {}; } diff --git a/src/Backups/IBackupEntry.h b/src/Backups/IBackupEntry.h index 719e03ae6f5..55f03f1a710 100644 --- a/src/Backups/IBackupEntry.h +++ b/src/Backups/IBackupEntry.h @@ -7,7 +7,7 @@ namespace DB { -class ReadBuffer; +class SeekableReadBuffer; /// A backup entry represents some data which should be written to the backup or has been read from the backup. class IBackupEntry @@ -23,7 +23,7 @@ public: virtual std::optional getChecksum() const { return {}; } /// Returns a read buffer for reading the data. - virtual std::unique_ptr getReadBuffer() const = 0; + virtual std::unique_ptr getReadBuffer() const = 0; }; using BackupEntryPtr = std::unique_ptr; diff --git a/src/IO/ConcatSeekableReadBuffer.cpp b/src/IO/ConcatSeekableReadBuffer.cpp new file mode 100644 index 00000000000..c5d48376e2f --- /dev/null +++ b/src/IO/ConcatSeekableReadBuffer.cpp @@ -0,0 +1,139 @@ +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; +} + +ConcatSeekableReadBuffer::BufferInfo::~BufferInfo() +{ + if (own_in) + delete in; +} + +ConcatSeekableReadBuffer::ConcatSeekableReadBuffer(std::unique_ptr buf1, size_t size1, std::unique_ptr buf2, size_t size2) : ConcatSeekableReadBuffer() +{ + appendBuffer(std::move(buf1), size1); + appendBuffer(std::move(buf2), size2); +} + +ConcatSeekableReadBuffer::ConcatSeekableReadBuffer(SeekableReadBuffer & buf1, size_t size1, SeekableReadBuffer & buf2, size_t size2) : ConcatSeekableReadBuffer() +{ + appendBuffer(buf1, size1); + appendBuffer(buf2, size2); +} + +void ConcatSeekableReadBuffer::appendBuffer(std::unique_ptr buffer, size_t size) +{ + appendBuffer(buffer.release(), true, size); +} + +void ConcatSeekableReadBuffer::appendBuffer(SeekableReadBuffer & buffer, size_t size) +{ + appendBuffer(&buffer, false, size); +} + +void ConcatSeekableReadBuffer::appendBuffer(SeekableReadBuffer * buffer, bool own, size_t size) +{ + BufferInfo info; + info.in = buffer; + info.own_in = own; + info.size = size; + + if (!size) + return; + + buffers.emplace_back(std::move(info)); + total_size += size; + + if (current == buffers.size() - 1) + { + working_buffer = buffers[current].in->buffer(); + pos = buffers[current].in->position(); + } +} + +bool ConcatSeekableReadBuffer::nextImpl() +{ + if (current < buffers.size()) + { + buffers[current].in->position() = pos; + while ((current < buffers.size()) && buffers[current].in->eof()) + { + current_start_pos += buffers[current++].size; + if (current < buffers.size()) + buffers[current].in->seek(0, SEEK_SET); + } + } + + if (current >= buffers.size()) + { + current_start_pos = total_size; + set(nullptr, 0); + return false; + } + + working_buffer = buffers[current].in->buffer(); + pos = buffers[current].in->position(); + return true; +} + +off_t ConcatSeekableReadBuffer::getPosition() +{ + size_t current_pos = current_start_pos; + if (current < buffers.size()) + current_pos += buffers[current].in->getPosition() + offset(); + return current_pos; +} + +off_t ConcatSeekableReadBuffer::seek(off_t off, int whence) +{ + off_t new_position; + off_t current_position = getPosition(); + if (whence == SEEK_SET) + new_position = off; + else if (whence == SEEK_CUR) + new_position = current_position + off; + else + throw Exception("ConcatSeekableReadBuffer::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + if (new_position < 0) + throw Exception("SEEK_SET underflow: off = " + std::to_string(off), ErrorCodes::ARGUMENT_OUT_OF_BOUND); + if (static_cast(new_position) > total_size) + throw Exception("SEEK_CUR shift out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + if (static_cast(new_position) == total_size) + { + current = buffers.size(); + current_start_pos = total_size; + set(nullptr, 0); + return new_position; + } + + off_t change_position = new_position - current_position; + if ((working_buffer.begin() <= pos + change_position) && (pos + change_position <= working_buffer.end())) + { + /// Position is still inside the same working buffer. + pos += change_position; + assert(pos >= working_buffer.begin()); + assert(pos <= working_buffer.end()); + return new_position; + } + + while (new_position < static_cast(current_start_pos)) + current_start_pos -= buffers[--current].size; + + while (new_position >= static_cast(current_start_pos + buffers[current].size)) + current_start_pos += buffers[current++].size; + + buffers[current].in->seek(new_position - current_start_pos, SEEK_SET); + working_buffer = buffers[current].in->buffer(); + pos = buffers[current].in->position(); + return new_position; +} + +} diff --git a/src/IO/ConcatSeekableReadBuffer.h b/src/IO/ConcatSeekableReadBuffer.h new file mode 100644 index 00000000000..26314a218ea --- /dev/null +++ b/src/IO/ConcatSeekableReadBuffer.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/// Reads from the concatenation of multiple SeekableReadBuffer's +class ConcatSeekableReadBuffer : public SeekableReadBufferWithSize +{ +public: + ConcatSeekableReadBuffer() : SeekableReadBufferWithSize(nullptr, 0) { } + ConcatSeekableReadBuffer(std::unique_ptr buf1, size_t size1, std::unique_ptr buf2, size_t size2); + ConcatSeekableReadBuffer(SeekableReadBuffer & buf1, size_t size1, SeekableReadBuffer & buf2, size_t size2); + + void appendBuffer(std::unique_ptr buffer, size_t size); + void appendBuffer(SeekableReadBuffer & buffer, size_t size); + + off_t seek(off_t off, int whence) override; + off_t getPosition() override; + + std::optional getTotalSize() override { return total_size; } + +private: + bool nextImpl() override; + void appendBuffer(SeekableReadBuffer * buffer, bool own, size_t size); + + struct BufferInfo + { + BufferInfo() = default; + BufferInfo(BufferInfo &&) = default; + ~BufferInfo(); + SeekableReadBuffer * in = nullptr; + bool own_in = false; + size_t size = 0; + }; + + std::vector buffers; + size_t total_size = 0; + size_t current = 0; + size_t current_start_pos = 0; /// Position of the current buffer's begin. +}; + +} diff --git a/src/IO/LimitSeekableReadBuffer.cpp b/src/IO/LimitSeekableReadBuffer.cpp new file mode 100644 index 00000000000..fc3300e71ca --- /dev/null +++ b/src/IO/LimitSeekableReadBuffer.cpp @@ -0,0 +1,131 @@ +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int LIMIT_EXCEEDED; +} + +bool LimitSeekableReadBuffer::nextImpl() +{ + if (end_position >= static_cast(limit)) + { + /// Limit reached. + set(in->position(), 0); + return false; + } + + assert(position() >= in->position()); + in->position() = position(); + + if (!in->next()) + { + /// EOF reached. + set(in->position(), 0); + return false; + } + + working_buffer = in->buffer(); + pos = in->position(); + end_position = in->getPosition() + in->available(); + + if (end_position > static_cast(limit)) + { + working_buffer.resize(working_buffer.size() - end_position + limit); + end_position = limit; + } + + return true; +} + + +off_t LimitSeekableReadBuffer::seek(off_t off, int whence) +{ + off_t new_position; + off_t current_position = getPosition(); + if (whence == SEEK_SET) + new_position = off; + else if (whence == SEEK_CUR) + new_position = current_position + off; + else + throw Exception("LimitSeekableReadBuffer::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + if (new_position < 0) + throw Exception("SEEK_SET underflow: off = " + std::to_string(off), ErrorCodes::ARGUMENT_OUT_OF_BOUND); + if (static_cast(new_position) > limit) + throw Exception("SEEK_CUR shift out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + + off_t change_position = new_position - current_position; + if ((working_buffer.begin() <= pos + change_position) && (pos + change_position <= working_buffer.end())) + { + /// Position is still inside buffer. + pos += change_position; + assert(pos >= working_buffer.begin()); + assert(pos <= working_buffer.end()); + return new_position; + } + + in->seek(new_position, SEEK_SET); + working_buffer = in->buffer(); + pos = in->position(); + end_position = in->getPosition() + in->available(); + + if (end_position > static_cast(limit)) + { + working_buffer.resize(working_buffer.size() - end_position + limit); + end_position = limit; + } + + return new_position; +} + + +LimitSeekableReadBuffer::LimitSeekableReadBuffer(SeekableReadBuffer * in_, bool owns, UInt64 limit_) + : SeekableReadBuffer(in_ ? in_->position() : nullptr, 0) + , in(in_) + , owns_in(owns) + , limit(limit_) +{ + assert(in); + + off_t current_position = in->getPosition(); + if (current_position > static_cast(limit)) + throw Exception("Limit for LimitSeekableReadBuffer exceeded", ErrorCodes::LIMIT_EXCEEDED); + + working_buffer = in->buffer(); + pos = in->position(); + end_position = current_position + in->available(); + + if (end_position > static_cast(limit)) + { + working_buffer.resize(working_buffer.size() - end_position + limit); + end_position = limit; + } +} + + +LimitSeekableReadBuffer::LimitSeekableReadBuffer(SeekableReadBuffer & in_, UInt64 limit_) + : LimitSeekableReadBuffer(&in_, false, limit_) +{ +} + + +LimitSeekableReadBuffer::LimitSeekableReadBuffer(std::unique_ptr in_, UInt64 limit_) + : LimitSeekableReadBuffer(in_.release(), true, limit_) +{ +} + + +LimitSeekableReadBuffer::~LimitSeekableReadBuffer() +{ + /// Update underlying buffer's position in case when limit wasn't reached. + in->position() = position(); + if (owns_in) + delete in; +} + +} diff --git a/src/IO/LimitSeekableReadBuffer.h b/src/IO/LimitSeekableReadBuffer.h new file mode 100644 index 00000000000..c6399f574c1 --- /dev/null +++ b/src/IO/LimitSeekableReadBuffer.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/** Allows to read from another SeekableReadBuffer no far than the specified offset. + * Note that the nested SeekableReadBuffer may read slightly more data internally to fill its buffer. + */ +class LimitSeekableReadBuffer : public SeekableReadBuffer +{ +public: + LimitSeekableReadBuffer(SeekableReadBuffer & in_, UInt64 limit_); + LimitSeekableReadBuffer(std::unique_ptr in_, UInt64 limit_); + ~LimitSeekableReadBuffer() override; + + off_t seek(off_t off, int whence) override; + off_t getPosition() override { return end_position - available(); } + +private: + SeekableReadBuffer * in; + bool owns_in; + UInt64 limit; + off_t end_position; /// Offset of the end of working_buffer. + + LimitSeekableReadBuffer(SeekableReadBuffer * in_, bool owns, UInt64 limit_); + bool nextImpl() override; +}; + +} diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 3e2fe996fe8..26b435f98a0 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -448,7 +448,7 @@ private: }); } - std::unique_ptr getReadBuffer(size_t index) override + std::unique_ptr getReadBuffer(size_t index) override { initialize(); return createReadBufferFromFileBase(file_paths[index], {}); From c92a8925e35bc906897fc6fa19d960e8221f17ad Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Sun, 17 Apr 2022 14:11:43 +0200 Subject: [PATCH 55/94] Refactoring of backup implementation to support archives more easily. --- src/Backups/ArchiveBackup.cpp | 120 ------ src/Backups/ArchiveBackup.h | 52 --- src/Backups/BackupFactory.cpp | 4 +- src/Backups/BackupIO.h | 30 ++ src/Backups/BackupIO_Disk.cpp | 58 +++ src/Backups/BackupIO_Disk.h | 41 ++ src/Backups/BackupIO_File.cpp | 57 +++ src/Backups/BackupIO_File.h | 37 ++ src/Backups/BackupImpl.cpp | 396 +++++++++++------- src/Backups/BackupImpl.h | 107 +++-- src/Backups/DirectoryBackup.cpp | 72 ---- src/Backups/DirectoryBackup.h | 37 -- src/Backups/DistributedBackupCoordination.cpp | 237 +++++++++++ src/Backups/DistributedBackupCoordination.h | 39 ++ src/Backups/IBackup.h | 8 +- src/Backups/IBackupCoordination.h | 63 +++ src/Backups/LocalBackupCoordination.cpp | 104 +++++ src/Backups/LocalBackupCoordination.h | 36 ++ .../registerBackupEnginesFileAndDisk.cpp | 35 +- 19 files changed, 1021 insertions(+), 512 deletions(-) delete mode 100644 src/Backups/ArchiveBackup.cpp delete mode 100644 src/Backups/ArchiveBackup.h create mode 100644 src/Backups/BackupIO.h create mode 100644 src/Backups/BackupIO_Disk.cpp create mode 100644 src/Backups/BackupIO_Disk.h create mode 100644 src/Backups/BackupIO_File.cpp create mode 100644 src/Backups/BackupIO_File.h delete mode 100644 src/Backups/DirectoryBackup.cpp delete mode 100644 src/Backups/DirectoryBackup.h create mode 100644 src/Backups/DistributedBackupCoordination.cpp create mode 100644 src/Backups/DistributedBackupCoordination.h create mode 100644 src/Backups/IBackupCoordination.h create mode 100644 src/Backups/LocalBackupCoordination.cpp create mode 100644 src/Backups/LocalBackupCoordination.h diff --git a/src/Backups/ArchiveBackup.cpp b/src/Backups/ArchiveBackup.cpp deleted file mode 100644 index 69194622e5a..00000000000 --- a/src/Backups/ArchiveBackup.cpp +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - - -ArchiveBackup::ArchiveBackup( - const String & backup_name_, - const DiskPtr & disk_, - const String & path_, - const ContextPtr & context_, - const std::optional & base_backup_info_) - : BackupImpl(backup_name_, context_, base_backup_info_), disk(disk_), path(path_) -{ -} - -ArchiveBackup::~ArchiveBackup() -{ - close(); -} - -bool ArchiveBackup::backupExists() const -{ - return disk ? disk->exists(path) : fs::exists(path); -} - -void ArchiveBackup::openImpl(OpenMode open_mode_) -{ - /// mutex is already locked - if (open_mode_ == OpenMode::WRITE) - { - /// Create a directory to contain the archive. - auto dir_path = fs::path(path).parent_path(); - if (disk) - disk->createDirectories(dir_path); - else - std::filesystem::create_directories(dir_path); - - /// Start writing the archive. - if (disk) - writer = createArchiveWriter(path, disk->writeFile(path)); - else - writer = createArchiveWriter(path); - - writer->setCompression(compression_method, compression_level); - writer->setPassword(password); - } - else if (open_mode_ == OpenMode::READ) - { - if (disk) - { - auto archive_read_function = [d = disk, p = path]() -> std::unique_ptr { return d->readFile(p); }; - size_t archive_size = disk->getFileSize(path); - reader = createArchiveReader(path, archive_read_function, archive_size); - } - else - reader = createArchiveReader(path); - - reader->setPassword(password); - } -} - -void ArchiveBackup::closeImpl(const Strings &, bool writing_finalized_) -{ - /// mutex is already locked - if (writer && writer->isWritingFile()) - throw Exception("There is some writing unfinished on close", ErrorCodes::LOGICAL_ERROR); - - writer.reset(); - reader.reset(); - - if ((getOpenModeNoLock() == OpenMode::WRITE) && !writing_finalized_) - fs::remove(path); -} - -std::unique_ptr ArchiveBackup::readFileImpl(const String & file_name) const -{ - /// mutex is already locked - return reader->readFile(file_name); -} - -std::unique_ptr ArchiveBackup::writeFileImpl(const String & file_name) -{ - /// mutex is already locked - return writer->writeFile(file_name); -} - -void ArchiveBackup::setCompression(const String & compression_method_, int compression_level_) -{ - std::lock_guard lock{mutex}; - compression_method = compression_method_; - compression_level = compression_level_; - if (writer) - writer->setCompression(compression_method, compression_level); -} - -void ArchiveBackup::setPassword(const String & password_) -{ - std::lock_guard lock{mutex}; - password = password_; - if (writer) - writer->setPassword(password); - if (reader) - reader->setPassword(password); -} - -} diff --git a/src/Backups/ArchiveBackup.h b/src/Backups/ArchiveBackup.h deleted file mode 100644 index 4e935efbddc..00000000000 --- a/src/Backups/ArchiveBackup.h +++ /dev/null @@ -1,52 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ -class IDisk; -using DiskPtr = std::shared_ptr; -class IArchiveReader; -class IArchiveWriter; - -/// Stores a backup as a single .zip file. -class ArchiveBackup : public BackupImpl -{ -public: - /// `disk`_ is allowed to be nullptr and that means the `path_` is a path in the local filesystem. - ArchiveBackup( - const String & backup_name_, - const DiskPtr & disk_, - const String & path_, - const ContextPtr & context_, - const std::optional & base_backup_info_ = {}); - - ~ArchiveBackup() override; - - static constexpr const int kDefaultCompressionLevel = -1; - - /// Sets compression method and level. - void setCompression(const String & compression_method_, int compression_level_ = kDefaultCompressionLevel); - - /// Sets password. - void setPassword(const String & password_); - -private: - bool backupExists() const override; - void openImpl(OpenMode open_mode_) override; - void closeImpl(const Strings & written_files_, bool writing_finalized_) override; - bool supportsWritingInMultipleThreads() const override { return false; } - std::unique_ptr readFileImpl(const String & file_name) const override; - std::unique_ptr writeFileImpl(const String & file_name) override; - - const DiskPtr disk; - const String path; - std::shared_ptr reader; - std::shared_ptr writer; - String compression_method; - int compression_level = kDefaultCompressionLevel; - String password; -}; - -} diff --git a/src/Backups/BackupFactory.cpp b/src/Backups/BackupFactory.cpp index d64c2bd0318..a23cc70658b 100644 --- a/src/Backups/BackupFactory.cpp +++ b/src/Backups/BackupFactory.cpp @@ -21,9 +21,7 @@ BackupMutablePtr BackupFactory::createBackup(const CreateParams & params) const auto it = creators.find(engine_name); if (it == creators.end()) throw Exception(ErrorCodes::BACKUP_ENGINE_NOT_FOUND, "Not found backup engine {}", engine_name); - BackupMutablePtr backup = (it->second)(params); - backup->open(params.open_mode); - return backup; + return (it->second)(params); } void BackupFactory::registerBackupEngine(const String & engine_name, const CreatorFn & creator_fn) diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h new file mode 100644 index 00000000000..ec0b2301800 --- /dev/null +++ b/src/Backups/BackupIO.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +namespace DB +{ +class SeekableReadBuffer; +class WriteBuffer; + +/// Represents operations of loading from disk or downloading for reading a backup. +class IBackupReader /// BackupReaderFile, BackupReaderDisk, BackupReaderS3 +{ +public: + virtual ~IBackupReader() = default; + virtual bool fileExists(const String & file_name) = 0; + virtual size_t getFileSize(const String & file_name) = 0; + virtual std::unique_ptr readFile(const String & file_name) = 0; +}; + +/// Represents operations of storing to disk or uploading for writing a backup. +class IBackupWriter /// BackupWriterFile, BackupWriterDisk, BackupWriterS3 +{ +public: + virtual ~IBackupWriter() = default; + virtual bool fileExists(const String & file_name) = 0; + virtual std::unique_ptr writeFile(const String & file_name) = 0; + virtual void removeFilesAfterFailure(const Strings & file_names) = 0; +}; + +} diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp new file mode 100644 index 00000000000..ff5ad13897c --- /dev/null +++ b/src/Backups/BackupIO_Disk.cpp @@ -0,0 +1,58 @@ +#include +#include +#include +#include + +namespace fs = std::filesystem; + + +namespace DB +{ +BackupReaderDisk::BackupReaderDisk(const DiskPtr & disk_, const String & path_) : disk(disk_), path(path_) +{ +} + +BackupReaderDisk::~BackupReaderDisk() = default; + +bool BackupReaderDisk::fileExists(const String & file_name) +{ + return disk->exists(path / file_name); +} + +size_t BackupReaderDisk::getFileSize(const String & file_name) +{ + return disk->getFileSize(path / file_name); +} + +std::unique_ptr BackupReaderDisk::readFile(const String & file_name) +{ + return disk->readFile(path / file_name); +} + +BackupWriterDisk::BackupWriterDisk(const DiskPtr & disk_, const String & path_) : disk(disk_), path(path_) +{ +} + +BackupWriterDisk::~BackupWriterDisk() = default; + +bool BackupWriterDisk::fileExists(const String & file_name) +{ + return disk->exists(path / file_name); +} + +std::unique_ptr BackupWriterDisk::writeFile(const String & file_name) +{ + auto file_path = path / file_name; + disk->createDirectories(file_path.parent_path()); + return disk->writeFile(file_path); +} + +void BackupWriterDisk::removeFilesAfterFailure(const Strings & file_names) +{ + for (const auto & file_name : file_names) + disk->removeFile(path / file_name); + if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) + disk->removeDirectory(path); +} + +} diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h new file mode 100644 index 00000000000..88d70b0f1db --- /dev/null +++ b/src/Backups/BackupIO_Disk.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +namespace DB +{ +class IDisk; +using DiskPtr = std::shared_ptr; + +class BackupReaderDisk : public IBackupReader +{ +public: + BackupReaderDisk(const DiskPtr & disk_, const String & path_); + ~BackupReaderDisk() override; + + bool fileExists(const String & file_name) override; + size_t getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; + +private: + DiskPtr disk; + std::filesystem::path path; +}; + +class BackupWriterDisk : public IBackupWriter +{ +public: + BackupWriterDisk(const DiskPtr & disk_, const String & path_); + ~BackupWriterDisk() override; + + bool fileExists(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; + void removeFilesAfterFailure(const Strings & file_names) override; + +private: + DiskPtr disk; + std::filesystem::path path; +}; + +} diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp new file mode 100644 index 00000000000..8e7bfb5b83e --- /dev/null +++ b/src/Backups/BackupIO_File.cpp @@ -0,0 +1,57 @@ +#include +#include +#include + +namespace fs = std::filesystem; + + +namespace DB +{ +BackupReaderFile::BackupReaderFile(const String & path_) : path(path_) +{ +} + +BackupReaderFile::~BackupReaderFile() = default; + +bool BackupReaderFile::fileExists(const String & file_name) +{ + return fs::exists(path / file_name); +} + +size_t BackupReaderFile::getFileSize(const String & file_name) +{ + return fs::file_size(path / file_name); +} + +std::unique_ptr BackupReaderFile::readFile(const String & file_name) +{ + return createReadBufferFromFileBase(path / file_name, {}); +} + +BackupWriterFile::BackupWriterFile(const String & path_) : path(path_) +{ +} + +BackupWriterFile::~BackupWriterFile() = default; + +bool BackupWriterFile::fileExists(const String & file_name) +{ + return fs::exists(path / file_name); +} + +std::unique_ptr BackupWriterFile::writeFile(const String & file_name) +{ + auto file_path = path / file_name; + fs::create_directories(file_path.parent_path()); + return std::make_unique(file_path); +} + +void BackupWriterFile::removeFilesAfterFailure(const Strings & file_names) +{ + for (const auto & file_name : file_names) + fs::remove(path / file_name); + if (fs::is_directory(path) && fs::is_empty(path)) + fs::remove(path); +} + +} diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h new file mode 100644 index 00000000000..c4aa20718a9 --- /dev/null +++ b/src/Backups/BackupIO_File.h @@ -0,0 +1,37 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class BackupReaderFile : public IBackupReader +{ +public: + BackupReaderFile(const String & path_); + ~BackupReaderFile() override; + + bool fileExists(const String & file_name) override; + size_t getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; + +private: + std::filesystem::path path; +}; + +class BackupWriterFile : public IBackupWriter +{ +public: + BackupWriterFile(const String & path_); + ~BackupWriterFile() override; + + bool fileExists(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; + void removeFilesAfterFailure(const Strings & file_names) override; + +private: + std::filesystem::path path; +}; + +} diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 7b8c6418367..b08012287f3 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -1,10 +1,16 @@ #include #include #include +#include #include +#include #include #include #include +#include +#include +#include +#include #include #include #include @@ -37,15 +43,21 @@ namespace { const UInt64 INITIAL_BACKUP_VERSION = 1; const UInt64 CURRENT_BACKUP_VERSION = 2; - const UInt64 BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES = 2; + String hexChecksum(UInt128 checksum) + { + return getHexUIntLowercase(checksum); + } + UInt128 unhexChecksum(const String & checksum) { if (checksum.size() != sizeof(UInt128) * 2) throw Exception(ErrorCodes::BACKUP_DAMAGED, "Unexpected size of checksum: {}, must be {}", checksum.size(), sizeof(UInt128) * 2); return unhexUInt(checksum.data()); } + + using FileInfo = IBackupCoordination::FileInfo; } @@ -54,18 +66,23 @@ class BackupImpl::BackupEntryFromBackupImpl : public IBackupEntry public: BackupEntryFromBackupImpl( const std::shared_ptr & backup_, + const String & archive_suffix_, const String & data_file_name_, UInt64 size_, const UInt128 checksum_, BackupEntryPtr base_backup_entry_ = {}) - : backup(backup_), data_file_name(data_file_name_), size(size_), checksum(checksum_), + : backup(backup_), archive_suffix(archive_suffix_), data_file_name(data_file_name_), size(size_), checksum(checksum_), base_backup_entry(std::move(base_backup_entry_)) { } std::unique_ptr getReadBuffer() const override { - auto read_buffer = backup->readFileImpl(data_file_name); + std::unique_ptr read_buffer; + if (backup->use_archives) + read_buffer = backup->getArchiveReader(archive_suffix)->readFile(data_file_name); + else + read_buffer = backup->reader->readFile(data_file_name); if (base_backup_entry) { size_t base_size = base_backup_entry->getSize(); @@ -80,6 +97,7 @@ public: private: const std::shared_ptr backup; + const String archive_suffix; const String data_file_name; const UInt64 size; const UInt128 checksum; @@ -87,120 +105,85 @@ private: }; -class BackupImpl::LocalFileInfos : public IFileInfos -{ -public: - LocalFileInfos() = default; - ~LocalFileInfos() override = default; - - void add(FileInfo && file_info, bool & is_new_checksum) override - { - file_names.emplace(file_info.file_name, file_info.checksum); - is_new_checksum = (file_info.checksum && !file_infos.contains(file_info.checksum)); - if (is_new_checksum) - file_infos.emplace(file_info.checksum, std::move(file_info)); - } - - std::vector getAllFileInfos() override - { - std::vector res; - for (const auto & [file_name, checksum] : file_names) - { - FileInfo info = file_infos.at(checksum); - info.file_name = file_name; - res.push_back(std::move(info)); - } - return res; - } - - Strings listFiles(const String & prefix, const String & terminator) override - { - Strings elements; - for (auto it = file_names.lower_bound(prefix); it != file_names.end(); ++it) - { - const String & name = it->first; - if (!name.starts_with(prefix)) - break; - size_t start_pos = prefix.length(); - size_t end_pos = String::npos; - if (!terminator.empty()) - end_pos = name.find(terminator, start_pos); - std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); - if (!elements.empty() && (elements.back() == new_element)) - continue; - elements.push_back(String{new_element}); - } - return elements; - } - - std::optional getChecksumByFileName(const String & file_name) override - { - auto it = file_names.find(file_name); - if (it == file_names.end()) - return std::nullopt; - return it->second; - } - - std::optional getFileInfoByChecksum(const UInt128 & checksum) override - { - auto it = file_infos.find(checksum); - if (it == file_infos.end()) - return std::nullopt; - return it->second; - } - - std::optional getFileInfoByFileName(const String & file_name) override - { - auto it = file_names.find(file_name); - if (it == file_names.end()) - return std::nullopt; - FileInfo info = file_infos.at(it->second); - info.file_name = file_name; - return info; - } - -private: - std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. - std::unordered_map file_infos; /// Information about files. Without empty files. -}; - - -BackupImpl::BackupImpl(const String & backup_name_, const ContextPtr & context_, const std::optional & base_backup_info_) - : backup_name(backup_name_), context(context_), base_backup_info_param(base_backup_info_), file_infos(std::make_unique()) +BackupImpl::BackupImpl( + const String & backup_name_, + const ArchiveParams & archive_params_, + const std::optional & base_backup_info_, + std::shared_ptr writer_, + std::shared_ptr coordination_, + bool is_helper_backup_, + const ContextPtr & context_) + : backup_name(backup_name_) + , archive_params(archive_params_) + , use_archives(!archive_params.archive_name.empty()) + , base_backup_info_initial(base_backup_info_) + , open_mode(OpenMode::WRITE) + , writer(std::move(writer_)) + , coordination(coordination_ ? coordination_ : std::make_shared()) + , is_helper_backup(is_helper_backup_) + , context(context_) { + open(); } -BackupImpl::~BackupImpl() = default; -void BackupImpl::open(OpenMode open_mode_) +BackupImpl::BackupImpl( + const String & backup_name_, + const ArchiveParams & archive_params_, + const std::optional & base_backup_info_, + std::shared_ptr reader_, + const ContextPtr & context_) + : backup_name(backup_name_) + , archive_params(archive_params_) + , use_archives(!archive_params.archive_name.empty()) + , base_backup_info_initial(base_backup_info_) + , open_mode(OpenMode::READ) + , reader(std::move(reader_)) + , coordination(std::make_shared()) + , is_helper_backup(false) + , context(context_) +{ + open(); +} + + +BackupImpl::~BackupImpl() +{ + close(); +} + + +void BackupImpl::open() { std::lock_guard lock{mutex}; - if (open_mode == open_mode_) - return; - if (open_mode != OpenMode::NONE) - throw Exception("Backup is already opened", ErrorCodes::LOGICAL_ERROR); + String file_name_to_check_existence; + if (use_archives) + file_name_to_check_existence = archive_params.archive_name; + else + file_name_to_check_existence = ".backup"; + bool backup_exists = (open_mode == OpenMode::WRITE) ? writer->fileExists(file_name_to_check_existence) : reader->fileExists(file_name_to_check_existence); - if (open_mode_ == OpenMode::WRITE) + if (open_mode == OpenMode::WRITE) { - if (backupExists()) + if (backup_exists) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", getName()); + } + else + { + if (!backup_exists) + throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", getName()); + } + if (open_mode == OpenMode::WRITE) + { timestamp = std::time(nullptr); uuid = UUIDHelpers::generateV4(); writing_finalized = false; } - if (open_mode_ == OpenMode::READ) - { - if (!backupExists()) - throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", getName()); - } - - openImpl(open_mode_); - - base_backup_info = base_backup_info_param; - if (open_mode_ == OpenMode::READ) + base_backup_info = base_backup_info_initial; + if (open_mode == OpenMode::READ) readBackupMetadata(); if (base_backup_info) @@ -211,37 +194,30 @@ void BackupImpl::open(OpenMode open_mode_) params.context = context; base_backup = BackupFactory::instance().createBackup(params); - if (open_mode_ == OpenMode::WRITE) + if (open_mode == OpenMode::WRITE) base_backup_uuid = base_backup->getUUID(); else if (base_backup_uuid != base_backup->getUUID()) throw Exception(ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: The base backup {} has different UUID ({} != {})", getName(), base_backup->getName(), toString(base_backup->getUUID()), (base_backup_uuid ? toString(*base_backup_uuid) : "")); } - - open_mode = open_mode_; } void BackupImpl::close() { std::lock_guard lock{mutex}; - if (open_mode == OpenMode::NONE) - return; - closeImpl({}/*written_files*/, writing_finalized); + if (!is_helper_backup && writing_finalized) + writeBackupMetadata(); - uuid = UUIDHelpers::Nil; - timestamp = 0; - base_backup_info.reset(); - base_backup.reset(); - base_backup_uuid.reset(); - //file_infos.clear(); - open_mode = OpenMode::NONE; -} + archive_readers.clear(); + archive_writer_with_empty_suffix.reset(); + current_archive_writer.reset(); -IBackup::OpenMode BackupImpl::getOpenMode() const -{ - std::lock_guard lock{mutex}; - return open_mode; + if (!is_helper_backup && writer && !writing_finalized) + removeAllFilesAfterFailure(); + + if (!is_helper_backup) + coordination->drop(); } time_t BackupImpl::getTimestamp() const @@ -260,7 +236,7 @@ void BackupImpl::writeBackupMetadata() if (base_backup_info) { bool base_backup_in_use = false; - for (const auto & info : file_infos->getAllFileInfos()) + for (const auto & info : coordination->getAllFileInfos()) { if (info.base_size) base_backup_in_use = true; @@ -274,23 +250,25 @@ void BackupImpl::writeBackupMetadata() } size_t index = 0; - for (const auto & info : file_infos->getAllFileInfos()) + for (const auto & info : coordination->getAllFileInfos()) { String prefix = index ? "contents.file[" + std::to_string(index) + "]." : "contents.file."; config->setString(prefix + "name", info.file_name); config->setUInt(prefix + "size", info.size); if (info.size) { - config->setString(prefix + "checksum", getHexUIntLowercase(info.checksum)); + config->setString(prefix + "checksum", hexChecksum(info.checksum)); if (info.base_size) { config->setBool(prefix + "use_base", true); if (info.base_size != info.size) { config->setUInt(prefix + "base_size", info.base_size); - config->setString(prefix + "base_checksum", getHexUIntLowercase(info.base_checksum)); + config->setString(prefix + "base_checksum", hexChecksum(info.base_checksum)); } } + if (!info.archive_suffix.empty()) + config->setString(prefix + "archive_suffix", info.archive_suffix); } ++index; } @@ -298,14 +276,23 @@ void BackupImpl::writeBackupMetadata() std::ostringstream stream; // STYLE_CHECK_ALLOW_STD_STRING_STREAM config->save(stream); String str = stream.str(); - //written_files.push_back(".backup"); - auto out = writeFileImpl(".backup"); + + std::unique_ptr out; + if (use_archives) + out = getArchiveWriter("")->writeFile(".backup"); + else + out = writer->writeFile(".backup"); out->write(str.data(), str.size()); } void BackupImpl::readBackupMetadata() { - auto in = readFileImpl(".backup"); + std::unique_ptr in; + if (use_archives) + in = getArchiveReader("")->readFile(".backup"); + else + in = reader->readFile(".backup"); + String str; readStringUntilEOF(str, *in); std::istringstream stream(str); // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -325,7 +312,6 @@ void BackupImpl::readBackupMetadata() if (config->has("base_backup_uuid")) base_backup_uuid = parse(config->getString("base_backup_uuid")); - //file_infos.clear(); Poco::Util::AbstractConfiguration::Keys keys; config->keys("contents", keys); for (const auto & key : keys) @@ -351,32 +337,37 @@ void BackupImpl::readBackupMetadata() info.base_checksum = unhexChecksum(config->getString(prefix + "base_checksum")); } - file_infos->add(std::move(info)); + info.archive_suffix = config->getString(prefix + "archive_suffix", ""); + + coordination->addFileInfo(info); } } } Strings BackupImpl::listFiles(const String & prefix, const String & terminator) const { + std::lock_guard lock{mutex}; if (!prefix.ends_with('/') && !prefix.empty()) throw Exception("prefix should end with '/'", ErrorCodes::BAD_ARGUMENTS); - return file_infos->listFiles(prefix, terminator); + return coordination->listFiles(prefix, terminator); } bool BackupImpl::fileExists(const String & file_name) const { std::lock_guard lock{mutex}; - return file_infos->getChecksumByFileName(file_name).has_value(); + return coordination->getChecksumByFileName(file_name).has_value(); } bool BackupImpl::fileExistsByChecksum(const UInt128 & checksum) const { - return file_infos->getFileInfoByChecksum(checksum).has_value(); + std::lock_guard lock{mutex}; + return coordination->getFileInfoByChecksum(checksum).has_value(); } size_t BackupImpl::getFileSize(const String & file_name) const { - auto info = file_infos->getFileInfoByFileName(file_name); + std::lock_guard lock{mutex}; + auto info = coordination->getFileInfoByFileName(file_name); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); @@ -385,16 +376,18 @@ size_t BackupImpl::getFileSize(const String & file_name) const size_t BackupImpl::getFileSizeByChecksum(const UInt128 & checksum) const { - auto info = file_infos->getFileInfoByChecksum(checksum); + std::lock_guard lock{mutex}; + auto info = coordination->getFileInfoByChecksum(checksum); if (!info) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), getHexUIntLowercase(checksum)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), hexChecksum(checksum)); return info->size; } UInt128 BackupImpl::getFileChecksum(const String & file_name) const { - auto info = file_infos->getFileInfoByFileName(file_name); + std::lock_guard lock{mutex}; + auto info = coordination->getFileInfoByFileName(file_name); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); @@ -412,10 +405,10 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const if (open_mode != OpenMode::READ) throw Exception("Backup is not opened for reading", ErrorCodes::LOGICAL_ERROR); - auto info_opt = file_infos->getFileInfoByChecksum(checksum); + auto info_opt = coordination->getFileInfoByChecksum(checksum); if (!info_opt) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), getHexUIntLowercase(checksum)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), hexChecksum(checksum)); const auto & info = *info_opt; if (!info.size) @@ -427,9 +420,9 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const if (!info.base_size) { /// Data goes completely from this backup, the base backup isn't used. - String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? getHexUIntLowercase(checksum) : info.file_name; + String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? hexChecksum(checksum) : info.file_name; return std::make_unique( - std::static_pointer_cast(shared_from_this()), data_file_name, info.size, info.checksum); + std::static_pointer_cast(shared_from_this()), info.archive_suffix, data_file_name, info.size, info.checksum); } if (info.size < info.base_size) @@ -437,7 +430,8 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const throw Exception( ErrorCodes::BACKUP_DAMAGED, "Backup {}: Entry {} has its data size less than in the base backup {}: {} < {}", - getName(), getHexUIntLowercase(checksum), base_backup->getName(), info.size, info.base_size); + getName(), + hexChecksum(checksum), base_backup->getName(), info.size, info.base_size); } if (!base_backup) @@ -445,7 +439,7 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const throw Exception( ErrorCodes::NO_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but there is no base backup specified", - getName(), getHexUIntLowercase(checksum)); + getName(), hexChecksum(checksum)); } if (!base_backup->fileExistsByChecksum(info.base_checksum)) @@ -453,7 +447,7 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but doesn't exist there", - getName(), getHexUIntLowercase(checksum)); + getName(), hexChecksum(checksum)); } auto base_entry = base_backup->readFileByChecksum(info.base_checksum); @@ -463,7 +457,7 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} has unexpected size in the base backup {}: {} (expected size: {})", - getName(), getHexUIntLowercase(checksum), base_backup->getName(), base_size, info.base_size); + getName(), hexChecksum(checksum), base_backup->getName(), base_size, info.base_size); } if (info.size == info.base_size) @@ -475,9 +469,9 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const { /// The beginning of the data goes from the base backup, /// and the ending goes from this backup. - String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? getHexUIntLowercase(checksum) : info.file_name; + String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? hexChecksum(checksum) : info.file_name; return std::make_unique( - static_pointer_cast(shared_from_this()), data_file_name, info.size, info.checksum, std::move(base_entry)); + static_pointer_cast(shared_from_this()), info.archive_suffix, data_file_name, info.size, info.checksum, std::move(base_entry)); } } @@ -488,7 +482,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); - if (file_infos->getChecksumByFileName(file_name)) + if (coordination->getChecksumByFileName(file_name)) throw Exception( ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", getName(), quoteString(file_name)); @@ -500,16 +494,16 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) /// Check if the entry's data is empty. if (!info.size) { - file_infos->add(std::move(info)); + coordination->addFileInfo(info); return; } /// Maybe we have a copy of this file in the backup already. std::optional checksum = entry->getChecksum(); - if (checksum && file_infos->getFileInfoByChecksum(*checksum)) + if (checksum && coordination->getFileInfoByChecksum(*checksum)) { info.checksum = *checksum; - file_infos->add(std::move(info)); + coordination->addFileInfo(info); return; } @@ -567,9 +561,9 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) info.checksum = *checksum; /// Maybe we have a copy of this file in the backup already. - if (file_infos->getFileInfoByChecksum(*checksum)) + if (coordination->getFileInfoByChecksum(*checksum)) { - file_infos->add(std::move(info)); + coordination->addFileInfo(info); return; } @@ -592,12 +586,12 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (use_base && (size == base_size)) { /// The entry's data has not been changed since the base backup. - file_infos->add(std::move(info)); + coordination->addFileInfo(info); return; } bool is_new_checksum; - file_infos->add(std::move(info), is_new_checksum); + coordination->addFileInfo(info, is_new_checksum); if (!is_new_checksum) return; /// We copy data only if it's a new checksum. @@ -614,7 +608,32 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) read_buffer->seek(copy_pos, SEEK_SET); /// Copy the entry's data after `copy_pos`. - auto out = writeFileImpl(getHexUIntLowercase(*checksum)); + String data_file_name = hexChecksum(*checksum); + + std::unique_ptr out; + if (use_archives) + { + String archive_suffix = current_archive_suffix; + bool next_suffix = false; + if (info.archive_suffix.empty() && is_helper_backup) + next_suffix = true; + /*if (archive_params.max_volume_size && current_archive_writer + && (current_archive_writer->getTotalSize() + size - base_size > archive_params.max_volume_size)) + next_suffix = true;*/ + if (next_suffix) + archive_suffix = coordination->getNextArchiveSuffix(); + if (info.archive_suffix != archive_suffix) + { + info.archive_suffix = archive_suffix; + coordination->updateFileInfo(info); + } + out = getArchiveWriter(info.archive_suffix)->writeFile(data_file_name); + } + else + { + out = writer->writeFile(data_file_name); + } + copyData(*read_buffer, *out); } @@ -622,14 +641,75 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) void BackupImpl::finalizeWriting() { std::lock_guard lock{mutex}; - if (writing_finalized) - return; - if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); - writeBackupMetadata(); writing_finalized = true; } + +String BackupImpl::getArchiveNameWithSuffix(const String & suffix) const +{ + return archive_params.archive_name + (suffix.empty() ? "" : ".") + suffix; +} + + +std::shared_ptr BackupImpl::getArchiveReader(const String & suffix) const +{ + auto it = archive_readers.find(suffix); + if (it != archive_readers.end()) + return it->second; + String archive_name_with_suffix = getArchiveNameWithSuffix(suffix); + size_t archive_size = reader->getFileSize(archive_name_with_suffix); + auto new_archive_reader = createArchiveReader(archive_params.archive_name, [reader=reader, archive_name_with_suffix]{ return reader->readFile(archive_name_with_suffix); }, + archive_size); + new_archive_reader->setPassword(archive_params.password); + archive_readers.emplace(suffix, new_archive_reader); + return new_archive_reader; +} + +std::shared_ptr BackupImpl::getArchiveWriter(const String & suffix) +{ + if (suffix.empty() && archive_writer_with_empty_suffix) + return archive_writer_with_empty_suffix; + if ((current_archive_suffix == suffix) && current_archive_writer) + return current_archive_writer; + + String archive_name_with_suffix = getArchiveNameWithSuffix(suffix); + auto new_archive_writer = createArchiveWriter(archive_params.archive_name, writer->writeFile(archive_name_with_suffix)); + new_archive_writer->setPassword(archive_params.password); + + current_archive_writer = new_archive_writer; + current_archive_suffix = suffix; + if (suffix.empty()) + archive_writer_with_empty_suffix = new_archive_writer; + return new_archive_writer; +} + + +void BackupImpl::removeAllFilesAfterFailure() +{ + Strings files_to_remove; + if (use_archives) + { + files_to_remove.push_back(archive_params.archive_name); + for (const auto & suffix : coordination->getAllArchiveSuffixes()) + { + String archive_name_with_suffix = getArchiveNameWithSuffix(suffix); + files_to_remove.push_back(std::move(archive_name_with_suffix)); + } + } + else + { + files_to_remove.push_back(".backup"); + for (const auto & file_info : coordination->getAllFileInfos()) + { + String data_file_name = hexChecksum(file_info.checksum); + files_to_remove.push_back(std::move(data_file_name)); + } + } + + writer->removeFilesAfterFailure(files_to_remove); +} + } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 38fd7e7e7d3..9544a25af46 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -2,34 +2,57 @@ #include #include -#include #include #include namespace DB { +class IBackupCoordination; +class IBackupReader; +class IBackupWriter; +class SeekableReadBuffer; +class IArchiveReader; +class IArchiveWriter; class Context; using ContextPtr = std::shared_ptr; -class SeekableReadBuffer; -/// Base implementation of IBackup. +/// Implementation of IBackup. /// Along with passed files it also stores backup metadata - a single file named ".backup" in XML format /// which contains a list of all files in the backup with their sizes and checksums and information /// whether the base backup should be used for each entry. class BackupImpl : public IBackup { public: + struct ArchiveParams + { + String archive_name; + String password; + String compression_method; + int compression_level = 0; + size_t max_volume_size = 0; + }; + BackupImpl( const String & backup_name_, - const ContextPtr & context_, - const std::optional & base_backup_info_ = {}); + const ArchiveParams & archive_params_, + const std::optional & base_backup_info_, + std::shared_ptr writer_, + std::shared_ptr coordination_, + bool is_helper_backup_, + const ContextPtr & context_); + + BackupImpl( + const String & backup_name_, + const ArchiveParams & archive_params_, + const std::optional & base_backup_info_, + std::shared_ptr reader_, + const ContextPtr & context_); + ~BackupImpl() override; const String & getName() const override { return backup_name; } - void open(OpenMode open_mode_) override; - OpenMode getOpenMode() const override; - void close() override; + OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override; UUID getUUID() const override { return uuid; } Strings listFiles(const String & prefix, const String & terminator) const override; @@ -43,68 +66,40 @@ public: void writeFile(const String & file_name, BackupEntryPtr entry) override; void finalizeWriting() override; -protected: - /// Checks if this backup exists. - virtual bool backupExists() const = 0; - - virtual void openImpl(OpenMode open_mode_) = 0; - OpenMode getOpenModeNoLock() const { return open_mode; } - - virtual void closeImpl(const Strings & written_files_, bool writing_finalized_) = 0; - - /// Read a file from the backup. - /// Low level: the function doesn't check base backup or checksums. - virtual std::unique_ptr readFileImpl(const String & file_name) const = 0; - - /// Add a file to the backup. - /// Low level: the function doesn't check base backup or checksums. - virtual std::unique_ptr writeFileImpl(const String & file_name) = 0; - - mutable std::mutex mutex; - private: + void open(); + void close(); void writeBackupMetadata(); void readBackupMetadata(); - - struct FileInfo - { - String file_name; - - UInt64 size = 0; - UInt128 checksum{0, 0}; - - /// for incremental backups - UInt64 base_size = 0; - UInt128 base_checksum{0, 0}; - }; - - class IFileInfos - { - public: - virtual ~IFileInfos() {} - virtual void add(FileInfo && file_info, bool & is_new_checksum) = 0; - void add(FileInfo && file_info) { bool dummy; add(std::move(file_info), dummy); } - virtual std::vector getAllFileInfos() = 0; - virtual Strings listFiles(const String & prefix, const String & terminator) = 0; - virtual std::optional getChecksumByFileName(const String & file_name) = 0; - virtual std::optional getFileInfoByChecksum(const UInt128 & checksum) = 0; - virtual std::optional getFileInfoByFileName(const String & file_name) = 0; - }; + String getArchiveNameWithSuffix(const String & suffix) const; + std::shared_ptr getArchiveReader(const String & suffix) const; + std::shared_ptr getArchiveWriter(const String & suffix); + void removeAllFilesAfterFailure(); class BackupEntryFromBackupImpl; - class LocalFileInfos; const String backup_name; + const ArchiveParams archive_params; + const bool use_archives; + const std::optional base_backup_info_initial; + const OpenMode open_mode; + std::shared_ptr writer; + std::shared_ptr reader; + std::shared_ptr coordination; + const bool is_helper_backup; ContextPtr context; - const std::optional base_backup_info_param; - OpenMode open_mode = OpenMode::NONE; + + mutable std::mutex mutex; UUID uuid = {}; time_t timestamp = 0; UInt64 version = 1; std::optional base_backup_info; std::shared_ptr base_backup; std::optional base_backup_uuid; - std::unique_ptr file_infos; + mutable std::unordered_map> archive_readers; + std::shared_ptr archive_writer_with_empty_suffix; + std::shared_ptr current_archive_writer; + String current_archive_suffix; bool writing_finalized = false; }; diff --git a/src/Backups/DirectoryBackup.cpp b/src/Backups/DirectoryBackup.cpp deleted file mode 100644 index 6a60cbdd1ef..00000000000 --- a/src/Backups/DirectoryBackup.cpp +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include - - -namespace DB -{ - -DirectoryBackup::DirectoryBackup( - const String & backup_name_, - const DiskPtr & disk_, - const String & path_, - const ContextPtr & context_, - const std::optional & base_backup_info_) - : BackupImpl(backup_name_, context_, base_backup_info_) - , disk(disk_) -{ - /// Remove terminating slash. - path = (std::filesystem::path(path_) / "").parent_path(); - - /// If `disk` is not specified, we create an internal instance of `DiskLocal` here. - if (!disk) - { - disk = std::make_shared(path, path, 0); - path = "."; - } -} - - -DirectoryBackup::~DirectoryBackup() -{ - close(); -} - -bool DirectoryBackup::backupExists() const -{ - return disk->isDirectory(path); -} - -void DirectoryBackup::openImpl(OpenMode open_mode_) -{ - if (open_mode_ == OpenMode::WRITE) - disk->createDirectories(path); -} - -void DirectoryBackup::closeImpl(const Strings & written_files_, bool writing_finalized_) -{ - if ((getOpenModeNoLock() == OpenMode::WRITE) && !writing_finalized_ && !written_files_.empty()) - { - /// Creating of the backup wasn't finished correctly, - /// so the backup cannot be used and it's better to remove its files. - const auto & files_to_delete = written_files_; - for (const String & file_name : files_to_delete) - disk->removeFileIfExists(path / file_name); - if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) - disk->removeDirectory(path); - } -} - -std::unique_ptr DirectoryBackup::readFileImpl(const String & file_name) const -{ - auto file_path = path / file_name; - return disk->readFile(file_path); -} - -std::unique_ptr DirectoryBackup::writeFileImpl(const String & file_name) -{ - auto file_path = path / file_name; - disk->createDirectories(fs::path(file_path).parent_path()); - return disk->writeFile(file_path); -} - -} diff --git a/src/Backups/DirectoryBackup.h b/src/Backups/DirectoryBackup.h deleted file mode 100644 index d9dbc81fa78..00000000000 --- a/src/Backups/DirectoryBackup.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ -class IDisk; -using DiskPtr = std::shared_ptr; - -/// Represents a backup stored on a disk. -/// A backup is stored as a directory, each entry is stored as a file in that directory. -class DirectoryBackup : public BackupImpl -{ -public: - /// `disk`_ is allowed to be nullptr and that means the `path_` is a path in the local filesystem. - DirectoryBackup( - const String & backup_name_, - const DiskPtr & disk_, - const String & path_, - const ContextPtr & context_, - const std::optional & base_backup_info_ = {}); - ~DirectoryBackup() override; - -private: - bool backupExists() const override; - void openImpl(OpenMode open_mode_) override; - void closeImpl(const Strings & written_files_, bool writing_finalized_) override; - std::unique_ptr readFileImpl(const String & file_name) const override; - std::unique_ptr writeFileImpl(const String & file_name) override; - - DiskPtr disk; - std::filesystem::path path; -}; - -} diff --git a/src/Backups/DistributedBackupCoordination.cpp b/src/Backups/DistributedBackupCoordination.cpp new file mode 100644 index 00000000000..d49ea025f56 --- /dev/null +++ b/src/Backups/DistributedBackupCoordination.cpp @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNEXPECTED_NODE_IN_ZOOKEEPER; +} + +/// zookeeper_path/file_names/file_name->checksum +/// zookeeper_path/file_infos/checksum->info +/// zookeeper_path/archive_suffixes +/// zookeeper_path/current_archive_suffix + +namespace +{ + using FileInfo = IBackupCoordination::FileInfo; + + String serializeFileInfo(const FileInfo & info) + { + WriteBufferFromOwnString out; + writeBinary(info.file_name, out); + writeBinary(info.size, out); + writeBinary(info.checksum, out); + writeBinary(info.base_size, out); + writeBinary(info.base_checksum, out); + writeBinary(info.archive_suffix, out); + writeBinary(info.pos_in_archive, out); + return out.str(); + } + + FileInfo deserializeFileInfo(const String & str) + { + FileInfo info; + ReadBufferFromString in{str}; + readBinary(info.file_name, in); + readBinary(info.size, in); + readBinary(info.checksum, in); + readBinary(info.base_size, in); + readBinary(info.base_checksum, in); + readBinary(info.archive_suffix, in); + readBinary(info.pos_in_archive, in); + return info; + } + + String hexChecksum(UInt128 checksum) + { + return getHexUIntLowercase(checksum); + } + + UInt128 unhexChecksum(const String & checksum) + { + if (checksum.size() != sizeof(UInt128) * 2) + throw Exception( + ErrorCodes::UNEXPECTED_NODE_IN_ZOOKEEPER, + "Unexpected size of checksum: {}, must be {}", + checksum.size(), + sizeof(UInt128) * 2); + return unhexUInt(checksum.data()); + } + + constexpr size_t NUM_ATTEMPTS = 10; +} + +DistributedBackupCoordination::DistributedBackupCoordination(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) + : zookeeper_path(zookeeper_path_), get_zookeeper(get_zookeeper_) +{ + createRootNodes(); +} + +DistributedBackupCoordination::~DistributedBackupCoordination() = default; + +void DistributedBackupCoordination::createRootNodes() +{ + auto zookeeper = get_zookeeper(); + zookeeper->createAncestors(zookeeper_path); + zookeeper->createIfNotExists(zookeeper_path, ""); + zookeeper->createIfNotExists(zookeeper_path + "/file_names", ""); + zookeeper->createIfNotExists(zookeeper_path + "/file_infos", ""); + zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", ""); + zookeeper->createIfNotExists(zookeeper_path + "/current_archive_suffix", "0"); +} + +void DistributedBackupCoordination::removeAllNodes() +{ + auto zookeeper = get_zookeeper(); + zookeeper->removeRecursive(zookeeper_path); +} + +void DistributedBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_new_checksum) +{ + auto zookeeper = get_zookeeper(); + + String full_path = zookeeper_path + "/file_names/" + escapeForFileName(file_info.file_name); + String checksum_str = hexChecksum(file_info.checksum); + zookeeper->create(full_path, checksum_str, zkutil::CreateMode::Persistent); + + full_path = zookeeper_path + "/file_infos/" + checksum_str; + auto code = zookeeper->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, full_path); + + is_new_checksum = (code == Coordination::Error::ZOK); +} + +void DistributedBackupCoordination::updateFileInfo(const FileInfo & file_info) +{ + auto zookeeper = get_zookeeper(); + String checksum_str = hexChecksum(file_info.checksum); + String full_path = zookeeper_path + "/file_infos/" + checksum_str; + for (size_t attempt = 0; attempt < NUM_ATTEMPTS; ++attempt) + { + Coordination::Stat stat; + auto new_info = deserializeFileInfo(zookeeper->get(full_path, &stat)); + new_info.archive_suffix = file_info.archive_suffix; + auto code = zookeeper->trySet(full_path, serializeFileInfo(new_info), stat.version); + if (code == Coordination::Error::ZOK) + return; + bool is_last_attempt = (attempt == NUM_ATTEMPTS - 1); + if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt) + throw zkutil::KeeperException(code, full_path); + } +} + +std::vector DistributedBackupCoordination::getAllFileInfos() +{ + auto zookeeper = get_zookeeper(); + std::vector file_infos; + Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + for (const String & escaped_name : escaped_names) + { + String checksum = zookeeper->get(zookeeper_path + "/file_names/" + escaped_name); + FileInfo file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + checksum)); + file_info.file_name = unescapeForFileName(escaped_name); + file_infos.emplace_back(std::move(file_info)); + } + return file_infos; +} + +Strings DistributedBackupCoordination::listFiles(const String & prefix, const String & terminator) +{ + auto zookeeper = get_zookeeper(); + Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + + Strings elements; + for (const String & escaped_name : escaped_names) + { + String name = unescapeForFileName(escaped_name); + if (!name.starts_with(prefix)) + continue; + size_t start_pos = prefix.length(); + size_t end_pos = String::npos; + if (!terminator.empty()) + end_pos = name.find(terminator, start_pos); + std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); + if (!elements.empty() && (elements.back() == new_element)) + continue; + elements.push_back(String{new_element}); + } + + std::sort(elements.begin(), elements.end()); + return elements; +} + +std::optional DistributedBackupCoordination::getChecksumByFileName(const String & file_name) +{ + auto zookeeper = get_zookeeper(); + String checksum; + if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), checksum)) + return std::nullopt; + return unhexChecksum(checksum); +} + +std::optional DistributedBackupCoordination::getFileInfoByChecksum(const UInt128 & checksum) +{ + auto zookeeper = get_zookeeper(); + String file_info_str; + if (!zookeeper->tryGet(zookeeper_path + "/file_infos/" + hexChecksum(checksum), file_info_str)) + return std::nullopt; + return deserializeFileInfo(file_info_str); +} + +std::optional DistributedBackupCoordination::getFileInfoByFileName(const String & file_name) +{ + auto zookeeper = get_zookeeper(); + String checksum; + if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), checksum)) + return std::nullopt; + FileInfo file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + checksum)); + file_info.file_name = file_name; + return file_info; +} + +String DistributedBackupCoordination::getNextArchiveSuffix() +{ + auto zookeeper = get_zookeeper(); + for (size_t attempt = 0; attempt != NUM_ATTEMPTS; ++attempt) + { + Coordination::Stat stat; + String current_suffix_str = zookeeper->get(zookeeper_path + "/current_archive_suffix", &stat); + UInt64 current_suffix = parseFromString(current_suffix_str); + current_suffix_str = fmt::format("{:03}", ++current_suffix); /// Outputs 001, 002, 003, ... + Coordination::Requests ops; + ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/current_archive_suffix", current_suffix_str, stat.version)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/archive_suffixes/" + current_suffix_str, "", zkutil::CreateMode::Persistent)); + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (code == Coordination::Error::ZOK) + return current_suffix_str; + bool is_last_attempt = (attempt == NUM_ATTEMPTS - 1); + if ((responses[0]->error != Coordination::Error::ZBADVERSION) || is_last_attempt) + throw zkutil::KeeperMultiException(code, ops, responses); + } + __builtin_unreachable(); +} + +Strings DistributedBackupCoordination::getAllArchiveSuffixes() +{ + auto zookeeper = get_zookeeper(); + return zookeeper->getChildren(zookeeper_path + "/archive_suffixes"); +} + +void DistributedBackupCoordination::drop() +{ + removeAllNodes(); +} + +} diff --git a/src/Backups/DistributedBackupCoordination.h b/src/Backups/DistributedBackupCoordination.h new file mode 100644 index 00000000000..7b1a2cb02ba --- /dev/null +++ b/src/Backups/DistributedBackupCoordination.h @@ -0,0 +1,39 @@ +#include +#include +#include +#include + + +namespace DB +{ + +/// Stores backup contents information in Zookeeper, useful for distributed backups. +class DistributedBackupCoordination : public IBackupCoordination +{ +public: + DistributedBackupCoordination(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); + ~DistributedBackupCoordination() override; + + void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) override; + void updateFileInfo(const FileInfo & file_info) override; + + std::vector getAllFileInfos() override; + Strings listFiles(const String & prefix, const String & terminator) override; + std::optional getChecksumByFileName(const String & file_name) override; + std::optional getFileInfoByChecksum(const UInt128 & checksum) override; + std::optional getFileInfoByFileName(const String & file_name) override; + + String getNextArchiveSuffix() override; + Strings getAllArchiveSuffixes() override; + + void drop() override; + +private: + void createRootNodes(); + void removeAllNodes(); + + String zookeeper_path; + zkutil::GetZooKeeper get_zookeeper; +}; + +} diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 727480923cf..58e537809db 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -16,7 +16,6 @@ using BackupEntryPtr = std::unique_ptr; class IBackup : public std::enable_shared_from_this { public: - IBackup() = default; virtual ~IBackup() = default; /// Name of the backup. @@ -24,18 +23,13 @@ public: enum class OpenMode { - NONE, READ, WRITE, }; - /// Opens the backup and start its reading or writing depending on `open_mode`. - virtual void open(OpenMode open_mode) = 0; + /// Returns whether the backup was opened for reading or writing. virtual OpenMode getOpenMode() const = 0; - /// Closes the backup and ends its reading or writing. - virtual void close() = 0; - /// Returns the time point when this backup was created. virtual time_t getTimestamp() const = 0; diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h new file mode 100644 index 00000000000..2f869ef1470 --- /dev/null +++ b/src/Backups/IBackupCoordination.h @@ -0,0 +1,63 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Keeps information about files contained in a backup. +class IBackupCoordination +{ +public: + struct FileInfo + { + String file_name; + + UInt64 size = 0; + UInt128 checksum{0}; + + /// for incremental backups + UInt64 base_size = 0; + UInt128 base_checksum{0}; + + /// Suffix of an archive if the backup is stored as a series of archives. + String archive_suffix; + + /// Position in the archive. + UInt64 pos_in_archive = static_cast(-1); + }; + + virtual ~IBackupCoordination() { } + + /// Adds file information. + /// If a specified checksum is new for this IBackupContentsInfo the function sets `is_new_checksum`. + virtual void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) = 0; + + void addFileInfo(const FileInfo & file_info) + { + bool is_new_checksum; + addFileInfo(file_info, is_new_checksum); + } + + /// Updates some fields (currently only `archive_suffix`) of a stored file's information. + virtual void updateFileInfo(const FileInfo & file_info) = 0; + + virtual std::vector getAllFileInfos() = 0; + virtual Strings listFiles(const String & prefix, const String & terminator) = 0; + + virtual std::optional getChecksumByFileName(const String & file_name) = 0; + virtual std::optional getFileInfoByChecksum(const UInt128 & checksum) = 0; + virtual std::optional getFileInfoByFileName(const String & file_name) = 0; + + /// Generates a new archive suffix, e.g. "001", "002", "003", ... + virtual String getNextArchiveSuffix() = 0; + + /// Returns the list of all the archive suffixes which were generated. + virtual Strings getAllArchiveSuffixes() = 0; + + /// Removes remotely stored information. + virtual void drop() {} +}; + +} diff --git a/src/Backups/LocalBackupCoordination.cpp b/src/Backups/LocalBackupCoordination.cpp new file mode 100644 index 00000000000..495f3ac483b --- /dev/null +++ b/src/Backups/LocalBackupCoordination.cpp @@ -0,0 +1,104 @@ +#include +#include + + +namespace DB +{ +using FileInfo = IBackupCoordination::FileInfo; + +LocalBackupCoordination::LocalBackupCoordination() = default; +LocalBackupCoordination::~LocalBackupCoordination() = default; + +void LocalBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_new_checksum) +{ + std::lock_guard lock{mutex}; + file_names.emplace(file_info.file_name, file_info.checksum); + is_new_checksum = (file_info.checksum && !file_infos.contains(file_info.checksum)); + if (is_new_checksum) + file_infos.emplace(file_info.checksum, std::move(file_info)); +} + +void LocalBackupCoordination::updateFileInfo(const FileInfo & file_info) +{ + std::lock_guard lock{mutex}; + auto & dest = file_infos.at(file_info.checksum); + dest.archive_suffix = file_info.archive_suffix; +} + +std::vector LocalBackupCoordination::getAllFileInfos() +{ + std::lock_guard lock{mutex}; + std::vector res; + for (const auto & [file_name, checksum] : file_names) + { + FileInfo info = file_infos.at(checksum); + info.file_name = file_name; + res.push_back(std::move(info)); + } + return res; +} + +Strings LocalBackupCoordination::listFiles(const String & prefix, const String & terminator) +{ + std::lock_guard lock{mutex}; + Strings elements; + for (auto it = file_names.lower_bound(prefix); it != file_names.end(); ++it) + { + const String & name = it->first; + if (!name.starts_with(prefix)) + break; + size_t start_pos = prefix.length(); + size_t end_pos = String::npos; + if (!terminator.empty()) + end_pos = name.find(terminator, start_pos); + std::string_view new_element = std::string_view{name}.substr(start_pos, end_pos - start_pos); + if (!elements.empty() && (elements.back() == new_element)) + continue; + elements.push_back(String{new_element}); + } + return elements; +} + +std::optional LocalBackupCoordination::getChecksumByFileName(const String & file_name) +{ + std::lock_guard lock{mutex}; + auto it = file_names.find(file_name); + if (it == file_names.end()) + return std::nullopt; + return it->second; +} + +std::optional LocalBackupCoordination::getFileInfoByChecksum(const UInt128 & checksum) +{ + std::lock_guard lock{mutex}; + auto it = file_infos.find(checksum); + if (it == file_infos.end()) + return std::nullopt; + return it->second; +} + +std::optional LocalBackupCoordination::getFileInfoByFileName(const String & file_name) +{ + std::lock_guard lock{mutex}; + auto it = file_names.find(file_name); + if (it == file_names.end()) + return std::nullopt; + FileInfo info = file_infos.at(it->second); + info.file_name = file_name; + return info; +} + +String LocalBackupCoordination::getNextArchiveSuffix() +{ + String new_archive_suffix = fmt::format("{:03}", ++current_archive_suffix); /// Outputs 001, 002, 003, ... + archive_suffixes.push_back(new_archive_suffix); + return new_archive_suffix; +} + +Strings LocalBackupCoordination::getAllArchiveSuffixes() +{ + std::lock_guard lock{mutex}; + return archive_suffixes; +} + +} diff --git a/src/Backups/LocalBackupCoordination.h b/src/Backups/LocalBackupCoordination.h new file mode 100644 index 00000000000..3f299a688fc --- /dev/null +++ b/src/Backups/LocalBackupCoordination.h @@ -0,0 +1,36 @@ +#include +#include +#include + + +namespace DB +{ + +/// Stores backup contents information in memory. +class LocalBackupCoordination : public IBackupCoordination +{ +public: + LocalBackupCoordination(); + ~LocalBackupCoordination() override; + + void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) override; + void updateFileInfo(const FileInfo & file_info) override; + + std::vector getAllFileInfos() override; + Strings listFiles(const String & prefix, const String & terminator) override; + std::optional getChecksumByFileName(const String & file_name) override; + std::optional getFileInfoByChecksum(const UInt128 & checksum) override; + std::optional getFileInfoByFileName(const String & file_name) override; + + String getNextArchiveSuffix() override; + Strings getAllArchiveSuffixes() override; + +private: + std::mutex mutex; + std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. + std::unordered_map file_infos; /// Information about files. Without empty files. + Strings archive_suffixes; + size_t current_archive_suffix = 0; +}; + +} diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index c4e51797f9e..679209e643e 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -1,6 +1,7 @@ #include -#include -#include +#include +#include +#include #include #include #include @@ -145,18 +146,38 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected backup engine '{}'", engine_name); + BackupImpl::ArchiveParams archive_params; if (hasRegisteredArchiveFileExtension(path)) { - auto archive_backup = std::make_unique(backup_name, disk, path, params.context, params.base_backup_info); - archive_backup->setCompression(params.compression_method, params.compression_level); - archive_backup->setPassword(params.password); - return archive_backup; + archive_params.archive_name = path.filename(); + path = path.parent_path(); + archive_params.compression_method = params.compression_method; + archive_params.compression_level = params.compression_level; + archive_params.password = params.password; } else { if (!params.password.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Password is not applicable, backup cannot be encrypted"); - return std::make_unique(backup_name, disk, path, params.context, params.base_backup_info); + } + + if (params.open_mode == IBackup::OpenMode::READ) + { + std::shared_ptr reader; + if (engine_name == "File") + reader = std::make_shared(path); + else + reader = std::make_shared(disk, path); + return std::make_unique(backup_name, archive_params, params.base_backup_info, reader, params.context); + } + else + { + std::shared_ptr writer; + if (engine_name == "File") + writer = std::make_shared(path); + else + writer = std::make_shared(disk, path); + return std::make_unique(backup_name, archive_params, params.base_backup_info, writer, nullptr, false, params.context); } }; From ec5b89a6accf50004362fd3c645d0b15fda1c569 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 19 Apr 2022 11:02:34 +0200 Subject: [PATCH 56/94] Revert backup version 2. Add parameter data_file_path to handle duplicate files more easily without introducing a new backup format. --- src/Backups/BackupImpl.cpp | 168 +++++++++--------- src/Backups/BackupImpl.h | 17 +- src/Backups/DistributedBackupCoordination.cpp | 82 +++++---- src/Backups/DistributedBackupCoordination.h | 8 +- src/Backups/IBackup.h | 14 +- src/Backups/IBackupCoordination.h | 19 +- src/Backups/LocalBackupCoordination.cpp | 48 +++-- src/Backups/LocalBackupCoordination.h | 14 +- .../test_backup_restore_new/test.py | 16 +- 9 files changed, 219 insertions(+), 167 deletions(-) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index b08012287f3..060c2d9d3f1 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -20,7 +20,6 @@ #include #include #include -#include namespace DB @@ -42,22 +41,28 @@ namespace ErrorCodes namespace { const UInt64 INITIAL_BACKUP_VERSION = 1; - const UInt64 CURRENT_BACKUP_VERSION = 2; - const UInt64 BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES = 2; + const UInt64 CURRENT_BACKUP_VERSION = 1; + + using SizeAndChecksum = IBackup::SizeAndChecksum; + using FileInfo = IBackupCoordination::FileInfo; String hexChecksum(UInt128 checksum) { - return getHexUIntLowercase(checksum); + return getHexUIntLowercase(checksum); } UInt128 unhexChecksum(const String & checksum) { - if (checksum.size() != sizeof(UInt128) * 2) - throw Exception(ErrorCodes::BACKUP_DAMAGED, "Unexpected size of checksum: {}, must be {}", checksum.size(), sizeof(UInt128) * 2); + constexpr size_t num_chars_in_checksum = sizeof(UInt128) * 2; + if (checksum.size() != num_chars_in_checksum) + throw Exception(ErrorCodes::BACKUP_DAMAGED, "Unexpected size of checksum: {}, must be {}", checksum.size(), num_chars_in_checksum); return unhexUInt(checksum.data()); } - using FileInfo = IBackupCoordination::FileInfo; + String formatSizeAndChecksum(const SizeAndChecksum & size_and_checksum) + { + return hexChecksum(size_and_checksum.second) + std::to_string(size_and_checksum.first); + } } @@ -122,6 +127,7 @@ BackupImpl::BackupImpl( , coordination(coordination_ ? coordination_ : std::make_shared()) , is_helper_backup(is_helper_backup_) , context(context_) + , version(CURRENT_BACKUP_VERSION) { open(); } @@ -142,6 +148,7 @@ BackupImpl::BackupImpl( , coordination(std::make_shared()) , is_helper_backup(false) , context(context_) + , version(INITIAL_BACKUP_VERSION) { open(); } @@ -253,10 +260,10 @@ void BackupImpl::writeBackupMetadata() for (const auto & info : coordination->getAllFileInfos()) { String prefix = index ? "contents.file[" + std::to_string(index) + "]." : "contents.file."; - config->setString(prefix + "name", info.file_name); config->setUInt(prefix + "size", info.size); if (info.size) { + config->setString(prefix + "name", info.file_name); config->setString(prefix + "checksum", hexChecksum(info.checksum)); if (info.base_size) { @@ -267,8 +274,12 @@ void BackupImpl::writeBackupMetadata() config->setString(prefix + "base_checksum", hexChecksum(info.base_checksum)); } } + if (!info.data_file_name.empty() && (info.data_file_name != info.file_name)) + config->setString(prefix + "data_file", info.data_file_name); if (!info.archive_suffix.empty()) config->setString(prefix + "archive_suffix", info.archive_suffix); + if (info.pos_in_archive != static_cast(-1)) + config->setUInt64(prefix + "pos_in_archive", info.pos_in_archive); } ++index; } @@ -322,22 +333,33 @@ void BackupImpl::readBackupMetadata() FileInfo info; info.file_name = config->getString(prefix + "name"); info.size = config->getUInt(prefix + "size"); - info.checksum = info.size ? unhexChecksum(config->getString(prefix + "checksum")) : UInt128{0}; - - bool use_base = config->getBool(prefix + "use_base", false); - info.base_size = config->getUInt(prefix + "base_size", use_base ? info.size : 0); - if (info.base_size) - use_base = true; - - if (use_base) + if (info.size) { - if (info.base_size == info.size) - info.base_checksum = info.checksum; - else - info.base_checksum = unhexChecksum(config->getString(prefix + "base_checksum")); - } + info.checksum = unhexChecksum(config->getString(prefix + "checksum")); - info.archive_suffix = config->getString(prefix + "archive_suffix", ""); + bool use_base = config->getBool(prefix + "use_base", false); + info.base_size = config->getUInt(prefix + "base_size", use_base ? info.size : 0); + if (info.base_size) + use_base = true; + + if (info.base_size > info.size) + throw Exception(ErrorCodes::BACKUP_DAMAGED, "Backup {}: Base size must not be greater than the size of entry {}", getName(), quoteString(info.file_name)); + + if (use_base) + { + if (info.base_size == info.size) + info.base_checksum = info.checksum; + else + info.base_checksum = unhexChecksum(config->getString(prefix + "base_checksum")); + } + + if (info.size > info.base_size) + { + info.data_file_name = config->getString(prefix + "data_file", info.file_name); + info.archive_suffix = config->getString(prefix + "archive_suffix", ""); + info.pos_in_archive = config->getUInt64(prefix + "pos_in_archive", static_cast(-1)); + } + } coordination->addFileInfo(info); } @@ -355,60 +377,60 @@ Strings BackupImpl::listFiles(const String & prefix, const String & terminator) bool BackupImpl::fileExists(const String & file_name) const { std::lock_guard lock{mutex}; - return coordination->getChecksumByFileName(file_name).has_value(); + return coordination->getFileInfo(file_name).has_value(); } -bool BackupImpl::fileExistsByChecksum(const UInt128 & checksum) const +bool BackupImpl::fileExists(const SizeAndChecksum & size_and_checksum) const { std::lock_guard lock{mutex}; - return coordination->getFileInfoByChecksum(checksum).has_value(); + return coordination->getFileInfo(size_and_checksum).has_value(); } -size_t BackupImpl::getFileSize(const String & file_name) const +UInt64 BackupImpl::getFileSize(const String & file_name) const { std::lock_guard lock{mutex}; - auto info = coordination->getFileInfoByFileName(file_name); + auto info = coordination->getFileInfo(file_name); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); return info->size; } -size_t BackupImpl::getFileSizeByChecksum(const UInt128 & checksum) const -{ - std::lock_guard lock{mutex}; - auto info = coordination->getFileInfoByChecksum(checksum); - if (!info) - throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), hexChecksum(checksum)); - return info->size; -} - UInt128 BackupImpl::getFileChecksum(const String & file_name) const { std::lock_guard lock{mutex}; - auto info = coordination->getFileInfoByFileName(file_name); + auto info = coordination->getFileInfo(file_name); if (!info) throw Exception( ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); return info->checksum; } -BackupEntryPtr BackupImpl::readFile(const String & file_name) const +SizeAndChecksum BackupImpl::getFileSizeAndChecksum(const String & file_name) const { - return readFileByChecksum(getFileChecksum(file_name)); + std::lock_guard lock{mutex}; + auto info = coordination->getFileInfo(file_name); + if (!info) + throw Exception( + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); + return std::pair(info->size, info->checksum); } -BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const +BackupEntryPtr BackupImpl::readFile(const String & file_name) const +{ + return readFile(getFileSizeAndChecksum(file_name)); +} + +BackupEntryPtr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) const { std::lock_guard lock{mutex}; if (open_mode != OpenMode::READ) throw Exception("Backup is not opened for reading", ErrorCodes::LOGICAL_ERROR); - auto info_opt = coordination->getFileInfoByChecksum(checksum); + auto info_opt = coordination->getFileInfo(size_and_checksum); if (!info_opt) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), hexChecksum(checksum)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), formatSizeAndChecksum(size_and_checksum)); const auto & info = *info_opt; if (!info.size) @@ -420,18 +442,8 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const if (!info.base_size) { /// Data goes completely from this backup, the base backup isn't used. - String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? hexChecksum(checksum) : info.file_name; return std::make_unique( - std::static_pointer_cast(shared_from_this()), info.archive_suffix, data_file_name, info.size, info.checksum); - } - - if (info.size < info.base_size) - { - throw Exception( - ErrorCodes::BACKUP_DAMAGED, - "Backup {}: Entry {} has its data size less than in the base backup {}: {} < {}", - getName(), - hexChecksum(checksum), base_backup->getName(), info.size, info.base_size); + std::static_pointer_cast(shared_from_this()), info.archive_suffix, info.data_file_name, info.size, info.checksum); } if (!base_backup) @@ -439,26 +451,18 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const throw Exception( ErrorCodes::NO_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but there is no base backup specified", - getName(), hexChecksum(checksum)); + getName(), formatSizeAndChecksum(size_and_checksum)); } - if (!base_backup->fileExistsByChecksum(info.base_checksum)) + if (!base_backup->fileExists(std::pair(info.base_size, info.base_checksum))) { throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but doesn't exist there", - getName(), hexChecksum(checksum)); + getName(), formatSizeAndChecksum(size_and_checksum)); } - auto base_entry = base_backup->readFileByChecksum(info.base_checksum); - auto base_size = base_entry->getSize(); - if (base_size != info.base_size) - { - throw Exception( - ErrorCodes::WRONG_BASE_BACKUP, - "Backup {}: Entry {} has unexpected size in the base backup {}: {} (expected size: {})", - getName(), hexChecksum(checksum), base_backup->getName(), base_size, info.base_size); - } + auto base_entry = base_backup->readFile(std::pair{info.base_size, info.base_checksum}); if (info.size == info.base_size) { @@ -469,9 +473,8 @@ BackupEntryPtr BackupImpl::readFileByChecksum(const UInt128 & checksum) const { /// The beginning of the data goes from the base backup, /// and the ending goes from this backup. - String data_file_name = (version >= BACKUP_VERSION_USE_CHECKSUMS_AS_DATA_FILE_NAMES) ? hexChecksum(checksum) : info.file_name; return std::make_unique( - static_pointer_cast(shared_from_this()), info.archive_suffix, data_file_name, info.size, info.checksum, std::move(base_entry)); + static_pointer_cast(shared_from_this()), info.archive_suffix, info.data_file_name, info.size, info.checksum, std::move(base_entry)); } } @@ -482,7 +485,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (open_mode != OpenMode::WRITE) throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR); - if (coordination->getChecksumByFileName(file_name)) + if (coordination->getFileInfo(file_name)) throw Exception( ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", getName(), quoteString(file_name)); @@ -500,7 +503,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) /// Maybe we have a copy of this file in the backup already. std::optional checksum = entry->getChecksum(); - if (checksum && coordination->getFileInfoByChecksum(*checksum)) + if (checksum && coordination->getFileInfo(std::pair{size, *checksum})) { info.checksum = *checksum; coordination->addFileInfo(info); @@ -561,14 +564,14 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) info.checksum = *checksum; /// Maybe we have a copy of this file in the backup already. - if (coordination->getFileInfoByChecksum(*checksum)) + if (coordination->getFileInfo(std::pair{size, *checksum})) { coordination->addFileInfo(info); return; } /// Check if a entry with the same checksum exists in the base backup. - if (base_backup && !use_base && base_backup->fileExistsByChecksum(*checksum)) + if (base_backup && !use_base && base_backup->fileExists(std::pair{size, *checksum})) { /// The entry's data has not changed since the base backup, /// but the entry itself has been moved or renamed. @@ -590,10 +593,11 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) return; } - bool is_new_checksum; - coordination->addFileInfo(info, is_new_checksum); - if (!is_new_checksum) - return; /// We copy data only if it's a new checksum. + bool is_data_file_required; + info.data_file_name = info.file_name; + coordination->addFileInfo(info, is_data_file_required); + if (!is_data_file_required) + return; /// We copy data only if it's a new combination of size & checksum. /// Either the entry wasn't exist in the base backup /// or the entry has data appended to the end of the data from the base backup. @@ -608,8 +612,6 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) read_buffer->seek(copy_pos, SEEK_SET); /// Copy the entry's data after `copy_pos`. - String data_file_name = hexChecksum(*checksum); - std::unique_ptr out; if (use_archives) { @@ -627,11 +629,11 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) info.archive_suffix = archive_suffix; coordination->updateFileInfo(info); } - out = getArchiveWriter(info.archive_suffix)->writeFile(data_file_name); + out = getArchiveWriter(info.archive_suffix)->writeFile(info.data_file_name); } else { - out = writer->writeFile(data_file_name); + out = writer->writeFile(info.data_file_name); } copyData(*read_buffer, *out); @@ -686,7 +688,6 @@ std::shared_ptr BackupImpl::getArchiveWriter(const String & suff return new_archive_writer; } - void BackupImpl::removeAllFilesAfterFailure() { Strings files_to_remove; @@ -703,10 +704,7 @@ void BackupImpl::removeAllFilesAfterFailure() { files_to_remove.push_back(".backup"); for (const auto & file_info : coordination->getAllFileInfos()) - { - String data_file_name = hexChecksum(file_info.checksum); - files_to_remove.push_back(std::move(data_file_name)); - } + files_to_remove.push_back(file_info.data_file_name); } writer->removeFilesAfterFailure(files_to_remove); diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 9544a25af46..be8a352e95e 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -57,16 +58,20 @@ public: UUID getUUID() const override { return uuid; } Strings listFiles(const String & prefix, const String & terminator) const override; bool fileExists(const String & file_name) const override; - bool fileExistsByChecksum(const UInt128 & checksum) const override; - size_t getFileSize(const String & file_name) const override; - size_t getFileSizeByChecksum(const UInt128 & checksum) const override; + bool fileExists(const SizeAndChecksum & size_and_checksum) const override; + UInt64 getFileSize(const String & file_name) const override; UInt128 getFileChecksum(const String & file_name) const override; + SizeAndChecksum getFileSizeAndChecksum(const String & file_name) const override; BackupEntryPtr readFile(const String & file_name) const override; - BackupEntryPtr readFileByChecksum(const UInt128 & checksum) const override; + BackupEntryPtr readFile(const SizeAndChecksum & size_and_checksum) const override; void writeFile(const String & file_name, BackupEntryPtr entry) override; void finalizeWriting() override; + bool supportsWritingInMultipleThreads() const override { return !use_archives; } private: + using FileInfo = IBackupCoordination::FileInfo; + class BackupEntryFromBackupImpl; + void open(); void close(); void writeBackupMetadata(); @@ -76,8 +81,6 @@ private: std::shared_ptr getArchiveWriter(const String & suffix); void removeAllFilesAfterFailure(); - class BackupEntryFromBackupImpl; - const String backup_name; const ArchiveParams archive_params; const bool use_archives; @@ -92,7 +95,7 @@ private: mutable std::mutex mutex; UUID uuid = {}; time_t timestamp = 0; - UInt64 version = 1; + UInt64 version; std::optional base_backup_info; std::shared_ptr base_backup; std::optional base_backup_uuid; diff --git a/src/Backups/DistributedBackupCoordination.cpp b/src/Backups/DistributedBackupCoordination.cpp index d49ea025f56..d1669cf292d 100644 --- a/src/Backups/DistributedBackupCoordination.cpp +++ b/src/Backups/DistributedBackupCoordination.cpp @@ -16,13 +16,14 @@ namespace ErrorCodes extern const int UNEXPECTED_NODE_IN_ZOOKEEPER; } -/// zookeeper_path/file_names/file_name->checksum -/// zookeeper_path/file_infos/checksum->info +/// zookeeper_path/file_names/file_name->checksum_and_size +/// zookeeper_path/file_infos/checksum_and_size->info /// zookeeper_path/archive_suffixes /// zookeeper_path/current_archive_suffix namespace { + using SizeAndChecksum = IBackupCoordination::SizeAndChecksum; using FileInfo = IBackupCoordination::FileInfo; String serializeFileInfo(const FileInfo & info) @@ -33,6 +34,7 @@ namespace writeBinary(info.checksum, out); writeBinary(info.base_size, out); writeBinary(info.base_checksum, out); + writeBinary(info.data_file_name, out); writeBinary(info.archive_suffix, out); writeBinary(info.pos_in_archive, out); return out.str(); @@ -47,27 +49,32 @@ namespace readBinary(info.checksum, in); readBinary(info.base_size, in); readBinary(info.base_checksum, in); + readBinary(info.data_file_name, in); readBinary(info.archive_suffix, in); readBinary(info.pos_in_archive, in); return info; } - String hexChecksum(UInt128 checksum) + String serializeSizeAndChecksum(const SizeAndChecksum & size_and_checksum) { - return getHexUIntLowercase(checksum); + return getHexUIntLowercase(size_and_checksum.second) + '_' + std::to_string(size_and_checksum.first); } - UInt128 unhexChecksum(const String & checksum) + SizeAndChecksum deserializeSizeAndChecksum(const String & str) { - if (checksum.size() != sizeof(UInt128) * 2) + constexpr size_t num_chars_in_checksum = sizeof(UInt128) * 2; + if (str.size() <= num_chars_in_checksum) throw Exception( ErrorCodes::UNEXPECTED_NODE_IN_ZOOKEEPER, "Unexpected size of checksum: {}, must be {}", - checksum.size(), - sizeof(UInt128) * 2); - return unhexUInt(checksum.data()); + str.size(), + num_chars_in_checksum); + UInt128 checksum = unhexUInt(str.data()); + UInt64 size = parseFromString(str.substr(num_chars_in_checksum + 1)); + return std::pair{size, checksum}; } + /// We try to store data to zookeeper several times due to possible version conflicts. constexpr size_t NUM_ATTEMPTS = 10; } @@ -96,27 +103,36 @@ void DistributedBackupCoordination::removeAllNodes() zookeeper->removeRecursive(zookeeper_path); } -void DistributedBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_new_checksum) +void DistributedBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { auto zookeeper = get_zookeeper(); String full_path = zookeeper_path + "/file_names/" + escapeForFileName(file_info.file_name); - String checksum_str = hexChecksum(file_info.checksum); - zookeeper->create(full_path, checksum_str, zkutil::CreateMode::Persistent); + String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum}); + zookeeper->create(full_path, size_and_checksum, zkutil::CreateMode::Persistent); - full_path = zookeeper_path + "/file_infos/" + checksum_str; + if (!file_info.size) + { + is_data_file_required = false; + return; + } + + full_path = zookeeper_path + "/file_infos/" + size_and_checksum; auto code = zookeeper->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent); if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) throw zkutil::KeeperException(code, full_path); - is_new_checksum = (code == Coordination::Error::ZOK); + is_data_file_required = (code == Coordination::Error::ZOK) && (file_info.size > file_info.base_size); } void DistributedBackupCoordination::updateFileInfo(const FileInfo & file_info) { + if (!file_info.size) + return; /// we don't keep FileInfos for empty files, nothing to update + auto zookeeper = get_zookeeper(); - String checksum_str = hexChecksum(file_info.checksum); - String full_path = zookeeper_path + "/file_infos/" + checksum_str; + String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum}); + String full_path = zookeeper_path + "/file_infos/" + size_and_checksum; for (size_t attempt = 0; attempt < NUM_ATTEMPTS; ++attempt) { Coordination::Stat stat; @@ -138,8 +154,11 @@ std::vector DistributedBackupCoordination::getAllFileInfos() Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); for (const String & escaped_name : escaped_names) { - String checksum = zookeeper->get(zookeeper_path + "/file_names/" + escaped_name); - FileInfo file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + checksum)); + String size_and_checksum = zookeeper->get(zookeeper_path + "/file_names/" + escaped_name); + UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first; + FileInfo file_info; + if (size) /// we don't keep FileInfos for empty files + file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum)); file_info.file_name = unescapeForFileName(escaped_name); file_infos.emplace_back(std::move(file_info)); } @@ -171,33 +190,36 @@ Strings DistributedBackupCoordination::listFiles(const String & prefix, const St return elements; } -std::optional DistributedBackupCoordination::getChecksumByFileName(const String & file_name) +std::optional DistributedBackupCoordination::getFileInfo(const String & file_name) { auto zookeeper = get_zookeeper(); - String checksum; - if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), checksum)) + String size_and_checksum; + if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) return std::nullopt; - return unhexChecksum(checksum); + UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first; + FileInfo file_info; + if (size) /// we don't keep FileInfos for empty files + file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum)); + file_info.file_name = file_name; + return file_info; } -std::optional DistributedBackupCoordination::getFileInfoByChecksum(const UInt128 & checksum) +std::optional DistributedBackupCoordination::getFileInfo(const SizeAndChecksum & size_and_checksum) { auto zookeeper = get_zookeeper(); String file_info_str; - if (!zookeeper->tryGet(zookeeper_path + "/file_infos/" + hexChecksum(checksum), file_info_str)) + if (!zookeeper->tryGet(zookeeper_path + "/file_infos/" + serializeSizeAndChecksum(size_and_checksum), file_info_str)) return std::nullopt; return deserializeFileInfo(file_info_str); } -std::optional DistributedBackupCoordination::getFileInfoByFileName(const String & file_name) +std::optional DistributedBackupCoordination::getFileSizeAndChecksum(const String & file_name) { auto zookeeper = get_zookeeper(); - String checksum; - if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), checksum)) + String size_and_checksum; + if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) return std::nullopt; - FileInfo file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + checksum)); - file_info.file_name = file_name; - return file_info; + return deserializeSizeAndChecksum(size_and_checksum); } String DistributedBackupCoordination::getNextArchiveSuffix() diff --git a/src/Backups/DistributedBackupCoordination.h b/src/Backups/DistributedBackupCoordination.h index 7b1a2cb02ba..7e7de59d9f3 100644 --- a/src/Backups/DistributedBackupCoordination.h +++ b/src/Backups/DistributedBackupCoordination.h @@ -14,14 +14,14 @@ public: DistributedBackupCoordination(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); ~DistributedBackupCoordination() override; - void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) override; + void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; std::vector getAllFileInfos() override; Strings listFiles(const String & prefix, const String & terminator) override; - std::optional getChecksumByFileName(const String & file_name) override; - std::optional getFileInfoByChecksum(const UInt128 & checksum) override; - std::optional getFileInfoByFileName(const String & file_name) override; + std::optional getFileInfo(const String & file_name) override; + std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) override; + std::optional getFileSizeAndChecksum(const String & file_name) override; String getNextArchiveSuffix() override; Strings getAllArchiveSuffixes() override; diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 58e537809db..2bb466b5911 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -47,20 +47,24 @@ public: /// Checks if an entry with a specified name exists. virtual bool fileExists(const String & file_name) const = 0; - virtual bool fileExistsByChecksum(const UInt128 & checksum) const = 0; + virtual bool fileExists(const std::pair & size_and_checksum) const = 0; /// Returns the size of the entry's data. /// This function does the same as `read(file_name)->getSize()` but faster. - virtual size_t getFileSize(const String & file_name) const = 0; - virtual size_t getFileSizeByChecksum(const UInt128 & checksum) const = 0; + virtual UInt64 getFileSize(const String & file_name) const = 0; /// Returns the checksum of the entry's data. /// This function does the same as `read(file_name)->getCheckum()` but faster. virtual UInt128 getFileChecksum(const String & file_name) const = 0; + using SizeAndChecksum = std::pair; + + /// Returns both the size and checksum in one call. + virtual SizeAndChecksum getFileSizeAndChecksum(const String & file_name) const = 0; + /// Reads an entry from the backup. virtual BackupEntryPtr readFile(const String & file_name) const = 0; - virtual BackupEntryPtr readFileByChecksum(const UInt128 & checksum) const = 0; + virtual BackupEntryPtr readFile(const SizeAndChecksum & size_and_checksum) const = 0; /// Puts a new entry to the backup. virtual void writeFile(const String & file_name, BackupEntryPtr entry) = 0; @@ -69,7 +73,7 @@ public: virtual void finalizeWriting() = 0; /// Whether it's possible to add new entries to the backup in multiple threads. - virtual bool supportsWritingInMultipleThreads() const { return true; } + virtual bool supportsWritingInMultipleThreads() const = 0; }; using BackupPtr = std::shared_ptr; diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 2f869ef1470..51f075e72f1 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -21,6 +21,9 @@ public: UInt64 base_size = 0; UInt128 base_checksum{0}; + /// Name of the data file. + String data_file_name; + /// Suffix of an archive if the backup is stored as a series of archives. String archive_suffix; @@ -31,13 +34,13 @@ public: virtual ~IBackupCoordination() { } /// Adds file information. - /// If a specified checksum is new for this IBackupContentsInfo the function sets `is_new_checksum`. - virtual void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) = 0; + /// If specified checksum+size are new for this IBackupContentsInfo the function sets `is_data_file_required`. + virtual void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) = 0; void addFileInfo(const FileInfo & file_info) { - bool is_new_checksum; - addFileInfo(file_info, is_new_checksum); + bool is_data_file_required; + addFileInfo(file_info, is_data_file_required); } /// Updates some fields (currently only `archive_suffix`) of a stored file's information. @@ -46,9 +49,11 @@ public: virtual std::vector getAllFileInfos() = 0; virtual Strings listFiles(const String & prefix, const String & terminator) = 0; - virtual std::optional getChecksumByFileName(const String & file_name) = 0; - virtual std::optional getFileInfoByChecksum(const UInt128 & checksum) = 0; - virtual std::optional getFileInfoByFileName(const String & file_name) = 0; + using SizeAndChecksum = std::pair; + + virtual std::optional getFileInfo(const String & file_name) = 0; + virtual std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) = 0; + virtual std::optional getFileSizeAndChecksum(const String & file_name) = 0; /// Generates a new archive suffix, e.g. "001", "002", "003", ... virtual String getNextArchiveSuffix() = 0; diff --git a/src/Backups/LocalBackupCoordination.cpp b/src/Backups/LocalBackupCoordination.cpp index 495f3ac483b..1ff00adb6b1 100644 --- a/src/Backups/LocalBackupCoordination.cpp +++ b/src/Backups/LocalBackupCoordination.cpp @@ -4,24 +4,32 @@ namespace DB { +using SizeAndChecksum = IBackupCoordination::SizeAndChecksum; using FileInfo = IBackupCoordination::FileInfo; LocalBackupCoordination::LocalBackupCoordination() = default; LocalBackupCoordination::~LocalBackupCoordination() = default; -void LocalBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_new_checksum) +void LocalBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { std::lock_guard lock{mutex}; - file_names.emplace(file_info.file_name, file_info.checksum); - is_new_checksum = (file_info.checksum && !file_infos.contains(file_info.checksum)); - if (is_new_checksum) - file_infos.emplace(file_info.checksum, std::move(file_info)); + file_names.emplace(file_info.file_name, std::pair{file_info.size, file_info.checksum}); + if (!file_info.size) + { + is_data_file_required = false; + return; + } + bool inserted_file_info = file_infos.try_emplace(std::pair{file_info.size, file_info.checksum}, file_info).second; + is_data_file_required = inserted_file_info && (file_info.size > file_info.base_size); } void LocalBackupCoordination::updateFileInfo(const FileInfo & file_info) { + if (!file_info.size) + return; /// we don't keep FileInfos for empty files, nothing to update + std::lock_guard lock{mutex}; - auto & dest = file_infos.at(file_info.checksum); + auto & dest = file_infos.at(std::pair{file_info.size, file_info.checksum}); dest.archive_suffix = file_info.archive_suffix; } @@ -29,9 +37,12 @@ std::vector LocalBackupCoordination::getAllFileInfos() { std::lock_guard lock{mutex}; std::vector res; - for (const auto & [file_name, checksum] : file_names) + for (const auto & [file_name, size_and_checksum] : file_names) { - FileInfo info = file_infos.at(checksum); + FileInfo info; + UInt64 size = size_and_checksum.first; + if (size) /// we don't keep FileInfos for empty files + info = file_infos.at(size_and_checksum); info.file_name = file_name; res.push_back(std::move(info)); } @@ -59,37 +70,42 @@ Strings LocalBackupCoordination::listFiles(const String & prefix, const String & return elements; } -std::optional LocalBackupCoordination::getChecksumByFileName(const String & file_name) +std::optional LocalBackupCoordination::getFileInfo(const String & file_name) { std::lock_guard lock{mutex}; auto it = file_names.find(file_name); if (it == file_names.end()) return std::nullopt; - return it->second; + const auto & size_and_checksum = it->second; + UInt64 size = size_and_checksum.first; + FileInfo info; + if (size) /// we don't keep FileInfos for empty files + info = file_infos.at(size_and_checksum); + info.file_name = file_name; + return info; } -std::optional LocalBackupCoordination::getFileInfoByChecksum(const UInt128 & checksum) +std::optional LocalBackupCoordination::getFileInfo(const SizeAndChecksum & size_and_checksum) { std::lock_guard lock{mutex}; - auto it = file_infos.find(checksum); + auto it = file_infos.find(size_and_checksum); if (it == file_infos.end()) return std::nullopt; return it->second; } -std::optional LocalBackupCoordination::getFileInfoByFileName(const String & file_name) +std::optional LocalBackupCoordination::getFileSizeAndChecksum(const String & file_name) { std::lock_guard lock{mutex}; auto it = file_names.find(file_name); if (it == file_names.end()) return std::nullopt; - FileInfo info = file_infos.at(it->second); - info.file_name = file_name; - return info; + return it->second; } String LocalBackupCoordination::getNextArchiveSuffix() { + std::lock_guard lock{mutex}; String new_archive_suffix = fmt::format("{:03}", ++current_archive_suffix); /// Outputs 001, 002, 003, ... archive_suffixes.push_back(new_archive_suffix); return new_archive_suffix; diff --git a/src/Backups/LocalBackupCoordination.h b/src/Backups/LocalBackupCoordination.h index 3f299a688fc..6ee4b941293 100644 --- a/src/Backups/LocalBackupCoordination.h +++ b/src/Backups/LocalBackupCoordination.h @@ -1,6 +1,5 @@ #include #include -#include namespace DB @@ -13,22 +12,23 @@ public: LocalBackupCoordination(); ~LocalBackupCoordination() override; - void addFileInfo(const FileInfo & file_info, bool & is_new_checksum) override; + void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; std::vector getAllFileInfos() override; Strings listFiles(const String & prefix, const String & terminator) override; - std::optional getChecksumByFileName(const String & file_name) override; - std::optional getFileInfoByChecksum(const UInt128 & checksum) override; - std::optional getFileInfoByFileName(const String & file_name) override; + + std::optional getFileInfo(const String & file_name) override; + std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) override; + std::optional getFileSizeAndChecksum(const String & file_name) override; String getNextArchiveSuffix() override; Strings getAllArchiveSuffixes() override; private: std::mutex mutex; - std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. - std::unordered_map file_infos; /// Information about files. Without empty files. + std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. + std::map file_infos; /// Information about files. Without empty files. Strings archive_suffixes; size_t current_archive_suffix = 0; }; diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 9937ce692bc..2bc72c30bdc 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -162,12 +162,16 @@ def test_incremental_backup_after_renaming_table(): # Files in a base backup can be searched by checksum, so an incremental backup with a renamed table actually # contains only its changed metadata. - contents = os.listdir(get_backup_dir(incremental_backup_name)) - assert '.backup' in contents - contents.remove('.backup') - assert len(contents) == 1 - with open(os.path.join(get_backup_dir(incremental_backup_name), contents[0])) as table_def_in_backup: - assert table_def_in_backup.read().startswith('CREATE TABLE test.table2') + assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "metadata")) == True + assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "data")) == True + assert ( + os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "metadata")) + == True + ) + assert ( + os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "data")) + == False + ) instance.query("DROP TABLE test.table2") instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name}") From 144d3aefebd0f85d3a8da2dd75d7092a83f034c4 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 20 Apr 2022 09:38:12 +0200 Subject: [PATCH 57/94] Add system table system.backups; implement async mode for commands BACKUP & RESTORE --- src/Backups/BackupSettings.cpp | 2 + src/Backups/BackupSettings.h | 2 + src/Backups/BackupStatus.cpp | 50 ++++++++ src/Backups/BackupStatus.h | 28 +++++ src/Backups/BackupsWorker.cpp | 75 ++++++++++++ src/Backups/BackupsWorker.h | 39 ++++++ src/Backups/RestoreSettings.cpp | 2 + src/Backups/RestoreSettings.h | 2 + src/Interpreters/InterpreterBackupQuery.cpp | 120 ++++++++++++++++--- src/Storages/System/StorageSystemBackups.cpp | 48 ++++++++ src/Storages/System/StorageSystemBackups.h | 23 ++++ src/Storages/System/attachSystemTables.cpp | 2 + 12 files changed, 379 insertions(+), 14 deletions(-) create mode 100644 src/Backups/BackupStatus.cpp create mode 100644 src/Backups/BackupStatus.h create mode 100644 src/Backups/BackupsWorker.cpp create mode 100644 src/Backups/BackupsWorker.h create mode 100644 src/Storages/System/StorageSystemBackups.cpp create mode 100644 src/Storages/System/StorageSystemBackups.h diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 817e0e7e1a7..059abc9a905 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -32,6 +32,8 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query) res.password = SettingFieldString{setting.value}; else if (setting.name == "structure_only") res.structure_only = SettingFieldBool{setting.value}; + else if (setting.name == "async") + res.async = SettingFieldBool{setting.value}; else throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); } diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index ca95a08da8f..93414379531 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -25,6 +25,8 @@ struct BackupSettings /// without the data of tables. bool structure_only = false; + bool async = false; + static BackupSettings fromBackupQuery(const ASTBackupQuery & query); }; diff --git a/src/Backups/BackupStatus.cpp b/src/Backups/BackupStatus.cpp new file mode 100644 index 00000000000..40524f141b3 --- /dev/null +++ b/src/Backups/BackupStatus.cpp @@ -0,0 +1,50 @@ +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +std::string_view toString(BackupStatus backup_status) +{ + switch (backup_status) + { + case BackupStatus::PREPARING: + return "PREPARING"; + case BackupStatus::MAKING_BACKUP: + return "MAKING_BACKUP"; + case BackupStatus::BACKUP_COMPLETE: + return "BACKUP_COMPLETE"; + case BackupStatus::FAILED_TO_BACKUP: + return "FAILED_TO_BACKUP"; + case BackupStatus::RESTORING: + return "RESTORING"; + case BackupStatus::RESTORED: + return "RESTORED"; + case BackupStatus::FAILED_TO_RESTORE: + return "FAILED_TO_RESTORE"; + default: + break; + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected backup status: {}", static_cast(backup_status)); +} + +const std::vector> & getBackupStatusEnumValues() +{ + static const std::vector> values = [] + { + std::vector> res; + for (auto status : collections::range(BackupStatus::MAX)) + res.emplace_back(toString(status), static_cast(status)); + return res; + }(); + return values; +} + +} diff --git a/src/Backups/BackupStatus.h b/src/Backups/BackupStatus.h new file mode 100644 index 00000000000..60968369bbc --- /dev/null +++ b/src/Backups/BackupStatus.h @@ -0,0 +1,28 @@ +#pragma once + +#include + + +namespace DB +{ + +enum class BackupStatus +{ + /// Statuses of making backups + PREPARING, + MAKING_BACKUP, + BACKUP_COMPLETE, + FAILED_TO_BACKUP, + + /// Status of restoring + RESTORING, + RESTORED, + FAILED_TO_RESTORE, + + MAX, +}; + +std::string_view toString(BackupStatus backup_status); +const std::vector> & getBackupStatusEnumValues(); + +} diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp new file mode 100644 index 00000000000..6f68ba32242 --- /dev/null +++ b/src/Backups/BackupsWorker.cpp @@ -0,0 +1,75 @@ +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +BackupsWorker & BackupsWorker::instance() +{ + static BackupsWorker the_instance; + return the_instance; +} + +size_t BackupsWorker::add(const String & backup_name, BackupStatus status, const String & error) +{ + std::lock_guard lock{mutex}; + + UInt64 task_id = ++current_task_id; + size_t pos; + auto it = entries_by_name.find(backup_name); + if (it != entries_by_name.end()) + { + pos = it->second; + entries_by_task_id.erase(entries[pos].task_id); + } + else + { + pos = entries.size(); + entries.emplace_back().backup_name = backup_name; + entries_by_name.emplace(backup_name, pos); + } + + entries_by_task_id.emplace(task_id, pos); + + Entry & entry = entries[pos]; + entry.task_id = task_id; + entry.status = status; + entry.error = error; + entry.timestamp = std::time(nullptr); + + return task_id; +} + +void BackupsWorker::update(UInt64 task_id, BackupStatus status, const String & error) +{ + std::lock_guard lock{mutex}; + auto it = entries_by_task_id.find(task_id); + if ((it == entries_by_task_id.end()) || (it->second >= entries.size())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: entry_id is out of range"); + Entry & entry = entries[it->second]; + entry.status = status; + entry.error = error; + entry.timestamp = std::time(nullptr); +} + +BackupsWorker::Entry BackupsWorker::getEntry(UInt64 task_id) const +{ + std::lock_guard lock{mutex}; + auto it = entries_by_task_id.find(task_id); + if ((it == entries_by_task_id.end()) || (it->second >= entries.size())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: entry_id is out of range"); + return entries[it->second]; +} + +std::vector BackupsWorker::getEntries() const +{ + std::lock_guard lock{mutex}; + return entries; +} + +} diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h new file mode 100644 index 00000000000..b207bc4c5e8 --- /dev/null +++ b/src/Backups/BackupsWorker.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class BackupsWorker +{ +public: + static BackupsWorker & instance(); + + UInt64 add(const String & backup_name, BackupStatus status, const String & error = {}); + void update(UInt64 task_id, BackupStatus status, const String & error = {}); + + struct Entry + { + String backup_name; + UInt64 task_id; + BackupStatus status; + String error; + time_t timestamp; + }; + + Entry getEntry(UInt64 task_id) const; + std::vector getEntries() const; + +private: + mutable std::mutex mutex; + std::vector entries; + std::unordered_map entries_by_name; + std::unordered_map entries_by_task_id; + UInt64 current_task_id = 0; +}; + +} diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index b1d3c157e13..8e5ba000da1 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -62,6 +62,8 @@ RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) res.allow_different_table_def = SettingFieldBool{setting.value}; else if (setting.name == "allow_different_database_def") res.allow_different_database_def = SettingFieldBool{setting.value}; + else if (setting.name == "async") + res.async = SettingFieldBool{setting.value}; else throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); } diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index 9f4951862c7..8ae247e7ba6 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -58,6 +58,8 @@ struct RestoreSettings : public StorageRestoreSettings /// Set `allow_different_database_def` to true to skip this check. bool allow_different_database_def = false; + bool async = false; + static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query); }; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index b630bac9515..f60ba4ad083 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -5,9 +5,16 @@ #include #include #include +#include #include #include #include +#include +#include +#include +#include +#include +#include namespace DB @@ -38,31 +45,116 @@ namespace return BackupFactory::instance().createBackup(params); } - void executeBackup(const ContextPtr & context, const ASTBackupQuery & query) + void executeBackupSync(UInt64 task_id, const ContextPtr & context, const BackupInfo & backup_info, const ASTBackupQuery::Elements & backup_elements, const BackupSettings & backup_settings, bool no_throw = false) { - auto backup_settings = BackupSettings::fromBackupQuery(query); - auto backup_entries = makeBackupEntries(context, query.elements, backup_settings); - BackupMutablePtr backup = createBackup(BackupInfo::fromAST(*query.backup_name), backup_settings, context); - writeBackupEntries(backup, std::move(backup_entries), context->getSettingsRef().max_backup_threads); + auto & worker = BackupsWorker::instance(); + try + { + BackupMutablePtr backup = createBackup(backup_info, backup_settings, context); + worker.update(task_id, BackupStatus::PREPARING); + auto backup_entries = makeBackupEntries(context, backup_elements, backup_settings); + worker.update(task_id, BackupStatus::MAKING_BACKUP); + writeBackupEntries(backup, std::move(backup_entries), context->getSettingsRef().max_backup_threads); + worker.update(task_id, BackupStatus::BACKUP_COMPLETE); + } + catch (...) + { + worker.update(task_id, BackupStatus::FAILED_TO_BACKUP, getCurrentExceptionMessage(false)); + if (!no_throw) + throw; + } } - void executeRestore(ContextMutablePtr context, const ASTBackupQuery & query) + void executeRestoreSync(UInt64 task_id, ContextMutablePtr context, const BackupInfo & backup_info, const ASTBackupQuery::Elements & restore_elements, const RestoreSettings & restore_settings, bool no_throw = false) { - auto restore_settings = RestoreSettings::fromRestoreQuery(query); - BackupPtr backup = openBackup(BackupInfo::fromAST(*query.backup_name), restore_settings, context); - auto restore_tasks = makeRestoreTasks(context, backup, query.elements, restore_settings); - executeRestoreTasks(std::move(restore_tasks), context->getSettingsRef().max_backup_threads); + auto & worker = BackupsWorker::instance(); + try + { + BackupPtr backup = openBackup(backup_info, restore_settings, context); + worker.update(task_id, BackupStatus::RESTORING); + auto restore_tasks = makeRestoreTasks(context, backup, restore_elements, restore_settings); + executeRestoreTasks(std::move(restore_tasks), context->getSettingsRef().max_backup_threads); + worker.update(task_id, BackupStatus::RESTORED); + } + catch (...) + { + worker.update(task_id, BackupStatus::FAILED_TO_RESTORE, getCurrentExceptionMessage(false)); + if (!no_throw) + throw; + } + } + + UInt64 executeBackup(const ContextPtr & context, const ASTBackupQuery & query) + { + const auto backup_info = BackupInfo::fromAST(*query.backup_name); + auto task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::PREPARING); + const auto backup_settings = BackupSettings::fromBackupQuery(query); + + if (backup_settings.async) + { + ThreadFromGlobalPool thread{ + &executeBackupSync, task_id, context, backup_info, query.elements, backup_settings, /* no_throw = */ true}; + thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead + } + else + { + executeBackupSync(task_id, context, backup_info, query.elements, backup_settings, /* no_throw = */ false); + } + return task_id; + } + + UInt64 executeRestore(ContextMutablePtr context, const ASTBackupQuery & query) + { + const auto backup_info = BackupInfo::fromAST(*query.backup_name); + const auto restore_settings = RestoreSettings::fromRestoreQuery(query); + auto task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::RESTORING); + + if (restore_settings.async) + { + ThreadFromGlobalPool thread{&executeRestoreSync, task_id, context, backup_info, query.elements, restore_settings, /* no_throw = */ true}; + thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead + } + else + { + executeRestoreSync(task_id, context, backup_info, query.elements, restore_settings, /* no_throw = */ false); + } + return task_id; + } + + Block getResultRow(UInt64 task_id) + { + auto entry = BackupsWorker::instance().getEntry(task_id); + + Block res_columns; + + auto column_task_id = ColumnUInt64::create(); + column_task_id->insert(task_id); + res_columns.insert(0, {std::move(column_task_id), std::make_shared(), "task_id"}); + + auto column_backup_name = ColumnString::create(); + column_backup_name->insert(entry.backup_name); + res_columns.insert(1, {std::move(column_backup_name), std::make_shared(), "backup_name"}); + + auto column_status = ColumnInt8::create(); + column_status->insert(static_cast(entry.status)); + res_columns.insert(2, {std::move(column_status), std::make_shared(getBackupStatusEnumValues()), "status"}); + + return res_columns; } } BlockIO InterpreterBackupQuery::execute() { const auto & query = query_ptr->as(); + UInt64 task_id; if (query.kind == ASTBackupQuery::BACKUP) - executeBackup(context, query); - else if (query.kind == ASTBackupQuery::RESTORE) - executeRestore(context, query); - return {}; + task_id = executeBackup(context, query); + else + task_id = executeRestore(context, query); + + BlockIO res_io; + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(task_id))); + return res_io; } } diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp new file mode 100644 index 00000000000..8cbe80b1568 --- /dev/null +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemBackups::getNamesAndTypes() +{ + NamesAndTypesList names_and_types{ + {"task_id", std::make_shared()}, + {"backup_name", std::make_shared()}, + {"status", std::make_shared(getBackupStatusEnumValues())}, + {"error", std::make_shared()}, + {"time", std::make_shared()}, + }; + return names_and_types; +} + + +void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr, const SelectQueryInfo &) const +{ + size_t column_index = 0; + auto & column_task_id = assert_cast(*res_columns[column_index++]); + auto & column_backup_name = assert_cast(*res_columns[column_index++]); + auto & column_status = assert_cast(*res_columns[column_index++]); + auto & column_error = assert_cast(*res_columns[column_index++]); + auto & column_timestamp = assert_cast(*res_columns[column_index++]); + + auto add_row = [&](const BackupsWorker::Entry & entry) + { + column_task_id.insertValue(entry.task_id); + column_backup_name.insertData(entry.backup_name.data(), entry.backup_name.size()); + column_status.insertValue(static_cast(entry.status)); + column_error.insertData(entry.error.data(), entry.error.size()); + column_timestamp.insertValue(entry.timestamp); + }; + + for (const auto & entry : BackupsWorker::instance().getEntries()) + add_row(entry); +} + +} diff --git a/src/Storages/System/StorageSystemBackups.h b/src/Storages/System/StorageSystemBackups.h new file mode 100644 index 00000000000..e31c7b0a994 --- /dev/null +++ b/src/Storages/System/StorageSystemBackups.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +/// Implements `grants` system table, which allows you to get information about grants. +class StorageSystemBackups final : public shared_ptr_helper, public IStorageSystemOneBlock +{ +public: + std::string getName() const override { return "SystemBackups"; } + static NamesAndTypesList getNamesAndTypes(); + +protected: + friend struct shared_ptr_helper; + using IStorageSystemOneBlock::IStorageSystemOneBlock; + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index 6558890b8c4..f8940de889a 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -128,6 +129,7 @@ void attachSystemTablesLocal(ContextPtr context, IDatabase & system_database) attach(context, system_database, "data_skipping_indices"); attach(context, system_database, "licenses"); attach(context, system_database, "time_zones"); + attach(context, system_database, "backups"); #ifdef OS_LINUX attach(context, system_database, "stack_trace"); #endif From 3477665659f5d2a1d86da16628a70f71fdd9c064 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Apr 2022 13:31:39 +0200 Subject: [PATCH 58/94] Add shard_index and replica_index to params of getRewrittenASTWithoutOnCluster(). --- src/Interpreters/DDLTask.cpp | 6 +++++- src/Parsers/ASTAlterQuery.h | 4 ++-- src/Parsers/ASTCreateFunctionQuery.h | 2 +- src/Parsers/ASTCreateQuery.h | 4 ++-- src/Parsers/ASTDropFunctionQuery.h | 2 +- src/Parsers/ASTDropQuery.h | 4 ++-- src/Parsers/ASTKillQueryQuery.h | 2 +- src/Parsers/ASTOptimizeQuery.h | 4 ++-- src/Parsers/ASTQueryWithOnCluster.cpp | 4 ++-- src/Parsers/ASTQueryWithOnCluster.h | 18 ++++++++++++++++-- src/Parsers/ASTRenameQuery.h | 6 +++--- src/Parsers/ASTSystemQuery.h | 4 ++-- src/Parsers/Access/ASTCreateQuotaQuery.h | 2 +- src/Parsers/Access/ASTCreateRoleQuery.h | 2 +- src/Parsers/Access/ASTCreateRowPolicyQuery.h | 2 +- .../Access/ASTCreateSettingsProfileQuery.h | 2 +- src/Parsers/Access/ASTCreateUserQuery.h | 2 +- src/Parsers/Access/ASTDropAccessEntityQuery.h | 2 +- src/Parsers/Access/ASTGrantQuery.h | 2 +- src/Parsers/Access/ASTRowPolicyName.h | 4 ++-- 20 files changed, 48 insertions(+), 30 deletions(-) diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 3f43c5eb412..71a94545dff 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -224,7 +224,11 @@ void DDLTask::setClusterInfo(ContextPtr context, Poco::Logger * log) host_id.readableString(), entry_name, address_in_cluster.readableString(), cluster_name); } - query = query_on_cluster->getRewrittenASTWithoutOnCluster(address_in_cluster.default_database); + WithoutOnClusterASTRewriteParams params; + params.default_database = address_in_cluster.default_database; + params.shard_index = address_in_cluster.shard_index; + params.replica_index = address_in_cluster.replica_index; + query = query_on_cluster->getRewrittenASTWithoutOnCluster(params); query_on_cluster = nullptr; } diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index 976ccd1e2bf..956f07811ae 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -243,9 +243,9 @@ public: ASTPtr clone() const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { - return removeOnCluster(clone(), new_database); + return removeOnCluster(clone(), params.default_database); } virtual QueryKind getQueryKind() const override { return QueryKind::Alter; } diff --git a/src/Parsers/ASTCreateFunctionQuery.h b/src/Parsers/ASTCreateFunctionQuery.h index b61441da6df..ce0d874b15d 100644 --- a/src/Parsers/ASTCreateFunctionQuery.h +++ b/src/Parsers/ASTCreateFunctionQuery.h @@ -22,7 +22,7 @@ public: void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } String getFunctionName() const; }; diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h index 04755a02399..28a4b2a2932 100644 --- a/src/Parsers/ASTCreateQuery.h +++ b/src/Parsers/ASTCreateQuery.h @@ -111,9 +111,9 @@ public: ASTPtr clone() const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { - return removeOnCluster(clone(), new_database); + return removeOnCluster(clone(), params.default_database); } bool isView() const { return is_ordinary_view || is_materialized_view || is_live_view || is_window_view; } diff --git a/src/Parsers/ASTDropFunctionQuery.h b/src/Parsers/ASTDropFunctionQuery.h index c9f673578a9..edfa6a23994 100644 --- a/src/Parsers/ASTDropFunctionQuery.h +++ b/src/Parsers/ASTDropFunctionQuery.h @@ -20,7 +20,7 @@ public: void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/ASTDropQuery.h b/src/Parsers/ASTDropQuery.h index 2e67eaf3692..ef2b609fbac 100644 --- a/src/Parsers/ASTDropQuery.h +++ b/src/Parsers/ASTDropQuery.h @@ -40,9 +40,9 @@ public: String getID(char) const override; ASTPtr clone() const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { - return removeOnCluster(clone(), new_database); + return removeOnCluster(clone(), params.default_database); } virtual QueryKind getQueryKind() const override { return QueryKind::Drop; } diff --git a/src/Parsers/ASTKillQueryQuery.h b/src/Parsers/ASTKillQueryQuery.h index 95be3ec6309..88a1be38766 100644 --- a/src/Parsers/ASTKillQueryQuery.h +++ b/src/Parsers/ASTKillQueryQuery.h @@ -38,7 +38,7 @@ public: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } diff --git a/src/Parsers/ASTOptimizeQuery.h b/src/Parsers/ASTOptimizeQuery.h index cac2ef1c6d0..c53a61a66dc 100644 --- a/src/Parsers/ASTOptimizeQuery.h +++ b/src/Parsers/ASTOptimizeQuery.h @@ -50,9 +50,9 @@ public: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { - return removeOnCluster(clone(), new_database); + return removeOnCluster(clone(), params.default_database); } }; diff --git a/src/Parsers/ASTQueryWithOnCluster.cpp b/src/Parsers/ASTQueryWithOnCluster.cpp index 60e96b1dbe1..d0b6d82ab13 100644 --- a/src/Parsers/ASTQueryWithOnCluster.cpp +++ b/src/Parsers/ASTQueryWithOnCluster.cpp @@ -11,9 +11,9 @@ namespace DB { -std::string ASTQueryWithOnCluster::getRewrittenQueryWithoutOnCluster(const std::string & new_database) const +std::string ASTQueryWithOnCluster::getRewrittenQueryWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const { - return queryToString(getRewrittenASTWithoutOnCluster(new_database)); + return queryToString(getRewrittenASTWithoutOnCluster(params)); } diff --git a/src/Parsers/ASTQueryWithOnCluster.h b/src/Parsers/ASTQueryWithOnCluster.h index c5daaa6ce37..6985bb01210 100644 --- a/src/Parsers/ASTQueryWithOnCluster.h +++ b/src/Parsers/ASTQueryWithOnCluster.h @@ -6,6 +6,20 @@ namespace DB { +/// Parameters for rewriting queries without ON CLUSTER, see getRewrittenASTWithoutOnCluster(). +struct WithoutOnClusterASTRewriteParams +{ + /// Default database from the cluster's configuration. + String default_database; + + /// 1-based index of the current shard in the cluster's configuration. + size_t shard_index; + + /// 1-based index of the current replica in the cluster's configuration. + size_t replica_index; +}; + + /// TODO: Quite messy. class ASTQueryWithOnCluster { @@ -17,10 +31,10 @@ public: /// new_database should be used by queries that refer to default db /// and default_database is specified for remote server - virtual ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database = {}) const = 0; /// NOLINT + virtual ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params = {}) const = 0; /// NOLINT /// Returns a query prepared for execution on remote server - std::string getRewrittenQueryWithoutOnCluster(const std::string & new_database = {}) const; + std::string getRewrittenQueryWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params = {}) const; void formatOnCluster(const IAST::FormatSettings & settings) const; diff --git a/src/Parsers/ASTRenameQuery.h b/src/Parsers/ASTRenameQuery.h index 01ab0df9774..ee7cad2d38a 100644 --- a/src/Parsers/ASTRenameQuery.h +++ b/src/Parsers/ASTRenameQuery.h @@ -48,7 +48,7 @@ public: return res; } - ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { auto query_ptr = clone(); auto & query = query_ptr->as(); @@ -57,9 +57,9 @@ public: for (Element & elem : query.elements) { if (elem.from.database.empty()) - elem.from.database = new_database; + elem.from.database = params.default_database; if (elem.to.database.empty()) - elem.to.database = new_database; + elem.to.database = params.default_database; } return query_ptr; diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 600525f9abe..89fe3ef62e3 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -104,9 +104,9 @@ public: return res; } - ASTPtr getRewrittenASTWithoutOnCluster(const std::string & new_database) const override + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const override { - return removeOnCluster(clone(), new_database); + return removeOnCluster(clone(), params.default_database); } virtual QueryKind getQueryKind() const override { return QueryKind::System; } diff --git a/src/Parsers/Access/ASTCreateQuotaQuery.h b/src/Parsers/Access/ASTCreateQuotaQuery.h index 154245cbfe3..b6add6d8321 100644 --- a/src/Parsers/Access/ASTCreateQuotaQuery.h +++ b/src/Parsers/Access/ASTCreateQuotaQuery.h @@ -54,6 +54,6 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; void replaceCurrentUserTag(const String & current_user_name) const; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/Access/ASTCreateRoleQuery.h b/src/Parsers/Access/ASTCreateRoleQuery.h index 422bb0e681d..0e0773c8972 100644 --- a/src/Parsers/Access/ASTCreateRoleQuery.h +++ b/src/Parsers/Access/ASTCreateRoleQuery.h @@ -34,6 +34,6 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/Access/ASTCreateRowPolicyQuery.h b/src/Parsers/Access/ASTCreateRowPolicyQuery.h index dc698c25c6d..b96cc325524 100644 --- a/src/Parsers/Access/ASTCreateRowPolicyQuery.h +++ b/src/Parsers/Access/ASTCreateRowPolicyQuery.h @@ -47,7 +47,7 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceCurrentUserTag(const String & current_user_name) const; void replaceEmptyDatabase(const String & current_database) const; diff --git a/src/Parsers/Access/ASTCreateSettingsProfileQuery.h b/src/Parsers/Access/ASTCreateSettingsProfileQuery.h index df0a11456bc..64546bc7230 100644 --- a/src/Parsers/Access/ASTCreateSettingsProfileQuery.h +++ b/src/Parsers/Access/ASTCreateSettingsProfileQuery.h @@ -40,6 +40,6 @@ public: ASTPtr clone() const override; void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; void replaceCurrentUserTag(const String & current_user_name) const; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/Access/ASTCreateUserQuery.h b/src/Parsers/Access/ASTCreateUserQuery.h index 92db71e8581..364a127e281 100644 --- a/src/Parsers/Access/ASTCreateUserQuery.h +++ b/src/Parsers/Access/ASTCreateUserQuery.h @@ -60,6 +60,6 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } }; } diff --git a/src/Parsers/Access/ASTDropAccessEntityQuery.h b/src/Parsers/Access/ASTDropAccessEntityQuery.h index b1a6ca58a18..7ca672ad989 100644 --- a/src/Parsers/Access/ASTDropAccessEntityQuery.h +++ b/src/Parsers/Access/ASTDropAccessEntityQuery.h @@ -26,7 +26,7 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceEmptyDatabase(const String & current_database) const; }; diff --git a/src/Parsers/Access/ASTGrantQuery.h b/src/Parsers/Access/ASTGrantQuery.h index 934d619fc36..d2df3f6cf6e 100644 --- a/src/Parsers/Access/ASTGrantQuery.h +++ b/src/Parsers/Access/ASTGrantQuery.h @@ -33,7 +33,7 @@ public: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; void replaceEmptyDatabase(const String & current_database); void replaceCurrentUserTag(const String & current_user_name) const; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } virtual QueryKind getQueryKind() const override { return is_revoke ? QueryKind::Revoke : QueryKind::Grant; } }; } diff --git a/src/Parsers/Access/ASTRowPolicyName.h b/src/Parsers/Access/ASTRowPolicyName.h index 43270b0185d..9f4848bd612 100644 --- a/src/Parsers/Access/ASTRowPolicyName.h +++ b/src/Parsers/Access/ASTRowPolicyName.h @@ -20,7 +20,7 @@ public: String getID(char) const override { return "RowPolicyName"; } ASTPtr clone() const override { return std::make_shared(*this); } void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceEmptyDatabase(const String & current_database); }; @@ -42,7 +42,7 @@ public: String getID(char) const override { return "RowPolicyNames"; } ASTPtr clone() const override { return std::make_shared(*this); } void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; - ASTPtr getRewrittenASTWithoutOnCluster(const std::string &) const override { return removeOnCluster(clone()); } + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override { return removeOnCluster(clone()); } void replaceEmptyDatabase(const String & current_database); }; From 030f3e488caf4353e2436284b8051911537f50ca Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Apr 2022 14:15:29 +0200 Subject: [PATCH 59/94] Add shard_index and replica_index to params of executeDDLQueryOnCluster(). --- .../InterpreterCreateRowPolicyQuery.cpp | 4 +- .../Access/InterpreterGrantQuery.cpp | 4 +- src/Interpreters/InterpreterAlterQuery.cpp | 6 +- .../InterpreterCreateFunctionQuery.cpp | 6 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 +- .../InterpreterDropFunctionQuery.cpp | 6 +- src/Interpreters/InterpreterDropQuery.cpp | 6 +- .../InterpreterKillQueryQuery.cpp | 6 +- src/Interpreters/InterpreterOptimizeQuery.cpp | 6 +- src/Interpreters/InterpreterRenameQuery.cpp | 6 +- src/Interpreters/InterpreterSystemQuery.cpp | 6 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 64 ++++++++++++------- src/Interpreters/executeDDLQueryOnCluster.h | 20 +++++- 13 files changed, 107 insertions(+), 37 deletions(-) diff --git a/src/Interpreters/Access/InterpreterCreateRowPolicyQuery.cpp b/src/Interpreters/Access/InterpreterCreateRowPolicyQuery.cpp index 72b4b149bd7..87dc9039c17 100644 --- a/src/Interpreters/Access/InterpreterCreateRowPolicyQuery.cpp +++ b/src/Interpreters/Access/InterpreterCreateRowPolicyQuery.cpp @@ -51,7 +51,9 @@ BlockIO InterpreterCreateRowPolicyQuery::execute() if (!query.cluster.empty()) { query.replaceCurrentUserTag(getContext()->getUserName()); - return executeDDLQueryOnCluster(query_ptr, getContext(), required_access); + DDLQueryOnClusterParams params; + params.access_to_check = std::move(required_access); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); } assert(query.names->cluster.empty()); diff --git a/src/Interpreters/Access/InterpreterGrantQuery.cpp b/src/Interpreters/Access/InterpreterGrantQuery.cpp index f2b9cd58991..85790f043a2 100644 --- a/src/Interpreters/Access/InterpreterGrantQuery.cpp +++ b/src/Interpreters/Access/InterpreterGrantQuery.cpp @@ -402,7 +402,9 @@ BlockIO InterpreterGrantQuery::execute() auto required_access = getRequiredAccessForExecutingOnCluster(elements_to_grant, elements_to_revoke); checkAdminOptionForExecutingOnCluster(*current_user_access, roles_to_grant, roles_to_revoke); checkGranteesAreAllowed(access_control, *current_user_access, grantees); - return executeDDLQueryOnCluster(query_ptr, getContext(), std::move(required_access)); + DDLQueryOnClusterParams params; + params.access_to_check = std::move(required_access); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); } /// Check if the current user has corresponding access rights granted with grant option. diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 22390bc30c6..8b012f6f317 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -67,7 +67,11 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) BlockIO res; if (!alter.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccess()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccess(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } getContext()->checkAccess(getRequiredAccess()); auto table_id = getContext()->resolveStorageID(alter, Context::ResolveOrdinary); diff --git a/src/Interpreters/InterpreterCreateFunctionQuery.cpp b/src/Interpreters/InterpreterCreateFunctionQuery.cpp index 615fbb03403..1db6c229ef4 100644 --- a/src/Interpreters/InterpreterCreateFunctionQuery.cpp +++ b/src/Interpreters/InterpreterCreateFunctionQuery.cpp @@ -34,7 +34,11 @@ BlockIO InterpreterCreateFunctionQuery::execute() access_rights_elements.emplace_back(AccessType::DROP_FUNCTION); if (!create_function_query.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), access_rights_elements); + { + DDLQueryOnClusterParams params; + params.access_to_check = std::move(access_rights_elements); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } auto current_context = getContext(); current_context->checkAccess(access_rights_elements); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bea1d921cd8..7b0d956ff58 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1503,7 +1503,9 @@ BlockIO InterpreterCreateQuery::execute() if (!create.cluster.empty()) { prepareOnClusterQuery(create, getContext(), create.cluster); - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccess()); + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccess(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); } getContext()->checkAccess(getRequiredAccess()); diff --git a/src/Interpreters/InterpreterDropFunctionQuery.cpp b/src/Interpreters/InterpreterDropFunctionQuery.cpp index be67ea68cdd..bb2032f355a 100644 --- a/src/Interpreters/InterpreterDropFunctionQuery.cpp +++ b/src/Interpreters/InterpreterDropFunctionQuery.cpp @@ -21,7 +21,11 @@ BlockIO InterpreterDropFunctionQuery::execute() access_rights_elements.emplace_back(AccessType::DROP_FUNCTION); if (!drop_function_query.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), access_rights_elements); + { + DDLQueryOnClusterParams params; + params.access_to_check = std::move(access_rights_elements); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } auto current_context = getContext(); current_context->checkAccess(access_rights_elements); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index b04342b37f7..41b65e73efa 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -55,7 +55,11 @@ BlockIO InterpreterDropQuery::execute() { auto & drop = query_ptr->as(); if (!drop.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccessForDDLOnCluster()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccessForDDLOnCluster(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } if (getContext()->getSettingsRef().database_atomic_wait_for_drop_and_detach_synchronously) drop.no_delay = true; diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 481355878aa..990079442ef 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -199,7 +199,11 @@ BlockIO InterpreterKillQueryQuery::execute() const auto & query = query_ptr->as(); if (!query.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccessForDDLOnCluster()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccessForDDLOnCluster(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } BlockIO res_io; switch (query.type) diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index 83bf23ab4ad..239e15f996e 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -25,7 +25,11 @@ BlockIO InterpreterOptimizeQuery::execute() const auto & ast = query_ptr->as(); if (!ast.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccess()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccess(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } getContext()->checkAccess(getRequiredAccess()); diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index f4b3fff19b6..f4dbbaec16d 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -29,7 +29,11 @@ BlockIO InterpreterRenameQuery::execute() const auto & rename = query_ptr->as(); if (!rename.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccess()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccess(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } getContext()->checkAccess(getRequiredAccess()); diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 28a2082d233..4414f910af5 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -208,7 +208,11 @@ BlockIO InterpreterSystemQuery::execute() auto & query = query_ptr->as(); if (!query.cluster.empty()) - return executeDDLQueryOnCluster(query_ptr, getContext(), getRequiredAccessForDDLOnCluster()); + { + DDLQueryOnClusterParams params; + params.access_to_check = getRequiredAccessForDDLOnCluster(); + return executeDDLQueryOnCluster(query_ptr, getContext(), params); + } using Type = ASTSystemQuery::Type; diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 271602c29f9..d967eb9c919 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -32,6 +32,8 @@ namespace ErrorCodes extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; extern const int QUERY_IS_PROHIBITED; + extern const int INVALID_SHARD_ID; + extern const int NO_SUCH_REPLICA; extern const int LOGICAL_ERROR; } @@ -51,17 +53,7 @@ bool isSupportedAlterType(int type) } -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context) -{ - return executeDDLQueryOnCluster(query_ptr_, context, {}); -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context, const AccessRightsElements & query_requires_access) -{ - return executeDDLQueryOnCluster(query_ptr, context, AccessRightsElements{query_requires_access}); -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, AccessRightsElements && query_requires_access) +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, const DDLQueryOnClusterParams & params) { if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ON CLUSTER queries inside transactions are not supported"); @@ -94,12 +86,37 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, DDLWorker & ddl_worker = context->getDDLWorker(); /// Enumerate hosts which will be used to send query. - Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); std::vector hosts; - for (const auto & shard : shards) + Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); + + auto collect_hosts_from_replicas = [&](size_t shard_index) { - for (const auto & addr : shard) - hosts.emplace_back(addr); + if (shard_index > shards.size()) + throw Exception(ErrorCodes::INVALID_SHARD_ID, "Cluster {} doesn't have shard #{}", query->cluster, shard_index); + const auto & replicas = shards[shard_index - 1]; + if (params.replica_index) + { + if (params.replica_index > replicas.size()) + throw Exception(ErrorCodes::NO_SUCH_REPLICA, "Cluster {} doesn't have replica #{} in shard #{}", query->cluster, params.replica_index, shard_index); + hosts.emplace_back(replicas[params.replica_index - 1]); + } + else + { + if ((replicas.size() > 1) && !params.allow_multiple_replicas) + throw Exception("This query cannot be executed on multiple replicas", ErrorCodes::QUERY_IS_PROHIBITED); + for (const auto & addr : replicas) + hosts.emplace_back(addr); + } + }; + + if (params.shard_index) + { + collect_hosts_from_replicas(params.shard_index); + } + else + { + for (size_t shard_index = 1; shard_index <= shards.size(); ++shard_index) + collect_hosts_from_replicas(shard_index); } if (hosts.empty()) @@ -107,9 +124,10 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, /// The current database in a distributed query need to be replaced with either /// the local current database or a shard's default database. + AccessRightsElements access_to_check = params.access_to_check; bool need_replace_current_database = std::any_of( - query_requires_access.begin(), - query_requires_access.end(), + access_to_check.begin(), + access_to_check.end(), [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }); bool use_local_default_database = false; @@ -137,18 +155,18 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, if (use_local_default_database) { - query_requires_access.replaceEmptyDatabase(current_database); + access_to_check.replaceEmptyDatabase(current_database); } else { - for (size_t i = 0; i != query_requires_access.size();) + for (size_t i = 0; i != access_to_check.size();) { - auto & element = query_requires_access[i]; + auto & element = access_to_check[i]; if (element.isEmptyDatabase()) { - query_requires_access.insert(query_requires_access.begin() + i + 1, shard_default_databases.size() - 1, element); + access_to_check.insert(access_to_check.begin() + i + 1, shard_default_databases.size() - 1, element); for (size_t j = 0; j != shard_default_databases.size(); ++j) - query_requires_access[i + j].replaceEmptyDatabase(shard_default_databases[j]); + access_to_check[i + j].replaceEmptyDatabase(shard_default_databases[j]); i += shard_default_databases.size(); } else @@ -161,7 +179,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, visitor.visitDDL(query_ptr); /// Check access rights, assume that all servers have the same users config - context->checkAccess(query_requires_access); + context->checkAccess(access_to_check); DDLLogEntry entry; entry.hosts = std::move(hosts); diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index e7ec52d03cb..f0b43634573 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -21,11 +22,24 @@ struct DDLLogEntry; /// Returns true if provided ALTER type can be executed ON CLUSTER bool isSupportedAlterType(int type); +struct DDLQueryOnClusterParams +{ + /// 1-bases index of a shard to execute a query on, 0 means all shards. + size_t shard_index = 0; + + /// 1-bases index of a replica to execute a query on, 0 means all replicas (see also allow_multiple_replicas). + size_t replica_index = 0; + + /// Allows executing a query on multiple replicas. + bool allow_multiple_replicas = true; + + /// Privileges which the current user should have to execute a query. + AccessRightsElements access_to_check; +}; + /// Pushes distributed DDL query to the queue. /// Returns DDLQueryStatusSource, which reads results of query execution on each host in the cluster. -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context, const AccessRightsElements & query_requires_access); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context, AccessRightsElements && query_requires_access); +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context, const DDLQueryOnClusterParams & params = {}); BlockIO getDistributedDDLStatus( const String & node_path, const DDLLogEntry & entry, ContextPtr context, const std::optional & hosts_to_wait = {}); From 68a020eceaebc168cbea617ed73f763bfeadfb13 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 19 Apr 2022 20:15:27 +0200 Subject: [PATCH 60/94] Implement BACKUP/RESTORE ON CLUSTER. --- ....cpp => BackupCoordinationDistributed.cpp} | 30 +- ...tion.h => BackupCoordinationDistributed.h} | 24 +- ...nation.cpp => BackupCoordinationLocal.cpp} | 24 +- ...ordination.h => BackupCoordinationLocal.h} | 20 +- src/Backups/BackupFactory.h | 3 + src/Backups/BackupIO_Disk.cpp | 17 +- src/Backups/BackupIO_File.cpp | 17 +- src/Backups/BackupImpl.cpp | 89 ++--- src/Backups/BackupImpl.h | 21 +- src/Backups/BackupInfo.cpp | 25 +- src/Backups/BackupInfo.h | 3 + src/Backups/BackupSettings.cpp | 45 --- src/Backups/BackupSettings.h | 33 -- src/Backups/BackupUtils.cpp | 115 ++++--- src/Backups/BackupUtils.h | 10 +- src/Backups/Common/BackupSettings.cpp | 72 ++++ src/Backups/Common/BackupSettings.h | 54 +++ src/Backups/Common/CMakeLists.txt | 5 + src/Backups/Common/RestoreSettings.cpp | 122 +++++++ src/Backups/{ => Common}/RestoreSettings.h | 27 ++ .../rewriteBackupQueryWithoutOnCluster.cpp | 59 ++++ .../rewriteBackupQueryWithoutOnCluster.h | 19 ++ src/Backups/IBackupCoordination.h | 14 +- src/Backups/IRestoreCoordination.h | 39 +++ .../RestoreCoordinationDistributed.cpp | 148 +++++++++ src/Backups/RestoreCoordinationDistributed.h | 36 ++ src/Backups/RestoreCoordinationLocal.cpp | 73 ++++ src/Backups/RestoreCoordinationLocal.h | 33 ++ src/Backups/RestoreSettings.cpp | 75 ----- src/Backups/RestoreUtils.cpp | 314 ++++++++++++++++-- src/Backups/RestoreUtils.h | 4 + .../registerBackupEnginesFileAndDisk.cpp | 3 +- ...TableUUIDWithMacroInReplicatedTableDef.cpp | 37 +++ ...ceTableUUIDWithMacroInReplicatedTableDef.h | 14 + src/CMakeLists.txt | 1 + src/Databases/DatabaseReplicated.h | 2 + src/Interpreters/InterpreterBackupQuery.cpp | 134 ++++++-- src/Interpreters/executeDDLQueryOnCluster.h | 2 +- src/Parsers/ASTBackupQuery.cpp | 12 +- src/Parsers/ASTBackupQuery.h | 6 +- src/Parsers/CMakeLists.txt | 1 + src/Parsers/ParserBackupQuery.cpp | 12 + src/Storages/IStorage.cpp | 2 +- src/Storages/IStorage.h | 3 +- src/Storages/StorageLog.cpp | 2 +- src/Storages/StorageLog.h | 2 +- src/Storages/StorageMaterializedView.cpp | 4 +- src/Storages/StorageMaterializedView.h | 2 +- src/Storages/StorageMemory.cpp | 2 +- src/Storages/StorageMemory.h | 2 +- src/Storages/StorageMergeTree.cpp | 2 +- src/Storages/StorageMergeTree.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 23 +- src/Storages/StorageReplicatedMergeTree.h | 4 +- src/Storages/StorageStripeLog.cpp | 2 +- src/Storages/StorageStripeLog.h | 2 +- tests/integration/helpers/cluster.py | 4 +- .../test_backup_restore_new/test.py | 8 +- .../__init__.py | 0 ...allow_experimental_database_replicated.xml | 7 + .../configs/backups_disk.xml | 0 .../configs/remote_servers.xml | 0 .../test_backup_restore_on_cluster/test.py | 158 +++++++++ .../test_backup_restore_replicated/test.py | 105 ------ 64 files changed, 1614 insertions(+), 516 deletions(-) rename src/Backups/{DistributedBackupCoordination.cpp => BackupCoordinationDistributed.cpp} (89%) rename src/Backups/{DistributedBackupCoordination.h => BackupCoordinationDistributed.h} (56%) rename src/Backups/{LocalBackupCoordination.cpp => BackupCoordinationLocal.cpp} (77%) rename src/Backups/{LocalBackupCoordination.h => BackupCoordinationLocal.h} (70%) delete mode 100644 src/Backups/BackupSettings.cpp delete mode 100644 src/Backups/BackupSettings.h create mode 100644 src/Backups/Common/BackupSettings.cpp create mode 100644 src/Backups/Common/BackupSettings.h create mode 100644 src/Backups/Common/CMakeLists.txt create mode 100644 src/Backups/Common/RestoreSettings.cpp rename src/Backups/{ => Common}/RestoreSettings.h (64%) create mode 100644 src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp create mode 100644 src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h create mode 100644 src/Backups/IRestoreCoordination.h create mode 100644 src/Backups/RestoreCoordinationDistributed.cpp create mode 100644 src/Backups/RestoreCoordinationDistributed.h create mode 100644 src/Backups/RestoreCoordinationLocal.cpp create mode 100644 src/Backups/RestoreCoordinationLocal.h delete mode 100644 src/Backups/RestoreSettings.cpp create mode 100644 src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp create mode 100644 src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h rename tests/integration/{test_backup_restore_replicated => test_backup_restore_on_cluster}/__init__.py (100%) create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml rename tests/integration/{test_backup_restore_replicated => test_backup_restore_on_cluster}/configs/backups_disk.xml (100%) rename tests/integration/{test_backup_restore_replicated => test_backup_restore_on_cluster}/configs/remote_servers.xml (100%) create mode 100644 tests/integration/test_backup_restore_on_cluster/test.py delete mode 100644 tests/integration/test_backup_restore_replicated/test.py diff --git a/src/Backups/DistributedBackupCoordination.cpp b/src/Backups/BackupCoordinationDistributed.cpp similarity index 89% rename from src/Backups/DistributedBackupCoordination.cpp rename to src/Backups/BackupCoordinationDistributed.cpp index d1669cf292d..c7244538655 100644 --- a/src/Backups/DistributedBackupCoordination.cpp +++ b/src/Backups/BackupCoordinationDistributed.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include #include @@ -78,15 +78,15 @@ namespace constexpr size_t NUM_ATTEMPTS = 10; } -DistributedBackupCoordination::DistributedBackupCoordination(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) +BackupCoordinationDistributed::BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) : zookeeper_path(zookeeper_path_), get_zookeeper(get_zookeeper_) { createRootNodes(); } -DistributedBackupCoordination::~DistributedBackupCoordination() = default; +BackupCoordinationDistributed::~BackupCoordinationDistributed() = default; -void DistributedBackupCoordination::createRootNodes() +void BackupCoordinationDistributed::createRootNodes() { auto zookeeper = get_zookeeper(); zookeeper->createAncestors(zookeeper_path); @@ -97,13 +97,13 @@ void DistributedBackupCoordination::createRootNodes() zookeeper->createIfNotExists(zookeeper_path + "/current_archive_suffix", "0"); } -void DistributedBackupCoordination::removeAllNodes() +void BackupCoordinationDistributed::removeAllNodes() { auto zookeeper = get_zookeeper(); zookeeper->removeRecursive(zookeeper_path); } -void DistributedBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) +void BackupCoordinationDistributed::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { auto zookeeper = get_zookeeper(); @@ -125,7 +125,7 @@ void DistributedBackupCoordination::addFileInfo(const FileInfo & file_info, bool is_data_file_required = (code == Coordination::Error::ZOK) && (file_info.size > file_info.base_size); } -void DistributedBackupCoordination::updateFileInfo(const FileInfo & file_info) +void BackupCoordinationDistributed::updateFileInfo(const FileInfo & file_info) { if (!file_info.size) return; /// we don't keep FileInfos for empty files, nothing to update @@ -147,7 +147,7 @@ void DistributedBackupCoordination::updateFileInfo(const FileInfo & file_info) } } -std::vector DistributedBackupCoordination::getAllFileInfos() +std::vector BackupCoordinationDistributed::getAllFileInfos() const { auto zookeeper = get_zookeeper(); std::vector file_infos; @@ -165,7 +165,7 @@ std::vector DistributedBackupCoordination::getAllFileInfos() return file_infos; } -Strings DistributedBackupCoordination::listFiles(const String & prefix, const String & terminator) +Strings BackupCoordinationDistributed::listFiles(const String & prefix, const String & terminator) const { auto zookeeper = get_zookeeper(); Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); @@ -190,7 +190,7 @@ Strings DistributedBackupCoordination::listFiles(const String & prefix, const St return elements; } -std::optional DistributedBackupCoordination::getFileInfo(const String & file_name) +std::optional BackupCoordinationDistributed::getFileInfo(const String & file_name) const { auto zookeeper = get_zookeeper(); String size_and_checksum; @@ -204,7 +204,7 @@ std::optional DistributedBackupCoordination::getFileInfo(const String return file_info; } -std::optional DistributedBackupCoordination::getFileInfo(const SizeAndChecksum & size_and_checksum) +std::optional BackupCoordinationDistributed::getFileInfo(const SizeAndChecksum & size_and_checksum) const { auto zookeeper = get_zookeeper(); String file_info_str; @@ -213,7 +213,7 @@ std::optional DistributedBackupCoordination::getFileInfo(const SizeAnd return deserializeFileInfo(file_info_str); } -std::optional DistributedBackupCoordination::getFileSizeAndChecksum(const String & file_name) +std::optional BackupCoordinationDistributed::getFileSizeAndChecksum(const String & file_name) const { auto zookeeper = get_zookeeper(); String size_and_checksum; @@ -222,7 +222,7 @@ std::optional DistributedBackupCoordination::getFileSizeAndChec return deserializeSizeAndChecksum(size_and_checksum); } -String DistributedBackupCoordination::getNextArchiveSuffix() +String BackupCoordinationDistributed::getNextArchiveSuffix() { auto zookeeper = get_zookeeper(); for (size_t attempt = 0; attempt != NUM_ATTEMPTS; ++attempt) @@ -245,13 +245,13 @@ String DistributedBackupCoordination::getNextArchiveSuffix() __builtin_unreachable(); } -Strings DistributedBackupCoordination::getAllArchiveSuffixes() +Strings BackupCoordinationDistributed::getAllArchiveSuffixes() const { auto zookeeper = get_zookeeper(); return zookeeper->getChildren(zookeeper_path + "/archive_suffixes"); } -void DistributedBackupCoordination::drop() +void BackupCoordinationDistributed::drop() { removeAllNodes(); } diff --git a/src/Backups/DistributedBackupCoordination.h b/src/Backups/BackupCoordinationDistributed.h similarity index 56% rename from src/Backups/DistributedBackupCoordination.h rename to src/Backups/BackupCoordinationDistributed.h index 7e7de59d9f3..7a7caad7299 100644 --- a/src/Backups/DistributedBackupCoordination.h +++ b/src/Backups/BackupCoordinationDistributed.h @@ -7,24 +7,24 @@ namespace DB { -/// Stores backup contents information in Zookeeper, useful for distributed backups. -class DistributedBackupCoordination : public IBackupCoordination +/// Stores backup temporary information in Zookeeper, used to perform BACKUP ON CLUSTER. +class BackupCoordinationDistributed : public IBackupCoordination { public: - DistributedBackupCoordination(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); - ~DistributedBackupCoordination() override; + BackupCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); + ~BackupCoordinationDistributed() override; void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; - std::vector getAllFileInfos() override; - Strings listFiles(const String & prefix, const String & terminator) override; - std::optional getFileInfo(const String & file_name) override; - std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) override; - std::optional getFileSizeAndChecksum(const String & file_name) override; + std::vector getAllFileInfos() const override; + Strings listFiles(const String & prefix, const String & terminator) const override; + std::optional getFileInfo(const String & file_name) const override; + std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) const override; + std::optional getFileSizeAndChecksum(const String & file_name) const override; String getNextArchiveSuffix() override; - Strings getAllArchiveSuffixes() override; + Strings getAllArchiveSuffixes() const override; void drop() override; @@ -32,8 +32,8 @@ private: void createRootNodes(); void removeAllNodes(); - String zookeeper_path; - zkutil::GetZooKeeper get_zookeeper; + const String zookeeper_path; + const zkutil::GetZooKeeper get_zookeeper; }; } diff --git a/src/Backups/LocalBackupCoordination.cpp b/src/Backups/BackupCoordinationLocal.cpp similarity index 77% rename from src/Backups/LocalBackupCoordination.cpp rename to src/Backups/BackupCoordinationLocal.cpp index 1ff00adb6b1..6691a1f601f 100644 --- a/src/Backups/LocalBackupCoordination.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -1,4 +1,4 @@ -#include +#include #include @@ -7,10 +7,10 @@ namespace DB using SizeAndChecksum = IBackupCoordination::SizeAndChecksum; using FileInfo = IBackupCoordination::FileInfo; -LocalBackupCoordination::LocalBackupCoordination() = default; -LocalBackupCoordination::~LocalBackupCoordination() = default; +BackupCoordinationLocal::BackupCoordinationLocal() = default; +BackupCoordinationLocal::~BackupCoordinationLocal() = default; -void LocalBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) +void BackupCoordinationLocal::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { std::lock_guard lock{mutex}; file_names.emplace(file_info.file_name, std::pair{file_info.size, file_info.checksum}); @@ -23,7 +23,7 @@ void LocalBackupCoordination::addFileInfo(const FileInfo & file_info, bool & is_ is_data_file_required = inserted_file_info && (file_info.size > file_info.base_size); } -void LocalBackupCoordination::updateFileInfo(const FileInfo & file_info) +void BackupCoordinationLocal::updateFileInfo(const FileInfo & file_info) { if (!file_info.size) return; /// we don't keep FileInfos for empty files, nothing to update @@ -33,7 +33,7 @@ void LocalBackupCoordination::updateFileInfo(const FileInfo & file_info) dest.archive_suffix = file_info.archive_suffix; } -std::vector LocalBackupCoordination::getAllFileInfos() +std::vector BackupCoordinationLocal::getAllFileInfos() const { std::lock_guard lock{mutex}; std::vector res; @@ -49,7 +49,7 @@ std::vector LocalBackupCoordination::getAllFileInfos() return res; } -Strings LocalBackupCoordination::listFiles(const String & prefix, const String & terminator) +Strings BackupCoordinationLocal::listFiles(const String & prefix, const String & terminator) const { std::lock_guard lock{mutex}; Strings elements; @@ -70,7 +70,7 @@ Strings LocalBackupCoordination::listFiles(const String & prefix, const String & return elements; } -std::optional LocalBackupCoordination::getFileInfo(const String & file_name) +std::optional BackupCoordinationLocal::getFileInfo(const String & file_name) const { std::lock_guard lock{mutex}; auto it = file_names.find(file_name); @@ -85,7 +85,7 @@ std::optional LocalBackupCoordination::getFileInfo(const String & file return info; } -std::optional LocalBackupCoordination::getFileInfo(const SizeAndChecksum & size_and_checksum) +std::optional BackupCoordinationLocal::getFileInfo(const SizeAndChecksum & size_and_checksum) const { std::lock_guard lock{mutex}; auto it = file_infos.find(size_and_checksum); @@ -94,7 +94,7 @@ std::optional LocalBackupCoordination::getFileInfo(const SizeAndChecks return it->second; } -std::optional LocalBackupCoordination::getFileSizeAndChecksum(const String & file_name) +std::optional BackupCoordinationLocal::getFileSizeAndChecksum(const String & file_name) const { std::lock_guard lock{mutex}; auto it = file_names.find(file_name); @@ -103,7 +103,7 @@ std::optional LocalBackupCoordination::getFileSizeAndChecksum(c return it->second; } -String LocalBackupCoordination::getNextArchiveSuffix() +String BackupCoordinationLocal::getNextArchiveSuffix() { std::lock_guard lock{mutex}; String new_archive_suffix = fmt::format("{:03}", ++current_archive_suffix); /// Outputs 001, 002, 003, ... @@ -111,7 +111,7 @@ String LocalBackupCoordination::getNextArchiveSuffix() return new_archive_suffix; } -Strings LocalBackupCoordination::getAllArchiveSuffixes() +Strings BackupCoordinationLocal::getAllArchiveSuffixes() const { std::lock_guard lock{mutex}; return archive_suffixes; diff --git a/src/Backups/LocalBackupCoordination.h b/src/Backups/BackupCoordinationLocal.h similarity index 70% rename from src/Backups/LocalBackupCoordination.h rename to src/Backups/BackupCoordinationLocal.h index 6ee4b941293..d47616e2ddf 100644 --- a/src/Backups/LocalBackupCoordination.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -6,27 +6,27 @@ namespace DB { /// Stores backup contents information in memory. -class LocalBackupCoordination : public IBackupCoordination +class BackupCoordinationLocal : public IBackupCoordination { public: - LocalBackupCoordination(); - ~LocalBackupCoordination() override; + BackupCoordinationLocal(); + ~BackupCoordinationLocal() override; void addFileInfo(const FileInfo & file_info, bool & is_data_file_required) override; void updateFileInfo(const FileInfo & file_info) override; - std::vector getAllFileInfos() override; - Strings listFiles(const String & prefix, const String & terminator) override; + std::vector getAllFileInfos() const override; + Strings listFiles(const String & prefix, const String & terminator) const override; - std::optional getFileInfo(const String & file_name) override; - std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) override; - std::optional getFileSizeAndChecksum(const String & file_name) override; + std::optional getFileInfo(const String & file_name) const override; + std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) const override; + std::optional getFileSizeAndChecksum(const String & file_name) const override; String getNextArchiveSuffix() override; - Strings getAllArchiveSuffixes() override; + Strings getAllArchiveSuffixes() const override; private: - std::mutex mutex; + mutable std::mutex mutex; std::map file_names; /// Should be ordered alphabetically, see listFiles(). For empty files we assume checksum = 0. std::map file_infos; /// Information about files. Without empty files. Strings archive_suffixes; diff --git a/src/Backups/BackupFactory.h b/src/Backups/BackupFactory.h index d3ebcfe2369..e78b7033d8f 100644 --- a/src/Backups/BackupFactory.h +++ b/src/Backups/BackupFactory.h @@ -24,12 +24,15 @@ public: struct CreateParams { OpenMode open_mode = OpenMode::WRITE; + std::optional backup_uuid; BackupInfo backup_info; std::optional base_backup_info; String compression_method; int compression_level = -1; String password; ContextPtr context; + bool is_internal_backup = false; + String coordination_zk_path; }; static BackupFactory & instance(); diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index ff5ad13897c..fc61370e951 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include namespace fs = std::filesystem; @@ -49,10 +51,17 @@ std::unique_ptr BackupWriterDisk::writeFile(const String & file_nam void BackupWriterDisk::removeFilesAfterFailure(const Strings & file_names) { - for (const auto & file_name : file_names) - disk->removeFile(path / file_name); - if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) - disk->removeDirectory(path); + try + { + for (const auto & file_name : file_names) + disk->removeFileIfExists(path / file_name); + if (disk->isDirectory(path) && disk->isDirectoryEmpty(path)) + disk->removeDirectory(path); + } + catch (...) + { + LOG_WARNING(&Poco::Logger::get("BackupWriterDisk"), "RemoveFilesAfterFailure: {}", getCurrentExceptionMessage(false)); + } } } diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 8e7bfb5b83e..1ac56477e5b 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -1,6 +1,8 @@ #include +#include #include #include +#include namespace fs = std::filesystem; @@ -48,10 +50,17 @@ std::unique_ptr BackupWriterFile::writeFile(const String & file_nam void BackupWriterFile::removeFilesAfterFailure(const Strings & file_names) { - for (const auto & file_name : file_names) - fs::remove(path / file_name); - if (fs::is_directory(path) && fs::is_empty(path)) - fs::remove(path); + try + { + for (const auto & file_name : file_names) + fs::remove(path / file_name); + if (fs::is_directory(path) && fs::is_empty(path)) + fs::remove(path); + } + catch (...) + { + LOG_WARNING(&Poco::Logger::get("BackupWriterFile"), "RemoveFilesAfterFailure: {}", getCurrentExceptionMessage(false)); + } } } diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 060c2d9d3f1..233061a8da2 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -3,10 +3,12 @@ #include #include #include -#include +#include +#include #include #include #include +#include #include #include #include @@ -114,20 +116,18 @@ BackupImpl::BackupImpl( const String & backup_name_, const ArchiveParams & archive_params_, const std::optional & base_backup_info_, - std::shared_ptr writer_, - std::shared_ptr coordination_, - bool is_helper_backup_, + std::shared_ptr reader_, const ContextPtr & context_) : backup_name(backup_name_) , archive_params(archive_params_) , use_archives(!archive_params.archive_name.empty()) - , base_backup_info_initial(base_backup_info_) - , open_mode(OpenMode::WRITE) - , writer(std::move(writer_)) - , coordination(coordination_ ? coordination_ : std::make_shared()) - , is_helper_backup(is_helper_backup_) + , open_mode(OpenMode::READ) + , reader(std::move(reader_)) + , is_internal_backup(false) + , coordination(std::make_shared()) , context(context_) - , version(CURRENT_BACKUP_VERSION) + , version(INITIAL_BACKUP_VERSION) + , base_backup_info(base_backup_info_) { open(); } @@ -137,19 +137,27 @@ BackupImpl::BackupImpl( const String & backup_name_, const ArchiveParams & archive_params_, const std::optional & base_backup_info_, - std::shared_ptr reader_, - const ContextPtr & context_) + std::shared_ptr writer_, + const ContextPtr & context_, + const std::optional & backup_uuid_, + bool is_internal_backup_, + const String & coordination_zk_path_) : backup_name(backup_name_) , archive_params(archive_params_) , use_archives(!archive_params.archive_name.empty()) - , base_backup_info_initial(base_backup_info_) - , open_mode(OpenMode::READ) - , reader(std::move(reader_)) - , coordination(std::make_shared()) - , is_helper_backup(false) + , open_mode(OpenMode::WRITE) + , writer(std::move(writer_)) + , is_internal_backup(is_internal_backup_) , context(context_) - , version(INITIAL_BACKUP_VERSION) + , uuid(backup_uuid_) + , version(CURRENT_BACKUP_VERSION) + , base_backup_info(base_backup_info_) { + if (coordination_zk_path_.empty()) + coordination = std::make_shared(); + else + coordination = std::make_shared(coordination_zk_path_, [&] { return context->getZooKeeper(); }); + open(); } @@ -185,14 +193,16 @@ void BackupImpl::open() if (open_mode == OpenMode::WRITE) { timestamp = std::time(nullptr); - uuid = UUIDHelpers::generateV4(); + if (!uuid) + uuid = UUIDHelpers::generateV4(); writing_finalized = false; } - base_backup_info = base_backup_info_initial; if (open_mode == OpenMode::READ) readBackupMetadata(); + assert(uuid); /// Backup's UUID must be loaded or generated at this point. + if (base_backup_info) { BackupFactory::CreateParams params; @@ -213,17 +223,17 @@ void BackupImpl::close() { std::lock_guard lock{mutex}; - if (!is_helper_backup && writing_finalized) + if (!is_internal_backup && writing_finalized) writeBackupMetadata(); archive_readers.clear(); - archive_writer_with_empty_suffix.reset(); - current_archive_writer.reset(); + for (auto & archive_writer : archive_writers) + archive_writer = {"", nullptr}; - if (!is_helper_backup && writer && !writing_finalized) + if (!is_internal_backup && writer && !writing_finalized) removeAllFilesAfterFailure(); - if (!is_helper_backup) + if (!is_internal_backup) coordination->drop(); } @@ -238,7 +248,7 @@ void BackupImpl::writeBackupMetadata() Poco::AutoPtr config{new Poco::Util::XMLConfiguration()}; config->setUInt("version", CURRENT_BACKUP_VERSION); config->setString("timestamp", toString(LocalDateTime{timestamp})); - config->setString("uuid", toString(uuid)); + config->setString("uuid", toString(*uuid)); if (base_backup_info) { @@ -595,6 +605,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) bool is_data_file_required; info.data_file_name = info.file_name; + info.archive_suffix = current_archive_suffix; coordination->addFileInfo(info, is_data_file_required); if (!is_data_file_required) return; /// We copy data only if it's a new combination of size & checksum. @@ -617,19 +628,19 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) { String archive_suffix = current_archive_suffix; bool next_suffix = false; - if (info.archive_suffix.empty() && is_helper_backup) + if (current_archive_suffix.empty() && is_internal_backup) next_suffix = true; /*if (archive_params.max_volume_size && current_archive_writer && (current_archive_writer->getTotalSize() + size - base_size > archive_params.max_volume_size)) next_suffix = true;*/ if (next_suffix) - archive_suffix = coordination->getNextArchiveSuffix(); - if (info.archive_suffix != archive_suffix) + current_archive_suffix = coordination->getNextArchiveSuffix(); + if (info.archive_suffix != current_archive_suffix) { - info.archive_suffix = archive_suffix; + info.archive_suffix = current_archive_suffix; coordination->updateFileInfo(info); } - out = getArchiveWriter(info.archive_suffix)->writeFile(info.data_file_name); + out = getArchiveWriter(current_archive_suffix)->writeFile(info.data_file_name); } else { @@ -672,19 +683,19 @@ std::shared_ptr BackupImpl::getArchiveReader(const String & suff std::shared_ptr BackupImpl::getArchiveWriter(const String & suffix) { - if (suffix.empty() && archive_writer_with_empty_suffix) - return archive_writer_with_empty_suffix; - if ((current_archive_suffix == suffix) && current_archive_writer) - return current_archive_writer; + for (const auto & archive_writer : archive_writers) + { + if ((suffix == archive_writer.first) && archive_writer.second) + return archive_writer.second; + } String archive_name_with_suffix = getArchiveNameWithSuffix(suffix); auto new_archive_writer = createArchiveWriter(archive_params.archive_name, writer->writeFile(archive_name_with_suffix)); new_archive_writer->setPassword(archive_params.password); - current_archive_writer = new_archive_writer; - current_archive_suffix = suffix; - if (suffix.empty()) - archive_writer_with_empty_suffix = new_archive_writer; + size_t pos = suffix.empty() ? 0 : 1; + archive_writers[pos] = {suffix, new_archive_writer}; + return new_archive_writer; } diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index be8a352e95e..92cf3a12006 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -38,24 +38,25 @@ public: const String & backup_name_, const ArchiveParams & archive_params_, const std::optional & base_backup_info_, - std::shared_ptr writer_, - std::shared_ptr coordination_, - bool is_helper_backup_, + std::shared_ptr reader_, const ContextPtr & context_); BackupImpl( const String & backup_name_, const ArchiveParams & archive_params_, const std::optional & base_backup_info_, - std::shared_ptr reader_, - const ContextPtr & context_); + std::shared_ptr writer_, + const ContextPtr & context_, + const std::optional & backup_uuid_ = {}, + bool is_internal_backup_ = false, + const String & coordination_zk_path_ = {}); ~BackupImpl() override; const String & getName() const override { return backup_name; } OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override; - UUID getUUID() const override { return uuid; } + UUID getUUID() const override { return *uuid; } Strings listFiles(const String & prefix, const String & terminator) const override; bool fileExists(const String & file_name) const override; bool fileExists(const SizeAndChecksum & size_and_checksum) const override; @@ -84,24 +85,22 @@ private: const String backup_name; const ArchiveParams archive_params; const bool use_archives; - const std::optional base_backup_info_initial; const OpenMode open_mode; std::shared_ptr writer; std::shared_ptr reader; + const bool is_internal_backup; std::shared_ptr coordination; - const bool is_helper_backup; ContextPtr context; mutable std::mutex mutex; - UUID uuid = {}; + std::optional uuid; time_t timestamp = 0; UInt64 version; std::optional base_backup_info; std::shared_ptr base_backup; std::optional base_backup_uuid; mutable std::unordered_map> archive_readers; - std::shared_ptr archive_writer_with_empty_suffix; - std::shared_ptr current_archive_writer; + std::pair> archive_writers[2]; String current_archive_suffix; bool writing_finalized = false; }; diff --git a/src/Backups/BackupInfo.cpp b/src/Backups/BackupInfo.cpp index cab08e306d6..13f86a075c0 100644 --- a/src/Backups/BackupInfo.cpp +++ b/src/Backups/BackupInfo.cpp @@ -16,6 +16,21 @@ namespace ErrorCodes } String BackupInfo::toString() const +{ + ASTPtr ast = toAST(); + return serializeAST(*ast); +} + + +BackupInfo BackupInfo::fromString(const String & str) +{ + ParserIdentifierWithOptionalParameters parser; + ASTPtr ast = parseQuery(parser, str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); + return fromAST(*ast); +} + + +ASTPtr BackupInfo::toAST() const { auto func = std::make_shared(); func->name = backup_engine_name; @@ -32,15 +47,7 @@ String BackupInfo::toString() const for (const auto & arg : args) list->children.push_back(std::make_shared(arg)); - return serializeAST(*func); -} - - -BackupInfo BackupInfo::fromString(const String & str) -{ - ParserIdentifierWithOptionalParameters parser; - ASTPtr ast = parseQuery(parser, str, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH); - return fromAST(*ast); + return func; } diff --git a/src/Backups/BackupInfo.h b/src/Backups/BackupInfo.h index 5b5c676ecf1..62365b63394 100644 --- a/src/Backups/BackupInfo.h +++ b/src/Backups/BackupInfo.h @@ -6,6 +6,7 @@ namespace DB { class IAST; +using ASTPtr = std::shared_ptr; /// Information about a backup. struct BackupInfo @@ -16,6 +17,8 @@ struct BackupInfo String toString() const; static BackupInfo fromString(const String & str); + + ASTPtr toAST() const; static BackupInfo fromAST(const IAST & ast); }; diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp deleted file mode 100644 index 059abc9a905..00000000000 --- a/src/Backups/BackupSettings.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int UNKNOWN_SETTING; -} - -BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query) -{ - BackupSettings res; - - if (query.base_backup_name) - res.base_backup_info = BackupInfo::fromAST(*query.base_backup_name); - - if (query.settings) - { - const auto & settings = query.settings->as().changes; - for (const auto & setting : settings) - { - if (setting.name == "compression_method") - res.compression_method = SettingFieldString{setting.value}; - else if (setting.name == "compression_level") - res.compression_level = SettingFieldInt64{setting.value}; - else if (setting.name == "password") - res.password = SettingFieldString{setting.value}; - else if (setting.name == "structure_only") - res.structure_only = SettingFieldBool{setting.value}; - else if (setting.name == "async") - res.async = SettingFieldBool{setting.value}; - else - throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); - } - } - - return res; -} - -} diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h deleted file mode 100644 index 93414379531..00000000000 --- a/src/Backups/BackupSettings.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include -#include - - -namespace DB -{ -class ASTBackupQuery; - -/// Settings specified in the "SETTINGS" clause of a BACKUP query. -struct BackupSettings -{ - /// Base backup, if it's set an incremental backup will be built. - std::optional base_backup_info; - - /// Compression method and level for writing the backup (when applicable). - String compression_method; /// "" means default method - int compression_level = -1; /// -1 means default level - - /// Password used to encrypt the backup. - String password; - - /// If this is set to true then only create queries will be written to backup, - /// without the data of tables. - bool structure_only = false; - - bool async = false; - - static BackupSettings fromBackupQuery(const ASTBackupQuery & query); -}; - -} diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index 8d6c57181f0..d4a163f8752 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -1,15 +1,17 @@ #include #include -#include +#include #include #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -26,6 +28,67 @@ namespace ErrorCodes namespace { + /// Helper to calculate paths inside a backup. + class PathsInBackup + { + public: + /// Returns the path to metadata in backup. + static String getMetadataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) + { + if (table_name.first.empty() || table_name.second.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); + return getPathForShardAndReplica(shard_index, replica_index) + String{"metadata/"} + escapeForFileName(table_name.first) + "/" + + escapeForFileName(table_name.second) + ".sql"; + } + + static String getMetadataPath(const String & database_name, size_t shard_index, size_t replica_index) + { + if (database_name.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name must not be empty"); + return getPathForShardAndReplica(shard_index, replica_index) + String{"metadata/"} + escapeForFileName(database_name) + ".sql"; + } + + static String getMetadataPath(const IAST & create_query, size_t shard_index, size_t replica_index) + { + const auto & create = create_query.as(); + if (!create.table) + return getMetadataPath(create.getDatabase(), shard_index, replica_index); + if (create.temporary) + return getMetadataPath({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}, shard_index, replica_index); + return getMetadataPath({create.getDatabase(), create.getTable()}, shard_index, replica_index); + } + + /// Returns the path to table's data in backup. + static String getDataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) + { + if (table_name.first.empty() || table_name.second.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); + assert(!table_name.first.empty() && !table_name.second.empty()); + return getPathForShardAndReplica(shard_index, replica_index) + String{"data/"} + escapeForFileName(table_name.first) + "/" + + escapeForFileName(table_name.second) + "/"; + } + + static String getDataPath(const IAST & create_query, size_t shard_index, size_t replica_index) + { + const auto & create = create_query.as(); + if (!create.table) + return {}; + if (create.temporary) + return getDataPath({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}, shard_index, replica_index); + return getDataPath({create.getDatabase(), create.getTable()}, shard_index, replica_index); + } + + private: + static String getPathForShardAndReplica(size_t shard_index, size_t replica_index) + { + if (shard_index || replica_index) + return fmt::format("shard{}/replica{}/", shard_index, replica_index); + else + return ""; + } + }; + + using Kind = ASTBackupQuery::Kind; using Element = ASTBackupQuery::Element; using Elements = ASTBackupQuery::Elements; @@ -92,7 +155,7 @@ namespace auto data_backup = info.storage->backupData(context, info.partitions); if (!data_backup.empty()) { - String data_path = getDataPathInBackup(*info.create_query); + String data_path = PathsInBackup::getDataPath(*info.create_query, backup_settings.shard, backup_settings.replica); for (auto & [path_in_backup, backup_entry] : data_backup) res.emplace_back(data_path + path_in_backup, std::move(backup_entry)); } @@ -209,6 +272,7 @@ namespace ASTPtr query = ast; ::DB::renameInCreateQuery(query, context, renaming_settings); auto create_query = typeid_cast>(query); + replaceTableUUIDWithMacroInReplicatedTableDef(*create_query, create_query->uuid); create_query->uuid = UUIDHelpers::Nil; create_query->to_inner_uuid = UUIDHelpers::Nil; return create_query; @@ -219,10 +283,10 @@ namespace return (database_name == DatabaseCatalog::SYSTEM_DATABASE) || (database_name == DatabaseCatalog::TEMPORARY_DATABASE); } - static std::pair makeBackupEntryForMetadata(const IAST & create_query) + std::pair makeBackupEntryForMetadata(const IAST & create_query) const { auto metadata_entry = std::make_unique(serializeAST(create_query)); - String metadata_path = getMetadataPathInBackup(create_query); + String metadata_path = PathsInBackup::getMetadataPath(create_query, backup_settings.shard, backup_settings.replica); return {metadata_path, std::move(metadata_entry)}; } @@ -319,47 +383,4 @@ void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries backup->finalizeWriting(); } - -String getDataPathInBackup(const DatabaseAndTableName & table_name) -{ - if (table_name.first.empty() || table_name.second.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); - assert(!table_name.first.empty() && !table_name.second.empty()); - return String{"data/"} + escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second) + "/"; -} - -String getDataPathInBackup(const IAST & create_query) -{ - const auto & create = create_query.as(); - if (!create.table) - return {}; - if (create.temporary) - return getDataPathInBackup({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}); - return getDataPathInBackup({create.getDatabase(), create.getTable()}); -} - -String getMetadataPathInBackup(const DatabaseAndTableName & table_name) -{ - if (table_name.first.empty() || table_name.second.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name and table name must not be empty"); - return String{"metadata/"} + escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second) + ".sql"; -} - -String getMetadataPathInBackup(const String & database_name) -{ - if (database_name.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Database name must not be empty"); - return String{"metadata/"} + escapeForFileName(database_name) + ".sql"; -} - -String getMetadataPathInBackup(const IAST & create_query) -{ - const auto & create = create_query.as(); - if (!create.table) - return getMetadataPathInBackup(create.getDatabase()); - if (create.temporary) - return getMetadataPathInBackup({DatabaseCatalog::TEMPORARY_DATABASE, create.getTable()}); - return getMetadataPathInBackup({create.getDatabase(), create.getTable()}); -} - } diff --git a/src/Backups/BackupUtils.h b/src/Backups/BackupUtils.h index d001d5a4bec..8251a9df643 100644 --- a/src/Backups/BackupUtils.h +++ b/src/Backups/BackupUtils.h @@ -6,6 +6,7 @@ namespace DB { class IBackup; +using BackupPtr = std::shared_ptr; using BackupMutablePtr = std::shared_ptr; class IBackupEntry; using BackupEntryPtr = std::unique_ptr; @@ -20,13 +21,4 @@ BackupEntries makeBackupEntries(const ContextPtr & context, const ASTBackupQuery /// Write backup entries to an opened backup. void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, size_t num_threads); -/// Returns the path to metadata in backup. -String getMetadataPathInBackup(const DatabaseAndTableName & table_name); -String getMetadataPathInBackup(const String & database_name); -String getMetadataPathInBackup(const IAST & create_query); - -/// Returns the path to table's data in backup. -String getDataPathInBackup(const DatabaseAndTableName & table_name); -String getDataPathInBackup(const IAST & create_query); - } diff --git a/src/Backups/Common/BackupSettings.cpp b/src/Backups/Common/BackupSettings.cpp new file mode 100644 index 00000000000..0722ef14972 --- /dev/null +++ b/src/Backups/Common/BackupSettings.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int UNKNOWN_SETTING; +} + +/// List of backup settings except base_backup_name. +#define LIST_OF_BACKUP_SETTINGS(M) \ + M(String, compression_method) \ + M(Int64, compression_level) \ + M(String, password) \ + M(Bool, structure_only) \ + M(Bool, async) \ + M(UInt64, shard) \ + M(UInt64, replica) \ + M(Bool, allow_storing_multiple_replicas) \ + M(Bool, internal) \ + M(String, coordination_zk_path) + + +BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query) +{ + BackupSettings res; + + if (query.base_backup_name) + res.base_backup_info = BackupInfo::fromAST(*query.base_backup_name); + + if (query.settings) + { + const auto & settings = query.settings->as().changes; + for (const auto & setting : settings) + { +#define GET_SETTINGS_FROM_BACKUP_QUERY_HELPER(TYPE, NAME) \ + if (setting.name == #NAME) \ + res.NAME = SettingField##TYPE{setting.value}; \ + else + + LIST_OF_BACKUP_SETTINGS(GET_SETTINGS_FROM_BACKUP_QUERY_HELPER) + throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); + } + } + + return res; +} + +void BackupSettings::copySettingsToBackupQuery(ASTBackupQuery & query) const +{ + query.base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + + auto query_settings = std::make_shared(); + query_settings->is_standalone = false; + + static const BackupSettings default_settings; + +#define SET_SETTINGS_IN_BACKUP_QUERY_HELPER(TYPE, NAME) \ + if (NAME != default_settings.NAME) \ + query_settings->changes.emplace_back(#NAME, static_cast(SettingField##TYPE{NAME})); + + LIST_OF_BACKUP_SETTINGS(SET_SETTINGS_IN_BACKUP_QUERY_HELPER) + + query.settings = query_settings; +} + +} diff --git a/src/Backups/Common/BackupSettings.h b/src/Backups/Common/BackupSettings.h new file mode 100644 index 00000000000..fd495d45bd6 --- /dev/null +++ b/src/Backups/Common/BackupSettings.h @@ -0,0 +1,54 @@ +#pragma once + +#include +#include + + +namespace DB +{ +class ASTBackupQuery; + +/// Settings specified in the "SETTINGS" clause of a BACKUP query. +struct BackupSettings +{ + /// Base backup, if it's set an incremental backup will be built. + std::optional base_backup_info; + + /// Compression method and level for writing the backup (when applicable). + String compression_method; /// "" means default method + int compression_level = -1; /// -1 means default level + + /// Password used to encrypt the backup. + String password; + + /// If this is set to true then only create queries will be written to backup, + /// without the data of tables. + bool structure_only = false; + + /// Whether BACKUP command must return immediately without waiting until the backup is completed. + bool async = false; + + /// 1-based shard index to store in the backup. 0 means all shards. + /// Can only be used with BACKUP ON CLUSTER. + size_t shard = 0; + + /// 1-based replica index to store in the backup. 0 means all replicas (see also allow_storing_multiple_replicas). + /// Can only be used with BACKUP ON CLUSTER. + size_t replica = 0; + + /// Allows storing in the backup of multiple replicas. + bool allow_storing_multiple_replicas = false; + + /// Internal, should not be specified by user. + /// Whether this backup is a part of a distributed backup created by BACKUP ON CLUSTER. + bool internal = false; + + /// Internal, should not be specified by user. + /// Path in Zookeeper used to coordinate a distributed backup created by BACKUP ON CLUSTER. + String coordination_zk_path; + + static BackupSettings fromBackupQuery(const ASTBackupQuery & query); + void copySettingsToBackupQuery(ASTBackupQuery & query) const; +}; + +} diff --git a/src/Backups/Common/CMakeLists.txt b/src/Backups/Common/CMakeLists.txt new file mode 100644 index 00000000000..1b65d57b926 --- /dev/null +++ b/src/Backups/Common/CMakeLists.txt @@ -0,0 +1,5 @@ +include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") + +add_headers_and_sources(clickhouse_common_backups .) +add_library(clickhouse_common_backups ${clickhouse_common_backups_headers} ${clickhouse_common_backups_sources}) +target_link_libraries(clickhouse_common_backups PUBLIC clickhouse_common_io) diff --git a/src/Backups/Common/RestoreSettings.cpp b/src/Backups/Common/RestoreSettings.cpp new file mode 100644 index 00000000000..0b046a41336 --- /dev/null +++ b/src/Backups/Common/RestoreSettings.cpp @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int UNKNOWN_SETTING; + extern const int CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE; +} + +namespace +{ + struct SettingFieldRestoreTableCreationMode + { + RestoreTableCreationMode value; + + explicit SettingFieldRestoreTableCreationMode(const Field & field) + { + if (field.getType() == Field::Types::String) + { + String str = field.get(); + if (str == "1" || boost::iequals(str, "true")) + value = RestoreTableCreationMode::kCreate; + else if (str == "0" || boost::iequals(str, "false")) + value = RestoreTableCreationMode::kMustExist; + else if (boost::iequals(str, "if not exists")) + value = RestoreTableCreationMode::kCreateIfNotExists; + else throw Exception("Cannot parse creation mode from string '" + str + "'", + ErrorCodes::CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE); + } + else + { + if (applyVisitor(FieldVisitorConvertToNumber(), field)) + value = RestoreTableCreationMode::kCreate; + else + value = RestoreTableCreationMode::kMustExist; + } + } + + explicit operator Field() const + { + switch (value) + { + case RestoreTableCreationMode::kCreate: return Field{true}; + case RestoreTableCreationMode::kMustExist: return Field{false}; + case RestoreTableCreationMode::kCreateIfNotExists: return Field{"if not exists"}; + } + } + + operator RestoreTableCreationMode() const { return value; } + }; + + using SettingFieldRestoreDatabaseCreationMode = SettingFieldRestoreTableCreationMode; +} + +/// List of restore settings except base_backup_name. +#define LIST_OF_RESTORE_SETTINGS(M) \ + M(String, password) \ + M(Bool, structure_only) \ + M(RestoreTableCreationMode, create_table) \ + M(RestoreDatabaseCreationMode, create_database) \ + M(Bool, allow_different_table_def) \ + M(Bool, allow_different_database_def) \ + M(Bool, async) \ + M(UInt64, shard) \ + M(UInt64, replica) \ + M(UInt64, shard_in_backup) \ + M(UInt64, replica_in_backup) \ + M(Bool, internal) \ + M(String, coordination_zk_path) + +RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) +{ + RestoreSettings res; + + if (query.base_backup_name) + res.base_backup_info = BackupInfo::fromAST(*query.base_backup_name); + + if (query.settings) + { + const auto & settings = query.settings->as().changes; + for (const auto & setting : settings) + { +#define GET_SETTINGS_FROM_RESTORE_QUERY_HELPER(TYPE, NAME) \ + if (setting.name == #NAME) \ + res.NAME = SettingField##TYPE{setting.value}; \ + else + + LIST_OF_RESTORE_SETTINGS(GET_SETTINGS_FROM_RESTORE_QUERY_HELPER) + throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); + } + } + + return res; +} + +void RestoreSettings::copySettingsToRestoreQuery(ASTBackupQuery & query) const +{ + query.base_backup_name = base_backup_info ? base_backup_info->toAST() : nullptr; + + auto query_settings = std::make_shared(); + query_settings->is_standalone = false; + + static const RestoreSettings default_settings; + +#define SET_SETTINGS_IN_RESTORE_QUERY_HELPER(TYPE, NAME) \ + if (NAME != default_settings.NAME) \ + query_settings->changes.emplace_back(#NAME, static_cast(SettingField##TYPE{NAME})); + + LIST_OF_RESTORE_SETTINGS(SET_SETTINGS_IN_RESTORE_QUERY_HELPER) + + query.settings = query_settings; +} + +} diff --git a/src/Backups/RestoreSettings.h b/src/Backups/Common/RestoreSettings.h similarity index 64% rename from src/Backups/RestoreSettings.h rename to src/Backups/Common/RestoreSettings.h index 8ae247e7ba6..474f5d75420 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/Common/RestoreSettings.h @@ -58,9 +58,36 @@ struct RestoreSettings : public StorageRestoreSettings /// Set `allow_different_database_def` to true to skip this check. bool allow_different_database_def = false; + /// Whether RESTORE command must return immediately without waiting until the backup is completed. bool async = false; + /// 1-based shard index to restore from the backup. 0 means all shards. + /// Can only be used with RESTORE ON CLUSTER. + size_t shard = 0; + + /// 1-based replica index to restore from the backup. 0 means all replicas. + /// Can only be used with RESTORE ON CLUSTER. + size_t replica = 0; + + /// 1-based index of a shard stored in the backup to get data from. + /// By default it's 0: if the backup contains only one shard it means the index of that shard + /// else it means the same as `shard`. + size_t shard_in_backup = 0; + + /// 1-based index of a replica stored in the backup to get data from. + /// By default it's 0: if the backup contains only one replica for the current shard it means the index of that replica + /// else it means the same as `replica`. + size_t replica_in_backup = 0; + + /// Internal, should not be specified by user. + bool internal = false; + + /// Internal, should not be specified by user. + /// Path in Zookeeper used to coordinate restoring process while executing by RESTORE ON CLUSTER. + String coordination_zk_path; + static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query); + void copySettingsToRestoreQuery(ASTBackupQuery & query) const; }; } diff --git a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp new file mode 100644 index 00000000000..64cfea8c6a6 --- /dev/null +++ b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp @@ -0,0 +1,59 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + void setDatabaseInElements(ASTBackupQuery::Elements & elements, const String & new_database) + { + for (auto & element : elements) + { + if (element.type == ASTBackupQuery::TABLE) + { + if (element.name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) + element.name.first = new_database; + if (element.new_name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) + element.new_name.first = new_database; + } + } + } +} + +std::shared_ptr +rewriteBackupQueryWithoutOnCluster(const ASTBackupQuery & backup_query, const WithoutOnClusterASTRewriteParams & params) +{ + auto backup_settings = BackupSettings::fromBackupQuery(backup_query); + backup_settings.internal = true; + backup_settings.async = false; + backup_settings.shard = params.shard_index; + backup_settings.replica = params.replica_index; + auto new_query = std::static_pointer_cast(backup_query.clone()); + new_query->cluster.clear(); + backup_settings.copySettingsToBackupQuery(*new_query); + setDatabaseInElements(new_query->elements, params.default_database); + return new_query; +} + + +std::shared_ptr +rewriteRestoreQueryWithoutOnCluster(const ASTBackupQuery & restore_query, const WithoutOnClusterASTRewriteParams & params) +{ + auto restore_settings = RestoreSettings::fromRestoreQuery(restore_query); + restore_settings.internal = true; + restore_settings.async = false; + restore_settings.shard = params.shard_index; + restore_settings.replica = params.replica_index; + auto new_query = std::static_pointer_cast(restore_query.clone()); + new_query->cluster.clear(); + restore_settings.copySettingsToRestoreQuery(*new_query); + setDatabaseInElements(new_query->elements, params.default_database); + return new_query; +} + +} diff --git a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h new file mode 100644 index 00000000000..720a397cdc1 --- /dev/null +++ b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h @@ -0,0 +1,19 @@ +#pragma once + +#include + + +namespace DB +{ +class ASTBackupQuery; +struct WithoutOnClusterASTRewriteParams; + +/// Rewrites elements of BACKUP-ON-CLUSTER query after receiving it on shards or replica. +std::shared_ptr +rewriteBackupQueryWithoutOnCluster(const ASTBackupQuery & backup_query, const WithoutOnClusterASTRewriteParams & params); + +/// Rewrites elements of RESTORE-ON-CLUSTER query after receiving it on shards or replica. +std::shared_ptr +rewriteRestoreQueryWithoutOnCluster(const ASTBackupQuery & restore_query, const WithoutOnClusterASTRewriteParams & params); + +} diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 51f075e72f1..8a32ebde268 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -31,7 +31,7 @@ public: UInt64 pos_in_archive = static_cast(-1); }; - virtual ~IBackupCoordination() { } + virtual ~IBackupCoordination() = default; /// Adds file information. /// If specified checksum+size are new for this IBackupContentsInfo the function sets `is_data_file_required`. @@ -46,20 +46,20 @@ public: /// Updates some fields (currently only `archive_suffix`) of a stored file's information. virtual void updateFileInfo(const FileInfo & file_info) = 0; - virtual std::vector getAllFileInfos() = 0; - virtual Strings listFiles(const String & prefix, const String & terminator) = 0; + virtual std::vector getAllFileInfos() const = 0; + virtual Strings listFiles(const String & prefix, const String & terminator) const = 0; using SizeAndChecksum = std::pair; - virtual std::optional getFileInfo(const String & file_name) = 0; - virtual std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) = 0; - virtual std::optional getFileSizeAndChecksum(const String & file_name) = 0; + virtual std::optional getFileInfo(const String & file_name) const = 0; + virtual std::optional getFileInfo(const SizeAndChecksum & size_and_checksum) const = 0; + virtual std::optional getFileSizeAndChecksum(const String & file_name) const = 0; /// Generates a new archive suffix, e.g. "001", "002", "003", ... virtual String getNextArchiveSuffix() = 0; /// Returns the list of all the archive suffixes which were generated. - virtual Strings getAllArchiveSuffixes() = 0; + virtual Strings getAllArchiveSuffixes() const = 0; /// Removes remotely stored information. virtual void drop() {} diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h new file mode 100644 index 00000000000..1c796137e88 --- /dev/null +++ b/src/Backups/IRestoreCoordination.h @@ -0,0 +1,39 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Keeps information about files contained in a backup. +class IRestoreCoordination +{ +public: + virtual ~IRestoreCoordination() = default; + + /// Sets or gets path in the backup for a specified path in ZooKeeper. + virtual void setOrGetPathInBackupForZkPath(const String & zk_path_, String & path_in_backup_) = 0; + + /// Sets that this replica is going to restore a partition in a replicated table or a table in a replicated database. + /// This function should be called to prevent other replicas from doing that in parallel. + virtual bool acquireZkPathAndName(const String & zk_path_, const String & name_) = 0; + + enum Result + { + SUCCEEDED, + FAILED, + }; + + /// Sets the result for an acquired path and name. + virtual void setResultForZkPathAndName(const String & zk_path_, const String & name_, Result res_) = 0; + + /// Waits for the result set by another replica for another replica's acquired path and name. + /// Returns false if time is out. + virtual bool getResultForZkPathAndName(const String & zk_path_, const String & name_, Result & res_, std::chrono::milliseconds timeout_) const = 0; + + /// Removes remotely stored information. + virtual void drop() {} +}; + +} diff --git a/src/Backups/RestoreCoordinationDistributed.cpp b/src/Backups/RestoreCoordinationDistributed.cpp new file mode 100644 index 00000000000..efa3e21e94a --- /dev/null +++ b/src/Backups/RestoreCoordinationDistributed.cpp @@ -0,0 +1,148 @@ +#include +#include +#include + + +namespace DB +{ + +RestoreCoordinationDistributed::RestoreCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) + : zookeeper_path(zookeeper_path_), get_zookeeper(get_zookeeper_) +{ + createRootNodes(); +} + +RestoreCoordinationDistributed::~RestoreCoordinationDistributed() = default; + +void RestoreCoordinationDistributed::createRootNodes() +{ + auto zookeeper = get_zookeeper(); + zookeeper->createAncestors(zookeeper_path); + zookeeper->createIfNotExists(zookeeper_path, ""); + zookeeper->createIfNotExists(zookeeper_path + "/paths_in_backup", ""); + zookeeper->createIfNotExists(zookeeper_path + "/acquired", ""); +} + +void RestoreCoordinationDistributed::removeAllNodes() +{ + auto zookeeper = get_zookeeper(); + zookeeper->removeRecursive(zookeeper_path); +} + +void RestoreCoordinationDistributed::setOrGetPathInBackupForZkPath(const String & zk_path_, String & path_in_backup_) +{ + { + std::lock_guard lock{mutex}; + auto it = paths_in_backup_by_zk_path.find(zk_path_); + if (it != paths_in_backup_by_zk_path.end()) + { + path_in_backup_ = it->second; + return; + } + } + + auto zookeeper = get_zookeeper(); + String combined_path = zookeeper_path + "/paths_in_backup/" + escapeForFileName(zk_path_); + auto code = zookeeper->tryCreate(combined_path, path_in_backup_, zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, combined_path); + + if (code == Coordination::Error::ZNODEEXISTS) + path_in_backup_ = zookeeper->get(combined_path); + + { + std::lock_guard lock{mutex}; + paths_in_backup_by_zk_path[zk_path_] = path_in_backup_; + } +} + +bool RestoreCoordinationDistributed::acquireZkPathAndName(const String & zk_path_, const String & name_) +{ + std::pair key{zk_path_, name_}; + + { + std::lock_guard lock{mutex}; + if (acquired.contains(key)) + return true; + } + + auto zookeeper = get_zookeeper(); + String combined_path = zookeeper_path + "/acquired/" + escapeForFileName(zk_path_) + "|" + escapeForFileName(name_); + auto code = zookeeper->tryCreate(combined_path, "", zkutil::CreateMode::Persistent); + if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) + throw zkutil::KeeperException(code, combined_path); + + if (code == Coordination::Error::ZNODEEXISTS) + return false; + + { + std::lock_guard lock{mutex}; + acquired.emplace(key, std::nullopt); + return true; + } +} + +void RestoreCoordinationDistributed::setResultForZkPathAndName(const String & zk_path_, const String & name_, Result res_) +{ + auto zookeeper = get_zookeeper(); + String combined_path = zookeeper_path + "/acquired/" + escapeForFileName(zk_path_) + "|" + escapeForFileName(name_); + zookeeper->set(combined_path, (res_ == Result::SUCCEEDED) ? "1" : "0"); + + { + std::lock_guard lock{mutex}; + acquired[std::pair{zk_path_, name_}] = res_; + } +} + +bool RestoreCoordinationDistributed::getResultForZkPathAndName(const String & zk_path_, const String & name_, Result & res_, std::chrono::milliseconds timeout_) const +{ + { + std::lock_guard lock{mutex}; + auto value = acquired[std::pair{zk_path_, name_}]; + if (value) + { + res_ = *value; + return true; + } + } + + auto zookeeper = get_zookeeper(); + String combined_path = zookeeper_path + "/acquired/" + escapeForFileName(zk_path_) + "|" + escapeForFileName(name_); + + std::atomic changed = false; + std::condition_variable changed_condvar; + const auto watch = [&changed, &changed_condvar, zk_path_, name_](const Coordination::WatchResponse &) + { + changed = true; + changed_condvar.notify_one(); + }; + + String res_str = zookeeper->getWatch(combined_path, nullptr, watch); + if (res_str.empty()) + { + std::mutex dummy_mutex; + std::unique_lock lock{dummy_mutex}; + changed_condvar.wait_for(lock, timeout_, [&changed] { return changed.load(); }); + res_str = zookeeper->get(combined_path); + } + + if (res_str.empty()) + return false; + + res_ = (res_str == "1") ? Result::SUCCEEDED : Result::FAILED; + + { + std::lock_guard lock{mutex}; + acquired[std::pair{zk_path_, name_}] = res_; + } + + return true; + +} + +void RestoreCoordinationDistributed::drop() +{ + removeAllNodes(); +} + +} diff --git a/src/Backups/RestoreCoordinationDistributed.h b/src/Backups/RestoreCoordinationDistributed.h new file mode 100644 index 00000000000..4f1da10a8b5 --- /dev/null +++ b/src/Backups/RestoreCoordinationDistributed.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +/// Stores restore temporary information in Zookeeper, used to perform RESTORE ON CLUSTER. +class RestoreCoordinationDistributed : public IRestoreCoordination +{ +public: + RestoreCoordinationDistributed(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); + ~RestoreCoordinationDistributed() override; + + void setOrGetPathInBackupForZkPath(const String & zk_path_, String & path_in_backup_) override; + + bool acquireZkPathAndName(const String & zk_path_, const String & name_) override; + void setResultForZkPathAndName(const String & zk_path_, const String & name_, Result res_) override; + bool getResultForZkPathAndName(const String & zk_path_, const String & name_, Result & res_, std::chrono::milliseconds timeout_) const override; + + void drop() override; + +private: + void createRootNodes(); + void removeAllNodes(); + + const String zookeeper_path; + const zkutil::GetZooKeeper get_zookeeper; + mutable std::mutex mutex; + mutable std::map, std::optional> acquired; + std::unordered_map paths_in_backup_by_zk_path; +}; + +} diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp new file mode 100644 index 00000000000..62b5133bf62 --- /dev/null +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -0,0 +1,73 @@ +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +RestoreCoordinationLocal::RestoreCoordinationLocal() = default; +RestoreCoordinationLocal::~RestoreCoordinationLocal() = default; + +void RestoreCoordinationLocal::setOrGetPathInBackupForZkPath(const String & zk_path_, String & path_in_backup_) +{ + std::lock_guard lock{mutex}; + auto [it, inserted] = paths_in_backup_by_zk_path.try_emplace(zk_path_, path_in_backup_); + if (!inserted) + path_in_backup_ = it->second; +} + +bool RestoreCoordinationLocal::acquireZkPathAndName(const String & path_, const String & name_) +{ + std::lock_guard lock{mutex}; + acquired.emplace(std::pair{path_, name_}, std::nullopt); + return true; +} + +void RestoreCoordinationLocal::setResultForZkPathAndName(const String & zk_path_, const String & name_, Result res_) +{ + std::lock_guard lock{mutex}; + getResultRef(zk_path_, name_) = res_; + result_changed.notify_all(); +} + +bool RestoreCoordinationLocal::getResultForZkPathAndName(const String & zk_path_, const String & name_, Result & res_, std::chrono::milliseconds timeout_) const +{ + std::unique_lock lock{mutex}; + auto value = getResultRef(zk_path_, name_); + if (value) + { + res_ = *value; + return true; + } + + bool waited = result_changed.wait_for(lock, timeout_, [this, zk_path_, name_] { return getResultRef(zk_path_, name_).has_value(); }); + if (!waited) + return false; + + res_ = *getResultRef(zk_path_, name_); + return true; +} + +std::optional & RestoreCoordinationLocal::getResultRef(const String & zk_path_, const String & name_) +{ + auto it = acquired.find(std::pair{zk_path_, name_}); + if (it == acquired.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path ({}, {}) is not acquired", zk_path_, name_); + return it->second; +} + +const std::optional & RestoreCoordinationLocal::getResultRef(const String & zk_path_, const String & name_) const +{ + auto it = acquired.find(std::pair{zk_path_, name_}); + if (it == acquired.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path ({}, {}) is not acquired", zk_path_, name_); + return it->second; +} + +} diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h new file mode 100644 index 00000000000..66ec53399f5 --- /dev/null +++ b/src/Backups/RestoreCoordinationLocal.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class RestoreCoordinationLocal : public IRestoreCoordination +{ +public: + RestoreCoordinationLocal(); + ~RestoreCoordinationLocal() override; + + void setOrGetPathInBackupForZkPath(const String & zk_path_, String & path_in_backup_) override; + + bool acquireZkPathAndName(const String & zk_path_, const String & name_) override; + void setResultForZkPathAndName(const String & zk_path_, const String & name_, Result res_) override; + bool getResultForZkPathAndName(const String & zk_path_, const String & name_, Result & res_, std::chrono::milliseconds timeout_) const override; + +private: + std::optional & getResultRef(const String & zk_path_, const String & name_); + const std::optional & getResultRef(const String & zk_path_, const String & name_) const; + + mutable std::mutex mutex; + std::unordered_map paths_in_backup_by_zk_path; + std::map, std::optional> acquired; + mutable std::condition_variable result_changed; +}; + +} diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp deleted file mode 100644 index 8e5ba000da1..00000000000 --- a/src/Backups/RestoreSettings.cpp +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int UNKNOWN_SETTING; - extern const int CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE; -} - -namespace -{ - RestoreTableCreationMode parseRestoreTableCreationMode(const Field & field) - { - if (field.getType() == Field::Types::String) - { - String str = field.get(); - if (str == "1" || boost::iequals(str, "true")) - return RestoreTableCreationMode::kCreate; - if (str == "0" || boost::iequals(str, "false")) - return RestoreTableCreationMode::kMustExist; - if (boost::iequals(str, "if not exists")) - return RestoreTableCreationMode::kCreateIfNotExists; - throw Exception("Cannot parse creation mode from string '" + str + "'", - ErrorCodes::CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE); - } - if (applyVisitor(FieldVisitorConvertToNumber(), field)) - return RestoreTableCreationMode::kCreate; - else - return RestoreTableCreationMode::kMustExist; - } -} - -RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) -{ - RestoreSettings res; - - if (query.base_backup_name) - res.base_backup_info = BackupInfo::fromAST(*query.base_backup_name); - - if (query.settings) - { - const auto & settings = query.settings->as().changes; - for (const auto & setting : settings) - { - if (setting.name == "password") - res.password = SettingFieldString{setting.value}; - else if (setting.name == "structure_only") - res.structure_only = SettingFieldBool{setting.value}; - else if (setting.name == "create_table") - res.create_table = parseRestoreTableCreationMode(setting.value); - else if (setting.name == "create_database") - res.create_database = parseRestoreTableCreationMode(setting.value); - else if (setting.name == "allow_different_table_def") - res.allow_different_table_def = SettingFieldBool{setting.value}; - else if (setting.name == "allow_different_database_def") - res.allow_different_database_def = SettingFieldBool{setting.value}; - else if (setting.name == "async") - res.async = SettingFieldBool{setting.value}; - else - throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); - } - } - - return res; -} - -} diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index 9a976c4753a..d8f8d177c2a 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -1,25 +1,33 @@ #include #include +#include #include #include #include #include #include -#include +#include #include #include #include +#include #include #include #include #include +#include #include #include #include #include +#include +#include #include +#include #include +#include + namespace fs = std::filesystem; @@ -29,10 +37,145 @@ namespace ErrorCodes { extern const int CANNOT_RESTORE_TABLE; extern const int CANNOT_RESTORE_DATABASE; + extern const int BACKUP_ENTRY_NOT_FOUND; } namespace { + class PathsInBackup + { + public: + PathsInBackup(const IBackup & backup_) : backup(backup_) {} + + std::vector getShards() const + { + std::vector res; + constexpr std::string_view shard_prefix = "shard"; + for (const String & shard_dir : backup.listFiles("")) + { + if (shard_dir.starts_with(shard_prefix)) + { + size_t shard_index = parse(shard_dir.substr(shard_prefix.size())); + res.push_back(shard_index); + } + } + if (res.empty()) + res.push_back(1); + return res; + } + + std::vector getReplicas(size_t shard_index) const + { + std::vector res; + constexpr std::string_view replica_prefix = "replica"; + for (const String & replica_dir : backup.listFiles(fmt::format("shard{}/", shard_index))) + { + if (replica_dir.starts_with(replica_prefix)) + { + size_t replica_index = parse(replica_dir.substr(replica_prefix.size())); + res.push_back(replica_index); + } + } + if (res.empty()) + res.push_back(1); + return res; + } + + std::vector getDatabases(size_t shard_index, size_t replica_index) const + { + std::vector res; + + insertAtEnd(res, backup.listFiles(fmt::format("shard{}/replica{}/metadata/", shard_index, replica_index))); + insertAtEnd(res, backup.listFiles(fmt::format("shard{}/metadata/", shard_index))); + insertAtEnd(res, backup.listFiles(fmt::format("metadata/"))); + + boost::range::remove_erase_if( + res, + [](String & str) + { + if (str.ends_with(".sql")) + { + str.resize(str.length() - strlen(".sql")); + str = unescapeForFileName(str); + return false; + } + return true; + }); + + std::sort(res.begin(), res.end()); + res.erase(std::unique(res.begin(), res.end()), res.end()); + return res; + } + + std::vector getTables(const String & database_name, size_t shard_index, size_t replica_index) const + { + std::vector res; + + String escaped_database_name = escapeForFileName(database_name); + insertAtEnd(res, backup.listFiles(fmt::format("shard{}/replica{}/metadata/{}/", shard_index, replica_index, escaped_database_name))); + insertAtEnd(res, backup.listFiles(fmt::format("shard{}/metadata/{}/", shard_index, escaped_database_name))); + insertAtEnd(res, backup.listFiles(fmt::format("metadata/{}/", escaped_database_name))); + + boost::range::remove_erase_if( + res, + [](String & str) + { + if (str.ends_with(".sql")) + { + str.resize(str.length() - strlen(".sql")); + str = unescapeForFileName(str); + return false; + } + return true; + }); + + std::sort(res.begin(), res.end()); + res.erase(std::unique(res.begin(), res.end()), res.end()); + return res; + } + + /// Returns the path to metadata in backup. + String getMetadataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const + { + String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); + String path1 = fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name); + if (backup.fileExists(path1)) + return path1; + String path2 = fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_table_name); + if (backup.fileExists(path2)) + return path2; + String path3 = fmt::format("metadata/{}.sql", escaped_table_name); + return path3; + } + + String getMetadataPath(const String & database_name, size_t shard_index, size_t replica_index) const + { + String escaped_database_name = escapeForFileName(database_name); + String path1 = fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_database_name); + if (backup.fileExists(path1)) + return path1; + String path2 = fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_database_name); + if (backup.fileExists(path2)) + return path2; + String path3 = fmt::format("metadata/{}.sql", escaped_database_name); + return path3; + } + + String getDataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const + { + String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); + if (backup.fileExists(fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name))) + return fmt::format("shard{}/replica{}/data/{}/", shard_index, replica_index, escaped_table_name); + if (backup.fileExists(fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_table_name))) + return fmt::format("shard{}/data/{}/", shard_index, escaped_table_name); + return fmt::format("data/{}/", escaped_table_name); + } + + private: + const IBackup & backup; + }; + + using Kind = ASTBackupQuery::Kind; using Element = ASTBackupQuery::Element; using Elements = ASTBackupQuery::Elements; @@ -127,10 +270,11 @@ namespace const ASTs & partitions_, const BackupPtr & backup_, const DatabaseAndTableName & table_name_in_backup_, - const RestoreSettingsPtr & restore_settings_) + const RestoreSettingsPtr & restore_settings_, + const std::shared_ptr & restore_coordination_) : context(context_), create_query(typeid_cast>(create_query_)), partitions(partitions_), backup(backup_), table_name_in_backup(table_name_in_backup_), - restore_settings(restore_settings_) + restore_settings(restore_settings_), restore_coordination(restore_coordination_) { table_name = DatabaseAndTableName{create_query->getDatabase(), create_query->getTable()}; if (create_query->temporary) @@ -139,9 +283,28 @@ namespace RestoreTasks run() override { - createStorage(); - getStorage(); - checkStorageCreateQuery(); + if (acquireTableCreation()) + { + try + { + createStorage(); + getStorage(); + checkStorageCreateQuery(); + setTableCreationResult(IRestoreCoordination::Result::SUCCEEDED); + } + catch (...) + { + setTableCreationResult(IRestoreCoordination::Result::FAILED); + throw; + } + } + else + { + waitForTableCreation(); + getStorage(); + checkStorageCreateQuery(); + } + RestoreTasks tasks; if (auto task = insertData()) tasks.push_back(std::move(task)); @@ -151,6 +314,67 @@ namespace bool isSequential() const override { return true; } private: + bool acquireTableCreation() + { + if (restore_settings->create_table == RestoreTableCreationMode::kMustExist) + return true; + + auto replicated_db + = typeid_cast>(DatabaseCatalog::instance().getDatabase(table_name.first)); + if (!replicated_db) + return true; + + use_coordination_for_table_creation = true; + replicated_database_zookeeper_path = replicated_db->getZooKeeperPath(); + if (restore_coordination->acquireZkPathAndName(replicated_database_zookeeper_path, table_name.second)) + return true; + + return false; + } + + void setTableCreationResult(IRestoreCoordination::Result res) + { + if (use_coordination_for_table_creation) + restore_coordination->setResultForZkPathAndName(replicated_database_zookeeper_path, table_name.second, res); + } + + void waitForTableCreation() + { + if (!use_coordination_for_table_creation) + return; + + IRestoreCoordination::Result res; + const auto & config = context->getConfigRef(); + auto timeout = std::chrono::seconds(config.getUInt("backups.create_table_in_replicated_db_timeout", 10)); + auto start_time = std::chrono::steady_clock::now(); + + if (!restore_coordination->getResultForZkPathAndName(replicated_database_zookeeper_path, table_name.second, res, timeout)) + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Waited too long ({}) for creating of {} on another replica", + to_string(timeout), + formatTableNameOrTemporaryTableName(table_name)); + + if (res == IRestoreCoordination::Result::FAILED) + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Failed creating of {} on another replica", + formatTableNameOrTemporaryTableName(table_name)); + + while (std::chrono::steady_clock::now() - start_time < timeout) + { + if (DatabaseCatalog::instance().tryGetDatabaseAndTable({table_name.first, table_name.second}, context).second) + return; + sleepForMilliseconds(50); + } + + throw Exception( + ErrorCodes::CANNOT_RESTORE_TABLE, + "Waited too long ({}) for creating of {} on another replica", + to_string(timeout), + formatTableNameOrTemporaryTableName(table_name)); + } + void createStorage() { if (restore_settings->create_table == RestoreTableCreationMode::kMustExist) @@ -206,7 +430,7 @@ namespace if (restore_settings->structure_only) return false; - data_path_in_backup = getDataPathInBackup(table_name_in_backup); + data_path_in_backup = PathsInBackup{*backup}.getDataPath(table_name_in_backup, restore_settings->shard_in_backup, restore_settings->replica_in_backup); if (backup->listFiles(data_path_in_backup).empty()) return false; @@ -234,7 +458,11 @@ namespace { if (!hasData()) return {}; - return storage->restoreData(context, partitions, backup, data_path_in_backup, *restore_settings); + + if (restore_settings->replica == 2) + sleepForSeconds(5); + + return storage->restoreData(context, partitions, backup, data_path_in_backup, *restore_settings, restore_coordination); } ContextMutablePtr context; @@ -244,6 +472,9 @@ namespace BackupPtr backup; DatabaseAndTableName table_name_in_backup; RestoreSettingsPtr restore_settings; + std::shared_ptr restore_coordination; + bool use_coordination_for_table_creation = false; + String replicated_database_zookeeper_path; DatabasePtr database; StoragePtr storage; ASTPtr storage_create_query; @@ -258,11 +489,17 @@ namespace { public: RestoreTasksBuilder(ContextMutablePtr context_, const BackupPtr & backup_, const RestoreSettings & restore_settings_) - : context(context_), backup(backup_), restore_settings(restore_settings_) {} + : context(context_), backup(backup_), restore_settings(restore_settings_) + { + if (!restore_settings.coordination_zk_path.empty()) + restore_coordination = std::make_shared(restore_settings.coordination_zk_path, [context=context] { return context->getZooKeeper(); }); + } /// Prepares internal structures for making tasks for restoring. void prepare(const ASTBackupQuery::Elements & elements) { + adjustIndicesOfSourceShardAndReplicaInBackup(); + String current_database = context->getCurrentDatabase(); renaming_settings.setFromBackupQuery(elements, current_database); @@ -307,12 +544,39 @@ namespace /// TODO: We need to restore tables according to their dependencies. for (const auto & info : tables | boost::adaptors::map_values) - res.push_back(std::make_unique(context, info.create_query, info.partitions, backup, info.name_in_backup, restore_settings_ptr)); + res.push_back(std::make_unique(context, info.create_query, info.partitions, backup, info.name_in_backup, restore_settings_ptr, restore_coordination)); return res; } private: + void adjustIndicesOfSourceShardAndReplicaInBackup() + { + auto shards_in_backup = PathsInBackup{*backup}.getShards(); + if (!restore_settings.shard_in_backup) + { + if (shards_in_backup.size() == 1) + restore_settings.shard_in_backup = shards_in_backup[0]; + else + restore_settings.shard_in_backup = restore_settings.shard; + } + + if (std::find(shards_in_backup.begin(), shards_in_backup.end(), restore_settings.shard_in_backup) == shards_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", restore_settings.shard_in_backup); + + auto replicas_in_backup = PathsInBackup{*backup}.getReplicas(restore_settings.shard_in_backup); + if (!restore_settings.replica_in_backup) + { + if (replicas_in_backup.size() == 1) + restore_settings.replica_in_backup = replicas_in_backup[0]; + else + restore_settings.replica_in_backup = restore_settings.replica; + } + + if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), restore_settings.replica_in_backup) == replicas_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No replica #{} in backup", restore_settings.replica_in_backup); + } + /// Prepares to restore a single table and probably its database's definition. void prepareToRestoreTable(const DatabaseAndTableName & table_name_, const ASTs & partitions_) { @@ -339,8 +603,8 @@ namespace if (databases.contains(new_database_name)) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} twice", backQuoteIfNeed(new_database_name)); - Strings table_metadata_filenames = backup->listFiles("metadata/" + escapeForFileName(database_name_) + "/", "/"); - bool has_tables_in_backup = !table_metadata_filenames.empty(); + Strings table_names = PathsInBackup{*backup}.getTables(database_name_, restore_settings.shard_in_backup, restore_settings.replica_in_backup); + bool has_tables_in_backup = !table_names.empty(); bool has_create_query_in_backup = hasCreateQueryInBackup(database_name_); if (!has_create_query_in_backup && !has_tables_in_backup) @@ -367,9 +631,8 @@ namespace } /// Restore tables in this database. - for (const String & table_metadata_filename : table_metadata_filenames) + for (const String & table_name : table_names) { - String table_name = unescapeForFileName(fs::path{table_metadata_filename}.stem()); if (except_list_.contains(table_name)) continue; prepareToRestoreTable(DatabaseAndTableName{database_name_, table_name}, ASTs{}); @@ -379,10 +642,8 @@ namespace /// Prepares to restore all the databases contained in the backup. void prepareToRestoreAllDatabases(const std::set & except_list_) { - Strings database_metadata_filenames = backup->listFiles("metadata/", "/"); - for (const String & database_metadata_filename : database_metadata_filenames) + for (const String & database_name : PathsInBackup{*backup}.getDatabases(restore_settings.shard_in_backup, restore_settings.replica_in_backup)) { - String database_name = unescapeForFileName(fs::path{database_metadata_filename}.stem()); if (except_list_.contains(database_name)) continue; prepareToRestoreDatabase(database_name, std::set{}); @@ -392,7 +653,7 @@ namespace /// Reads a create query for creating a specified table from the backup. std::shared_ptr readCreateQueryFromBackup(const DatabaseAndTableName & table_name) const { - String create_query_path = getMetadataPathInBackup(table_name); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(table_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); if (!backup->fileExists(create_query_path)) throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, "Cannot restore the {} because there is no such table in the backup", formatTableNameOrTemporaryTableName(table_name)); @@ -407,7 +668,7 @@ namespace /// Reads a create query for creating a specified database from the backup. std::shared_ptr readCreateQueryFromBackup(const String & database_name) const { - String create_query_path = getMetadataPathInBackup(database_name); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); if (!backup->fileExists(create_query_path)) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} because there is no such database in the backup", backQuoteIfNeed(database_name)); auto read_buffer = backup->readFile(create_query_path)->getReadBuffer(); @@ -421,7 +682,7 @@ namespace /// Whether there is a create query for creating a specified database in the backup. bool hasCreateQueryInBackup(const String & database_name) const { - String create_query_path = getMetadataPathInBackup(database_name); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); return backup->fileExists(create_query_path); } @@ -456,6 +717,7 @@ namespace ContextMutablePtr context; BackupPtr backup; RestoreSettings restore_settings; + std::shared_ptr restore_coordination; DDLRenamingSettings renaming_settings; std::map databases; std::map tables; @@ -612,4 +874,16 @@ void executeRestoreTasks(RestoreTasks && restore_tasks, size_t num_threads) need_rollback_completed_tasks = false; } + +size_t getMinCountOfReplicas(const IBackup & backup) +{ + size_t min_count_of_replicas = static_cast(-1); + for (size_t shard_index : PathsInBackup(backup).getShards()) + { + size_t count_of_replicas = PathsInBackup(backup).getReplicas(shard_index).size(); + min_count_of_replicas = std::min(min_count_of_replicas, count_of_replicas); + } + return min_count_of_replicas; +} + } diff --git a/src/Backups/RestoreUtils.h b/src/Backups/RestoreUtils.h index 33d2f7ff527..e58cb49ef84 100644 --- a/src/Backups/RestoreUtils.h +++ b/src/Backups/RestoreUtils.h @@ -13,6 +13,7 @@ using RestoreTaskPtr = std::unique_ptr; using RestoreTasks = std::vector; struct RestoreSettings; class Context; +using ContextPtr = std::shared_ptr; using ContextMutablePtr = std::shared_ptr; /// Prepares restore tasks. @@ -21,4 +22,7 @@ RestoreTasks makeRestoreTasks(ContextMutablePtr context, const BackupPtr & backu /// Executes restore tasks. void executeRestoreTasks(RestoreTasks && tasks, size_t num_threads); +/// Returns the minimal count of replicas stored in the backup. +size_t getMinCountOfReplicas(const IBackup & backup); + } diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index 679209e643e..a3ba09bbafc 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include @@ -177,7 +176,7 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) writer = std::make_shared(path); else writer = std::make_shared(disk, path); - return std::make_unique(backup_name, archive_params, params.base_backup_info, writer, nullptr, false, params.context); + return std::make_unique(backup_name, archive_params, params.base_backup_info, writer, params.context, params.backup_uuid, params.is_internal_backup, params.coordination_zk_path); } }; diff --git a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp new file mode 100644 index 00000000000..cf6190c4c27 --- /dev/null +++ b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.cpp @@ -0,0 +1,37 @@ +#include +#include +#include +#include + + +namespace DB +{ + +void replaceTableUUIDWithMacroInReplicatedTableDef(ASTCreateQuery & create_query, const UUID & table_uuid) +{ + if (create_query.getTable().empty() || !create_query.storage || !create_query.storage->engine || (table_uuid == UUIDHelpers::Nil)) + return; + + auto & engine = *(create_query.storage->engine); + if (!engine.name.starts_with("Replicated") || !engine.arguments) + return; + + auto * args = typeid_cast(engine.arguments.get()); + + size_t zookeeper_path_arg_pos = engine.name.starts_with("ReplicatedGraphite") ? 1 : 0; + + if (!args || (args->children.size() <= zookeeper_path_arg_pos)) + return; + + auto * zookeeper_path_arg = typeid_cast(args->children[zookeeper_path_arg_pos].get()); + if (!zookeeper_path_arg || (zookeeper_path_arg->value.getType() != Field::Types::String)) + return; + + String & zookeeper_path = zookeeper_path_arg->value.get(); + + String table_uuid_str = toString(table_uuid); + if (size_t uuid_pos = zookeeper_path.find(table_uuid_str); uuid_pos != String::npos) + zookeeper_path.replace(uuid_pos, table_uuid_str.size(), "{uuid}"); +} + +} diff --git a/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h new file mode 100644 index 00000000000..e339b1c4536 --- /dev/null +++ b/src/Backups/replaceTableUUIDWithMacroInReplicatedTableDef.h @@ -0,0 +1,14 @@ +#pragma once + +#include + +namespace DB +{ + +class ASTCreateQuery; + +/// While making a replicated table it replaces "{uuid}" in zookeeper path with the real table UUID. +/// This function reverts this replacement.d +void replaceTableUUIDWithMacroInReplicatedTableDef(ASTCreateQuery & create_query, const UUID & table_uuid); + +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e8e1153e2b4..d31bfe114f8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -215,6 +215,7 @@ if (TARGET ch_contrib::jemalloc) endif() add_subdirectory(Access/Common) +add_subdirectory(Backups/Common) add_subdirectory(Common/ZooKeeper) add_subdirectory(Common/Config) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 72d19a5340a..7eadb74b523 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -55,6 +55,8 @@ public: String getFullReplicaName() const; static std::pair parseFullReplicaName(const String & name); + const String & getZooKeeperPath() const { return zookeeper_path; } + /// Returns cluster consisting of database replicas ClusterPtr getCluster() const; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index f60ba4ad083..7961dd28759 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -1,27 +1,30 @@ #include +#include +#include #include #include #include #include -#include #include #include -#include #include #include +#include #include #include #include #include #include +#include #include +#include namespace DB { namespace { - BackupMutablePtr createBackup(const BackupInfo & backup_info, const BackupSettings & backup_settings, const ContextPtr & context) + BackupMutablePtr createBackup(const UUID & backup_uuid, const BackupInfo & backup_info, const BackupSettings & backup_settings, const ContextPtr & context) { BackupFactory::CreateParams params; params.open_mode = IBackup::OpenMode::WRITE; @@ -31,6 +34,9 @@ namespace params.compression_method = backup_settings.compression_method; params.compression_level = backup_settings.compression_level; params.password = backup_settings.password; + params.backup_uuid = backup_uuid; + params.is_internal_backup = backup_settings.internal; + params.coordination_zk_path = backup_settings.coordination_zk_path; return BackupFactory::instance().createBackup(params); } @@ -45,84 +51,157 @@ namespace return BackupFactory::instance().createBackup(params); } - void executeBackupSync(UInt64 task_id, const ContextPtr & context, const BackupInfo & backup_info, const ASTBackupQuery::Elements & backup_elements, const BackupSettings & backup_settings, bool no_throw = false) + void executeBackupSync(const ASTBackupQuery & query, UInt64 task_id, const ContextPtr & context, const BackupInfo & backup_info, const BackupSettings & backup_settings, bool no_throw = false) { auto & worker = BackupsWorker::instance(); + bool is_internal_backup = backup_settings.internal; + try { - BackupMutablePtr backup = createBackup(backup_info, backup_settings, context); - worker.update(task_id, BackupStatus::PREPARING); - auto backup_entries = makeBackupEntries(context, backup_elements, backup_settings); - worker.update(task_id, BackupStatus::MAKING_BACKUP); - writeBackupEntries(backup, std::move(backup_entries), context->getSettingsRef().max_backup_threads); - worker.update(task_id, BackupStatus::BACKUP_COMPLETE); + UUID backup_uuid = UUIDHelpers::generateV4(); + + auto new_backup_settings = backup_settings; + if (!query.cluster.empty() && backup_settings.coordination_zk_path.empty()) + new_backup_settings.coordination_zk_path = query.cluster.empty() ? "" : ("/clickhouse/backups/backup-" + toString(backup_uuid)); + std::shared_ptr new_query = std::static_pointer_cast(query.clone()); + new_backup_settings.copySettingsToBackupQuery(*new_query); + + BackupMutablePtr backup = createBackup(backup_uuid, backup_info, new_backup_settings, context); + + if (!query.cluster.empty()) + { + if (!is_internal_backup) + worker.update(task_id, BackupStatus::MAKING_BACKUP); + + DDLQueryOnClusterParams params; + params.shard_index = new_backup_settings.shard; + params.replica_index = new_backup_settings.replica; + params.allow_multiple_replicas = new_backup_settings.allow_storing_multiple_replicas; + auto res = executeDDLQueryOnCluster(new_query, context, params); + + PullingPipelineExecutor executor(res.pipeline); + Block block; + while (executor.pull(block)); + + backup->finalizeWriting(); + } + else + { + auto backup_entries = makeBackupEntries(context, new_query->elements, new_backup_settings); + + if (!is_internal_backup) + worker.update(task_id, BackupStatus::MAKING_BACKUP); + + writeBackupEntries(backup, std::move(backup_entries), context->getSettingsRef().max_backup_threads); + } + + if (!is_internal_backup) + worker.update(task_id, BackupStatus::BACKUP_COMPLETE); } catch (...) { - worker.update(task_id, BackupStatus::FAILED_TO_BACKUP, getCurrentExceptionMessage(false)); + if (!is_internal_backup) + worker.update(task_id, BackupStatus::FAILED_TO_BACKUP, getCurrentExceptionMessage(false)); if (!no_throw) throw; } } - void executeRestoreSync(UInt64 task_id, ContextMutablePtr context, const BackupInfo & backup_info, const ASTBackupQuery::Elements & restore_elements, const RestoreSettings & restore_settings, bool no_throw = false) + void executeRestoreSync(const ASTBackupQuery & query, UInt64 task_id, ContextMutablePtr context, const BackupInfo & backup_info, const RestoreSettings & restore_settings, bool no_throw = false) { auto & worker = BackupsWorker::instance(); + bool is_internal_restore = restore_settings.internal; + try { BackupPtr backup = openBackup(backup_info, restore_settings, context); - worker.update(task_id, BackupStatus::RESTORING); - auto restore_tasks = makeRestoreTasks(context, backup, restore_elements, restore_settings); - executeRestoreTasks(std::move(restore_tasks), context->getSettingsRef().max_backup_threads); - worker.update(task_id, BackupStatus::RESTORED); + + auto new_restore_settings = restore_settings; + if (!query.cluster.empty() && new_restore_settings.coordination_zk_path.empty()) + { + UUID restore_uuid = UUIDHelpers::generateV4(); + new_restore_settings.coordination_zk_path + = query.cluster.empty() ? "" : ("/clickhouse/backups/restore-" + toString(restore_uuid)); + } + std::shared_ptr new_query = std::static_pointer_cast(query.clone()); + new_restore_settings.copySettingsToRestoreQuery(*new_query); + + if (!query.cluster.empty()) + { + DDLQueryOnClusterParams params; + params.shard_index = new_restore_settings.shard; + params.replica_index = new_restore_settings.replica; + auto res = executeDDLQueryOnCluster(new_query, context, params); + + PullingPipelineExecutor executor(res.pipeline); + Block block; + while (executor.pull(block)); + } + else + { + auto restore_tasks = makeRestoreTasks(context, backup, new_query->elements, new_restore_settings); + executeRestoreTasks(std::move(restore_tasks), context->getSettingsRef().max_backup_threads); + } + + if (!is_internal_restore) + worker.update(task_id, BackupStatus::RESTORED); } catch (...) { - worker.update(task_id, BackupStatus::FAILED_TO_RESTORE, getCurrentExceptionMessage(false)); + if (!is_internal_restore) + worker.update(task_id, BackupStatus::FAILED_TO_RESTORE, getCurrentExceptionMessage(false)); if (!no_throw) throw; } } - UInt64 executeBackup(const ContextPtr & context, const ASTBackupQuery & query) + UInt64 executeBackup(const ASTBackupQuery & query, const ContextPtr & context) { const auto backup_info = BackupInfo::fromAST(*query.backup_name); - auto task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::PREPARING); const auto backup_settings = BackupSettings::fromBackupQuery(query); + size_t task_id = 0; + if (!backup_settings.internal) + task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::PREPARING); + if (backup_settings.async) { ThreadFromGlobalPool thread{ - &executeBackupSync, task_id, context, backup_info, query.elements, backup_settings, /* no_throw = */ true}; + &executeBackupSync, query, task_id, context, backup_info, backup_settings, /* no_throw = */ true}; thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead } else { - executeBackupSync(task_id, context, backup_info, query.elements, backup_settings, /* no_throw = */ false); + executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ false); } return task_id; } - UInt64 executeRestore(ContextMutablePtr context, const ASTBackupQuery & query) + UInt64 executeRestore(const ASTBackupQuery & query, ContextMutablePtr context) { const auto backup_info = BackupInfo::fromAST(*query.backup_name); const auto restore_settings = RestoreSettings::fromRestoreQuery(query); - auto task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::RESTORING); + + size_t task_id = 0; + if (!restore_settings.internal) + task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::RESTORING); if (restore_settings.async) { - ThreadFromGlobalPool thread{&executeRestoreSync, task_id, context, backup_info, query.elements, restore_settings, /* no_throw = */ true}; + ThreadFromGlobalPool thread{&executeRestoreSync, query, task_id, context, backup_info, restore_settings, /* no_throw = */ true}; thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead } else { - executeRestoreSync(task_id, context, backup_info, query.elements, restore_settings, /* no_throw = */ false); + executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ false); } return task_id; } Block getResultRow(UInt64 task_id) { + if (!task_id) + return {}; auto entry = BackupsWorker::instance().getEntry(task_id); Block res_columns; @@ -146,11 +225,12 @@ namespace BlockIO InterpreterBackupQuery::execute() { const auto & query = query_ptr->as(); + UInt64 task_id; if (query.kind == ASTBackupQuery::BACKUP) - task_id = executeBackup(context, query); + task_id = executeBackup(query, context); else - task_id = executeRestore(context, query); + task_id = executeRestore(query, context); BlockIO res_io; res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(task_id))); diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index f0b43634573..51ab32d44c5 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -27,7 +27,7 @@ struct DDLQueryOnClusterParams /// 1-bases index of a shard to execute a query on, 0 means all shards. size_t shard_index = 0; - /// 1-bases index of a replica to execute a query on, 0 means all replicas (see also allow_multiple_replicas). + /// 1-bases index of a replica to execute a query on, 0 means all replicas (see also allow_storing_multiple_replicas). size_t replica_index = 0; /// Allows executing a query on multiple replicas. diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index f8fcbd98872..50ca51bd019 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -135,7 +136,6 @@ namespace format.ostr << ", "; settings->format(format); } - } } @@ -157,6 +157,7 @@ void ASTBackupQuery::formatImpl(const FormatSettings & format, FormatState &, Fo << (format.hilite ? hilite_none : ""); formatElements(elements, kind, format); + formatOnCluster(format); format.ostr << (format.hilite ? hilite_keyword : "") << ((kind == Kind::BACKUP) ? " TO " : " FROM ") << (format.hilite ? hilite_none : ""); backup_name->format(format); @@ -165,4 +166,13 @@ void ASTBackupQuery::formatImpl(const FormatSettings & format, FormatState &, Fo formatSettings(settings, base_backup_name, format); } +ASTPtr ASTBackupQuery::getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const +{ + if (kind == ASTBackupQuery::Kind::BACKUP) + return rewriteBackupQueryWithoutOnCluster(*this, params); + else + return rewriteRestoreQueryWithoutOnCluster(*this, params); +} + + } diff --git a/src/Parsers/ASTBackupQuery.h b/src/Parsers/ASTBackupQuery.h index 648bcf27bce..01f8d66fcdc 100644 --- a/src/Parsers/ASTBackupQuery.h +++ b/src/Parsers/ASTBackupQuery.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -15,6 +16,7 @@ using DatabaseAndTableName = std::pair; * ALL TEMPORARY TABLES [EXCEPT ...] | * DATABASE database_name [EXCEPT ...] [AS database_name_in_backup] | * ALL DATABASES [EXCEPT ...] } [,...] + * [ON CLUSTER 'cluster_name'] * TO { File('path/') | * Disk('disk_name', 'path/') * [SETTINGS base_backup = {File(...) | Disk(...)}] @@ -25,6 +27,7 @@ using DatabaseAndTableName = std::pair; * ALL TEMPORARY TABLES [EXCEPT ...] | * DATABASE database_name_in_backup [EXCEPT ...] [INTO database_name] | * ALL DATABASES [EXCEPT ...] } [,...] + * [ON CLUSTER 'cluster_name'] * FROM {File(...) | Disk(...)} * * Notes: @@ -42,7 +45,7 @@ using DatabaseAndTableName = std::pair; * The "WITH BASE" clause allows to set a base backup. Only differences made after the base backup will be * included in a newly created backup, so this option allows to make an incremental backup. */ -class ASTBackupQuery : public IAST +class ASTBackupQuery : public IAST, public ASTQueryWithOnCluster { public: enum Kind @@ -84,5 +87,6 @@ public: String getID(char) const override; ASTPtr clone() const override; void formatImpl(const FormatSettings & format, FormatState &, FormatStateStacked) const override; + ASTPtr getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams &) const override; }; } diff --git a/src/Parsers/CMakeLists.txt b/src/Parsers/CMakeLists.txt index b2c31366929..08722770773 100644 --- a/src/Parsers/CMakeLists.txt +++ b/src/Parsers/CMakeLists.txt @@ -5,6 +5,7 @@ add_headers_and_sources(clickhouse_parsers ./Access) add_headers_and_sources(clickhouse_parsers ./MySQL) add_library(clickhouse_parsers ${clickhouse_parsers_headers} ${clickhouse_parsers_sources}) target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_access) +target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_backups) if (USE_DEBUG_HELPERS) # CMake generator expression will do insane quoting when it encounters special character like quotes, spaces, etc. diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index 844a91fa515..1b4840b548e 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -284,6 +284,14 @@ namespace return true; }); } + + bool parseOnCluster(IParserBase::Pos & pos, Expected & expected, String & cluster) + { + return IParserBase::wrapParseImpl(pos, [&] + { + return ParserKeyword{"ON"}.ignore(pos, expected) && ASTQueryWithOnCluster::parse(pos, cluster, expected); + }); + } } @@ -301,6 +309,9 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!parseElements(pos, expected, elements)) return false; + String cluster; + parseOnCluster(pos, expected, cluster); + if (!ParserKeyword{(kind == Kind::BACKUP) ? "TO" : "FROM"}.ignore(pos, expected)) return false; @@ -320,6 +331,7 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) query->backup_name = std::move(backup_name); query->base_backup_name = std::move(base_backup_name); query->settings = std::move(settings); + query->cluster = std::move(cluster); return true; } diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 88ddde32d83..f236cb5e98c 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -221,7 +221,7 @@ BackupEntries IStorage::backupData(ContextPtr, const ASTs &) throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED); } -RestoreTaskPtr IStorage::restoreData(ContextMutablePtr, const ASTs &, const BackupPtr &, const String &, const StorageRestoreSettings &) +RestoreTaskPtr IStorage::restoreData(ContextMutablePtr, const ASTs &, const BackupPtr &, const String &, const StorageRestoreSettings &, const std::shared_ptr &) { throw Exception("Table engine " + getName() + " doesn't support backups", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 5d1c283c1a0..c667f464a13 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -75,6 +75,7 @@ using BackupEntries = std::vector; struct StorageRestoreSettings; +class IRestoreCoordination; struct ColumnSize { @@ -233,7 +234,7 @@ public: virtual BackupEntries backupData(ContextPtr context, const ASTs & partitions); /// Extract data from the backup and put it to the storage. - virtual RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings); + virtual RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination); /// Returns whether the column is virtual - by default all columns are real. /// Initially reserved virtual column name may be shadowed by real column. diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index bb14ee4cdff..f84c356d66e 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -1063,7 +1063,7 @@ private: ContextMutablePtr context; }; -RestoreTaskPtr StorageLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &) +RestoreTaskPtr StorageLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) { if (!partitions.empty()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); diff --git a/src/Storages/StorageLog.h b/src/Storages/StorageLog.h index b9255c16f2b..6692fbe0524 100644 --- a/src/Storages/StorageLog.h +++ b/src/Storages/StorageLog.h @@ -54,7 +54,7 @@ public: bool hasDataToBackup() const override { return true; } BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; protected: /** Attach the table with the appropriate name, along the appropriate path (with / at the end), diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 610d16c8ea8..063953161ea 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -440,11 +440,11 @@ BackupEntries StorageMaterializedView::backupData(ContextPtr context_, const AST return getTargetTable()->backupData(context_, partitions_); } -RestoreTaskPtr StorageMaterializedView::restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup_, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_) +RestoreTaskPtr StorageMaterializedView::restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup_, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_, const std::shared_ptr & restore_coordination_) { if (!hasInnerTable()) return {}; - return getTargetTable()->restoreData(context_, partitions_, backup_, data_path_in_backup_, restore_settings_); + return getTargetTable()->restoreData(context_, partitions_, backup_, data_path_in_backup_, restore_settings_, restore_coordination_); } ActionLock StorageMaterializedView::getActionLock(StorageActionBlockType type) diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 35fe38058de..c0038ed5190 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -100,7 +100,7 @@ public: bool hasDataToBackup() const override { return hasInnerTable(); } BackupEntries backupData(ContextPtr context_, const ASTs & partitions_) override; - RestoreTaskPtr restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_) override; + RestoreTaskPtr restoreData(ContextMutablePtr context_, const ASTs & partitions_, const BackupPtr & backup, const String & data_path_in_backup_, const StorageRestoreSettings & restore_settings_, const std::shared_ptr & restore_coordination_) override; private: /// Will be initialized in constructor diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 26b435f98a0..c660195c368 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -553,7 +553,7 @@ private: }; -RestoreTaskPtr StorageMemory::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &) +RestoreTaskPtr StorageMemory::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) { if (!partitions.empty()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index d4e82ccb4fc..70da733668d 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -68,7 +68,7 @@ public: bool hasDataToBackup() const override { return true; } BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; std::optional totalRows(const Settings &) const override; std::optional totalBytes(const Settings &) const override; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 8db6fc6c88e..f7db56acd75 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1786,7 +1786,7 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ } -RestoreTaskPtr StorageMergeTree::restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &) +RestoreTaskPtr StorageMergeTree::restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) { return restoreDataParts(getPartitionIDsFromQuery(partitions, local_context), backup, data_path_in_backup, &increment); } diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 74fb954bb6d..6609bae40fb 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -99,7 +99,7 @@ public: CheckResults checkData(const ASTPtr & query, ContextPtr context) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 17150dce78f..c8c22669aab 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -8208,19 +8209,25 @@ public: const std::shared_ptr & storage_, const std::unordered_set & partition_ids_, const BackupPtr & backup_, - const String & data_path_in_backup_) + const String & data_path_in_backup_, + const std::shared_ptr & restore_coordination_) : query_context(query_context_) , storage(storage_) , partition_ids(partition_ids_) , backup(backup_) , data_path_in_backup(data_path_in_backup_) + , restore_coordination(restore_coordination_) { } RestoreTasks run() override { + String full_zk_path = storage->getZooKeeperName() + storage->getZooKeeperPath(); + String adjusted_data_path_in_backup = data_path_in_backup; + restore_coordination->setOrGetPathInBackupForZkPath(full_zk_path, adjusted_data_path_in_backup); + RestoreTasks restore_part_tasks; - Strings part_names = backup->listFiles(data_path_in_backup); + Strings part_names = backup->listFiles(adjusted_data_path_in_backup); auto metadata_snapshot = storage->getInMemoryMetadataPtr(); auto sink = std::make_shared(*storage, metadata_snapshot, 0, 0, 0, false, false, query_context, /*is_attach*/true); @@ -8234,8 +8241,11 @@ public: if (!partition_ids.empty() && !partition_ids.contains(part_info->partition_id)) continue; + if (!restore_coordination->acquireZkPathAndName(full_zk_path, part_info->partition_id)) + continue; /// Other replica is already restoring this partition. + restore_part_tasks.push_back( - std::make_unique(storage, sink, part_name, *part_info, backup, data_path_in_backup)); + std::make_unique(storage, sink, part_name, *part_info, backup, adjusted_data_path_in_backup)); } return restore_part_tasks; } @@ -8246,6 +8256,7 @@ private: std::unordered_set partition_ids; BackupPtr backup; String data_path_in_backup; + std::shared_ptr restore_coordination; class RestorePartTask : public IRestoreTask { @@ -8346,14 +8357,16 @@ RestoreTaskPtr StorageReplicatedMergeTree::restoreData( const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, - const StorageRestoreSettings &) + const StorageRestoreSettings &, + const std::shared_ptr & restore_coordination) { return std::make_unique( local_context, std::static_pointer_cast(shared_from_this()), getPartitionIDsFromQuery(partitions, local_context), backup, - data_path_in_backup); + data_path_in_backup, + restore_coordination); } } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 99cdac36314..e589d40783c 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -226,7 +226,7 @@ public: const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger); /// Extract data from the backup and put it to the storage. - RestoreTaskPtr restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + RestoreTaskPtr restoreData(ContextMutablePtr local_context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; /// Schedules job to execute in background pool (merge, mutate, drop range and so on) bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override; @@ -285,6 +285,8 @@ public: // Return default or custom zookeeper name for table String getZooKeeperName() const { return zookeeper_name; } + String getZooKeeperPath() const { return zookeeper_path; } + // Return table id, common for different replicas String getTableSharedID() const override; diff --git a/src/Storages/StorageStripeLog.cpp b/src/Storages/StorageStripeLog.cpp index 274789f012b..55da256a866 100644 --- a/src/Storages/StorageStripeLog.cpp +++ b/src/Storages/StorageStripeLog.cpp @@ -637,7 +637,7 @@ private: }; -RestoreTaskPtr StorageStripeLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &) +RestoreTaskPtr StorageStripeLog::restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings &, const std::shared_ptr &) { if (!partitions.empty()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine {} doesn't support partitions", getName()); diff --git a/src/Storages/StorageStripeLog.h b/src/Storages/StorageStripeLog.h index 223b662d13c..dbc69fa1553 100644 --- a/src/Storages/StorageStripeLog.h +++ b/src/Storages/StorageStripeLog.h @@ -54,7 +54,7 @@ public: bool hasDataToBackup() const override { return true; } BackupEntries backupData(ContextPtr context, const ASTs & partitions) override; - RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings) override; + RestoreTaskPtr restoreData(ContextMutablePtr context, const ASTs & partitions, const BackupPtr & backup, const String & data_path_in_backup, const StorageRestoreSettings & restore_settings, const std::shared_ptr & restore_coordination) override; protected: StorageStripeLog( diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 16691bef084..0a6f93f048e 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -3765,10 +3765,10 @@ class ClickHouseInstance: if self.external_dirs: for external_dir in self.external_dirs: external_dir_abs_path = p.abspath( - p.join(self.path, external_dir.lstrip("/")) + p.join(self.cluster.instances_dir, external_dir.lstrip("/")) ) logging.info(f"external_dir_abs_path={external_dir_abs_path}") - os.mkdir(external_dir_abs_path) + os.makedirs(external_dir_abs_path, exist_ok=True) external_dirs_volumes += ( "- " + external_dir_abs_path + ":" + external_dir + "\n" ) diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 2bc72c30bdc..56166046253 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -77,14 +77,10 @@ def test_restore_table_into_existing_table(engine): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - instance.query( - f"RESTORE TABLE test.table INTO test.table FROM {backup_name}" - ) + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "200\t9900\n" - instance.query( - f"RESTORE TABLE test.table INTO test.table FROM {backup_name}" - ) + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "300\t14850\n" diff --git a/tests/integration/test_backup_restore_replicated/__init__.py b/tests/integration/test_backup_restore_on_cluster/__init__.py similarity index 100% rename from tests/integration/test_backup_restore_replicated/__init__.py rename to tests/integration/test_backup_restore_on_cluster/__init__.py diff --git a/tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml b/tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml new file mode 100644 index 00000000000..0434df06457 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml @@ -0,0 +1,7 @@ + + + + 1 + + + diff --git a/tests/integration/test_backup_restore_replicated/configs/backups_disk.xml b/tests/integration/test_backup_restore_on_cluster/configs/backups_disk.xml similarity index 100% rename from tests/integration/test_backup_restore_replicated/configs/backups_disk.xml rename to tests/integration/test_backup_restore_on_cluster/configs/backups_disk.xml diff --git a/tests/integration/test_backup_restore_replicated/configs/remote_servers.xml b/tests/integration/test_backup_restore_on_cluster/configs/remote_servers.xml similarity index 100% rename from tests/integration/test_backup_restore_replicated/configs/remote_servers.xml rename to tests/integration/test_backup_restore_on_cluster/configs/remote_servers.xml diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py new file mode 100644 index 00000000000..7ddbe035558 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -0,0 +1,158 @@ +from time import sleep +import pytest +import os.path +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + user_configs=["configs/allow_experimental_database_replicated.xml"], + external_dirs=["/backups/"], + macros={"replica": "node1", "shard": "shard1"}, + with_zookeeper=True, +) + +node2 = cluster.add_instance( + "node2", + main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], + user_configs=["configs/allow_experimental_database_replicated.xml"], + external_dirs=["/backups/"], + macros={"replica": "node2", "shard": "shard1"}, + with_zookeeper=True, +) + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(autouse=True) +def drop_after_test(): + try: + yield + finally: + node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY") + node1.query("DROP DATABASE IF EXISTS mydb ON CLUSTER 'cluster' NO DELAY") + + +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"Disk('backups', '{backup_id_counter}.zip')" + + +def get_path_to_backup(instance, backup_name): + return os.path.join( + instance.path, + "backups", + backup_name.removeprefix("Disk('backups', '").removesuffix("')"), + ) + + +def test_replicated_table(): + node1.query( + "CREATE TABLE tbl ON CLUSTER 'cluster' (" + "x UInt8, y String" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" + "ORDER BY x" + ) + + node1.query("INSERT INTO tbl VALUES (1, 'Don''t')") + node2.query("INSERT INTO tbl VALUES (2, 'count')") + node1.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (3, 'your')") + node2.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (4, 'chickens')") + + backup_name = new_backup_name() + + # Make backup on node 1. + node1.query( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica=1" + ) + + # Drop table on both nodes. + node1.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + + # Restore from backup on node2. + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + + assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + + assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + + +def test_replicated_database(): + node1.query( + "CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE=Replicated('/clickhouse/path/','{shard}','{replica}')" + ) + + node1.query( + "CREATE TABLE mydb.tbl(x UInt8, y String) ENGINE=ReplicatedMergeTree ORDER BY x" + ) + assert node2.query("EXISTS mydb.tbl") == "1\n" + + node1.query("INSERT INTO mydb.tbl VALUES (1, 'Don''t')") + node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')") + node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')") + node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')") + + # Make backup. + backup_name = new_backup_name() + node1.query( + f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica=2" + ) + + # Drop table on both nodes. + node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") + + # Restore from backup on node2. + node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}") + + assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + + assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + + +def test_different_tables_on_nodes(): + node1.query( + "CREATE TABLE tbl (`x` UInt8, `y` String) ENGINE = MergeTree ORDER BY x" + ) + node2.query("CREATE TABLE tbl (`w` Int64) ENGINE = MergeTree ORDER BY w") + + node1.query( + "INSERT INTO tbl VALUES (1, 'Don''t'), (2, 'count'), (3, 'your'), (4, 'chickens')" + ) + node2.query("INSERT INTO tbl VALUES (-333), (-222), (-111), (0), (111)") + + backup_name = new_backup_name() + node1.query( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS allow_storing_multiple_replicas = true" + ) + + node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + + assert node1.query("SELECT * FROM tbl") == TSV( + [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] + ) + assert node2.query("SELECT * FROM tbl") == TSV([-333, -222, -111, 0, 111]) diff --git a/tests/integration/test_backup_restore_replicated/test.py b/tests/integration/test_backup_restore_replicated/test.py deleted file mode 100644 index c0c7bd91b6e..00000000000 --- a/tests/integration/test_backup_restore_replicated/test.py +++ /dev/null @@ -1,105 +0,0 @@ -import pytest -import os.path -from helpers.cluster import ClickHouseCluster -from helpers.test_tools import TSV - - -cluster = ClickHouseCluster(__file__) - -node1 = cluster.add_instance( - "node1", - main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], - external_dirs=["/backups/"], - macros={"replica": "node1"}, - with_zookeeper=True, -) - -node2 = cluster.add_instance( - "node2", - main_configs=["configs/remote_servers.xml", "configs/backups_disk.xml"], - external_dirs=["/backups/"], - macros={"replica": "node2"}, - with_zookeeper=True, -) - - -@pytest.fixture(scope="module", autouse=True) -def start_cluster(): - try: - cluster.start() - yield cluster - node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY") - finally: - cluster.shutdown() - - -def create_table(instance = None): - on_cluster_clause = "" if instance else "ON CLUSTER 'cluster'" - instance_to_execute = instance if instance else node1 - instance_to_execute.query( - "CREATE TABLE tbl " + on_cluster_clause + " (" - "x UInt8, y String" - ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" - "ORDER BY x" - ) - - -def drop_table(instance = None): - on_cluster_clause = "" if instance else "ON CLUSTER 'cluster'" - instance_to_execute = instance if instance else node1 - instance_to_execute.query(f"DROP TABLE tbl {on_cluster_clause} NO DELAY") - - -def insert_data(instance = None): - instance1_to_execute = instance if instance else node1 - instance2_to_execute = instance if instance else node2 - instance1_to_execute.query("INSERT INTO tbl VALUES (1, 'Don''t')") - instance2_to_execute.query("INSERT INTO tbl VALUES (2, 'count')") - instance1_to_execute.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (3, 'your')") - instance2_to_execute.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (4, 'chickens')") - - -backup_id_counter = 0 - - -def new_backup_name(): - global backup_id_counter - backup_id_counter += 1 - return f"Disk('backups', '{backup_id_counter}.zip')" - - -def get_path_to_backup(instance, backup_name): - return os.path.join( - instance.path, - "backups", - backup_name.removeprefix("Disk('backups', '").removesuffix("')"), - ) - - -def test_backup_and_restore(): - create_table() - insert_data() - - backup_name = new_backup_name() - - # Make backup on node 1. - node1.query(f"BACKUP TABLE tbl TO {backup_name}") - - # Drop table on both nodes. - drop_table() - - # Restore from backup on node2. - os.link( - get_path_to_backup(node1, backup_name), get_path_to_backup(node2, backup_name) - ) - node2.query(f"RESTORE TABLE tbl FROM {backup_name}") - - assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( - [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] - ) - - # Data should be replicated to node1. - create_table(node1) - assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV( - [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] - ) From 129eca7307d56e63d0dcc37e5262a8f54bd61e12 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 10:21:44 +0200 Subject: [PATCH 61/94] CREATE queries generated by RESTORE command is now considered as internal. --- src/Backups/RestoreUtils.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 4 ++-- src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index d8f8d177c2a..1d13683c359 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -383,6 +383,7 @@ namespace auto cloned_create_query = typeid_cast>(create_query->clone()); cloned_create_query->if_not_exists = (restore_settings->create_table == RestoreTableCreationMode::kCreateIfNotExists); InterpreterCreateQuery create_interpreter{cloned_create_query, context}; + create_interpreter.setInternal(true); create_interpreter.execute(); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 7fea8699d59..7878ed4fa08 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -462,7 +462,7 @@ void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_ } } -BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context) +BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, bool internal) { if (query_context->getCurrentTransaction() && query_context->getSettingsRef().throw_on_unsupported_query_inside_transaction) @@ -471,7 +471,7 @@ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, Contex if (is_readonly) throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); - if (query_context->getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + if (!internal && (query_context->getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY)) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); checkQueryValid(query, query_context); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 7eadb74b523..e3e147fef11 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -44,7 +44,7 @@ public: /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. - BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context); + BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context, bool internal = false); bool hasReplicationThread() const override { return true; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7b0d956ff58..5ddd9a22707 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -985,7 +985,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { create.setDatabase(database_name); guard->releaseTableLock(); - return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext()); + return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext(), internal); } } @@ -1117,7 +1117,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { assertOrSetUUID(create, database); guard->releaseTableLock(); - return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext()); + return ptr->tryEnqueueReplicatedDDL(query_ptr, getContext(), internal); } } From 042dc4abb2c5fc9a162ee348bf68b101443a85bc Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 12:54:13 +0200 Subject: [PATCH 62/94] Rename settings shard -> shard_num, replica -> replica_num. --- src/Backups/BackupUtils.cpp | 4 +-- src/Backups/Common/BackupSettings.cpp | 4 +-- src/Backups/Common/BackupSettings.h | 4 +-- src/Backups/Common/RestoreSettings.cpp | 8 ++--- src/Backups/Common/RestoreSettings.h | 8 ++--- .../rewriteBackupQueryWithoutOnCluster.cpp | 8 ++--- src/Backups/RestoreUtils.cpp | 36 +++++++++---------- src/Interpreters/InterpreterBackupQuery.cpp | 8 ++--- 8 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index d4a163f8752..6f0cdd30385 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -155,7 +155,7 @@ namespace auto data_backup = info.storage->backupData(context, info.partitions); if (!data_backup.empty()) { - String data_path = PathsInBackup::getDataPath(*info.create_query, backup_settings.shard, backup_settings.replica); + String data_path = PathsInBackup::getDataPath(*info.create_query, backup_settings.shard_num, backup_settings.replica_num); for (auto & [path_in_backup, backup_entry] : data_backup) res.emplace_back(data_path + path_in_backup, std::move(backup_entry)); } @@ -286,7 +286,7 @@ namespace std::pair makeBackupEntryForMetadata(const IAST & create_query) const { auto metadata_entry = std::make_unique(serializeAST(create_query)); - String metadata_path = PathsInBackup::getMetadataPath(create_query, backup_settings.shard, backup_settings.replica); + String metadata_path = PathsInBackup::getMetadataPath(create_query, backup_settings.shard_num, backup_settings.replica_num); return {metadata_path, std::move(metadata_entry)}; } diff --git a/src/Backups/Common/BackupSettings.cpp b/src/Backups/Common/BackupSettings.cpp index 0722ef14972..bbe879fb0e8 100644 --- a/src/Backups/Common/BackupSettings.cpp +++ b/src/Backups/Common/BackupSettings.cpp @@ -19,8 +19,8 @@ namespace ErrorCodes M(String, password) \ M(Bool, structure_only) \ M(Bool, async) \ - M(UInt64, shard) \ - M(UInt64, replica) \ + M(UInt64, shard_num) \ + M(UInt64, replica_num) \ M(Bool, allow_storing_multiple_replicas) \ M(Bool, internal) \ M(String, coordination_zk_path) diff --git a/src/Backups/Common/BackupSettings.h b/src/Backups/Common/BackupSettings.h index fd495d45bd6..5fd29f510ab 100644 --- a/src/Backups/Common/BackupSettings.h +++ b/src/Backups/Common/BackupSettings.h @@ -30,11 +30,11 @@ struct BackupSettings /// 1-based shard index to store in the backup. 0 means all shards. /// Can only be used with BACKUP ON CLUSTER. - size_t shard = 0; + size_t shard_num = 0; /// 1-based replica index to store in the backup. 0 means all replicas (see also allow_storing_multiple_replicas). /// Can only be used with BACKUP ON CLUSTER. - size_t replica = 0; + size_t replica_num = 0; /// Allows storing in the backup of multiple replicas. bool allow_storing_multiple_replicas = false; diff --git a/src/Backups/Common/RestoreSettings.cpp b/src/Backups/Common/RestoreSettings.cpp index 0b046a41336..f9fc8e044d5 100644 --- a/src/Backups/Common/RestoreSettings.cpp +++ b/src/Backups/Common/RestoreSettings.cpp @@ -69,10 +69,10 @@ namespace M(Bool, allow_different_table_def) \ M(Bool, allow_different_database_def) \ M(Bool, async) \ - M(UInt64, shard) \ - M(UInt64, replica) \ - M(UInt64, shard_in_backup) \ - M(UInt64, replica_in_backup) \ + M(UInt64, shard_num) \ + M(UInt64, replica_num) \ + M(UInt64, shard_num_in_backup) \ + M(UInt64, replica_num_in_backup) \ M(Bool, internal) \ M(String, coordination_zk_path) diff --git a/src/Backups/Common/RestoreSettings.h b/src/Backups/Common/RestoreSettings.h index 474f5d75420..136cfcaf239 100644 --- a/src/Backups/Common/RestoreSettings.h +++ b/src/Backups/Common/RestoreSettings.h @@ -63,21 +63,21 @@ struct RestoreSettings : public StorageRestoreSettings /// 1-based shard index to restore from the backup. 0 means all shards. /// Can only be used with RESTORE ON CLUSTER. - size_t shard = 0; + size_t shard_num = 0; /// 1-based replica index to restore from the backup. 0 means all replicas. /// Can only be used with RESTORE ON CLUSTER. - size_t replica = 0; + size_t replica_num = 0; /// 1-based index of a shard stored in the backup to get data from. /// By default it's 0: if the backup contains only one shard it means the index of that shard /// else it means the same as `shard`. - size_t shard_in_backup = 0; + size_t shard_num_in_backup = 0; /// 1-based index of a replica stored in the backup to get data from. /// By default it's 0: if the backup contains only one replica for the current shard it means the index of that replica /// else it means the same as `replica`. - size_t replica_in_backup = 0; + size_t replica_num_in_backup = 0; /// Internal, should not be specified by user. bool internal = false; diff --git a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp index 64cfea8c6a6..4fb876d26b8 100644 --- a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp +++ b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp @@ -31,8 +31,8 @@ rewriteBackupQueryWithoutOnCluster(const ASTBackupQuery & backup_query, const Wi auto backup_settings = BackupSettings::fromBackupQuery(backup_query); backup_settings.internal = true; backup_settings.async = false; - backup_settings.shard = params.shard_index; - backup_settings.replica = params.replica_index; + backup_settings.shard_num = params.shard_index; + backup_settings.replica_num = params.replica_index; auto new_query = std::static_pointer_cast(backup_query.clone()); new_query->cluster.clear(); backup_settings.copySettingsToBackupQuery(*new_query); @@ -47,8 +47,8 @@ rewriteRestoreQueryWithoutOnCluster(const ASTBackupQuery & restore_query, const auto restore_settings = RestoreSettings::fromRestoreQuery(restore_query); restore_settings.internal = true; restore_settings.async = false; - restore_settings.shard = params.shard_index; - restore_settings.replica = params.replica_index; + restore_settings.shard_num = params.shard_index; + restore_settings.replica_num = params.replica_index; auto new_query = std::static_pointer_cast(restore_query.clone()); new_query->cluster.clear(); restore_settings.copySettingsToRestoreQuery(*new_query); diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index 1d13683c359..222d39fe739 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -431,7 +431,7 @@ namespace if (restore_settings->structure_only) return false; - data_path_in_backup = PathsInBackup{*backup}.getDataPath(table_name_in_backup, restore_settings->shard_in_backup, restore_settings->replica_in_backup); + data_path_in_backup = PathsInBackup{*backup}.getDataPath(table_name_in_backup, restore_settings->shard_num_in_backup, restore_settings->replica_num_in_backup); if (backup->listFiles(data_path_in_backup).empty()) return false; @@ -460,7 +460,7 @@ namespace if (!hasData()) return {}; - if (restore_settings->replica == 2) + if (restore_settings->replica_num == 2) sleepForSeconds(5); return storage->restoreData(context, partitions, backup, data_path_in_backup, *restore_settings, restore_coordination); @@ -554,28 +554,28 @@ namespace void adjustIndicesOfSourceShardAndReplicaInBackup() { auto shards_in_backup = PathsInBackup{*backup}.getShards(); - if (!restore_settings.shard_in_backup) + if (!restore_settings.shard_num_in_backup) { if (shards_in_backup.size() == 1) - restore_settings.shard_in_backup = shards_in_backup[0]; + restore_settings.shard_num_in_backup = shards_in_backup[0]; else - restore_settings.shard_in_backup = restore_settings.shard; + restore_settings.shard_num_in_backup = restore_settings.shard_num; } - if (std::find(shards_in_backup.begin(), shards_in_backup.end(), restore_settings.shard_in_backup) == shards_in_backup.end()) - throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", restore_settings.shard_in_backup); + if (std::find(shards_in_backup.begin(), shards_in_backup.end(), restore_settings.shard_num_in_backup) == shards_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No shard #{} in backup", restore_settings.shard_num_in_backup); - auto replicas_in_backup = PathsInBackup{*backup}.getReplicas(restore_settings.shard_in_backup); - if (!restore_settings.replica_in_backup) + auto replicas_in_backup = PathsInBackup{*backup}.getReplicas(restore_settings.shard_num_in_backup); + if (!restore_settings.replica_num_in_backup) { if (replicas_in_backup.size() == 1) - restore_settings.replica_in_backup = replicas_in_backup[0]; + restore_settings.replica_num_in_backup = replicas_in_backup[0]; else - restore_settings.replica_in_backup = restore_settings.replica; + restore_settings.replica_num_in_backup = restore_settings.replica_num; } - if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), restore_settings.replica_in_backup) == replicas_in_backup.end()) - throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No replica #{} in backup", restore_settings.replica_in_backup); + if (std::find(replicas_in_backup.begin(), replicas_in_backup.end(), restore_settings.replica_num_in_backup) == replicas_in_backup.end()) + throw Exception(ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "No replica #{} in backup", restore_settings.replica_num_in_backup); } /// Prepares to restore a single table and probably its database's definition. @@ -604,7 +604,7 @@ namespace if (databases.contains(new_database_name)) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} twice", backQuoteIfNeed(new_database_name)); - Strings table_names = PathsInBackup{*backup}.getTables(database_name_, restore_settings.shard_in_backup, restore_settings.replica_in_backup); + Strings table_names = PathsInBackup{*backup}.getTables(database_name_, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); bool has_tables_in_backup = !table_names.empty(); bool has_create_query_in_backup = hasCreateQueryInBackup(database_name_); @@ -643,7 +643,7 @@ namespace /// Prepares to restore all the databases contained in the backup. void prepareToRestoreAllDatabases(const std::set & except_list_) { - for (const String & database_name : PathsInBackup{*backup}.getDatabases(restore_settings.shard_in_backup, restore_settings.replica_in_backup)) + for (const String & database_name : PathsInBackup{*backup}.getDatabases(restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup)) { if (except_list_.contains(database_name)) continue; @@ -654,7 +654,7 @@ namespace /// Reads a create query for creating a specified table from the backup. std::shared_ptr readCreateQueryFromBackup(const DatabaseAndTableName & table_name) const { - String create_query_path = PathsInBackup{*backup}.getMetadataPath(table_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(table_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); if (!backup->fileExists(create_query_path)) throw Exception(ErrorCodes::CANNOT_RESTORE_TABLE, "Cannot restore the {} because there is no such table in the backup", formatTableNameOrTemporaryTableName(table_name)); @@ -669,7 +669,7 @@ namespace /// Reads a create query for creating a specified database from the backup. std::shared_ptr readCreateQueryFromBackup(const String & database_name) const { - String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); if (!backup->fileExists(create_query_path)) throw Exception(ErrorCodes::CANNOT_RESTORE_DATABASE, "Cannot restore the database {} because there is no such database in the backup", backQuoteIfNeed(database_name)); auto read_buffer = backup->readFile(create_query_path)->getReadBuffer(); @@ -683,7 +683,7 @@ namespace /// Whether there is a create query for creating a specified database in the backup. bool hasCreateQueryInBackup(const String & database_name) const { - String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_in_backup, restore_settings.replica_in_backup); + String create_query_path = PathsInBackup{*backup}.getMetadataPath(database_name, restore_settings.shard_num_in_backup, restore_settings.replica_num_in_backup); return backup->fileExists(create_query_path); } diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 7961dd28759..107039317d4 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -74,8 +74,8 @@ namespace worker.update(task_id, BackupStatus::MAKING_BACKUP); DDLQueryOnClusterParams params; - params.shard_index = new_backup_settings.shard; - params.replica_index = new_backup_settings.replica; + params.shard_index = new_backup_settings.shard_num; + params.replica_index = new_backup_settings.replica_num; params.allow_multiple_replicas = new_backup_settings.allow_storing_multiple_replicas; auto res = executeDDLQueryOnCluster(new_query, context, params); @@ -129,8 +129,8 @@ namespace if (!query.cluster.empty()) { DDLQueryOnClusterParams params; - params.shard_index = new_restore_settings.shard; - params.replica_index = new_restore_settings.replica; + params.shard_index = new_restore_settings.shard_num; + params.replica_index = new_restore_settings.replica_num; auto res = executeDDLQueryOnCluster(new_query, context, params); PullingPipelineExecutor executor(res.pipeline); From 4de4fff590a41873bbd8fe6f2bac07346892dc8d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 12:54:23 +0200 Subject: [PATCH 63/94] Fix tests. --- .../test_backup_restore_new/test.py | 26 ++++++++++--------- .../test_backup_restore_on_cluster/test.py | 15 +++++------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 56166046253..51daf6d37e8 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -45,9 +45,9 @@ def new_backup_name(): return f"Disk('backups', '{backup_id_counter}/')" -def get_backup_dir(backup_name): - counter = int(backup_name.split(",")[1].strip("')/ ")) - return os.path.join(instance.path, f"backups/{counter}") +def get_path_to_backup(backup_name): + name = backup_name.split(",")[1].strip("')/ ") + return os.path.join(instance.cluster.instances_dir, "backups", name) @pytest.mark.parametrize( @@ -158,14 +158,18 @@ def test_incremental_backup_after_renaming_table(): # Files in a base backup can be searched by checksum, so an incremental backup with a renamed table actually # contains only its changed metadata. - assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "metadata")) == True - assert os.path.isdir(os.path.join(get_backup_dir(backup_name), "data")) == True assert ( - os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "metadata")) + os.path.isdir(os.path.join(get_path_to_backup(backup_name), "metadata")) == True + ) + assert os.path.isdir(os.path.join(get_path_to_backup(backup_name), "data")) == True + assert ( + os.path.isdir( + os.path.join(get_path_to_backup(incremental_backup_name), "metadata") + ) == True ) assert ( - os.path.isdir(os.path.join(get_backup_dir(incremental_backup_name), "data")) + os.path.isdir(os.path.join(get_path_to_backup(incremental_backup_name), "data")) == False ) @@ -222,14 +226,12 @@ def test_database(): def test_zip_archive(): - backup_name = f"File('/backups/archive.zip')" + backup_name = f"Disk('backups', 'archive.zip')" create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query(f"BACKUP TABLE test.table TO {backup_name}") - assert os.path.isfile( - os.path.join(os.path.join(instance.path, "backups/archive.zip")) - ) + assert os.path.isfile(get_path_to_backup(backup_name)) instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" @@ -239,7 +241,7 @@ def test_zip_archive(): def test_zip_archive_with_settings(): - backup_name = f"File('/backups/archive_with_settings.zip')" + backup_name = f"Disk('backups', 'archive_with_settings.zip')" create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 7ddbe035558..36ec02bb770 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -50,15 +50,12 @@ backup_id_counter = 0 def new_backup_name(): global backup_id_counter backup_id_counter += 1 - return f"Disk('backups', '{backup_id_counter}.zip')" + return f"Disk('backups', '{backup_id_counter}')" -def get_path_to_backup(instance, backup_name): - return os.path.join( - instance.path, - "backups", - backup_name.removeprefix("Disk('backups', '").removesuffix("')"), - ) +def get_path_to_backup(backup_name): + name = backup_name.split(",")[1].strip("')/ ") + return os.path.join(instance.cluster.instances_dir, "backups", name) def test_replicated_table(): @@ -78,7 +75,7 @@ def test_replicated_table(): # Make backup on node 1. node1.query( - f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica=1" + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=1" ) # Drop table on both nodes. @@ -114,7 +111,7 @@ def test_replicated_database(): # Make backup. backup_name = new_backup_name() node1.query( - f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica=2" + f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2" ) # Drop table on both nodes. From 97cd68239f91bc696c7d8fa3e48fb14dc8004235 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 14:29:23 +0200 Subject: [PATCH 64/94] Change represenation of shards inside backup. --- src/Backups/BackupUtils.cpp | 2 +- src/Backups/RestoreUtils.cpp | 48 +++++++++++++----------------------- 2 files changed, 18 insertions(+), 32 deletions(-) diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index 6f0cdd30385..cd89175cb6a 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -82,7 +82,7 @@ namespace static String getPathForShardAndReplica(size_t shard_index, size_t replica_index) { if (shard_index || replica_index) - return fmt::format("shard{}/replica{}/", shard_index, replica_index); + return fmt::format("shards/{}/replicas/{}/", shard_index, replica_index); else return ""; } diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index 222d39fe739..d186497f3b1 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -50,15 +50,8 @@ namespace std::vector getShards() const { std::vector res; - constexpr std::string_view shard_prefix = "shard"; - for (const String & shard_dir : backup.listFiles("")) - { - if (shard_dir.starts_with(shard_prefix)) - { - size_t shard_index = parse(shard_dir.substr(shard_prefix.size())); - res.push_back(shard_index); - } - } + for (const String & shard_index : backup.listFiles("shards/")) + res.push_back(parse(shard_index)); if (res.empty()) res.push_back(1); return res; @@ -67,17 +60,10 @@ namespace std::vector getReplicas(size_t shard_index) const { std::vector res; - constexpr std::string_view replica_prefix = "replica"; - for (const String & replica_dir : backup.listFiles(fmt::format("shard{}/", shard_index))) - { - if (replica_dir.starts_with(replica_prefix)) - { - size_t replica_index = parse(replica_dir.substr(replica_prefix.size())); - res.push_back(replica_index); - } - } + for (const String & replica_index : backup.listFiles(fmt::format("shards/{}/replicas/", shard_index))) + res.push_back(parse(replica_index)); if (res.empty()) - res.push_back(1); + res.push_back(1); return res; } @@ -85,8 +71,8 @@ namespace { std::vector res; - insertAtEnd(res, backup.listFiles(fmt::format("shard{}/replica{}/metadata/", shard_index, replica_index))); - insertAtEnd(res, backup.listFiles(fmt::format("shard{}/metadata/", shard_index))); + insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/replicas/{}/metadata/", shard_index, replica_index))); + insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/metadata/", shard_index))); insertAtEnd(res, backup.listFiles(fmt::format("metadata/"))); boost::range::remove_erase_if( @@ -112,8 +98,8 @@ namespace std::vector res; String escaped_database_name = escapeForFileName(database_name); - insertAtEnd(res, backup.listFiles(fmt::format("shard{}/replica{}/metadata/{}/", shard_index, replica_index, escaped_database_name))); - insertAtEnd(res, backup.listFiles(fmt::format("shard{}/metadata/{}/", shard_index, escaped_database_name))); + insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/replicas/{}/metadata/{}/", shard_index, replica_index, escaped_database_name))); + insertAtEnd(res, backup.listFiles(fmt::format("shards/{}/metadata/{}/", shard_index, escaped_database_name))); insertAtEnd(res, backup.listFiles(fmt::format("metadata/{}/", escaped_database_name))); boost::range::remove_erase_if( @@ -138,10 +124,10 @@ namespace String getMetadataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const { String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); - String path1 = fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name); + String path1 = fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name); if (backup.fileExists(path1)) return path1; - String path2 = fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_table_name); + String path2 = fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_table_name); if (backup.fileExists(path2)) return path2; String path3 = fmt::format("metadata/{}.sql", escaped_table_name); @@ -151,10 +137,10 @@ namespace String getMetadataPath(const String & database_name, size_t shard_index, size_t replica_index) const { String escaped_database_name = escapeForFileName(database_name); - String path1 = fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_database_name); + String path1 = fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_database_name); if (backup.fileExists(path1)) return path1; - String path2 = fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_database_name); + String path2 = fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_database_name); if (backup.fileExists(path2)) return path2; String path3 = fmt::format("metadata/{}.sql", escaped_database_name); @@ -164,10 +150,10 @@ namespace String getDataPath(const DatabaseAndTableName & table_name, size_t shard_index, size_t replica_index) const { String escaped_table_name = escapeForFileName(table_name.first) + "/" + escapeForFileName(table_name.second); - if (backup.fileExists(fmt::format("shard{}/replica{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name))) - return fmt::format("shard{}/replica{}/data/{}/", shard_index, replica_index, escaped_table_name); - if (backup.fileExists(fmt::format("shard{}/metadata/{}.sql", shard_index, escaped_table_name))) - return fmt::format("shard{}/data/{}/", shard_index, escaped_table_name); + if (backup.fileExists(fmt::format("shards/{}/replicas/{}/metadata/{}.sql", shard_index, replica_index, escaped_table_name))) + return fmt::format("shards/{}/replicas/{}/data/{}/", shard_index, replica_index, escaped_table_name); + if (backup.fileExists(fmt::format("shards/{}/metadata/{}.sql", shard_index, escaped_table_name))) + return fmt::format("shards/{}/data/{}/", shard_index, escaped_table_name); return fmt::format("data/{}/", escaped_table_name); } From ed0af9d4a0b96144fc3f299432a972bc48109cd8 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 16:09:46 +0200 Subject: [PATCH 65/94] Improve shutdown with async backup/restore. --- src/Backups/BackupsWorker.cpp | 21 +++++++++++++++++++++ src/Backups/BackupsWorker.h | 10 ++++++++++ src/Interpreters/Context.cpp | 4 ++++ src/Interpreters/InterpreterBackupQuery.cpp | 7 ++----- 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 6f68ba32242..b282aa5053f 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace DB @@ -15,6 +16,11 @@ BackupsWorker & BackupsWorker::instance() return the_instance; } +BackupsWorker::BackupsWorker() +{ +} + + size_t BackupsWorker::add(const String & backup_name, BackupStatus status, const String & error) { std::lock_guard lock{mutex}; @@ -72,4 +78,19 @@ std::vector BackupsWorker::getEntries() const return entries; } +void BackupsWorker::run(std::function && task) +{ + thread_pool.scheduleOrThrowOnError(std::move(task)); +} + +void BackupsWorker::shutdown() +{ + size_t num_active_tasks = thread_pool.active(); + if (!num_active_tasks) + return; + LOG_INFO(&Poco::Logger::get("BackupsWorker"), "Waiting for {} backup or restore tasks to be finished", num_active_tasks); + thread_pool.wait(); + LOG_INFO(&Poco::Logger::get("BackupsWorker"), "All backup and restore tasks have finished"); +} + } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index b207bc4c5e8..a3c5fe25352 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -28,12 +29,21 @@ public: Entry getEntry(UInt64 task_id) const; std::vector getEntries() const; + /// Schedules a new task and perfoms it in the background thread. + void run(std::function && task); + + /// Waits until all tasks have been completed. + void shutdown(); + private: + BackupsWorker(); + mutable std::mutex mutex; std::vector entries; std::unordered_map entries_by_name; std::unordered_map entries_by_task_id; UInt64 current_task_id = 0; + ThreadPool thread_pool; }; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 0ad9b72b963..0b5a6ec9391 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -352,6 +353,9 @@ struct ContextSharedPart Session::shutdownNamedSessions(); + /// Waiting for current backups/restores to be finished. This must be done before `DatabaseCatalog::shutdown()`. + BackupsWorker::instance().shutdown(); + /** After system_logs have been shut down it is guaranteed that no system table gets created or written to. * Note that part changes at shutdown won't be logged to part log. */ diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 107039317d4..6d3be56fa87 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -166,9 +166,7 @@ namespace if (backup_settings.async) { - ThreadFromGlobalPool thread{ - &executeBackupSync, query, task_id, context, backup_info, backup_settings, /* no_throw = */ true}; - thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead + BackupsWorker::instance().run([query, task_id, context, backup_info, backup_settings]{ executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ true); }); } else { @@ -188,8 +186,7 @@ namespace if (restore_settings.async) { - ThreadFromGlobalPool thread{&executeRestoreSync, query, task_id, context, backup_info, restore_settings, /* no_throw = */ true}; - thread.detach(); /// TODO: Remove this !!! Move that thread to BackupsWorker instead + BackupsWorker::instance().run([query, task_id, context, backup_info, restore_settings]{ executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ true); }); } else { From 5127eeee7b118d2dba758110813053195a4bc3aa Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 17:10:29 +0200 Subject: [PATCH 66/94] Remove debug sleep. --- src/Backups/RestoreUtils.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index d186497f3b1..cd9510608d6 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -446,9 +446,6 @@ namespace if (!hasData()) return {}; - if (restore_settings->replica_num == 2) - sleepForSeconds(5); - return storage->restoreData(context, partitions, backup, data_path_in_backup, *restore_settings, restore_coordination); } From 7b2422ccef824c65bdbf1ce4b6109a4a4d698271 Mon Sep 17 00:00:00 2001 From: fenglv Date: Mon, 25 Apr 2022 16:31:37 +0000 Subject: [PATCH 67/94] remove unneeded if statement --- src/QueryPipeline/Pipe.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/QueryPipeline/Pipe.cpp b/src/QueryPipeline/Pipe.cpp index 39d2e49cbb7..67faaefbf66 100644 --- a/src/QueryPipeline/Pipe.cpp +++ b/src/QueryPipeline/Pipe.cpp @@ -170,8 +170,7 @@ Pipe::Pipe(ProcessorPtr source, OutputPort * output, OutputPort * totals, Output Pipe::Pipe(ProcessorPtr source) { - if (source->getOutputs().size() != 1) - checkSource(*source); + checkSource(*source); if (collected_processors) collected_processors->emplace_back(source); From 000b184691cdc2e2604a39e9d43eb99bffb18736 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 25 Apr 2022 16:33:25 +0200 Subject: [PATCH 68/94] Fix style & compilation. --- src/Backups/BackupCoordinationDistributed.h | 2 + src/Backups/BackupCoordinationLocal.h | 3 + src/Backups/{Common => }/BackupSettings.cpp | 2 +- src/Backups/{Common => }/BackupSettings.h | 0 src/Backups/BackupUtils.cpp | 2 +- src/Backups/BackupsWorker.cpp | 6 +- src/Backups/BackupsWorker.h | 16 ++--- src/Backups/Common/CMakeLists.txt | 5 -- .../rewriteBackupQueryWithoutOnCluster.cpp | 59 ------------------- .../rewriteBackupQueryWithoutOnCluster.h | 19 ------ src/Backups/IBackupCoordination.h | 1 + src/Backups/RestoreCoordinationDistributed.h | 2 + src/Backups/RestoreCoordinationLocal.h | 1 + src/Backups/{Common => }/RestoreSettings.cpp | 2 +- src/Backups/{Common => }/RestoreSettings.h | 0 src/Backups/RestoreUtils.cpp | 2 +- src/CMakeLists.txt | 1 - src/Interpreters/InterpreterBackupQuery.cpp | 16 ++--- src/Parsers/ASTBackupQuery.cpp | 54 +++++++++++++++-- src/Parsers/CMakeLists.txt | 1 - 20 files changed, 81 insertions(+), 113 deletions(-) rename src/Backups/{Common => }/BackupSettings.cpp (97%) rename src/Backups/{Common => }/BackupSettings.h (100%) delete mode 100644 src/Backups/Common/CMakeLists.txt delete mode 100644 src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp delete mode 100644 src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h rename src/Backups/{Common => }/RestoreSettings.cpp (98%) rename src/Backups/{Common => }/RestoreSettings.h (100%) diff --git a/src/Backups/BackupCoordinationDistributed.h b/src/Backups/BackupCoordinationDistributed.h index 7a7caad7299..ef23740761f 100644 --- a/src/Backups/BackupCoordinationDistributed.h +++ b/src/Backups/BackupCoordinationDistributed.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index d47616e2ddf..bb0072cf4d7 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -1,5 +1,8 @@ +#pragma once + #include #include +#include namespace DB diff --git a/src/Backups/Common/BackupSettings.cpp b/src/Backups/BackupSettings.cpp similarity index 97% rename from src/Backups/Common/BackupSettings.cpp rename to src/Backups/BackupSettings.cpp index bbe879fb0e8..818b93edcc6 100644 --- a/src/Backups/Common/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Backups/Common/BackupSettings.h b/src/Backups/BackupSettings.h similarity index 100% rename from src/Backups/Common/BackupSettings.h rename to src/Backups/BackupSettings.h diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index cd89175cb6a..248ef58f3ce 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index b282aa5053f..3a1bf6d7d0c 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -25,7 +25,7 @@ size_t BackupsWorker::add(const String & backup_name, BackupStatus status, const { std::lock_guard lock{mutex}; - UInt64 task_id = ++current_task_id; + size_t task_id = ++current_task_id; size_t pos; auto it = entries_by_name.find(backup_name); if (it != entries_by_name.end()) @@ -51,7 +51,7 @@ size_t BackupsWorker::add(const String & backup_name, BackupStatus status, const return task_id; } -void BackupsWorker::update(UInt64 task_id, BackupStatus status, const String & error) +void BackupsWorker::update(size_t task_id, BackupStatus status, const String & error) { std::lock_guard lock{mutex}; auto it = entries_by_task_id.find(task_id); @@ -63,7 +63,7 @@ void BackupsWorker::update(UInt64 task_id, BackupStatus status, const String & e entry.timestamp = std::time(nullptr); } -BackupsWorker::Entry BackupsWorker::getEntry(UInt64 task_id) const +BackupsWorker::Entry BackupsWorker::getEntry(size_t task_id) const { std::lock_guard lock{mutex}; auto it = entries_by_task_id.find(task_id); diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index a3c5fe25352..9c74f6c31c2 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -14,22 +14,22 @@ class BackupsWorker public: static BackupsWorker & instance(); - UInt64 add(const String & backup_name, BackupStatus status, const String & error = {}); - void update(UInt64 task_id, BackupStatus status, const String & error = {}); + size_t add(const String & backup_name, BackupStatus status, const String & error = {}); + void update(size_t task_id, BackupStatus status, const String & error = {}); struct Entry { String backup_name; - UInt64 task_id; + size_t task_id; BackupStatus status; String error; time_t timestamp; }; - Entry getEntry(UInt64 task_id) const; + Entry getEntry(size_t task_id) const; std::vector getEntries() const; - /// Schedules a new task and perfoms it in the background thread. + /// Schedules a new task and performs it in the background thread. void run(std::function && task); /// Waits until all tasks have been completed. @@ -40,9 +40,9 @@ private: mutable std::mutex mutex; std::vector entries; - std::unordered_map entries_by_name; - std::unordered_map entries_by_task_id; - UInt64 current_task_id = 0; + std::unordered_map entries_by_name; + std::unordered_map entries_by_task_id; + size_t current_task_id = 0; ThreadPool thread_pool; }; diff --git a/src/Backups/Common/CMakeLists.txt b/src/Backups/Common/CMakeLists.txt deleted file mode 100644 index 1b65d57b926..00000000000 --- a/src/Backups/Common/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") - -add_headers_and_sources(clickhouse_common_backups .) -add_library(clickhouse_common_backups ${clickhouse_common_backups_headers} ${clickhouse_common_backups_sources}) -target_link_libraries(clickhouse_common_backups PUBLIC clickhouse_common_io) diff --git a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp deleted file mode 100644 index 4fb876d26b8..00000000000 --- a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace -{ - void setDatabaseInElements(ASTBackupQuery::Elements & elements, const String & new_database) - { - for (auto & element : elements) - { - if (element.type == ASTBackupQuery::TABLE) - { - if (element.name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) - element.name.first = new_database; - if (element.new_name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) - element.new_name.first = new_database; - } - } - } -} - -std::shared_ptr -rewriteBackupQueryWithoutOnCluster(const ASTBackupQuery & backup_query, const WithoutOnClusterASTRewriteParams & params) -{ - auto backup_settings = BackupSettings::fromBackupQuery(backup_query); - backup_settings.internal = true; - backup_settings.async = false; - backup_settings.shard_num = params.shard_index; - backup_settings.replica_num = params.replica_index; - auto new_query = std::static_pointer_cast(backup_query.clone()); - new_query->cluster.clear(); - backup_settings.copySettingsToBackupQuery(*new_query); - setDatabaseInElements(new_query->elements, params.default_database); - return new_query; -} - - -std::shared_ptr -rewriteRestoreQueryWithoutOnCluster(const ASTBackupQuery & restore_query, const WithoutOnClusterASTRewriteParams & params) -{ - auto restore_settings = RestoreSettings::fromRestoreQuery(restore_query); - restore_settings.internal = true; - restore_settings.async = false; - restore_settings.shard_num = params.shard_index; - restore_settings.replica_num = params.replica_index; - auto new_query = std::static_pointer_cast(restore_query.clone()); - new_query->cluster.clear(); - restore_settings.copySettingsToRestoreQuery(*new_query); - setDatabaseInElements(new_query->elements, params.default_database); - return new_query; -} - -} diff --git a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h b/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h deleted file mode 100644 index 720a397cdc1..00000000000 --- a/src/Backups/Common/rewriteBackupQueryWithoutOnCluster.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ -class ASTBackupQuery; -struct WithoutOnClusterASTRewriteParams; - -/// Rewrites elements of BACKUP-ON-CLUSTER query after receiving it on shards or replica. -std::shared_ptr -rewriteBackupQueryWithoutOnCluster(const ASTBackupQuery & backup_query, const WithoutOnClusterASTRewriteParams & params); - -/// Rewrites elements of RESTORE-ON-CLUSTER query after receiving it on shards or replica. -std::shared_ptr -rewriteRestoreQueryWithoutOnCluster(const ASTBackupQuery & restore_query, const WithoutOnClusterASTRewriteParams & params); - -} diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 8a32ebde268..f88c22531c0 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB diff --git a/src/Backups/RestoreCoordinationDistributed.h b/src/Backups/RestoreCoordinationDistributed.h index 4f1da10a8b5..4cad3a52622 100644 --- a/src/Backups/RestoreCoordinationDistributed.h +++ b/src/Backups/RestoreCoordinationDistributed.h @@ -3,6 +3,8 @@ #include #include #include +#include + namespace DB { diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index 66ec53399f5..9b7cc2a707d 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -2,6 +2,7 @@ #include #include +#include #include diff --git a/src/Backups/Common/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp similarity index 98% rename from src/Backups/Common/RestoreSettings.cpp rename to src/Backups/RestoreSettings.cpp index f9fc8e044d5..b9480448ccf 100644 --- a/src/Backups/Common/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Backups/Common/RestoreSettings.h b/src/Backups/RestoreSettings.h similarity index 100% rename from src/Backups/Common/RestoreSettings.h rename to src/Backups/RestoreSettings.h diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index cd9510608d6..cf2962a442e 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -1,6 +1,6 @@ #include #include -#include +#include #include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d31bfe114f8..e8e1153e2b4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -215,7 +215,6 @@ if (TARGET ch_contrib::jemalloc) endif() add_subdirectory(Access/Common) -add_subdirectory(Backups/Common) add_subdirectory(Common/ZooKeeper) add_subdirectory(Common/Config) diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 6d3be56fa87..d9080f54e95 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -1,6 +1,6 @@ #include -#include -#include +#include +#include #include #include #include @@ -51,7 +51,7 @@ namespace return BackupFactory::instance().createBackup(params); } - void executeBackupSync(const ASTBackupQuery & query, UInt64 task_id, const ContextPtr & context, const BackupInfo & backup_info, const BackupSettings & backup_settings, bool no_throw = false) + void executeBackupSync(const ASTBackupQuery & query, size_t task_id, const ContextPtr & context, const BackupInfo & backup_info, const BackupSettings & backup_settings, bool no_throw = false) { auto & worker = BackupsWorker::instance(); bool is_internal_backup = backup_settings.internal; @@ -107,7 +107,7 @@ namespace } } - void executeRestoreSync(const ASTBackupQuery & query, UInt64 task_id, ContextMutablePtr context, const BackupInfo & backup_info, const RestoreSettings & restore_settings, bool no_throw = false) + void executeRestoreSync(const ASTBackupQuery & query, size_t task_id, ContextMutablePtr context, const BackupInfo & backup_info, const RestoreSettings & restore_settings, bool no_throw = false) { auto & worker = BackupsWorker::instance(); bool is_internal_restore = restore_settings.internal; @@ -155,7 +155,7 @@ namespace } } - UInt64 executeBackup(const ASTBackupQuery & query, const ContextPtr & context) + size_t executeBackup(const ASTBackupQuery & query, const ContextPtr & context) { const auto backup_info = BackupInfo::fromAST(*query.backup_name); const auto backup_settings = BackupSettings::fromBackupQuery(query); @@ -175,7 +175,7 @@ namespace return task_id; } - UInt64 executeRestore(const ASTBackupQuery & query, ContextMutablePtr context) + size_t executeRestore(const ASTBackupQuery & query, ContextMutablePtr context) { const auto backup_info = BackupInfo::fromAST(*query.backup_name); const auto restore_settings = RestoreSettings::fromRestoreQuery(query); @@ -195,7 +195,7 @@ namespace return task_id; } - Block getResultRow(UInt64 task_id) + Block getResultRow(size_t task_id) { if (!task_id) return {}; @@ -223,7 +223,7 @@ BlockIO InterpreterBackupQuery::execute() { const auto & query = query_ptr->as(); - UInt64 task_id; + size_t task_id; if (query.kind == ASTBackupQuery::BACKUP) task_id = executeBackup(query, context); else diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index 50ca51bd019..d483f477e71 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -1,7 +1,9 @@ #include -#include +#include #include +#include #include +#include namespace DB @@ -137,8 +139,49 @@ namespace settings->format(format); } } + + + void setDatabaseInElements(ASTBackupQuery::Elements & elements, const String & new_database) + { + for (auto & element : elements) + { + if (element.type == ASTBackupQuery::TABLE) + { + if (element.name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) + element.name.first = new_database; + if (element.new_name.first.empty() && !element.name.second.empty() && !element.name_is_in_temp_db) + element.new_name.first = new_database; + } + } + } + + ASTPtr rewriteSettingsWithoutOnCluster(ASTPtr settings, const WithoutOnClusterASTRewriteParams & params) + { + SettingsChanges changes; + if (settings) + changes = assert_cast(settings.get())->changes; + + boost::remove_erase_if( + changes, + [](const SettingChange & change) + { + const String & name = change.name; + return (name == "internal") || (name == "async") || (name == "shard_num") || (name == "replica_num"); + }); + + changes.emplace_back("internal", true); + changes.emplace_back("async", false); + changes.emplace_back("shard_num", params.shard_index); + changes.emplace_back("replica_num", params.replica_index); + + auto out_settings = std::shared_ptr(); + out_settings->changes = std::move(changes); + out_settings->is_standalone = false; + return out_settings; + } } + String ASTBackupQuery::getID(char) const { return (kind == Kind::BACKUP) ? "BackupQuery" : "RestoreQuery"; @@ -168,10 +211,11 @@ void ASTBackupQuery::formatImpl(const FormatSettings & format, FormatState &, Fo ASTPtr ASTBackupQuery::getRewrittenASTWithoutOnCluster(const WithoutOnClusterASTRewriteParams & params) const { - if (kind == ASTBackupQuery::Kind::BACKUP) - return rewriteBackupQueryWithoutOnCluster(*this, params); - else - return rewriteRestoreQueryWithoutOnCluster(*this, params); + auto new_query = std::static_pointer_cast(clone()); + new_query->cluster.clear(); + new_query->settings = rewriteSettingsWithoutOnCluster(new_query->settings, params); + setDatabaseInElements(new_query->elements, params.default_database); + return new_query; } diff --git a/src/Parsers/CMakeLists.txt b/src/Parsers/CMakeLists.txt index 08722770773..b2c31366929 100644 --- a/src/Parsers/CMakeLists.txt +++ b/src/Parsers/CMakeLists.txt @@ -5,7 +5,6 @@ add_headers_and_sources(clickhouse_parsers ./Access) add_headers_and_sources(clickhouse_parsers ./MySQL) add_library(clickhouse_parsers ${clickhouse_parsers_headers} ${clickhouse_parsers_sources}) target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_access) -target_link_libraries(clickhouse_parsers PUBLIC clickhouse_common_io clickhouse_common_backups) if (USE_DEBUG_HELPERS) # CMake generator expression will do insane quoting when it encounters special character like quotes, spaces, etc. From 6b5d9201f6070d303a59b9a1bf2bf1e5ba51395e Mon Sep 17 00:00:00 2001 From: Memo Date: Tue, 26 Apr 2022 10:25:58 +0800 Subject: [PATCH 69/94] Update src/Core/Settings.h Co-authored-by: alesapin --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 51a5a6f3105..a755aacc044 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -248,7 +248,7 @@ class IColumn; M(UInt64, preferred_max_column_in_block_size_bytes, 0, "Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size.", 0) \ \ M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \ - M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \ + M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in a single partition of the destination table, throw 'Too many parts ...' exception.", 0) \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ M(Int64, distributed_ddl_task_timeout, 180, "Timeout for DDL query responses from all hosts in cluster. If a ddl request has not been performed on all hosts, a response will contain a timeout error and a request will be executed in an async mode. Negative value means infinite. Zero means async mode.", 0) \ From 420d343e833006b276c760cc524351da2160c714 Mon Sep 17 00:00:00 2001 From: Memo Date: Tue, 26 Apr 2022 10:26:09 +0800 Subject: [PATCH 70/94] Update src/Core/Settings.h Co-authored-by: alesapin --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a755aacc044..a68eec3f6a3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -247,7 +247,7 @@ class IColumn; M(Bool, fallback_to_stale_replicas_for_distributed_queries, true, "Suppose max_replica_delay_for_distributed_queries is set and all replicas for the queried table are stale. If this setting is enabled, the query will be performed anyway, otherwise the error will be reported.", 0) \ M(UInt64, preferred_max_column_in_block_size_bytes, 0, "Limit on max column size in block while reading. Helps to decrease cache misses count. Should be close to L2 cache size.", 0) \ \ - M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \ + M(UInt64, parts_to_delay_insert, 150, "If the destination table contains at least that many active parts in a single partition, artificially slow down insert into table.", 0) \ M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in a single partition of the destination table, throw 'Too many parts ...' exception.", 0) \ M(Bool, insert_distributed_sync, false, "If setting is enabled, insert query into distributed waits until data will be sent to all nodes in cluster.", 0) \ M(UInt64, insert_distributed_timeout, 0, "Timeout for insert query into distributed. Setting is used only with insert_distributed_sync enabled. Zero value means no timeout.", 0) \ From c38a4b4255dc8a9824adc793dd79e4a65c79b798 Mon Sep 17 00:00:00 2001 From: Memo Date: Tue, 26 Apr 2022 10:26:18 +0800 Subject: [PATCH 71/94] Update src/Storages/MergeTree/MergeTreeData.h Co-authored-by: alesapin --- src/Storages/MergeTree/MergeTreeData.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 419fd2ed017..8515c8583b7 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -530,7 +530,7 @@ public: /// If the table contains too many active parts, sleep for a while to give them time to merge. /// If until is non-null, wake up from the sleep earlier if the event happened. - void delayInsertOrThrowIfNeeded(Poco::Event * until = nullptr, ContextPtr query_context = nullptr) const; + void delayInsertOrThrowIfNeeded(Poco::Event * until, ContextPtr query_context) const; /// Renames temporary part to a permanent part and adds it to the parts set. /// It is assumed that the part does not intersect with existing parts. From 9646487c093a75dc31e3e818417cfad83580b40f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 26 Apr 2022 07:32:02 +0000 Subject: [PATCH 72/94] Address PR review comments --- contrib/NuRaft | 2 +- docs/en/operations/clickhouse-keeper.md | 22 +++++++++++----------- src/Coordination/FourLetterCommand.cpp | 2 +- src/Coordination/KeeperDispatcher.cpp | 5 +++-- src/Coordination/KeeperDispatcher.h | 2 ++ src/Coordination/KeeperServer.cpp | 22 +++++++++++++--------- src/Coordination/KeeperServer.h | 8 ++++++-- 7 files changed, 37 insertions(+), 26 deletions(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index aed15187ef0..0e1ac401b7c 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit aed15187ef0f051f5b7ea5628176824e91f6ecb1 +Subproject commit 0e1ac401b7c41a5042be2385477332dbeb7104cd diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 58f4a0fc16e..a52e3a315f8 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -13,9 +13,9 @@ ZooKeeper is one of the first well-known open-source coordination systems. It's By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible. -ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. +ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. -:::note +:::note External integrations are not supported. ::: @@ -117,7 +117,7 @@ clickhouse keeper --config /etc/your_path_to_config/config.xml ## Four Letter Word Commands {#four-letter-word-commands} -ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. +ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". @@ -334,13 +334,13 @@ This should be done only as your last resort if you cannot start your nodes agai Important things to note before continuing: - Make sure that the failed nodes cannot connect to the cluster again. -- Do not start any of the new nodes until it's specified in the steps +- Do not start any of the new nodes until it's specified in the steps. After making sure that the above things are true, you need to do following: -1. Pick a single Keeper node to be your new leader. -2. Before doing anything else, make a backup of the `log_storage_path` folder of the picked node -3. Reconfigure the cluster on all of the nodes you want to use -4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode OR stop Keeper instance on the picked node and start it again with the `--force-recovery` argument -5. One by one, start Keeper instances on the new nodes making sure that `mntr` returns `follower` for the `zk_server_state` before starting the next one -6. While in the recovery mode, the leader node will return error message for `mntr` command until it achieves quorum with the new nodes and refuse any requests from the client and the followers -7. After quorum is achieved, the leader node will return to the normal mode of operation, accepting all the requests using Raft - verify with `mntr` which should return `leader` for the `zk_server_state` +1. Pick a single Keeper node to be your new leader. Be aware that the data of that node will be used for the entire cluster so we recommend to use a node with the most up to date state. +2. Before doing anything else, make a backup of the `log_storage_path` and `snapshot_storage_path` folders of the picked node. +3. Reconfigure the cluster on all of the nodes you want to use. +4. Send the four letter command `rcvr` to the node you picked which will move the node to the recovery mode OR stop Keeper instance on the picked node and start it again with the `--force-recovery` argument. +5. One by one, start Keeper instances on the new nodes making sure that `mntr` returns `follower` for the `zk_server_state` before starting the next one. +6. While in the recovery mode, the leader node will return error message for `mntr` command until it achieves quorum with the new nodes and refuse any requests from the client and the followers. +7. After quorum is achieved, the leader node will return to the normal mode of operation, accepting all the requests using Raft - verify with `mntr` which should return `leader` for the `zk_server_state`. diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 5ea3b1124f3..9a374c51b93 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -460,7 +460,7 @@ String IsReadOnlyCommand::run() String RecoveryCommand::run() { keeper_dispatcher.forceRecovery(); - return ""; + return "ok"; } } diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index bf160a428ec..f97c65e7056 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -540,17 +540,18 @@ void KeeperDispatcher::updateConfigurationThread() try { + using namespace std::chrono_literals; if (!server->checkInit()) { LOG_INFO(log, "Server still not initialized, will not apply configuration until initialization finished"); - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + std::this_thread::sleep_for(5000ms); continue; } if (server->isRecovering()) { LOG_INFO(log, "Server is recovering, will not apply configuration until recovery is finished"); - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + std::this_thread::sleep_for(5000ms); continue; } diff --git a/src/Coordination/KeeperDispatcher.h b/src/Coordination/KeeperDispatcher.h index af7b132ac3a..9837a1047de 100644 --- a/src/Coordination/KeeperDispatcher.h +++ b/src/Coordination/KeeperDispatcher.h @@ -116,6 +116,8 @@ public: return server && server->checkInit(); } + /// Is server accepting requests, i.e. connected to the cluster + /// and achieved quorum bool isServerActive() const; /// Registered in ConfigReloader callback. Add new configuration changes to diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index c4ef8d7e7c4..ca76f672cac 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -118,6 +118,10 @@ KeeperServer::KeeperServer( LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); } +/** + * Tiny wrapper around nuraft::raft_server which adds some functions + * necessary for recovery, mostly connected to config manipulation. + */ struct KeeperServer::KeeperRaftServer : public nuraft::raft_server { bool isClusterHealthy() @@ -197,7 +201,7 @@ void KeeperServer::loadLatestConfig() } } -void KeeperServer::recoveryMode(nuraft::raft_params & params) +void KeeperServer::enterRecoveryMode(nuraft::raft_params & params) { LOG_WARNING( log, @@ -218,10 +222,11 @@ void KeeperServer::recoveryMode(nuraft::raft_params & params) void KeeperServer::forceRecovery() { + // notify threads containing the lock that we want to enter recovery mode is_recovering = true; - std::lock_guard lock{server_mutex}; + std::lock_guard lock{server_write_mutex}; auto params = raft_instance->get_current_params(); - recoveryMode(params); + enterRecoveryMode(params); raft_instance->setConfig(state_manager->load_config()); raft_instance->update_params(params); } @@ -263,7 +268,7 @@ void KeeperServer::launchRaftServer(bool enable_ipv6) } if (is_recovering) - recoveryMode(params); + enterRecoveryMode(params); nuraft::raft_server::init_options init_options; @@ -383,7 +388,7 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS for (const auto & [session_id, time, request] : requests_for_sessions) entries.push_back(getZooKeeperLogEntry(session_id, time, request)); - std::lock_guard lock{server_mutex}; + std::lock_guard lock{server_write_mutex}; if (is_recovering) return nullptr; @@ -462,7 +467,7 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ // config is the same as the committed one // Because we manually set the config to commit // we need to call the reconfigure also - uint64_t log_idx = *static_cast(param->ctx); + uint64_t log_idx = *static_cast(param->ctx); if (log_idx == state_manager->load_config()->get_log_idx()) raft_instance->forceReconfigure(state_manager->load_config()); break; @@ -554,7 +559,7 @@ ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::Abstrac if (!diff.empty()) { - std::lock_guard lock{server_mutex}; + std::lock_guard lock{server_write_mutex}; last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config; } @@ -563,7 +568,7 @@ ConfigUpdateActions KeeperServer::getConfigurationDiff(const Poco::Util::Abstrac void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) { - std::lock_guard lock{server_mutex}; + std::lock_guard lock{server_write_mutex}; if (is_recovering) return; @@ -663,7 +668,6 @@ void KeeperServer::applyConfigurationUpdate(const ConfigUpdateAction & task) bool KeeperServer::waitConfigurationUpdate(const ConfigUpdateAction & task) { - std::lock_guard lock{server_mutex}; if (is_recovering) return false; diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 19d346de345..8c21cf47d94 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -29,7 +29,11 @@ private: nuraft::ptr raft_instance; nuraft::ptr asio_service; nuraft::ptr asio_listener; - mutable std::mutex server_mutex; + // because some actions can be applied + // when we are sure that there are no requests currently being + // processed (e.g. recovery) we do all write actions + // on raft_server under this mutex. + mutable std::mutex server_write_mutex; std::mutex initialized_mutex; std::atomic initialized_flag = false; @@ -52,7 +56,7 @@ private: void loadLatestConfig(); - void recoveryMode(nuraft::raft_params & params); + void enterRecoveryMode(nuraft::raft_params & params); std::atomic_bool is_recovering = false; From 78bcb96098f507c732ca66f893d2ec4e8838278d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 09:48:40 +0200 Subject: [PATCH 73/94] Rename backup & restore setting 'async' -> 'sync', and make backup & restore async by default. --- src/Backups/BackupSettings.cpp | 2 +- src/Backups/BackupSettings.h | 4 +- src/Backups/RestoreSettings.cpp | 2 +- src/Backups/RestoreSettings.h | 4 +- src/Interpreters/InterpreterBackupQuery.cpp | 12 ++-- src/Parsers/ASTBackupQuery.cpp | 4 +- src/Parsers/ParserBackupQuery.cpp | 30 ++++++++++ .../test_backup_restore_new/test.py | 56 +++++++++---------- .../test_backup_restore_on_cluster/test.py | 12 ++-- 9 files changed, 78 insertions(+), 48 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 818b93edcc6..61c5726bdbf 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -18,7 +18,7 @@ namespace ErrorCodes M(Int64, compression_level) \ M(String, password) \ M(Bool, structure_only) \ - M(Bool, async) \ + M(Bool, sync) \ M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(Bool, allow_storing_multiple_replicas) \ diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index 5fd29f510ab..c1121970b49 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -25,8 +25,8 @@ struct BackupSettings /// without the data of tables. bool structure_only = false; - /// Whether BACKUP command must return immediately without waiting until the backup is completed. - bool async = false; + /// Whether the BACKUP command must wait until the backup has completed. + bool sync = false; /// 1-based shard index to store in the backup. 0 means all shards. /// Can only be used with BACKUP ON CLUSTER. diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index b9480448ccf..e664e1066de 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -68,7 +68,7 @@ namespace M(RestoreDatabaseCreationMode, create_database) \ M(Bool, allow_different_table_def) \ M(Bool, allow_different_database_def) \ - M(Bool, async) \ + M(Bool, sync) \ M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(UInt64, shard_num_in_backup) \ diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index 136cfcaf239..e6fdc4e492f 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -58,8 +58,8 @@ struct RestoreSettings : public StorageRestoreSettings /// Set `allow_different_database_def` to true to skip this check. bool allow_different_database_def = false; - /// Whether RESTORE command must return immediately without waiting until the backup is completed. - bool async = false; + /// Whether the RESTORE command must wait until the restoring has completed. + bool sync = false; /// 1-based shard index to restore from the backup. 0 means all shards. /// Can only be used with RESTORE ON CLUSTER. diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index d9080f54e95..90dceec3f3b 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -164,13 +164,13 @@ namespace if (!backup_settings.internal) task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::PREPARING); - if (backup_settings.async) + if (backup_settings.sync) { - BackupsWorker::instance().run([query, task_id, context, backup_info, backup_settings]{ executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ true); }); + executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ false); } else { - executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ false); + BackupsWorker::instance().run([query, task_id, context, backup_info, backup_settings]{ executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ true); }); } return task_id; } @@ -184,13 +184,13 @@ namespace if (!restore_settings.internal) task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::RESTORING); - if (restore_settings.async) + if (restore_settings.sync) { - BackupsWorker::instance().run([query, task_id, context, backup_info, restore_settings]{ executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ true); }); + executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ false); } else { - executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ false); + BackupsWorker::instance().run([query, task_id, context, backup_info, restore_settings]{ executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ true); }); } return task_id; } diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index d483f477e71..8fbdf332dc5 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -166,11 +166,11 @@ namespace [](const SettingChange & change) { const String & name = change.name; - return (name == "internal") || (name == "async") || (name == "shard_num") || (name == "replica_num"); + return (name == "internal") || (name == "sync") || (name == "shard_num") || (name == "replica_num"); }); changes.emplace_back("internal", true); - changes.emplace_back("async", false); + changes.emplace_back("sync", true); changes.emplace_back("shard_num", params.shard_index); changes.emplace_back("replica_num", params.replica_index); diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index 1b4840b548e..c354f28db17 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -1,12 +1,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include +#include namespace DB @@ -285,6 +288,32 @@ namespace }); } + bool parseSyncOrAsync(IParser::Pos & pos, Expected & expected, ASTPtr & settings) + { + bool sync; + if (ParserKeyword{"SYNC"}.ignore(pos, expected)) + sync = true; + else if (ParserKeyword{"ASYNC"}.ignore(pos, expected)) + sync = false; + else + return false; + + SettingsChanges changes; + if (settings) + { + changes = assert_cast(settings.get())->changes; + } + + boost::remove_erase_if(changes, [](const SettingChange & change) { return change.name == "sync"; }); + changes.emplace_back("sync", sync); + + auto new_settings = std::make_shared(); + new_settings->changes = std::move(changes); + new_settings->is_standalone = false; + settings = new_settings; + return true; + } + bool parseOnCluster(IParserBase::Pos & pos, Expected & expected, String & cluster) { return IParserBase::wrapParseImpl(pos, [&] @@ -322,6 +351,7 @@ bool ParserBackupQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ASTPtr settings; ASTPtr base_backup_name; parseSettings(pos, expected, settings, base_backup_name); + parseSyncOrAsync(pos, expected, settings); auto query = std::make_shared(); node = query; diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 51daf6d37e8..ebcc94841be 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -58,12 +58,12 @@ def test_restore_table(engine): create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -75,12 +75,12 @@ def test_restore_table_into_existing_table(engine): create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") - instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "200\t9900\n" - instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "300\t14850\n" @@ -89,11 +89,11 @@ def test_restore_table_under_another_name(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") assert instance.query("EXISTS test.table2") == "0\n" - instance.query(f"RESTORE TABLE test.table INTO test.table2 FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table INTO test.table2 FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -102,11 +102,11 @@ def test_backup_table_under_another_name(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table AS test.table2 TO {backup_name}") + instance.query(f"BACKUP TABLE test.table AS test.table2 TO {backup_name} SYNC") assert instance.query("EXISTS test.table2") == "0\n" - instance.query(f"RESTORE TABLE test.table2 FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table2 FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -116,9 +116,9 @@ def test_materialized_view(): "CREATE MATERIALIZED VIEW mv_1(x UInt8) ENGINE=MergeTree ORDER BY tuple() POPULATE AS SELECT 1 AS x" ) - instance.query(f"BACKUP TABLE mv_1 TO {backup_name}") + instance.query(f"BACKUP TABLE mv_1 TO {backup_name} SYNC") instance.query("DROP TABLE mv_1") - instance.query(f"RESTORE TABLE mv_1 FROM {backup_name}") + instance.query(f"RESTORE TABLE mv_1 FROM {backup_name} SYNC") assert instance.query("SELECT * FROM mv_1") == "1\n" instance.query("DROP TABLE mv_1") @@ -130,17 +130,17 @@ def test_incremental_backup(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") instance.query("INSERT INTO test.table VALUES (65, 'a'), (66, 'b')") assert instance.query("SELECT count(), sum(x) FROM test.table") == "102\t5081\n" instance.query( - f"BACKUP TABLE test.table TO {incremental_backup_name} SETTINGS base_backup = {backup_name}" + f"BACKUP TABLE test.table TO {incremental_backup_name} SETTINGS base_backup = {backup_name} SYNC" ) instance.query( - f"RESTORE TABLE test.table AS test.table2 FROM {incremental_backup_name}" + f"RESTORE TABLE test.table AS test.table2 FROM {incremental_backup_name} SYNC" ) assert instance.query("SELECT count(), sum(x) FROM test.table2") == "102\t5081\n" @@ -150,10 +150,10 @@ def test_incremental_backup_after_renaming_table(): incremental_backup_name = new_backup_name() create_and_fill_table() - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") instance.query("RENAME TABLE test.table TO test.table2") instance.query( - f"BACKUP TABLE test.table2 TO {incremental_backup_name} SETTINGS base_backup = {backup_name}" + f"BACKUP TABLE test.table2 TO {incremental_backup_name} SETTINGS base_backup = {backup_name} SYNC" ) # Files in a base backup can be searched by checksum, so an incremental backup with a renamed table actually @@ -174,7 +174,7 @@ def test_incremental_backup_after_renaming_table(): ) instance.query("DROP TABLE test.table2") - instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name}") + instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -185,17 +185,17 @@ def test_backup_not_found_or_already_exists(): assert re.search( expected_error, instance.query_and_get_error( - f"RESTORE TABLE test.table AS test.table2 FROM {backup_name}" + f"RESTORE TABLE test.table AS test.table2 FROM {backup_name} SYNC" ), ) create_and_fill_table() - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") expected_error = "Backup .* already exists" assert re.search( expected_error, - instance.query_and_get_error(f"BACKUP TABLE test.table TO {backup_name}"), + instance.query_and_get_error(f"BACKUP TABLE test.table TO {backup_name} SYNC"), ) @@ -204,12 +204,12 @@ def test_file_engine(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -218,9 +218,9 @@ def test_database(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP DATABASE test TO {backup_name}") + instance.query(f"BACKUP DATABASE test TO {backup_name} SYNC") instance.query("DROP DATABASE test") - instance.query(f"RESTORE DATABASE test FROM {backup_name}") + instance.query(f"RESTORE DATABASE test FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -230,13 +230,13 @@ def test_zip_archive(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name}") + instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") assert os.path.isfile(get_path_to_backup(backup_name)) instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name}") + instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -246,13 +246,13 @@ def test_zip_archive_with_settings(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query( - f"BACKUP TABLE test.table TO {backup_name} SETTINGS compression_method='lzma', compression_level=3, password='qwerty'" + f"BACKUP TABLE test.table TO {backup_name} SETTINGS compression_method='lzma', compression_level=3, password='qwerty' SYNC" ) instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" instance.query( - f"RESTORE TABLE test.table FROM {backup_name} SETTINGS password='qwerty'" + f"RESTORE TABLE test.table FROM {backup_name} SETTINGS password='qwerty' SYNC" ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 36ec02bb770..3dacafa5e30 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -75,14 +75,14 @@ def test_replicated_table(): # Make backup on node 1. node1.query( - f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=1" + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=1 SYNC" ) # Drop table on both nodes. node1.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") # Restore from backup on node2. - node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} SYNC") assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] @@ -111,14 +111,14 @@ def test_replicated_database(): # Make backup. backup_name = new_backup_name() node1.query( - f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2" + f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2 SYNC" ) # Drop table on both nodes. node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") # Restore from backup on node2. - node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}") + node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} SYNC") assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] @@ -142,12 +142,12 @@ def test_different_tables_on_nodes(): backup_name = new_backup_name() node1.query( - f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS allow_storing_multiple_replicas = true" + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS allow_storing_multiple_replicas = true SYNC" ) node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") - node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} SYNC") assert node1.query("SELECT * FROM tbl") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] From 1c0b731ea6b86ce3bf7f88bd3ec27df7b218454d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 10:04:16 +0200 Subject: [PATCH 74/94] Fix compilation. --- src/Backups/BackupIO_Disk.cpp | 2 -- src/Backups/BackupImpl.cpp | 24 ++++++++++++------------ src/Backups/BackupSettings.cpp | 6 +++--- src/Backups/BackupsWorker.cpp | 5 +---- src/Backups/RestoreSettings.cpp | 13 +++++++------ src/Backups/RestoreUtils.cpp | 12 +++--------- src/Common/ErrorCodes.cpp | 2 +- src/Parsers/ASTBackupQuery.cpp | 2 +- 8 files changed, 28 insertions(+), 38 deletions(-) diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index fc61370e951..a281f88a8b2 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -5,8 +5,6 @@ #include #include -namespace fs = std::filesystem; - namespace DB { diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 233061a8da2..c4ae22bc864 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -182,12 +182,12 @@ void BackupImpl::open() if (open_mode == OpenMode::WRITE) { if (backup_exists) - throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", getName()); + throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", backup_name); } else { if (!backup_exists) - throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", getName()); + throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", backup_name); } if (open_mode == OpenMode::WRITE) @@ -215,7 +215,7 @@ void BackupImpl::open() base_backup_uuid = base_backup->getUUID(); else if (base_backup_uuid != base_backup->getUUID()) throw Exception(ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: The base backup {} has different UUID ({} != {})", - getName(), base_backup->getName(), toString(base_backup->getUUID()), (base_backup_uuid ? toString(*base_backup_uuid) : "")); + backup_name, base_backup->getName(), toString(base_backup->getUUID()), (base_backup_uuid ? toString(*base_backup_uuid) : "")); } } @@ -322,7 +322,7 @@ void BackupImpl::readBackupMetadata() version = config->getUInt("version"); if ((version < INITIAL_BACKUP_VERSION) || (version > CURRENT_BACKUP_VERSION)) - throw Exception(ErrorCodes::BACKUP_VERSION_NOT_SUPPORTED, "Backup {}: Version {} is not supported", getName(), version); + throw Exception(ErrorCodes::BACKUP_VERSION_NOT_SUPPORTED, "Backup {}: Version {} is not supported", backup_name, version); timestamp = parse(config->getString("timestamp")).to_time_t(); uuid = parse(config->getString("uuid")); @@ -353,7 +353,7 @@ void BackupImpl::readBackupMetadata() use_base = true; if (info.base_size > info.size) - throw Exception(ErrorCodes::BACKUP_DAMAGED, "Backup {}: Base size must not be greater than the size of entry {}", getName(), quoteString(info.file_name)); + throw Exception(ErrorCodes::BACKUP_DAMAGED, "Backup {}: Base size must not be greater than the size of entry {}", backup_name, quoteString(info.file_name)); if (use_base) { @@ -402,7 +402,7 @@ UInt64 BackupImpl::getFileSize(const String & file_name) const auto info = coordination->getFileInfo(file_name); if (!info) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); return info->size; } @@ -412,7 +412,7 @@ UInt128 BackupImpl::getFileChecksum(const String & file_name) const auto info = coordination->getFileInfo(file_name); if (!info) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); return info->checksum; } @@ -422,7 +422,7 @@ SizeAndChecksum BackupImpl::getFileSizeAndChecksum(const String & file_name) con auto info = coordination->getFileInfo(file_name); if (!info) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), quoteString(file_name)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name)); return std::pair(info->size, info->checksum); } @@ -440,7 +440,7 @@ BackupEntryPtr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) c auto info_opt = coordination->getFileInfo(size_and_checksum); if (!info_opt) throw Exception( - ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", getName(), formatSizeAndChecksum(size_and_checksum)); + ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, formatSizeAndChecksum(size_and_checksum)); const auto & info = *info_opt; if (!info.size) @@ -461,7 +461,7 @@ BackupEntryPtr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) c throw Exception( ErrorCodes::NO_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but there is no base backup specified", - getName(), formatSizeAndChecksum(size_and_checksum)); + backup_name, formatSizeAndChecksum(size_and_checksum)); } if (!base_backup->fileExists(std::pair(info.base_size, info.base_checksum))) @@ -469,7 +469,7 @@ BackupEntryPtr BackupImpl::readFile(const SizeAndChecksum & size_and_checksum) c throw Exception( ErrorCodes::WRONG_BASE_BACKUP, "Backup {}: Entry {} is marked to be read from a base backup, but doesn't exist there", - getName(), formatSizeAndChecksum(size_and_checksum)); + backup_name, formatSizeAndChecksum(size_and_checksum)); } auto base_entry = base_backup->readFile(std::pair{info.base_size, info.base_checksum}); @@ -497,7 +497,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry) if (coordination->getFileInfo(file_name)) throw Exception( - ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", getName(), quoteString(file_name)); + ErrorCodes::BACKUP_ENTRY_ALREADY_EXISTS, "Backup {}: Entry {} already exists", backup_name, quoteString(file_name)); FileInfo info; info.file_name = file_name; diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 61c5726bdbf..397a8bad344 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -9,7 +9,7 @@ namespace DB { namespace ErrorCodes { - extern const int UNKNOWN_SETTING; + extern const int CANNOT_PARSE_BACKUP_SETTINGS; } /// List of backup settings except base_backup_name. @@ -44,7 +44,7 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query) else LIST_OF_BACKUP_SETTINGS(GET_SETTINGS_FROM_BACKUP_QUERY_HELPER) - throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); + throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Unknown setting {}", setting.name); } } @@ -61,7 +61,7 @@ void BackupSettings::copySettingsToBackupQuery(ASTBackupQuery & query) const static const BackupSettings default_settings; #define SET_SETTINGS_IN_BACKUP_QUERY_HELPER(TYPE, NAME) \ - if (NAME != default_settings.NAME) \ + if ((NAME) != default_settings.NAME) \ query_settings->changes.emplace_back(#NAME, static_cast(SettingField##TYPE{NAME})); LIST_OF_BACKUP_SETTINGS(SET_SETTINGS_IN_BACKUP_QUERY_HELPER) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 3a1bf6d7d0c..3465c06d753 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -16,10 +16,7 @@ BackupsWorker & BackupsWorker::instance() return the_instance; } -BackupsWorker::BackupsWorker() -{ -} - +BackupsWorker::BackupsWorker() = default; size_t BackupsWorker::add(const String & backup_name, BackupStatus status, const String & error) { diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index e664e1066de..bbda1eaea5d 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -11,8 +11,8 @@ namespace DB { namespace ErrorCodes { - extern const int UNKNOWN_SETTING; - extern const int CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE; + extern const int CANNOT_PARSE_BACKUP_SETTINGS; + extern const int LOGICAL_ERROR; } namespace @@ -25,7 +25,7 @@ namespace { if (field.getType() == Field::Types::String) { - String str = field.get(); + const String & str = field.get(); if (str == "1" || boost::iequals(str, "true")) value = RestoreTableCreationMode::kCreate; else if (str == "0" || boost::iequals(str, "false")) @@ -33,7 +33,7 @@ namespace else if (boost::iequals(str, "if not exists")) value = RestoreTableCreationMode::kCreateIfNotExists; else throw Exception("Cannot parse creation mode from string '" + str + "'", - ErrorCodes::CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE); + ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS); } else { @@ -52,6 +52,7 @@ namespace case RestoreTableCreationMode::kMustExist: return Field{false}; case RestoreTableCreationMode::kCreateIfNotExists: return Field{"if not exists"}; } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected value of enum RestoreTableCreationMode: {}", static_cast(value)); } operator RestoreTableCreationMode() const { return value; } @@ -94,7 +95,7 @@ RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) else LIST_OF_RESTORE_SETTINGS(GET_SETTINGS_FROM_RESTORE_QUERY_HELPER) - throw Exception(ErrorCodes::UNKNOWN_SETTING, "Unknown setting {}", setting.name); + throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Unknown setting {}", setting.name); } } @@ -111,7 +112,7 @@ void RestoreSettings::copySettingsToRestoreQuery(ASTBackupQuery & query) const static const RestoreSettings default_settings; #define SET_SETTINGS_IN_RESTORE_QUERY_HELPER(TYPE, NAME) \ - if (NAME != default_settings.NAME) \ + if ((NAME) != default_settings.NAME) \ query_settings->changes.emplace_back(#NAME, static_cast(SettingField##TYPE{NAME})); LIST_OF_RESTORE_SETTINGS(SET_SETTINGS_IN_RESTORE_QUERY_HELPER) diff --git a/src/Backups/RestoreUtils.cpp b/src/Backups/RestoreUtils.cpp index cf2962a442e..57b12dbce33 100644 --- a/src/Backups/RestoreUtils.cpp +++ b/src/Backups/RestoreUtils.cpp @@ -22,14 +22,11 @@ #include #include #include +#include #include #include #include -#include - -namespace fs = std::filesystem; - namespace DB { @@ -45,7 +42,7 @@ namespace class PathsInBackup { public: - PathsInBackup(const IBackup & backup_) : backup(backup_) {} + explicit PathsInBackup(const IBackup & backup_) : backup(backup_) {} std::vector getShards() const { @@ -312,10 +309,7 @@ namespace use_coordination_for_table_creation = true; replicated_database_zookeeper_path = replicated_db->getZooKeeperPath(); - if (restore_coordination->acquireZkPathAndName(replicated_database_zookeeper_path, table_name.second)) - return true; - - return false; + return restore_coordination->acquireZkPathAndName(replicated_database_zookeeper_path, table_name.second); } void setTableCreationResult(IRestoreCoordination::Result res) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index e2298e04b44..423f6945b2c 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -621,7 +621,7 @@ M(650, SERIALIZATION_ERROR) \ M(651, CAPN_PROTO_BAD_TYPE) \ M(652, ONLY_NULLS_WHILE_READING_SCHEMA) \ - M(653, CANNOT_PARSE_RESTORE_TABLE_CREATION_MODE) \ + M(653, CANNOT_PARSE_BACKUP_SETTINGS) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index 8fbdf332dc5..5734599c3c9 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -174,7 +174,7 @@ namespace changes.emplace_back("shard_num", params.shard_index); changes.emplace_back("replica_num", params.replica_index); - auto out_settings = std::shared_ptr(); + auto out_settings = std::make_shared(); out_settings->changes = std::move(changes); out_settings->is_standalone = false; return out_settings; From 16f8c71eb42ed9e52d8c38bc090f79ff6cf73678 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 10:13:49 +0200 Subject: [PATCH 75/94] Disable usage of archives with backups on clusters. --- src/Backups/registerBackupEnginesFileAndDisk.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Backups/registerBackupEnginesFileAndDisk.cpp b/src/Backups/registerBackupEnginesFileAndDisk.cpp index a3ba09bbafc..c55220e8a36 100644 --- a/src/Backups/registerBackupEnginesFileAndDisk.cpp +++ b/src/Backups/registerBackupEnginesFileAndDisk.cpp @@ -17,6 +17,7 @@ namespace ErrorCodes extern const int INVALID_CONFIG_PARAMETER; extern const int LOGICAL_ERROR; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int SUPPORT_IS_DISABLED; } @@ -148,6 +149,9 @@ void registerBackupEnginesFileAndDisk(BackupFactory & factory) BackupImpl::ArchiveParams archive_params; if (hasRegisteredArchiveFileExtension(path)) { + if (params.is_internal_backup) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Using archives with backups on clusters is disabled"); + archive_params.archive_name = path.filename(); path = path.parent_path(); archive_params.compression_method = params.compression_method; From 22ecd92ed38c61d54fb3d1ab813e69def8bbbd35 Mon Sep 17 00:00:00 2001 From: Memo Date: Tue, 26 Apr 2022 16:24:21 +0800 Subject: [PATCH 76/94] fix reference --- .../0_stateless/02280_add_query_level_settings.reference | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/queries/0_stateless/02280_add_query_level_settings.reference b/tests/queries/0_stateless/02280_add_query_level_settings.reference index 139597f9cb0..e69de29bb2d 100644 --- a/tests/queries/0_stateless/02280_add_query_level_settings.reference +++ b/tests/queries/0_stateless/02280_add_query_level_settings.reference @@ -1,2 +0,0 @@ - - From c502a14404ec1aaf6815626901e5cf97f87e47b3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 26 Apr 2022 11:24:24 +0200 Subject: [PATCH 77/94] Fix --- src/Parsers/ASTSystemQuery.cpp | 2 +- src/Parsers/ParserExplainQuery.cpp | 5 ++++- .../0_stateless/02286_drop_filesystem_cache.reference | 2 ++ tests/queries/0_stateless/02286_drop_filesystem_cache.sql | 2 ++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp index 274d1639b4b..fd19e2166df 100644 --- a/src/Parsers/ASTSystemQuery.cpp +++ b/src/Parsers/ASTSystemQuery.cpp @@ -195,7 +195,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &, else if (type == Type::DROP_FILESYSTEM_CACHE) { if (!filesystem_cache_path.empty()) - settings.ostr << (settings.hilite ? hilite_none : "") << filesystem_cache_path; + settings.ostr << (settings.hilite ? hilite_none : "") << " " << filesystem_cache_path; if (force_removal) settings.ostr << (settings.hilite ? hilite_keyword : "") << " FORCE"; } diff --git a/src/Parsers/ParserExplainQuery.cpp b/src/Parsers/ParserExplainQuery.cpp index 71c49a020cc..4547cb6045f 100644 --- a/src/Parsers/ParserExplainQuery.cpp +++ b/src/Parsers/ParserExplainQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -62,6 +63,7 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserCreateTableQuery create_p; ParserSelectWithUnionQuery select_p; ParserInsertQuery insert_p(end, allow_settings_after_format_in_insert); + ParserSystemQuery system_p; ASTPtr query; if (kind == ASTExplainQuery::ExplainKind::ParsedAST) { @@ -88,7 +90,8 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected } else if (select_p.parse(pos, query, expected) || create_p.parse(pos, query, expected) || - insert_p.parse(pos, query, expected)) + insert_p.parse(pos, query, expected) || + system_p.parse(pos, query, expected)) explain_query->setExplainedQuery(std::move(query)); else return false; diff --git a/tests/queries/0_stateless/02286_drop_filesystem_cache.reference b/tests/queries/0_stateless/02286_drop_filesystem_cache.reference index 0dabc778274..83e6ad8ec79 100644 --- a/tests/queries/0_stateless/02286_drop_filesystem_cache.reference +++ b/tests/queries/0_stateless/02286_drop_filesystem_cache.reference @@ -28,3 +28,5 @@ SELECT count() FROM system.filesystem_cache; SYSTEM DROP FILESYSTEM CACHE './s3_cache/'; SELECT count() FROM system.filesystem_cache; 2 +EXPLAIN SYNTAX SYSTEM DROP FILESYSTEM CACHE './s3_cache/' FORCE; +SYSTEM DROP FILESYSTEM CACHE ./s3_cache/ FORCE diff --git a/tests/queries/0_stateless/02286_drop_filesystem_cache.sql b/tests/queries/0_stateless/02286_drop_filesystem_cache.sql index 4c99c248dbc..811463c26d7 100644 --- a/tests/queries/0_stateless/02286_drop_filesystem_cache.sql +++ b/tests/queries/0_stateless/02286_drop_filesystem_cache.sql @@ -32,3 +32,5 @@ SELECT count() FROM system.filesystem_cache; SYSTEM DROP FILESYSTEM CACHE './s3_cache/'; SELECT count() FROM system.filesystem_cache; + +EXPLAIN SYNTAX SYSTEM DROP FILESYSTEM CACHE './s3_cache/' FORCE; From 2aa17630be3a2e6e6ac6d05a6831e7ac2c2837a8 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Tue, 26 Apr 2022 12:30:04 +0200 Subject: [PATCH 78/94] fix build --- src/Disks/S3/DiskS3.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 9941fae643b..f3f2493299e 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -356,6 +356,7 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc settings->client, source_bucket, source_path + SCHEMA_VERSION_OBJECT, + version_id, settings->s3_settings.max_single_read_retries, context->getReadSettings()); From a89ef54c69ce07ccdd69f08a38b04c47831c7afb Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 13:25:49 +0200 Subject: [PATCH 79/94] Fix tests and compilation. --- src/Backups/BackupSettings.cpp | 2 +- src/Backups/RestoreSettings.cpp | 4 +--- tests/integration/test_backup_restore_on_cluster/test.py | 9 +++++++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 397a8bad344..a78050ec428 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -40,7 +40,7 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query) { #define GET_SETTINGS_FROM_BACKUP_QUERY_HELPER(TYPE, NAME) \ if (setting.name == #NAME) \ - res.NAME = SettingField##TYPE{setting.value}; \ + res.NAME = SettingField##TYPE{setting.value}.value; \ else LIST_OF_BACKUP_SETTINGS(GET_SETTINGS_FROM_BACKUP_QUERY_HELPER) diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index bbda1eaea5d..18e04c0f81a 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -54,8 +54,6 @@ namespace } throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected value of enum RestoreTableCreationMode: {}", static_cast(value)); } - - operator RestoreTableCreationMode() const { return value; } }; using SettingFieldRestoreDatabaseCreationMode = SettingFieldRestoreTableCreationMode; @@ -91,7 +89,7 @@ RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query) { #define GET_SETTINGS_FROM_RESTORE_QUERY_HELPER(TYPE, NAME) \ if (setting.name == #NAME) \ - res.NAME = SettingField##TYPE{setting.value}; \ + res.NAME = SettingField##TYPE{setting.value}.value; \ else LIST_OF_RESTORE_SETTINGS(GET_SETTINGS_FROM_RESTORE_QUERY_HELPER) diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 3dacafa5e30..52a4918b5d0 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -2,7 +2,7 @@ from time import sleep import pytest import os.path from helpers.cluster import ClickHouseCluster -from helpers.test_tools import TSV +from helpers.test_tools import TSV, assert_eq_with_retry cluster = ClickHouseCluster(__file__) @@ -70,6 +70,7 @@ def test_replicated_table(): node2.query("INSERT INTO tbl VALUES (2, 'count')") node1.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (3, 'your')") node2.query("INSERT INTO tbl SETTINGS async_insert=true VALUES (4, 'chickens')") + node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") backup_name = new_backup_name() @@ -83,6 +84,7 @@ def test_replicated_table(): # Restore from backup on node2. node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} SYNC") + node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] @@ -101,12 +103,14 @@ def test_replicated_database(): node1.query( "CREATE TABLE mydb.tbl(x UInt8, y String) ENGINE=ReplicatedMergeTree ORDER BY x" ) - assert node2.query("EXISTS mydb.tbl") == "1\n" + + assert_eq_with_retry(node2, "EXISTS mydb.tbl", "1\n") node1.query("INSERT INTO mydb.tbl VALUES (1, 'Don''t')") node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')") node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')") node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')") + node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl") # Make backup. backup_name = new_backup_name() @@ -119,6 +123,7 @@ def test_replicated_database(): # Restore from backup on node2. node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} SYNC") + node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl") assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] From f17aea145c8c57647e69d14a54d45c13382c2d8e Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Tue, 26 Apr 2022 14:07:11 +0200 Subject: [PATCH 80/94] documentation on how configuration is implemented --- docs/en/development/architecture.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index b5cb6c321ac..6007ee52703 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -159,6 +159,14 @@ We maintain full backward and forward compatibility for the server TCP protocol: For most external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data, and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. ::: +## Configuration {#configuration} + +ClickHouse Server is based on POCO C++ Libraries and uses `Poco::Util::AbstractConfiguration` to represent it's configuration. Configuration is held by `Poco::Util::ServerApplication` class inherited by `DaemonBase` class, which in turn is inherited by `DB::Server` class, implementing clickhouse-server itself. So config can be accessed by `ServerApplication::config()` method. + +Config is read from multiple files (in XML or YAML format) and merged into single `AbstractConfiguration` by `ConfigProcessor` class. Configuration is loaded at server startup and can be reloaded later if one of config files is updated, removed or added. `ConfigReloader` class is responsible for periodic monitoring of these changes and reload procedure as well. `SYSTEM RELOAD CONFIG` query also triggers config to be reloaded. + +For queries and subsystems other than `Server` config is accessible using `Context::getConfigRef()` method. Every subsystem that is capable of reloading it's config without server restart should register itself in reload callback in `Server::main()` method. Note that if newer config has an error, most subsystems will ignore new config, log warning messages and keep working with previously loaded config. Due to the nature of `AbstractConfiguration` it is not possible to pass reference to specific section, so `String config_prefix` is usually used instead. + ## Distributed Query Execution {#distributed-query-execution} Servers in a cluster setup are mostly independent. You can create a `Distributed` table on one or all servers in a cluster. The `Distributed` table does not store data itself – it only provides a “view” to all local tables on multiple nodes of a cluster. When you SELECT from a `Distributed` table, it rewrites that query, chooses remote nodes according to load balancing settings, and sends the query to them. The `Distributed` table requests remote servers to process a query just up to a stage where intermediate results from different servers can be merged. Then it receives the intermediate results and merges them. The distributed table tries to distribute as much work as possible to remote servers and does not send much intermediate data over the network. From 9d364cdce26b13aa8923fa03ef9f4a7fd70f6ea6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 26 Apr 2022 14:57:02 +0200 Subject: [PATCH 81/94] Refactor --- ...chronousReadIndirectBufferFromRemoteFS.cpp | 4 ++ ...ynchronousReadIndirectBufferFromRemoteFS.h | 4 +- src/IO/CompressedReadBufferWrapper.h | 1 + src/IO/ParallelReadBuffer.cpp | 6 +-- src/IO/ParallelReadBuffer.h | 9 ++-- src/IO/ReadBufferFromFileDecorator.h | 6 +++ src/IO/ReadBufferFromS3.cpp | 6 +-- src/IO/ReadBufferFromS3.h | 8 +-- src/IO/ReadWriteBufferFromHTTP.h | 8 +-- src/IO/SeekableReadBuffer.h | 17 +------ src/IO/WithFileSize.cpp | 49 +++++++++++++++++++ src/IO/WithFileSize.h | 21 ++++++++ .../Formats/Impl/ArrowBufferedStreams.cpp | 21 ++++---- .../Formats/Impl/ArrowBufferedStreams.h | 8 +-- .../Cache/ExternalDataSourceCache.cpp | 2 +- src/Storages/Cache/ExternalDataSourceCache.h | 4 +- src/Storages/HDFS/ReadBufferFromHDFS.cpp | 8 +-- src/Storages/HDFS/ReadBufferFromHDFS.h | 4 +- 18 files changed, 127 insertions(+), 59 deletions(-) create mode 100644 src/IO/WithFileSize.cpp create mode 100644 src/IO/WithFileSize.h diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp index e693a8e9ea8..60620dd4159 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp @@ -69,6 +69,10 @@ String AsynchronousReadIndirectBufferFromRemoteFS::getInfoForLog() return impl->getInfoForLog(); } +std::optional AsynchronousReadIndirectBufferFromRemoteFS::getFileSize() +{ + return impl->getFileSize(); +} bool AsynchronousReadIndirectBufferFromRemoteFS::hasPendingDataToRead() { diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h index 48c4ff3b4f0..a69ef7d37a8 100644 --- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h +++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h @@ -27,7 +27,7 @@ struct ReadSettings; * * We pass either `memory` or `prefetch_buffer` through all this chain and return it back. */ -class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase +class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase, public WithFileSize { public: explicit AsynchronousReadIndirectBufferFromRemoteFS( @@ -51,6 +51,8 @@ public: String getInfoForLog() override; + std::optional getFileSize() override; + private: bool nextImpl() override; diff --git a/src/IO/CompressedReadBufferWrapper.h b/src/IO/CompressedReadBufferWrapper.h index 0e23357f625..e0a4537d9bc 100644 --- a/src/IO/CompressedReadBufferWrapper.h +++ b/src/IO/CompressedReadBufferWrapper.h @@ -17,6 +17,7 @@ public: , in(std::move(in_)) {} const ReadBuffer & getWrappedReadBuffer() const { return *in; } + ReadBuffer & getWrappedReadBuffer() { return *in; } protected: std::unique_ptr in; diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp index 3db4a45778e..fb1dc8dc704 100644 --- a/src/IO/ParallelReadBuffer.cpp +++ b/src/IO/ParallelReadBuffer.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes } ParallelReadBuffer::ParallelReadBuffer(std::unique_ptr reader_factory_, CallbackRunner schedule_, size_t max_working_readers_) - : SeekableReadBufferWithSize(nullptr, 0) + : SeekableReadBuffer(nullptr, 0) , max_working_readers(max_working_readers_) , schedule(std::move(schedule_)) , reader_factory(std::move(reader_factory_)) @@ -116,10 +116,10 @@ off_t ParallelReadBuffer::seek(off_t offset, int whence) return offset; } -std::optional ParallelReadBuffer::getTotalSize() +std::optional ParallelReadBuffer::getFileSize() { std::lock_guard lock{mutex}; - return reader_factory->getTotalSize(); + return reader_factory->getFileSize(); } off_t ParallelReadBuffer::getPosition() diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h index 7b706b5111e..1682e217d89 100644 --- a/src/IO/ParallelReadBuffer.h +++ b/src/IO/ParallelReadBuffer.h @@ -22,7 +22,7 @@ namespace DB * * Number of working readers limited by max_working_readers. */ -class ParallelReadBuffer : public SeekableReadBufferWithSize +class ParallelReadBuffer : public SeekableReadBuffer { private: /// Blocks until data occurred in the first reader or this reader indicate finishing @@ -68,13 +68,12 @@ private: }; public: - class ReadBufferFactory + class ReadBufferFactory : public WithFileSize { public: virtual SeekableReadBufferPtr getReader() = 0; - virtual ~ReadBufferFactory() = default; + virtual ~ReadBufferFactory() override = default; virtual off_t seek(off_t off, int whence) = 0; - virtual std::optional getTotalSize() = 0; }; explicit ParallelReadBuffer(std::unique_ptr reader_factory_, CallbackRunner schedule_, size_t max_working_readers); @@ -82,7 +81,7 @@ public: ~ParallelReadBuffer() override { finishAndWait(); } off_t seek(off_t off, int whence) override; - std::optional getTotalSize() override; + std::optional getFileSize(); off_t getPosition() override; const ReadBufferFactory & getReadBufferFactory() const { return *reader_factory; } diff --git a/src/IO/ReadBufferFromFileDecorator.h b/src/IO/ReadBufferFromFileDecorator.h index c83ec669203..4b12bf96c26 100644 --- a/src/IO/ReadBufferFromFileDecorator.h +++ b/src/IO/ReadBufferFromFileDecorator.h @@ -21,6 +21,12 @@ public: bool nextImpl() override; + bool isWithFileSize() const { return dynamic_cast(impl.get()) != nullptr; } + + const ReadBuffer & getWrappedReadBuffer() const { return *impl; } + + ReadBuffer & getWrappedReadBuffer() { return *impl; } + protected: std::unique_ptr impl; String file_name; diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index d19fbd28265..618ba4b0b88 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -46,7 +46,7 @@ ReadBufferFromS3::ReadBufferFromS3( size_t offset_, size_t read_until_position_, bool restricted_seek_) - : SeekableReadBufferWithSize(nullptr, 0) + : SeekableReadBuffer(nullptr, 0) , client_ptr(std::move(client_ptr_)) , bucket(bucket_) , key(key_) @@ -208,7 +208,7 @@ off_t ReadBufferFromS3::seek(off_t offset_, int whence) return offset; } -std::optional ReadBufferFromS3::getTotalSize() +std::optional ReadBufferFromS3::getFileSize() { if (file_size) return file_size; @@ -307,7 +307,7 @@ off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence) return off; } -std::optional ReadBufferS3Factory::getTotalSize() +std::optional ReadBufferS3Factory::getFileSize() { return object_size; } diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 2a94d286da6..6c2c5473bf2 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -26,7 +26,7 @@ namespace DB /** * Perform S3 HTTP GET request and provide response to read. */ -class ReadBufferFromS3 : public SeekableReadBufferWithSize, public WithFileName +class ReadBufferFromS3 : public SeekableReadBuffer, public WithFileName, public WithFileSize { private: std::shared_ptr client_ptr; @@ -63,7 +63,7 @@ public: off_t getPosition() override; - std::optional getTotalSize() override; + std::optional getFileSize() override; void setReadUntilPosition(size_t position) override; @@ -83,6 +83,8 @@ private: /// There is different seek policy for disk seek and for non-disk seek /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet). bool restricted_seek; + + std::optional file_size; }; /// Creates separate ReadBufferFromS3 for sequence of ranges of particular object @@ -114,7 +116,7 @@ public: off_t seek(off_t off, [[maybe_unused]] int whence) override; - std::optional getTotalSize() override; + std::optional getFileSize() override; String getFileName() const override { return bucket + "/" + key; } diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index 02a214e87d7..9ff90afd3b7 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -86,7 +86,7 @@ public: namespace detail { template - class ReadWriteBufferFromHTTPBase : public SeekableReadBufferWithSize, public WithFileName + class ReadWriteBufferFromHTTPBase : public SeekableReadBuffer, public WithFileName, public WithFileSize { public: using HTTPHeaderEntry = std::tuple; @@ -199,7 +199,7 @@ namespace detail } } - std::optional getTotalSize() override + std::optional getFileSize() override { if (read_range.end) return *read_range.end - getRangeBegin(); @@ -270,7 +270,7 @@ namespace detail bool delay_initialization = false, bool use_external_buffer_ = false, bool http_skip_not_found_url_ = false) - : SeekableReadBufferWithSize(nullptr, 0) + : SeekableReadBuffer(nullptr, 0) , uri {uri_} , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET} , session {session_} @@ -749,7 +749,7 @@ public: return off; } - std::optional getTotalSize() override { return total_object_size; } + std::optional getFileSize() override { return total_object_size; } String getFileName() const override { return uri.toString(); } diff --git a/src/IO/SeekableReadBuffer.h b/src/IO/SeekableReadBuffer.h index 3a46630350a..1187f7b92ba 100644 --- a/src/IO/SeekableReadBuffer.h +++ b/src/IO/SeekableReadBuffer.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace DB @@ -62,20 +63,4 @@ public: using SeekableReadBufferPtr = std::shared_ptr; - -class SeekableReadBufferWithSize : public SeekableReadBuffer -{ -public: - SeekableReadBufferWithSize(Position ptr, size_t size) - : SeekableReadBuffer(ptr, size) {} - SeekableReadBufferWithSize(Position ptr, size_t size, size_t offset) - : SeekableReadBuffer(ptr, size, offset) {} - - /// set std::nullopt in case it is impossible to find out total size. - virtual std::optional getTotalSize() = 0; - -protected: - std::optional file_size; -}; - } diff --git a/src/IO/WithFileSize.cpp b/src/IO/WithFileSize.cpp new file mode 100644 index 00000000000..c05a32291e3 --- /dev/null +++ b/src/IO/WithFileSize.cpp @@ -0,0 +1,49 @@ +#include "WithFileSize.h" +#include +#include +#include +#include + +namespace DB +{ + +template +static std::optional getFileSize(T & in) +{ + if (auto * with_file_size = dynamic_cast(&in)) + { + return with_file_size->getFileSize(); + } + + return std::nullopt; +} + +std::optional getFileSizeFromReadBuffer(ReadBuffer & in) +{ + if (auto * delegate = dynamic_cast(&in)) + { + return getFileSize(delegate->getWrappedReadBuffer()); + } + else if (auto * compressed = dynamic_cast(&in)) + { + return getFileSize(compressed->getWrappedReadBuffer()); + } + + return getFileSize(in); +} + +bool isBufferWithFileSize(const ReadBuffer & in) +{ + if (const auto * delegate = dynamic_cast(&in)) + { + return delegate->isWithFileSize(); + } + else if (const auto * compressed = dynamic_cast(&in)) + { + return isBufferWithFileSize(compressed->getWrappedReadBuffer()); + } + + return dynamic_cast(&in) != nullptr; +} + +} diff --git a/src/IO/WithFileSize.h b/src/IO/WithFileSize.h new file mode 100644 index 00000000000..b0d0517b23a --- /dev/null +++ b/src/IO/WithFileSize.h @@ -0,0 +1,21 @@ +#pragma once +#include +#include + +namespace DB +{ + +class ReadBuffer; + +class WithFileSize +{ +public: + virtual std::optional getFileSize() = 0; + virtual ~WithFileSize() = default; +}; + +bool isBufferWithFileSize(const ReadBuffer & in); + +std::optional getFileSizeFromReadBuffer(ReadBuffer & in); + +} diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp index 8573a560d02..c8e8cf900f4 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.cpp @@ -48,13 +48,13 @@ arrow::Status ArrowBufferedOutputStream::Write(const void * data, int64_t length return arrow::Status::OK(); } -RandomAccessFileFromSeekableReadBuffer::RandomAccessFileFromSeekableReadBuffer(SeekableReadBuffer & in_, off_t file_size_) - : in{in_}, file_size{file_size_}, is_open{true} +RandomAccessFileFromSeekableReadBuffer::RandomAccessFileFromSeekableReadBuffer(ReadBuffer & in_, off_t file_size_) + : in{in_}, seekable_in{dynamic_cast(in_)}, file_size{file_size_}, is_open{true} { } -RandomAccessFileFromSeekableReadBuffer::RandomAccessFileFromSeekableReadBuffer(SeekableReadBufferWithSize & in_) - : in{in_}, is_open{true} +RandomAccessFileFromSeekableReadBuffer::RandomAccessFileFromSeekableReadBuffer(ReadBuffer & in_) + : in{in_}, seekable_in{dynamic_cast(in_)}, is_open{true} { } @@ -62,9 +62,8 @@ arrow::Result RandomAccessFileFromSeekableReadBuffer::GetSize() { if (!file_size) { - auto * buf_with_size = dynamic_cast(&in); - if (buf_with_size) - file_size = buf_with_size->getTotalSize(); + if (isBufferWithFileSize(in)) + file_size = getFileSizeFromReadBuffer(in); if (!file_size) throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out size of file"); } @@ -79,7 +78,7 @@ arrow::Status RandomAccessFileFromSeekableReadBuffer::Close() arrow::Result RandomAccessFileFromSeekableReadBuffer::Tell() const { - return in.getPosition(); + return seekable_in.getPosition(); } arrow::Result RandomAccessFileFromSeekableReadBuffer::Read(int64_t nbytes, void * out) @@ -100,7 +99,7 @@ arrow::Result> RandomAccessFileFromSeekableReadBu arrow::Status RandomAccessFileFromSeekableReadBuffer::Seek(int64_t position) { - in.seek(position, SEEK_SET); + seekable_in.seek(position, SEEK_SET); return arrow::Status::OK(); } @@ -156,10 +155,10 @@ std::shared_ptr asArrowFile( if (res == 0 && S_ISREG(stat.st_mode)) return std::make_shared(*fd_in, stat.st_size); } - else if (auto * seekable_in = dynamic_cast(&in)) + else if (dynamic_cast(&in) && isBufferWithFileSize(in)) { if (settings.seekable_read) - return std::make_shared(*seekable_in); + return std::make_shared(in); } // fallback to loading the entire file in memory diff --git a/src/Processors/Formats/Impl/ArrowBufferedStreams.h b/src/Processors/Formats/Impl/ArrowBufferedStreams.h index 4ad0ecdf012..d61da2d5363 100644 --- a/src/Processors/Formats/Impl/ArrowBufferedStreams.h +++ b/src/Processors/Formats/Impl/ArrowBufferedStreams.h @@ -17,7 +17,6 @@ class ReadBuffer; class WriteBuffer; class SeekableReadBuffer; -class SeekableReadBufferWithSize; struct FormatSettings; class ArrowBufferedOutputStream : public arrow::io::OutputStream @@ -46,9 +45,9 @@ private: class RandomAccessFileFromSeekableReadBuffer : public arrow::io::RandomAccessFile { public: - RandomAccessFileFromSeekableReadBuffer(SeekableReadBuffer & in_, off_t file_size_); + RandomAccessFileFromSeekableReadBuffer(ReadBuffer & in_, off_t file_size_); - explicit RandomAccessFileFromSeekableReadBuffer(SeekableReadBufferWithSize & in_); + explicit RandomAccessFileFromSeekableReadBuffer(ReadBuffer & in_); arrow::Result GetSize() override; @@ -65,7 +64,8 @@ public: arrow::Status Seek(int64_t position) override; private: - SeekableReadBuffer & in; + ReadBuffer & in; + SeekableReadBuffer & seekable_in; std::optional file_size; bool is_open = false; diff --git a/src/Storages/Cache/ExternalDataSourceCache.cpp b/src/Storages/Cache/ExternalDataSourceCache.cpp index 17966d49c74..ad7a784e682 100644 --- a/src/Storages/Cache/ExternalDataSourceCache.cpp +++ b/src/Storages/Cache/ExternalDataSourceCache.cpp @@ -56,7 +56,7 @@ LocalFileHolder::~LocalFileHolder() } } -RemoteReadBuffer::RemoteReadBuffer(size_t buff_size) : BufferWithOwnMemory(buff_size) +RemoteReadBuffer::RemoteReadBuffer(size_t buff_size) : BufferWithOwnMemory(buff_size) { } diff --git a/src/Storages/Cache/ExternalDataSourceCache.h b/src/Storages/Cache/ExternalDataSourceCache.h index 5ffb2b20fc7..ec0aeea4985 100644 --- a/src/Storages/Cache/ExternalDataSourceCache.h +++ b/src/Storages/Cache/ExternalDataSourceCache.h @@ -43,7 +43,7 @@ public: BackgroundSchedulePool * thread_pool; }; -class RemoteReadBuffer : public BufferWithOwnMemory +class RemoteReadBuffer : public BufferWithOwnMemory, public WithFileSize { public: explicit RemoteReadBuffer(size_t buff_size); @@ -53,7 +53,7 @@ public: bool nextImpl() override; off_t seek(off_t off, int whence) override; off_t getPosition() override; - std::optional getTotalSize() override { return remote_file_size; } + std::optional getFileSize() override { return remote_file_size; } private: std::unique_ptr local_file_holder; diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index 42961061de4..45905fdcaf4 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -58,7 +58,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory getTotalSize() const + std::optional getFileSize() const { auto * file_info = hdfsGetPathInfo(fs.get(), hdfs_file_path.c_str()); if (!file_info) @@ -125,14 +125,14 @@ ReadBufferFromHDFS::ReadBufferFromHDFS( const String & hdfs_file_path_, const Poco::Util::AbstractConfiguration & config_, size_t buf_size_, size_t read_until_position_) - : SeekableReadBufferWithSize(nullptr, 0) + : SeekableReadBuffer(nullptr, 0) , impl(std::make_unique(hdfs_uri_, hdfs_file_path_, config_, buf_size_, read_until_position_)) { } -std::optional ReadBufferFromHDFS::getTotalSize() +std::optional ReadBufferFromHDFS::getFileSize() { - return impl->getTotalSize(); + return impl->getFileSize(); } bool ReadBufferFromHDFS::nextImpl() diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.h b/src/Storages/HDFS/ReadBufferFromHDFS.h index 0587f4d1d3d..4107ed70069 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.h +++ b/src/Storages/HDFS/ReadBufferFromHDFS.h @@ -20,7 +20,7 @@ namespace DB /** Accepts HDFS path to file and opens it. * Closes file by himself (thus "owns" a file descriptor). */ -class ReadBufferFromHDFS : public SeekableReadBufferWithSize, public WithFileName +class ReadBufferFromHDFS : public SeekableReadBuffer, public WithFileName, public WithFileSize { struct ReadBufferFromHDFSImpl; @@ -38,7 +38,7 @@ public: off_t getPosition() override; - std::optional getTotalSize() override; + std::optional getFileSize() override; size_t getFileOffsetOfBufferEnd() const override; From be0aa06958e9afcbcea6ed203cf30990e642ef17 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 13 Apr 2022 17:20:26 +0000 Subject: [PATCH 82/94] Add output format Prometheus --- src/Formats/registerFormats.cpp | 2 + .../Impl/PrometheusTextOutputFormat.cpp | 104 ++++++++++++++++++ .../Formats/Impl/PrometheusTextOutputFormat.h | 51 +++++++++ .../02267_output_format_prometheus.reference | 15 +++ .../02267_output_format_prometheus.sql | 7 ++ 5 files changed, 179 insertions(+) create mode 100644 src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp create mode 100644 src/Processors/Formats/Impl/PrometheusTextOutputFormat.h create mode 100644 tests/queries/0_stateless/02267_output_format_prometheus.reference create mode 100644 tests/queries/0_stateless/02267_output_format_prometheus.sql diff --git a/src/Formats/registerFormats.cpp b/src/Formats/registerFormats.cpp index 8c5955b2108..94f2e3c0bb5 100644 --- a/src/Formats/registerFormats.cpp +++ b/src/Formats/registerFormats.cpp @@ -72,6 +72,7 @@ void registerOutputFormatMySQLWire(FormatFactory & factory); void registerOutputFormatMarkdown(FormatFactory & factory); void registerOutputFormatPostgreSQLWire(FormatFactory & factory); void registerOutputFormatCapnProto(FormatFactory & factory); +void registerOutputFormatPrometheus(FormatFactory & factory); /// Input only formats. @@ -181,6 +182,7 @@ void registerFormats() registerOutputFormatMarkdown(factory); registerOutputFormatPostgreSQLWire(factory); registerOutputFormatCapnProto(factory); + registerOutputFormatPrometheus(factory); registerInputFormatRegexp(factory); registerInputFormatJSONAsString(factory); diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp new file mode 100644 index 00000000000..7f911997ab6 --- /dev/null +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp @@ -0,0 +1,104 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +constexpr auto FORMAT_NAME = "Prometheus"; + +static bool isDataTypeString(const DataTypePtr & type) +{ + return WhichDataType(type).isStringOrFixedString(); +} + +template +static void getColumnPos(const Block & header, const String & col_name, Pred pred, ResType & res) +{ + static_assert(std::is_same_v || std::is_same_v>, "Illegal ResType"); + + constexpr bool is_optional = std::is_same_v>; + + if (header.has(col_name)) + { + res = header.getPositionByName(col_name); + const auto & col = header.getByName(col_name); + if (!pred(is_optional ? removeNullable(col.type) : col.type)) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal type '{}' of column '{}' for output format '{}'", + col.type->getName(), col_name, FORMAT_NAME); + } + } + else + { + if constexpr (is_optional) + res = std::nullopt; + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column '{}' is required for output format '{}'", col_name, FORMAT_NAME); + } +} + +PrometheusTextOutputFormat::PrometheusTextOutputFormat( + WriteBuffer & out_, + const Block & header_, + const RowOutputFormatParams & params_, + const FormatSettings & format_settings_) + : IRowOutputFormat(header_, out_, params_), format_settings(format_settings_) +{ + const Block & header = getPort(PortKind::Main).getHeader(); + + getColumnPos(header, "name", isDataTypeString, pos.name); + getColumnPos(header, "value", isNumber, pos.value); + + getColumnPos(header, "help", isDataTypeString,pos.help); + getColumnPos(header, "type", isDataTypeString, pos.type); +} + +void PrometheusTextOutputFormat::write(const Columns & columns, size_t row_num) +{ + if (pos.help.has_value() && !columns[*pos.help]->isNullAt(row_num)) + { + writeCString("# HELP ", out); + serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); + writeChar(' ', out); + serializations[*pos.help]->serializeText(*columns[*pos.help], row_num, out, format_settings); + writeChar('\n', out); + } + + if (pos.type.has_value() && !columns[*pos.type]->isNullAt(row_num)) + { + writeCString("# TYPE ", out); + serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); + writeChar(' ', out); + serializations[*pos.type]->serializeText(*columns[*pos.type], row_num, out, format_settings); + /// TODO(vdimir): Check if type is 'counter', 'gauge', 'histogram', 'summary', or 'untyped' + writeChar('\n', out); + } + + serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); + writeChar(' ', out); + serializations[pos.value]->serializeText(*columns[pos.value], row_num, out, format_settings); + + writeChar('\n', out); + writeChar('\n', out); +} + +void registerOutputFormatPrometheus(FormatFactory & factory) +{ + factory.registerOutputFormat(FORMAT_NAME, []( + WriteBuffer & buf, + const Block & sample, + const RowOutputFormatParams & params, + const FormatSettings & settings) + { + return std::make_shared(buf, sample, params, settings); + }); +} + +} diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h new file mode 100644 index 00000000000..cf82d28da68 --- /dev/null +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +class WriteBuffer; + +/** A stream for outputting data in tsv format. + */ +class PrometheusTextOutputFormat : public IRowOutputFormat +{ +public: + /** with_names - output in the first line a header with column names + * with_types - output the next line header with the names of the types + */ + PrometheusTextOutputFormat( + WriteBuffer & out_, + const Block & header_, + const RowOutputFormatParams & params_, + const FormatSettings & format_settings_); + + String getName() const override { return "PrometheusTextOutputFormat"; } + + /// https://github.com/prometheus/docs/blob/86386ed25bc8a5309492483ec7d18d0914043162/content/docs/instrumenting/exposition_formats.md + String getContentType() const override { return "text/plain; version=0.0.4; charset=UTF-8"; } + +protected: + void write(const Columns & columns, size_t row_num) override; + void writeField(const IColumn &, const ISerialization &, size_t) override {} + + + struct ColumnPositions + { + size_t name; + size_t value; + std::optional help; + std::optional type; + }; + + + ColumnPositions pos; + + const FormatSettings format_settings; +}; + +} diff --git a/tests/queries/0_stateless/02267_output_format_prometheus.reference b/tests/queries/0_stateless/02267_output_format_prometheus.reference new file mode 100644 index 00000000000..da3a51401ed --- /dev/null +++ b/tests/queries/0_stateless/02267_output_format_prometheus.reference @@ -0,0 +1,15 @@ +# HELP metric0 info 0 +# TYPE metric0 counter +metric0 0 + +metric1 1 + +# HELP metric2 info 2 +metric2 2 + +# TYPE metric3 counter +metric3 3 + +# HELP metric4 info 4 +metric4 4 + diff --git a/tests/queries/0_stateless/02267_output_format_prometheus.sql b/tests/queries/0_stateless/02267_output_format_prometheus.sql new file mode 100644 index 00000000000..c334f8f767b --- /dev/null +++ b/tests/queries/0_stateless/02267_output_format_prometheus.sql @@ -0,0 +1,7 @@ +SELECT + 'metric' || toString(number) as name, + number as value, + if(number % 2 == 0, 'info ' || toString(number), NULL) as help, + if(number % 3 == 0, 'counter', NULL) as type +FROM numbers(5) +FORMAT Prometheus From d5d98ed95185437cc18e7ce9b27fc6562ac372a1 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 19 Apr 2022 14:41:11 +0000 Subject: [PATCH 83/94] PrometheusTextOutputFormat: support lables, histograms and summaries --- .../Impl/PrometheusTextOutputFormat.cpp | 287 ++++++++++++++++-- .../Formats/Impl/PrometheusTextOutputFormat.h | 46 ++- .../02267_output_format_prometheus.reference | 34 ++- .../02267_output_format_prometheus.sql | 168 +++++++++- 4 files changed, 495 insertions(+), 40 deletions(-) diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp index 7f911997ab6..3de548b1833 100644 --- a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp @@ -1,16 +1,38 @@ +#include + #include #include -#include + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + #include #include + +#include +#include +#include #include -#include -#include -#include + namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + constexpr auto FORMAT_NAME = "Prometheus"; static bool isDataTypeString(const DataTypePtr & type) @@ -18,6 +40,14 @@ static bool isDataTypeString(const DataTypePtr & type) return WhichDataType(type).isStringOrFixedString(); } +static bool isDataTypeMapString(const DataTypePtr & type) +{ + if (!isMap(type)) + return false; + const auto * type_map = assert_cast(type.get()); + return isDataTypeString(type_map->getKeyType()) && isDataTypeString(type_map->getValueType()); +} + template static void getColumnPos(const Block & header, const String & col_name, Pred pred, ResType & res) { @@ -44,49 +74,256 @@ static void getColumnPos(const Block & header, const String & col_name, Pred pre } } +static Float64 tryParseFloat(const String & s) +{ + Float64 t = 0; + ReadBufferFromString buf(s); + tryReadFloatText(t, buf); + return t; +} + PrometheusTextOutputFormat::PrometheusTextOutputFormat( WriteBuffer & out_, const Block & header_, const RowOutputFormatParams & params_, const FormatSettings & format_settings_) - : IRowOutputFormat(header_, out_, params_), format_settings(format_settings_) + : IRowOutputFormat(header_, out_, params_) + , string_serialization(DataTypeString().getDefaultSerialization()) + , format_settings(format_settings_) { const Block & header = getPort(PortKind::Main).getHeader(); getColumnPos(header, "name", isDataTypeString, pos.name); getColumnPos(header, "value", isNumber, pos.value); - getColumnPos(header, "help", isDataTypeString,pos.help); + getColumnPos(header, "help", isDataTypeString, pos.help); getColumnPos(header, "type", isDataTypeString, pos.type); + getColumnPos(header, "timestamp", isNumber, pos.timestamp); + getColumnPos(header, "labels", isDataTypeMapString, pos.labels); +} + +/* + * https://prometheus.io/docs/instrumenting/exposition_formats/#histograms-and-summaries + * + * > A histogram must have a bucket with {le="+Inf"}. Its value must be identical to the value of x_count. + * > The buckets of a histogram and the quantiles of a summary must appear in increasing numerical order of their label values (for the le or the quantile label, respectively). +*/ +void PrometheusTextOutputFormat::fixupBucketLables(CurrentMetric & metric) +{ + String bucket_label = metric.type == "histogram" ? "le" : "quantile"; + + std::sort(metric.values.begin(), metric.values.end(), + [&bucket_label](const auto & lhs, const auto & rhs) + { + /// rows with lables at the begining and then `_sum` and `_count` + if (lhs.labels.contains("sum") && rhs.labels.contains("count")) + return true; + if (lhs.labels.contains("count") && rhs.labels.contains("sum")) + return false; + if (rhs.labels.contains("sum") || rhs.labels.contains("count")) + return true; + if (lhs.labels.contains("sum") || lhs.labels.contains("count")) + return false; + + auto lit = lhs.labels.find(bucket_label); + auto rit = rhs.labels.find(bucket_label); + if (lit != lhs.labels.end() && rit != rhs.labels.end()) + return tryParseFloat(lit->second) < tryParseFloat(rit->second); + return false; + }); + + if (metric.type == "histogram") + { + /// If we have only `_count` or metric with `le="+Inf" need to create both + std::optional inf_bucket; + std::optional count_bucket; + for (const auto & val : metric.values) + { + if (auto it = val.labels.find("count"); it != val.labels.end()) + { + inf_bucket = val; + inf_bucket->labels = {{"le", "+Inf"}}; + } + if (auto it = val.labels.find("le"); it != val.labels.end() && it->second == "+Inf") + { + count_bucket = val; + count_bucket->labels = {{"count", ""}}; + } + } + if (inf_bucket.has_value() && !count_bucket.has_value()) + metric.values.emplace_back(*inf_bucket); + + if (!inf_bucket.has_value() && count_bucket.has_value()) + metric.values.emplace_back(*count_bucket); + } +} + +void PrometheusTextOutputFormat::flushCurrentMetric() +{ + if (current_metric.name.empty() || current_metric.values.empty()) + { + current_metric = {}; + return; + } + + auto write_attribute = [this] (const char * name, const auto & value) + { + if (value.empty()) + return; + + writeCString(name, out); + writeString(current_metric.name, out); + writeChar(' ', out); + writeString(value, out); + writeChar('\n', out); + }; + + write_attribute("# HELP ", current_metric.help); + write_attribute("# TYPE ", current_metric.type); + + bool use_buckets = current_metric.type == "histogram" || current_metric.type == "summary"; + if (use_buckets) + { + fixupBucketLables(current_metric); + } + + for (auto & val : current_metric.values) + { + /* https://prometheus.io/docs/instrumenting/exposition_formats/#comments-help-text-and-type-information + ``` + metric_name [ + "{" label_name "=" `"` label_value `"` { "," label_name "=" `"` label_value `"` } [ "," ] "}" + ] value [ timestamp ] + ``` + */ + writeString(current_metric.name, out); + + auto lable_to_suffix = [&val, this](const auto & key, const auto & suffix, bool erase) + { + if (auto it = val.labels.find(key); it != val.labels.end()) + { + writeChar('_', out); + writeString(suffix, out); + if (erase) + val.labels.erase(it); + } + }; + + if (use_buckets) + { + lable_to_suffix("sum", "sum", true); + lable_to_suffix("count", "count", true); + lable_to_suffix("le", "bucket", false); + } + + if (!val.labels.empty()) + { + writeChar('{', out); + bool is_first = true; + for (const auto & [name, value] : val.labels) + { + if (!is_first) + writeChar(',', out); + is_first = false; + + writeString(name, out); + writeChar('=', out); + writeDoubleQuotedString(value, out); + } + writeChar('}', out); + } + + writeChar(' ', out); + + if (val.value == "nan") + writeString("NaN", out); + else if (val.value == "inf") + writeString("+Inf", out); + else if (val.value == "-inf") + writeString("-Inf", out); + else + writeString(val.value, out); + + if (!val.timestamp.empty()) + { + writeChar(' ', out); + writeString(val.timestamp, out); + } + + writeChar('\n', out); + } + writeChar('\n', out); + + current_metric = {}; +} + +String PrometheusTextOutputFormat::getString(const Columns & columns, size_t row_num, size_t column_pos) +{ + WriteBufferFromOwnString tout; + serializations[column_pos]->serializeText(*columns[column_pos], row_num, tout, format_settings); + return tout.str(); +} + +String PrometheusTextOutputFormat::getString(const IColumn & column, size_t row_num, SerializationPtr serialization) +{ + WriteBufferFromOwnString tout; + serialization->serializeText(column, row_num, tout, format_settings); + return tout.str(); +} + +template +static void columnMapToContainer(const ColumnMap * col_map, size_t row_num, Container & result) +{ + Field field; + col_map->get(row_num, field); + const auto & map_field = field.get(); + for (size_t i = 0; i < map_field.size(); ++i) + { + const auto & map_entry = map_field[i].get(); + + String entry_key; + String entry_value; + if (map_entry.size() == 2 + && map_entry[0].tryGet(entry_key) + && map_entry[1].tryGet(entry_value)) + { + result.emplace(entry_key, entry_value); + } + } } void PrometheusTextOutputFormat::write(const Columns & columns, size_t row_num) { - if (pos.help.has_value() && !columns[*pos.help]->isNullAt(row_num)) + String name = getString(columns, row_num, pos.name); + if (current_metric.name != name) { - writeCString("# HELP ", out); - serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); - writeChar(' ', out); - serializations[*pos.help]->serializeText(*columns[*pos.help], row_num, out, format_settings); - writeChar('\n', out); + flushCurrentMetric(); + current_metric = CurrentMetric(name); } - if (pos.type.has_value() && !columns[*pos.type]->isNullAt(row_num)) + if (pos.help.has_value() && !columns[*pos.help]->isNullAt(row_num) && current_metric.help.empty()) + current_metric.help = getString(columns, row_num, *pos.help); + + if (pos.type.has_value() && !columns[*pos.type]->isNullAt(row_num) && current_metric.type.empty()) + current_metric.type = getString(columns, row_num, *pos.type); + + auto & row = current_metric.values.emplace_back(); + + row.value = getString(columns, row_num, pos.value); + + if (pos.timestamp.has_value() && !columns[*pos.timestamp]->isNullAt(row_num) && columns[*pos.timestamp]->get64(row_num) != 0) + row.timestamp = getString(columns, row_num, *pos.timestamp); + + if (pos.labels.has_value()) { - writeCString("# TYPE ", out); - serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); - writeChar(' ', out); - serializations[*pos.type]->serializeText(*columns[*pos.type], row_num, out, format_settings); - /// TODO(vdimir): Check if type is 'counter', 'gauge', 'histogram', 'summary', or 'untyped' - writeChar('\n', out); + if (const ColumnMap * col_map = checkAndGetColumn(columns[*pos.labels].get())) + columnMapToContainer(col_map, row_num, row.labels); } +} - serializations[pos.name]->serializeText(*columns[pos.name], row_num, out, format_settings); - writeChar(' ', out); - serializations[pos.value]->serializeText(*columns[pos.value], row_num, out, format_settings); - - writeChar('\n', out); - writeChar('\n', out); +void PrometheusTextOutputFormat::finalizeImpl() +{ + flushCurrentMetric(); } void registerOutputFormatPrometheus(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h index cf82d28da68..e51cdce0706 100644 --- a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h @@ -1,8 +1,12 @@ #pragma once +#include +#include + #include #include #include +#include namespace DB @@ -10,14 +14,9 @@ namespace DB class WriteBuffer; -/** A stream for outputting data in tsv format. - */ class PrometheusTextOutputFormat : public IRowOutputFormat { public: - /** with_names - output in the first line a header with column names - * with_types - output the next line header with the names of the types - */ PrometheusTextOutputFormat( WriteBuffer & out_, const Block & header_, @@ -30,9 +29,6 @@ public: String getContentType() const override { return "text/plain; version=0.0.4; charset=UTF-8"; } protected: - void write(const Columns & columns, size_t row_num) override; - void writeField(const IColumn &, const ISerialization &, size_t) override {} - struct ColumnPositions { @@ -40,11 +36,43 @@ protected: size_t value; std::optional help; std::optional type; + std::optional labels; + std::optional timestamp; }; + /// One metric can be represented by multiple rows (e.g. containing different labels). + struct CurrentMetric + { + struct RowValue + { + std::map labels; + String value; + String timestamp; + }; + + CurrentMetric() = default; + explicit CurrentMetric(const String & name_) : name(name_) {} + + String name; + String help; + String type; + std::vector values; + }; + + /// Input rows should be grouped by the same metric. + void write(const Columns & columns, size_t row_num) override; + void writeField(const IColumn &, const ISerialization &, size_t) override {} + void finalizeImpl() override; + + void flushCurrentMetric(); + String getString(const Columns & columns, size_t row_num, size_t column_pos); + String getString(const IColumn & column, size_t row_num, SerializationPtr serialization); + + void fixupBucketLables(CurrentMetric & metric); ColumnPositions pos; - + CurrentMetric current_metric; + SerializationPtr string_serialization; const FormatSettings format_settings; }; diff --git a/tests/queries/0_stateless/02267_output_format_prometheus.reference b/tests/queries/0_stateless/02267_output_format_prometheus.reference index da3a51401ed..3199a6ead89 100644 --- a/tests/queries/0_stateless/02267_output_format_prometheus.reference +++ b/tests/queries/0_stateless/02267_output_format_prometheus.reference @@ -1,3 +1,35 @@ +# HELP http_request_duration_seconds A histogram of the request duration. +# TYPE http_request_duration_seconds histogram +http_request_duration_seconds_bucket{le="0.05"} 24054 +http_request_duration_seconds_bucket{le="0.1"} 33444 +http_request_duration_seconds_bucket{le="0.2"} 100392 +http_request_duration_seconds_bucket{le="0.5"} 129389 +http_request_duration_seconds_bucket{le="1"} 133988 +http_request_duration_seconds_bucket{le="+Inf"} 144320 +http_request_duration_seconds_sum 53423 +http_request_duration_seconds_count 144320 + +# HELP http_requests_total Total number of HTTP requests +# TYPE http_requests_total counter +http_requests_total{code="200",method="post"} 1027 1395066363000 +http_requests_total{code="400",method="post"} 3 1395066363000 + +metric_without_timestamp_and_labels 12.47 + +msdos_file_access_time_seconds{error="Cannot find file:\n\"FILE.TXT\"",path="C:\\DIR\\FILE.TXT"} 1458255915 + +# HELP rpc_duration_seconds A summary of the RPC duration in seconds. +# TYPE rpc_duration_seconds summary +rpc_duration_seconds{quantile="0.01"} 3102 +rpc_duration_seconds{quantile="0.05"} 3272 +rpc_duration_seconds{quantile="0.5"} 4773 +rpc_duration_seconds{quantile="0.9"} 9001 +rpc_duration_seconds{quantile="0.99"} 76656 +rpc_duration_seconds_sum 17560473 +rpc_duration_seconds_count 2693 + +something_weird{problem="division by zero"} +Inf -3982045 + # HELP metric0 info 0 # TYPE metric0 counter metric0 0 @@ -5,7 +37,7 @@ metric0 0 metric1 1 # HELP metric2 info 2 -metric2 2 +metric2 2 1395066363000 # TYPE metric3 counter metric3 3 diff --git a/tests/queries/0_stateless/02267_output_format_prometheus.sql b/tests/queries/0_stateless/02267_output_format_prometheus.sql index c334f8f767b..85abeb996ec 100644 --- a/tests/queries/0_stateless/02267_output_format_prometheus.sql +++ b/tests/queries/0_stateless/02267_output_format_prometheus.sql @@ -1,7 +1,165 @@ +SELECT * FROM ( + SELECT - 'metric' || toString(number) as name, - number as value, - if(number % 2 == 0, 'info ' || toString(number), NULL) as help, - if(number % 3 == 0, 'counter', NULL) as type + 'http_requests_total' AS name, + 'counter' AS type, + 'Total number of HTTP requests' AS help, + map('method', 'post', 'code', '200') AS labels, + 1027 AS value, + 1395066363000 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_requests_total' AS name, + 'counter' AS type, + '' AS help, + map('method', 'post', 'code', '400') AS labels, + 3 AS value, + 1395066363000 :: Float64 AS timestamp +UNION ALL +SELECT + 'msdos_file_access_time_seconds' AS name, + '' AS type, + '' AS help, + map('path', 'C:\\DIR\\FILE.TXT', 'error', 'Cannot find file:\n"FILE.TXT"') AS labels, + 1458255915 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'metric_without_timestamp_and_labels' AS name, + '' AS type, + '' AS help, + map() AS labels, + 12.47 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'something_weird' AS name, + '' AS type, + '' AS help, + map('problem', 'division by zero') AS labels, + inf AS value, + -3982045 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + 'A histogram of the request duration.' AS help, + map('le', '0.05') AS labels, + 24054 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('le', '0.1') AS labels, + 33444 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('le', '0.2') AS labels, + 100392 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('le', '0.5') AS labels, + 129389 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('le', '1') AS labels, + 133988 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('le', '+Inf') AS labels, + 144320 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'http_request_duration_seconds' AS name, + 'histogram' AS type, + '' AS help, + map('sum', '') AS labels, + 53423 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + 'A summary of the RPC duration in seconds.' AS help, + map('quantile', '0.01') AS labels, + 3102 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('quantile', '0.05') AS labels, + 3272 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('quantile', '0.5') AS labels, + 4773 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('quantile', '0.9') AS labels, + 9001 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('quantile', '0.99') AS labels, + 76656 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('count', '') AS labels, + 2693 AS value, + 0 :: Float64 AS timestamp +UNION ALL +SELECT + 'rpc_duration_seconds' AS name, + 'summary' AS type, + '' AS help, + map('sum', '') AS labels, + 1.7560473e+07 AS value, + 0 :: Float64 AS timestamp + +) ORDER BY name +FORMAT Prometheus; + +SELECT + 'metric' || toString(number) AS name, + number AS value, + if(number % 2 == 0, 'info ' || toString(number), NULL) AS help, + if(number % 3 == 0, 'counter', NULL) AS type, + if(number == 2, 1395066363000, NULL) AS timestamp FROM numbers(5) -FORMAT Prometheus +FORMAT Prometheus; From 560d668c3861c8bfd04138da46d496e09c4ff48d Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 20 Apr 2022 10:02:02 +0000 Subject: [PATCH 84/94] Add doc for output format Prometheus --- docs/en/interfaces/formats.md | 71 +++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 801b7c1a14f..805e2ac301b 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -49,6 +49,7 @@ The supported formats are: | [PrettyCompactMonoBlock](#prettycompactmonoblock) | ✗ | ✔ | | [PrettyNoEscapes](#prettynoescapes) | ✗ | ✔ | | [PrettySpace](#prettyspace) | ✗ | ✔ | +| [Prometheus](#prometheus) | ✗ | ✔ | | [Protobuf](#protobuf) | ✔ | ✔ | | [ProtobufSingle](#protobufsingle) | ✔ | ✔ | | [Avro](#data-format-avro) | ✔ | ✔ | @@ -1162,6 +1163,76 @@ You can select data from a ClickHouse table and save them into some file in the ``` bash $ clickhouse-client --query = "SELECT * FROM test.hits FORMAT CapnProto SETTINGS format_schema = 'schema:Message'" ``` +## Prometheus {#prometheus} + +Expose metrics in [Prometheus text-based exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format). + +The output table should have a proper structure. +Columns `name` ([String](../sql-reference/data-types/string.md)) and `value` (number) are required. +Rows may optionally contain `help` ([String](../sql-reference/data-types/string.md)) and `timestamp` (number). +Column `type` ([String](../sql-reference/data-types/string.md)) is either `counter`, `gauge`, `histogram`, `summary`, `untyped` or empty. +Each metric value may also have some `labels` ([Map(String, String)](../sql-reference/data-types/map.md)). +Several consequent rows may refer to the one metric with different lables. The table should be sorted by metric name (e.g., with `ORDER BY name`). + +There's special requirements for labels for `histogram` and `summary`, see [Prometheus doc](https://prometheus.io/docs/instrumenting/exposition_formats/#histograms-and-summaries) for the details. Special rules applied to row with labels `{'count':''}` and `{'sum':''}`, they'll be convered to `_count` and `_sum` respectively. + +**Example:** + +``` +┌─name────────────────────────────────┬─type──────┬─help──────────────────────────────────────┬─labels─────────────────────────┬────value─┬─────timestamp─┐ +│ http_request_duration_seconds │ histogram │ A histogram of the request duration. │ {'le':'0.05'} │ 24054 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'le':'0.1'} │ 33444 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'le':'0.2'} │ 100392 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'le':'0.5'} │ 129389 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'le':'1'} │ 133988 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'le':'+Inf'} │ 144320 │ 0 │ +│ http_request_duration_seconds │ histogram │ │ {'sum':''} │ 53423 │ 0 │ +│ http_requests_total │ counter │ Total number of HTTP requests │ {'method':'post','code':'200'} │ 1027 │ 1395066363000 │ +│ http_requests_total │ counter │ │ {'method':'post','code':'400'} │ 3 │ 1395066363000 │ +│ metric_without_timestamp_and_labels │ │ │ {} │ 12.47 │ 0 │ +│ rpc_duration_seconds │ summary │ A summary of the RPC duration in seconds. │ {'quantile':'0.01'} │ 3102 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'quantile':'0.05'} │ 3272 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'quantile':'0.5'} │ 4773 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'quantile':'0.9'} │ 9001 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'quantile':'0.99'} │ 76656 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'count':''} │ 2693 │ 0 │ +│ rpc_duration_seconds │ summary │ │ {'sum':''} │ 17560473 │ 0 │ +│ something_weird │ │ │ {'problem':'division by zero'} │ inf │ -3982045 │ +└─────────────────────────────────────┴───────────┴───────────────────────────────────────────┴────────────────────────────────┴──────────┴───────────────┘ +``` + +Will be formatted as: + +``` +# HELP http_request_duration_seconds A histogram of the request duration. +# TYPE http_request_duration_seconds histogram +http_request_duration_seconds_bucket{le="0.05"} 24054 +http_request_duration_seconds_bucket{le="0.1"} 33444 +http_request_duration_seconds_bucket{le="0.5"} 129389 +http_request_duration_seconds_bucket{le="1"} 133988 +http_request_duration_seconds_bucket{le="+Inf"} 144320 +http_request_duration_seconds_sum 53423 +http_request_duration_seconds_count 144320 + +# HELP http_requests_total Total number of HTTP requests +# TYPE http_requests_total counter +http_requests_total{code="200",method="post"} 1027 1395066363000 +http_requests_total{code="400",method="post"} 3 1395066363000 + +metric_without_timestamp_and_labels 12.47 + +# HELP rpc_duration_seconds A summary of the RPC duration in seconds. +# TYPE rpc_duration_seconds summary +rpc_duration_seconds{quantile="0.01"} 3102 +rpc_duration_seconds{quantile="0.05"} 3272 +rpc_duration_seconds{quantile="0.5"} 4773 +rpc_duration_seconds{quantile="0.9"} 9001 +rpc_duration_seconds{quantile="0.99"} 76656 +rpc_duration_seconds_sum 17560473 +rpc_duration_seconds_count 2693 + +something_weird{problem="division by zero"} +Inf -3982045 +``` ## Protobuf {#protobuf} From 81b86799e7bd45d9555160154bf512598ce2912b Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 22 Apr 2022 09:36:35 +0000 Subject: [PATCH 85/94] Fixup PrometheusTextOutputFormat --- .../Impl/PrometheusTextOutputFormat.cpp | 21 +++++++------------ .../Formats/Impl/PrometheusTextOutputFormat.h | 2 +- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp index 3de548b1833..3d0a02c99e8 100644 --- a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.cpp @@ -35,17 +35,12 @@ namespace ErrorCodes constexpr auto FORMAT_NAME = "Prometheus"; -static bool isDataTypeString(const DataTypePtr & type) -{ - return WhichDataType(type).isStringOrFixedString(); -} - static bool isDataTypeMapString(const DataTypePtr & type) { if (!isMap(type)) return false; const auto * type_map = assert_cast(type.get()); - return isDataTypeString(type_map->getKeyType()) && isDataTypeString(type_map->getValueType()); + return isStringOrFixedString(type_map->getKeyType()) && isStringOrFixedString(type_map->getValueType()); } template @@ -55,7 +50,7 @@ static void getColumnPos(const Block & header, const String & col_name, Pred pre constexpr bool is_optional = std::is_same_v>; - if (header.has(col_name)) + if (header.has(col_name, true)) { res = header.getPositionByName(col_name); const auto & col = header.getByName(col_name); @@ -93,11 +88,11 @@ PrometheusTextOutputFormat::PrometheusTextOutputFormat( { const Block & header = getPort(PortKind::Main).getHeader(); - getColumnPos(header, "name", isDataTypeString, pos.name); + getColumnPos(header, "name", isStringOrFixedString, pos.name); getColumnPos(header, "value", isNumber, pos.value); - getColumnPos(header, "help", isDataTypeString, pos.help); - getColumnPos(header, "type", isDataTypeString, pos.type); + getColumnPos(header, "help", isStringOrFixedString, pos.help); + getColumnPos(header, "type", isStringOrFixedString, pos.type); getColumnPos(header, "timestamp", isNumber, pos.timestamp); getColumnPos(header, "labels", isDataTypeMapString, pos.labels); } @@ -108,14 +103,14 @@ PrometheusTextOutputFormat::PrometheusTextOutputFormat( * > A histogram must have a bucket with {le="+Inf"}. Its value must be identical to the value of x_count. * > The buckets of a histogram and the quantiles of a summary must appear in increasing numerical order of their label values (for the le or the quantile label, respectively). */ -void PrometheusTextOutputFormat::fixupBucketLables(CurrentMetric & metric) +void PrometheusTextOutputFormat::fixupBucketLabels(CurrentMetric & metric) { String bucket_label = metric.type == "histogram" ? "le" : "quantile"; std::sort(metric.values.begin(), metric.values.end(), [&bucket_label](const auto & lhs, const auto & rhs) { - /// rows with lables at the begining and then `_sum` and `_count` + /// rows with labels at the beginning and then `_sum` and `_count` if (lhs.labels.contains("sum") && rhs.labels.contains("count")) return true; if (lhs.labels.contains("count") && rhs.labels.contains("sum")) @@ -184,7 +179,7 @@ void PrometheusTextOutputFormat::flushCurrentMetric() bool use_buckets = current_metric.type == "histogram" || current_metric.type == "summary"; if (use_buckets) { - fixupBucketLables(current_metric); + fixupBucketLabels(current_metric); } for (auto & val : current_metric.values) diff --git a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h index e51cdce0706..69b8d10e56b 100644 --- a/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h +++ b/src/Processors/Formats/Impl/PrometheusTextOutputFormat.h @@ -68,7 +68,7 @@ protected: String getString(const Columns & columns, size_t row_num, size_t column_pos); String getString(const IColumn & column, size_t row_num, SerializationPtr serialization); - void fixupBucketLables(CurrentMetric & metric); + static void fixupBucketLabels(CurrentMetric & metric); ColumnPositions pos; CurrentMetric current_metric; From eb1917f9dec90ebb94edddd543152491965a954f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 18:01:07 +0200 Subject: [PATCH 86/94] Use sequential nodes for counters. Add comments. --- src/Backups/BackupCoordinationDistributed.cpp | 43 ++++++++++--------- src/Backups/BackupStatus.h | 3 ++ src/Backups/BackupUtils.cpp | 2 +- src/Backups/BackupsWorker.h | 2 + 4 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/Backups/BackupCoordinationDistributed.cpp b/src/Backups/BackupCoordinationDistributed.cpp index c7244538655..a5e1cf88832 100644 --- a/src/Backups/BackupCoordinationDistributed.cpp +++ b/src/Backups/BackupCoordinationDistributed.cpp @@ -74,6 +74,19 @@ namespace return std::pair{size, checksum}; } + size_t extractCounterFromSequentialNodeName(const String & node_name) + { + size_t pos_before_counter = node_name.find_last_not_of("0123456789"); + size_t counter_length = node_name.length() - 1 - pos_before_counter; + auto counter = std::string_view{node_name}.substr(node_name.length() - counter_length); + return parseFromString(counter); + } + + String formatArchiveSuffix(size_t counter) + { + return fmt::format("{:03}", counter); /// Outputs 001, 002, 003, ... + } + /// We try to store data to zookeeper several times due to possible version conflicts. constexpr size_t NUM_ATTEMPTS = 10; } @@ -94,7 +107,6 @@ void BackupCoordinationDistributed::createRootNodes() zookeeper->createIfNotExists(zookeeper_path + "/file_names", ""); zookeeper->createIfNotExists(zookeeper_path + "/file_infos", ""); zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", ""); - zookeeper->createIfNotExists(zookeeper_path + "/current_archive_suffix", "0"); } void BackupCoordinationDistributed::removeAllNodes() @@ -225,30 +237,21 @@ std::optional BackupCoordinationDistributed::getFileSizeAndChec String BackupCoordinationDistributed::getNextArchiveSuffix() { auto zookeeper = get_zookeeper(); - for (size_t attempt = 0; attempt != NUM_ATTEMPTS; ++attempt) - { - Coordination::Stat stat; - String current_suffix_str = zookeeper->get(zookeeper_path + "/current_archive_suffix", &stat); - UInt64 current_suffix = parseFromString(current_suffix_str); - current_suffix_str = fmt::format("{:03}", ++current_suffix); /// Outputs 001, 002, 003, ... - Coordination::Requests ops; - ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/current_archive_suffix", current_suffix_str, stat.version)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/archive_suffixes/" + current_suffix_str, "", zkutil::CreateMode::Persistent)); - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (code == Coordination::Error::ZOK) - return current_suffix_str; - bool is_last_attempt = (attempt == NUM_ATTEMPTS - 1); - if ((responses[0]->error != Coordination::Error::ZBADVERSION) || is_last_attempt) - throw zkutil::KeeperMultiException(code, ops, responses); - } - __builtin_unreachable(); + String path = zookeeper_path + "/archive_suffixes/a"; + String path_created; + auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::PersistentSequential, path_created); + if (code != Coordination::Error::ZOK) + throw zkutil::KeeperException(code, path); + return formatArchiveSuffix(extractCounterFromSequentialNodeName(path_created)); } Strings BackupCoordinationDistributed::getAllArchiveSuffixes() const { auto zookeeper = get_zookeeper(); - return zookeeper->getChildren(zookeeper_path + "/archive_suffixes"); + Strings node_names = zookeeper->getChildren(zookeeper_path + "/archive_suffixes"); + for (auto & node_name : node_names) + node_name = formatArchiveSuffix(extractCounterFromSequentialNodeName(node_name)); + return node_names; } void BackupCoordinationDistributed::drop() diff --git a/src/Backups/BackupStatus.h b/src/Backups/BackupStatus.h index 60968369bbc..13589ff6e4f 100644 --- a/src/Backups/BackupStatus.h +++ b/src/Backups/BackupStatus.h @@ -23,6 +23,9 @@ enum class BackupStatus }; std::string_view toString(BackupStatus backup_status); + +/// Returns vector containing all values of BackupStatus and their string representation, +/// which is used to create DataTypeEnum8 to store those statuses. const std::vector> & getBackupStatusEnumValues(); } diff --git a/src/Backups/BackupUtils.cpp b/src/Backups/BackupUtils.cpp index 248ef58f3ce..a4f1c7cd196 100644 --- a/src/Backups/BackupUtils.cpp +++ b/src/Backups/BackupUtils.cpp @@ -308,7 +308,7 @@ namespace ContextPtr context; BackupSettings backup_settings; DDLRenamingSettings renaming_settings; - std::map databases; + std::unordered_map databases; std::map tables; }; } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 9c74f6c31c2..86c45ce5a04 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -9,6 +9,8 @@ namespace DB { +/// Manager of backups and restores: executes backups and restores' threads in the background. +/// Keeps information about backups and restores started in this session. class BackupsWorker { public: From a8e924caf6768cd2761b2b2a4f75acbde0de3a26 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Apr 2022 18:33:19 +0200 Subject: [PATCH 87/94] Make BACKUP & RESTORE synchronous by default. --- src/Backups/BackupSettings.cpp | 2 +- src/Backups/BackupSettings.h | 4 +- src/Backups/RestoreSettings.cpp | 2 +- src/Backups/RestoreSettings.h | 4 +- src/Interpreters/InterpreterBackupQuery.cpp | 12 ++-- src/Parsers/ASTBackupQuery.cpp | 4 +- src/Parsers/ParserBackupQuery.cpp | 14 ++--- .../test_backup_restore_new/test.py | 56 +++++++++---------- .../test_backup_restore_on_cluster/test.py | 12 ++-- 9 files changed, 55 insertions(+), 55 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index a78050ec428..e707527c1e3 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -18,7 +18,7 @@ namespace ErrorCodes M(Int64, compression_level) \ M(String, password) \ M(Bool, structure_only) \ - M(Bool, sync) \ + M(Bool, async) \ M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(Bool, allow_storing_multiple_replicas) \ diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index c1121970b49..4e784be7af7 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -25,8 +25,8 @@ struct BackupSettings /// without the data of tables. bool structure_only = false; - /// Whether the BACKUP command must wait until the backup has completed. - bool sync = false; + /// Whether the BACKUP command must return immediately without waiting until the backup has completed. + bool async = false; /// 1-based shard index to store in the backup. 0 means all shards. /// Can only be used with BACKUP ON CLUSTER. diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index 18e04c0f81a..6dfcde91ea3 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -67,7 +67,7 @@ namespace M(RestoreDatabaseCreationMode, create_database) \ M(Bool, allow_different_table_def) \ M(Bool, allow_different_database_def) \ - M(Bool, sync) \ + M(Bool, async) \ M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(UInt64, shard_num_in_backup) \ diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index e6fdc4e492f..840fc6516b5 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -58,8 +58,8 @@ struct RestoreSettings : public StorageRestoreSettings /// Set `allow_different_database_def` to true to skip this check. bool allow_different_database_def = false; - /// Whether the RESTORE command must wait until the restoring has completed. - bool sync = false; + /// Whether the RESTORE command must return immediately without waiting until the restoring has completed. + bool async = false; /// 1-based shard index to restore from the backup. 0 means all shards. /// Can only be used with RESTORE ON CLUSTER. diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 90dceec3f3b..d9080f54e95 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -164,13 +164,13 @@ namespace if (!backup_settings.internal) task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::PREPARING); - if (backup_settings.sync) + if (backup_settings.async) { - executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ false); + BackupsWorker::instance().run([query, task_id, context, backup_info, backup_settings]{ executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ true); }); } else { - BackupsWorker::instance().run([query, task_id, context, backup_info, backup_settings]{ executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ true); }); + executeBackupSync(query, task_id, context, backup_info, backup_settings, /* no_throw = */ false); } return task_id; } @@ -184,13 +184,13 @@ namespace if (!restore_settings.internal) task_id = BackupsWorker::instance().add(backup_info.toString(), BackupStatus::RESTORING); - if (restore_settings.sync) + if (restore_settings.async) { - executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ false); + BackupsWorker::instance().run([query, task_id, context, backup_info, restore_settings]{ executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ true); }); } else { - BackupsWorker::instance().run([query, task_id, context, backup_info, restore_settings]{ executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ true); }); + executeRestoreSync(query, task_id, context, backup_info, restore_settings, /* no_throw = */ false); } return task_id; } diff --git a/src/Parsers/ASTBackupQuery.cpp b/src/Parsers/ASTBackupQuery.cpp index 5734599c3c9..f19a082f648 100644 --- a/src/Parsers/ASTBackupQuery.cpp +++ b/src/Parsers/ASTBackupQuery.cpp @@ -166,11 +166,11 @@ namespace [](const SettingChange & change) { const String & name = change.name; - return (name == "internal") || (name == "sync") || (name == "shard_num") || (name == "replica_num"); + return (name == "internal") || (name == "async") || (name == "shard_num") || (name == "replica_num"); }); changes.emplace_back("internal", true); - changes.emplace_back("sync", true); + changes.emplace_back("async", false); changes.emplace_back("shard_num", params.shard_index); changes.emplace_back("replica_num", params.replica_index); diff --git a/src/Parsers/ParserBackupQuery.cpp b/src/Parsers/ParserBackupQuery.cpp index c354f28db17..17a8e7c5e0d 100644 --- a/src/Parsers/ParserBackupQuery.cpp +++ b/src/Parsers/ParserBackupQuery.cpp @@ -290,11 +290,11 @@ namespace bool parseSyncOrAsync(IParser::Pos & pos, Expected & expected, ASTPtr & settings) { - bool sync; - if (ParserKeyword{"SYNC"}.ignore(pos, expected)) - sync = true; - else if (ParserKeyword{"ASYNC"}.ignore(pos, expected)) - sync = false; + bool async; + if (ParserKeyword{"ASYNC"}.ignore(pos, expected)) + async = true; + else if (ParserKeyword{"SYNC"}.ignore(pos, expected)) + async = false; else return false; @@ -304,8 +304,8 @@ namespace changes = assert_cast(settings.get())->changes; } - boost::remove_erase_if(changes, [](const SettingChange & change) { return change.name == "sync"; }); - changes.emplace_back("sync", sync); + boost::remove_erase_if(changes, [](const SettingChange & change) { return change.name == "async"; }); + changes.emplace_back("async", async); auto new_settings = std::make_shared(); new_settings->changes = std::move(changes); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index ebcc94841be..51daf6d37e8 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -58,12 +58,12 @@ def test_restore_table(engine): create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -75,12 +75,12 @@ def test_restore_table_into_existing_table(engine): create_and_fill_table(engine=engine) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") - instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "200\t9900\n" - instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table INTO test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "300\t14850\n" @@ -89,11 +89,11 @@ def test_restore_table_under_another_name(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") assert instance.query("EXISTS test.table2") == "0\n" - instance.query(f"RESTORE TABLE test.table INTO test.table2 FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table INTO test.table2 FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -102,11 +102,11 @@ def test_backup_table_under_another_name(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table AS test.table2 TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table AS test.table2 TO {backup_name}") assert instance.query("EXISTS test.table2") == "0\n" - instance.query(f"RESTORE TABLE test.table2 FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table2 FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -116,9 +116,9 @@ def test_materialized_view(): "CREATE MATERIALIZED VIEW mv_1(x UInt8) ENGINE=MergeTree ORDER BY tuple() POPULATE AS SELECT 1 AS x" ) - instance.query(f"BACKUP TABLE mv_1 TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE mv_1 TO {backup_name}") instance.query("DROP TABLE mv_1") - instance.query(f"RESTORE TABLE mv_1 FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE mv_1 FROM {backup_name}") assert instance.query("SELECT * FROM mv_1") == "1\n" instance.query("DROP TABLE mv_1") @@ -130,17 +130,17 @@ def test_incremental_backup(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") instance.query("INSERT INTO test.table VALUES (65, 'a'), (66, 'b')") assert instance.query("SELECT count(), sum(x) FROM test.table") == "102\t5081\n" instance.query( - f"BACKUP TABLE test.table TO {incremental_backup_name} SETTINGS base_backup = {backup_name} SYNC" + f"BACKUP TABLE test.table TO {incremental_backup_name} SETTINGS base_backup = {backup_name}" ) instance.query( - f"RESTORE TABLE test.table AS test.table2 FROM {incremental_backup_name} SYNC" + f"RESTORE TABLE test.table AS test.table2 FROM {incremental_backup_name}" ) assert instance.query("SELECT count(), sum(x) FROM test.table2") == "102\t5081\n" @@ -150,10 +150,10 @@ def test_incremental_backup_after_renaming_table(): incremental_backup_name = new_backup_name() create_and_fill_table() - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") instance.query("RENAME TABLE test.table TO test.table2") instance.query( - f"BACKUP TABLE test.table2 TO {incremental_backup_name} SETTINGS base_backup = {backup_name} SYNC" + f"BACKUP TABLE test.table2 TO {incremental_backup_name} SETTINGS base_backup = {backup_name}" ) # Files in a base backup can be searched by checksum, so an incremental backup with a renamed table actually @@ -174,7 +174,7 @@ def test_incremental_backup_after_renaming_table(): ) instance.query("DROP TABLE test.table2") - instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table2 FROM {incremental_backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n" @@ -185,17 +185,17 @@ def test_backup_not_found_or_already_exists(): assert re.search( expected_error, instance.query_and_get_error( - f"RESTORE TABLE test.table AS test.table2 FROM {backup_name} SYNC" + f"RESTORE TABLE test.table AS test.table2 FROM {backup_name}" ), ) create_and_fill_table() - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") expected_error = "Backup .* already exists" assert re.search( expected_error, - instance.query_and_get_error(f"BACKUP TABLE test.table TO {backup_name} SYNC"), + instance.query_and_get_error(f"BACKUP TABLE test.table TO {backup_name}"), ) @@ -204,12 +204,12 @@ def test_file_engine(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -218,9 +218,9 @@ def test_database(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP DATABASE test TO {backup_name} SYNC") + instance.query(f"BACKUP DATABASE test TO {backup_name}") instance.query("DROP DATABASE test") - instance.query(f"RESTORE DATABASE test FROM {backup_name} SYNC") + instance.query(f"RESTORE DATABASE test FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -230,13 +230,13 @@ def test_zip_archive(): create_and_fill_table() assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" - instance.query(f"BACKUP TABLE test.table TO {backup_name} SYNC") + instance.query(f"BACKUP TABLE test.table TO {backup_name}") assert os.path.isfile(get_path_to_backup(backup_name)) instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" - instance.query(f"RESTORE TABLE test.table FROM {backup_name} SYNC") + instance.query(f"RESTORE TABLE test.table FROM {backup_name}") assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -246,13 +246,13 @@ def test_zip_archive_with_settings(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" instance.query( - f"BACKUP TABLE test.table TO {backup_name} SETTINGS compression_method='lzma', compression_level=3, password='qwerty' SYNC" + f"BACKUP TABLE test.table TO {backup_name} SETTINGS compression_method='lzma', compression_level=3, password='qwerty'" ) instance.query("DROP TABLE test.table") assert instance.query("EXISTS test.table") == "0\n" instance.query( - f"RESTORE TABLE test.table FROM {backup_name} SETTINGS password='qwerty' SYNC" + f"RESTORE TABLE test.table FROM {backup_name} SETTINGS password='qwerty'" ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 52a4918b5d0..09032dab56a 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -76,14 +76,14 @@ def test_replicated_table(): # Make backup on node 1. node1.query( - f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=1 SYNC" + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=1" ) # Drop table on both nodes. node1.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") # Restore from backup on node2. - node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} SYNC") + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") assert node2.query("SELECT * FROM tbl ORDER BY x") == TSV( @@ -115,14 +115,14 @@ def test_replicated_database(): # Make backup. backup_name = new_backup_name() node1.query( - f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2 SYNC" + f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2" ) # Drop table on both nodes. node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") # Restore from backup on node2. - node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} SYNC") + node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}") node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl") assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV( @@ -147,12 +147,12 @@ def test_different_tables_on_nodes(): backup_name = new_backup_name() node1.query( - f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS allow_storing_multiple_replicas = true SYNC" + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} SETTINGS allow_storing_multiple_replicas = true" ) node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") - node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name} SYNC") + node2.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") assert node1.query("SELECT * FROM tbl") == TSV( [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]] From 2697417a2b5cda2e96203d68dc97cf6c7b7d77fd Mon Sep 17 00:00:00 2001 From: terrylin Date: Wed, 27 Apr 2022 20:35:20 +0800 Subject: [PATCH 88/94] add test case --- ...lickhouse_dictionary_source_loop.reference | 4 ++ ...1780_clickhouse_dictionary_source_loop.sql | 38 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.reference b/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.reference index 0cfb83aa2f2..44a395e06b0 100644 --- a/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.reference +++ b/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.reference @@ -1,3 +1,7 @@ 1 1 2 2 3 3 +3 3 +0 +0 +4 4 diff --git a/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.sql b/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.sql index 2ea6119cef8..d4469963249 100644 --- a/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.sql +++ b/tests/queries/0_stateless/01780_clickhouse_dictionary_source_loop.sql @@ -52,4 +52,42 @@ SELECT * FROM 01780_db.dict3; DROP DICTIONARY 01780_db.dict3; +DROP TABLE IF EXISTS 01780_db.dict4_source; +CREATE TABLE 01780_db.dict4_source +( + id UInt64, + value String +) ENGINE = TinyLog; + +DROP TABLE IF EXISTS 01780_db.dict4_view; +CREATE VIEW 01780_db.dict4_view +( + id UInt64, + value String +) AS SELECT id, value FROM 01780_db.dict4_source WHERE id = (SELECT max(id) FROM 01780_db.dict4_source); + +INSERT INTO 01780_db.dict4_source VALUES (1, '1'), (2, '2'), (3, '3'); + +DROP DICTIONARY IF EXISTS 01780_db.dict4; +CREATE DICTIONARY 01780_db.dict4 +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 TABLE 'dict4_view' DATABASE '01780_db')) +LIFETIME(MIN 0 MAX 1) +LAYOUT(COMPLEX_KEY_HASHED()); + +SELECT * FROM 01780_db.dict4; + +INSERT INTO 01780_db.dict4_source VALUES (4, '4'); + +SELECT sleep(3); +SELECT sleep(3); + +SELECT * FROM 01780_db.dict4; + +DROP DICTIONARY 01780_db.dict4; + DROP DATABASE 01780_db; From 22189b0a5a1aa9c1365e0d036dd37ddb0569ed24 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 27 Apr 2022 16:53:35 +0300 Subject: [PATCH 89/94] Properly cancel INSERT queries in clickhouse-client This will also fix issues like queries left after the test, like in [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/36686/042cf0c76444e8738eb2481ae21a135f05b4c990/stateless_tests__debug__actions__[2/3]/runlog.log Signed-off-by: Azat Khuzhin --- src/Client/ClientBase.cpp | 37 ++++++++++++------- src/Client/ClientBase.h | 1 + .../02290_client_insert_cancel.reference | 1 + .../0_stateless/02290_client_insert_cancel.sh | 18 +++++++++ 4 files changed, 43 insertions(+), 14 deletions(-) create mode 100644 tests/queries/0_stateless/02290_client_insert_cancel.reference create mode 100755 tests/queries/0_stateless/02290_client_insert_cancel.sh diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 905cd9fe6be..7bda1df1e62 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -765,21 +765,9 @@ void ClientBase::receiveResult(ASTPtr parsed_query) /// to avoid losing sync. if (!cancelled) { - auto cancel_query = [&] { - connection->sendCancel(); - if (is_interactive) - { - progress_indication.clearProgressOutput(); - std::cout << "Cancelling query." << std::endl; - - } - cancelled = true; - }; - - /// handler received sigint if (QueryInterruptHandler::cancelled()) { - cancel_query(); + cancelQuery(); } else { @@ -790,7 +778,7 @@ void ClientBase::receiveResult(ASTPtr parsed_query) << " Waited for " << static_cast(elapsed) << " seconds," << " timeout is " << receive_timeout.totalSeconds() << " seconds." << std::endl; - cancel_query(); + cancelQuery(); } } } @@ -1066,6 +1054,9 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars return; } + QueryInterruptHandler::start(); + SCOPE_EXIT({ QueryInterruptHandler::stop(); }); + connection->sendQuery( connection_parameters.timeouts, query, @@ -1242,6 +1233,13 @@ try Block block; while (executor.pull(block)) { + if (!cancelled && QueryInterruptHandler::cancelled()) + { + cancelQuery(); + executor.cancel(); + return; + } + /// Check if server send Log packet receiveLogs(parsed_query); @@ -1346,6 +1344,17 @@ bool ClientBase::receiveEndOfQuery() } } +void ClientBase::cancelQuery() +{ + connection->sendCancel(); + if (is_interactive) + { + progress_indication.clearProgressOutput(); + std::cout << "Cancelling query." << std::endl; + + } + cancelled = true; +} void ClientBase::processParsedSingleQuery(const String & full_query, const String & query_to_execute, ASTPtr parsed_query, std::optional echo_query_, bool report_error) diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 2684ae2481f..055a57e2f3c 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -112,6 +112,7 @@ private: void receiveLogs(ASTPtr parsed_query); bool receiveSampleBlock(Block & out, ColumnsDescription & columns_description, ASTPtr parsed_query); bool receiveEndOfQuery(); + void cancelQuery(); void onProgress(const Progress & value); void onData(Block & block, ASTPtr parsed_query); diff --git a/tests/queries/0_stateless/02290_client_insert_cancel.reference b/tests/queries/0_stateless/02290_client_insert_cancel.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02290_client_insert_cancel.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02290_client_insert_cancel.sh b/tests/queries/0_stateless/02290_client_insert_cancel.sh new file mode 100755 index 00000000000..9e0a2f3571b --- /dev/null +++ b/tests/queries/0_stateless/02290_client_insert_cancel.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +yes 1 | $CLICKHOUSE_CLIENT --query_id "$CLICKHOUSE_TEST_UNIQUE_NAME" -q "insert into function null('n Int') format TSV" & +client_pid=$! + +# wait for the query +while [ "$($CLICKHOUSE_CLIENT -q "select count() from system.processes where query_id = '$CLICKHOUSE_TEST_UNIQUE_NAME'")" = 0 ]; do + sleep 0.1 +done + +kill -INT $client_pid +wait $client_pid +# if client does not cancel it properly (i.e. cancel the query), then return code will be 2, otherwise 0 +echo $? From 326be13c056baee4df1d255ad0c2b8b2d1e043a2 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Thu, 28 Apr 2022 02:30:08 +0000 Subject: [PATCH 90/94] fix: utils/clickhouse-diagnostics/requirements.txt to reduce vulnerabilities The following vulnerabilities are fixed by pinning transitive dependencies: - https://snyk.io/vuln/SNYK-PYTHON-NUMPY-2321964 - https://snyk.io/vuln/SNYK-PYTHON-NUMPY-2321966 - https://snyk.io/vuln/SNYK-PYTHON-NUMPY-2321969 - https://snyk.io/vuln/SNYK-PYTHON-NUMPY-2321970 --- utils/clickhouse-diagnostics/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/clickhouse-diagnostics/requirements.txt b/utils/clickhouse-diagnostics/requirements.txt index d29acab2161..25a6070988d 100644 --- a/utils/clickhouse-diagnostics/requirements.txt +++ b/utils/clickhouse-diagnostics/requirements.txt @@ -5,3 +5,4 @@ sqlparse tenacity xmltodict pandas +numpy>=1.22.2 # not directly required, pinned by Snyk to avoid a vulnerability From 7d798b36553f0fa95f4ad431d34f0d3c76f65aa3 Mon Sep 17 00:00:00 2001 From: zhanglistar Date: Thu, 28 Apr 2022 20:55:37 +0800 Subject: [PATCH 91/94] Count distinct optimization by using subquery of group by (#35993) --- src/Core/Settings.h | 1 + src/Interpreters/InterpreterSelectQuery.cpp | 7 ++ .../RewriteCountDistinctVisitor.cpp | 76 +++++++++++++++++++ .../RewriteCountDistinctVisitor.h | 22 ++++++ ...2152_count_distinct_optimization.reference | 10 +++ .../02152_count_distinct_optimization.sql | 18 +++++ 6 files changed, 134 insertions(+) create mode 100644 src/Interpreters/RewriteCountDistinctVisitor.cpp create mode 100644 src/Interpreters/RewriteCountDistinctVisitor.h create mode 100644 tests/queries/0_stateless/02152_count_distinct_optimization.reference create mode 100644 tests/queries/0_stateless/02152_count_distinct_optimization.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a68eec3f6a3..af25feefc6c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -585,6 +585,7 @@ class IColumn; M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ + M(Bool, count_distinct_optimization, false, "Rewrite count distinct to subquery of group by", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \ // End of COMMON_SETTINGS diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index b7d2945c857..5650884e3e9 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -315,6 +316,12 @@ InterpreterSelectQuery::InterpreterSelectQuery( query_info.original_query = query_ptr->clone(); + if (settings.count_distinct_optimization) + { + RewriteCountDistinctFunctionMatcher::Data data_rewrite_countdistinct; + RewriteCountDistinctFunctionVisitor(data_rewrite_countdistinct).visit(query_ptr); + } + JoinedTables joined_tables(getSubqueryContext(context), getSelectQuery(), options.with_all_cols); bool got_storage_from_query = false; diff --git a/src/Interpreters/RewriteCountDistinctVisitor.cpp b/src/Interpreters/RewriteCountDistinctVisitor.cpp new file mode 100644 index 00000000000..28950b7a592 --- /dev/null +++ b/src/Interpreters/RewriteCountDistinctVisitor.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include "Coordination/KeeperStorage.h" +#include "Parsers/ASTExpressionList.h" +#include "Parsers/ASTIdentifier.h" +#include "Parsers/ASTSelectQuery.h" +#include "Parsers/ASTSubquery.h" +#include "Parsers/ASTTablesInSelectQuery.h" +#include +#include +#include +#include +#include + +namespace DB +{ + +void RewriteCountDistinctFunctionMatcher::visit(ASTPtr & ast, Data & /*data*/) +{ + auto * selectq = ast->as(); + if (!selectq || !selectq->tables() || selectq->tables()->children.size() != 1) + return; + auto expr_list = selectq->select(); + if (!expr_list || expr_list->children.size() != 1) + return; + auto * func = expr_list->children[0]->as(); + if (!func || (Poco::toLower(func->name) != "countdistinct" && Poco::toLower(func->name) != "uniqexact")) + return; + auto arg = func->arguments->children; + if (arg.size() != 1) + return; + if (!arg[0]->as()) + return; + if (selectq->tables()->as()->children[0]->as()->children.size() != 1) + return; + auto * table_expr = selectq->tables()->as()->children[0]->as()->children[0]->as(); + if (!table_expr || table_expr->size() != 1 || !table_expr->database_and_table_name) + return; + // Check done, we now rewrite the AST + auto cloned_select_query = selectq->clone(); + expr_list->children[0] = makeASTFunction("count"); + + auto table_name = table_expr->database_and_table_name->as()->name(); + table_expr->children.clear(); + table_expr->children.emplace_back(std::make_shared()); + table_expr->database_and_table_name = nullptr; + table_expr->table_function = nullptr; + table_expr->subquery = table_expr->children[0]; + + auto column_name = arg[0]->as()->name(); + // Form AST for subquery + { + auto * select_ptr = cloned_select_query->as(); + select_ptr->refSelect()->children.clear(); + select_ptr->refSelect()->children.emplace_back(std::make_shared(column_name)); + auto exprlist = std::make_shared(); + exprlist->children.emplace_back(std::make_shared(column_name)); + cloned_select_query->as()->setExpression(ASTSelectQuery::Expression::GROUP_BY, exprlist); + + auto expr = std::make_shared(); + expr->children.emplace_back(cloned_select_query); + auto select_with_union = std::make_shared(); + select_with_union->union_mode = SelectUnionMode::Unspecified; + select_with_union->is_normalized = false; + select_with_union->list_of_modes.clear(); + select_with_union->set_of_modes.clear(); + select_with_union->children.emplace_back(expr); + select_with_union->list_of_selects = expr; + table_expr->children[0]->as()->children.emplace_back(select_with_union); + } +} + +} diff --git a/src/Interpreters/RewriteCountDistinctVisitor.h b/src/Interpreters/RewriteCountDistinctVisitor.h new file mode 100644 index 00000000000..b15c03f9c24 --- /dev/null +++ b/src/Interpreters/RewriteCountDistinctVisitor.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include +#include "Interpreters/TreeRewriter.h" + +namespace DB +{ + +class ASTFunction; + +/// Really simple rewrite 'select countDistinct(a) from t' to 'select count(1) from (select a from t groupBy a)' +class RewriteCountDistinctFunctionMatcher +{ +public: + struct Data {}; + static void visit(ASTPtr & ast, Data &); + static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; } +}; + +using RewriteCountDistinctFunctionVisitor = InDepthNodeVisitor; +} diff --git a/tests/queries/0_stateless/02152_count_distinct_optimization.reference b/tests/queries/0_stateless/02152_count_distinct_optimization.reference new file mode 100644 index 00000000000..968a6f4740f --- /dev/null +++ b/tests/queries/0_stateless/02152_count_distinct_optimization.reference @@ -0,0 +1,10 @@ +2 +3 +5 +1 +1 +2 +3 +5 +1 +1 diff --git a/tests/queries/0_stateless/02152_count_distinct_optimization.sql b/tests/queries/0_stateless/02152_count_distinct_optimization.sql new file mode 100644 index 00000000000..abb47763a53 --- /dev/null +++ b/tests/queries/0_stateless/02152_count_distinct_optimization.sql @@ -0,0 +1,18 @@ +drop table if exists table_02152; + +create table table_02152 (a String, b LowCardinality(String)) engine = MergeTree order by a; +insert into table_02152 values ('a_1', 'b_1') ('a_2', 'b_2') ('a_1', 'b_3') ('a_2', 'b_2'); + +set count_distinct_optimization=true; +select countDistinct(a) from table_02152; +select countDistinct(b) from table_02152; +select uniqExact(m) from (select number, (number / 2)::UInt64 as m from numbers(10)); +select uniqExact(x) from numbers(10) group by number % 2 as x; + +set count_distinct_optimization=false; +select countDistinct(a) from table_02152; +select countDistinct(b) from table_02152; +select uniqExact(m) from (select number, (number / 2)::UInt64 as m from numbers(10)); +select uniqExact(x) from numbers(10) group by number % 2 as x; + +drop table if exists table_02152; From ca519ca75e4f7cbb405804dbcd07934005a7d970 Mon Sep 17 00:00:00 2001 From: Anton Kozlov Date: Thu, 28 Apr 2022 13:56:53 +0100 Subject: [PATCH 92/94] Improve performance of file descriptor cache by narrowing mutex scopes (#36682) --- src/IO/OpenedFile.cpp | 22 ++++++++++++++++------ src/IO/OpenedFile.h | 12 ++++++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/IO/OpenedFile.cpp b/src/IO/OpenedFile.cpp index 6df21e836b4..f8c460b0833 100644 --- a/src/IO/OpenedFile.cpp +++ b/src/IO/OpenedFile.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -22,7 +23,7 @@ namespace ErrorCodes } -void OpenedFile::open(int flags) +void OpenedFile::open() const { ProfileEvents::increment(ProfileEvents::FileOpen); @@ -33,6 +34,13 @@ void OpenedFile::open(int flags) errno == ENOENT ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE); } +int OpenedFile::getFD() const +{ + std::lock_guard l(mutex); + if (fd == -1) + open(); + return fd; +} std::string OpenedFile::getFileName() const { @@ -40,22 +48,24 @@ std::string OpenedFile::getFileName() const } -OpenedFile::OpenedFile(const std::string & file_name_, int flags) - : file_name(file_name_) +OpenedFile::OpenedFile(const std::string & file_name_, int flags_) + : file_name(file_name_), flags(flags_) { - open(flags); } OpenedFile::~OpenedFile() { - if (fd != -1) - close(); /// Exceptions will lead to std::terminate and that's Ok. + close(); /// Exceptions will lead to std::terminate and that's Ok. } void OpenedFile::close() { + std::lock_guard l(mutex); + if (fd == -1) + return; + if (0 != ::close(fd)) throw Exception("Cannot close file", ErrorCodes::CANNOT_CLOSE_FILE); diff --git a/src/IO/OpenedFile.h b/src/IO/OpenedFile.h index 8af0c83c363..10c36d9e1d3 100644 --- a/src/IO/OpenedFile.h +++ b/src/IO/OpenedFile.h @@ -2,6 +2,7 @@ #include #include +#include namespace CurrentMetrics @@ -17,22 +18,25 @@ namespace DB class OpenedFile { public: - OpenedFile(const std::string & file_name_, int flags); + OpenedFile(const std::string & file_name_, int flags_); ~OpenedFile(); /// Close prematurally. void close(); - int getFD() const { return fd; } + int getFD() const; std::string getFileName() const; private: std::string file_name; - int fd = -1; + int flags = 0; + + mutable int fd = -1; + mutable std::mutex mutex; CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead}; - void open(int flags); + void open() const; }; } From 7c3cf30163891ab9c68c01a0382f8441228f56bb Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 28 Apr 2022 14:40:51 +0200 Subject: [PATCH 93/94] Fix build SeekableReadBufferWithSize -> SeekableReadBuffer --- src/IO/ConcatSeekableReadBuffer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/IO/ConcatSeekableReadBuffer.h b/src/IO/ConcatSeekableReadBuffer.h index 26314a218ea..fd9417cef8a 100644 --- a/src/IO/ConcatSeekableReadBuffer.h +++ b/src/IO/ConcatSeekableReadBuffer.h @@ -8,10 +8,10 @@ namespace DB { /// Reads from the concatenation of multiple SeekableReadBuffer's -class ConcatSeekableReadBuffer : public SeekableReadBufferWithSize +class ConcatSeekableReadBuffer : public SeekableReadBuffer, public WithFileSize { public: - ConcatSeekableReadBuffer() : SeekableReadBufferWithSize(nullptr, 0) { } + ConcatSeekableReadBuffer() : SeekableReadBuffer(nullptr, 0) { } ConcatSeekableReadBuffer(std::unique_ptr buf1, size_t size1, std::unique_ptr buf2, size_t size2); ConcatSeekableReadBuffer(SeekableReadBuffer & buf1, size_t size1, SeekableReadBuffer & buf2, size_t size2); @@ -21,7 +21,7 @@ public: off_t seek(off_t off, int whence) override; off_t getPosition() override; - std::optional getTotalSize() override { return total_size; } + std::optional getFileSize() override { return total_size; } private: bool nextImpl() override; From 7dc084419e16c52650088a3ae948850d94c4edf5 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 28 Apr 2022 15:12:25 +0200 Subject: [PATCH 94/94] Check socket is connected in HTTPSession (#36683) --- src/Server/HTTP/HTTPServerConnection.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp index e365c9f31d0..6b2ef32f6a4 100644 --- a/src/Server/HTTP/HTTPServerConnection.cpp +++ b/src/Server/HTTP/HTTPServerConnection.cpp @@ -22,12 +22,12 @@ void HTTPServerConnection::run() std::string server = params->getSoftwareVersion(); Poco::Net::HTTPServerSession session(socket(), params); - while (!stopped && tcp_server.isOpen() && session.hasMoreRequests()) + while (!stopped && tcp_server.isOpen() && session.hasMoreRequests() && session.connected()) { try { std::unique_lock lock(mutex); - if (!stopped && tcp_server.isOpen()) + if (!stopped && tcp_server.isOpen() && session.connected()) { HTTPServerResponse response(session); HTTPServerRequest request(context, response, session);