From d0a49628f421a63b8344b4c77927a2b17c5fcddf Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 22 Apr 2022 12:17:23 +0300 Subject: [PATCH 001/227] Added global_max_threads parameter. --- src/Core/Settings.h | 1 + src/Interpreters/ProcessList.cpp | 20 +++++++++++++- src/Interpreters/ProcessList.h | 3 +++ src/QueryPipeline/QueryPipelineBuilder.cpp | 31 ++++++++++++++++++++++ src/QueryPipeline/QueryPipelineBuilder.h | 16 +++++------ 5 files changed, 61 insertions(+), 10 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index aa78456702c..cbf41379564 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -47,6 +47,7 @@ class IColumn; M(UInt64, max_insert_delayed_streams_for_parallel_write, 0, "The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise)", 0) \ M(UInt64, max_final_threads, 16, "The maximum number of threads to read from table with FINAL.", 0) \ M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ + M(MaxThreads, global_max_threads, 0, "The total maximum number of threads for all requests.", 0) \ M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \ M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \ M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 26146781327..9963a3ba0a2 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -15,7 +15,8 @@ #include #include #include - +#include +#include namespace CurrentMetrics { @@ -71,6 +72,10 @@ static bool isUnlimitedQuery(const IAST * ast) ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * ast, ContextPtr query_context) { EntryPtr res; + auto adqm_log = &Poco::Logger::get("ADQM"); + LOG_DEBUG(adqm_log,"Inserting query into process list: {}", query_); + LOG_DEBUG(adqm_log,"Num of concurrent queries: {}", processes.size()); + LOG_DEBUG(adqm_log,"Global Num Threads: {}", getGlobalNumThreads()); const ClientInfo & client_info = query_context->getClientInfo(); const Settings & settings = query_context->getSettingsRef(); @@ -500,6 +505,19 @@ ProcessList::Info ProcessList::getInfo(bool get_thread_list, bool get_profile_ev return per_query_infos; } +size_t ProcessList::getGlobalNumThreads() const +{ + size_t global_num_threads = 0; + std::lock_guard lock(mutex); + + for (const auto & process : processes) + { + auto qsi = process.getInfo(true); + global_num_threads += qsi.thread_ids.size(); + } + + return global_num_threads; +} ProcessListForUser::ProcessListForUser(ProcessList * global_process_list) : user_overcommit_tracker(global_process_list, this) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 0994f34d003..09f783c4602 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -342,6 +342,9 @@ public: /// Get current state of process list. Info getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const; + /// Get total number of threads for all queries in process list. + size_t getGlobalNumThreads() const; + /// Get current state of process list per user. UserInfo getUserInfo(bool get_profile_events = false) const; diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 9f392b51cf0..74d574dc7fa 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include namespace DB { @@ -458,6 +460,35 @@ void QueryPipelineBuilder::setProcessListElement(QueryStatus * elem) } } +size_t QueryPipelineBuilder::getNumThreads() const +{ + auto num_threads = pipe.maxParallelStreams(); + + auto adqm_log = &Poco::Logger::get("ADQM"); + LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); + LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); + + if (max_threads) //-V1051 + num_threads = std::min(num_threads, max_threads); + + LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); + auto context = process_list_element->getContext(); + auto global_max_threads = context->getSettingsRef().global_max_threads; + if (process_list_element && global_max_threads) { + LOG_DEBUG(adqm_log,"Global number of threads from config: {}", global_max_threads); + LOG_DEBUG(adqm_log,"Current global num threads: {}", + context->getProcessList().getGlobalNumThreads()); + auto globally_available_threads = global_max_threads - context->getProcessList().getGlobalNumThreads(); + LOG_DEBUG(adqm_log,"Globally available threads: {}", globally_available_threads); + num_threads = std::min(num_threads, globally_available_threads); + LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); + } + + num_threads = std::max(1, num_threads); + LOG_DEBUG(adqm_log,"Final num threads: {}", num_threads); + return num_threads; +} + PipelineExecutorPtr QueryPipelineBuilder::execute() { if (!isCompleted()) diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index ac84191cf34..10ce1a52aa1 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -6,6 +6,10 @@ #include #include #include +#include +#include +#include + namespace DB { @@ -30,6 +34,8 @@ struct ExpressionActionsSettings; class IJoin; using JoinPtr = std::shared_ptr; +class Context; + class QueryPipelineBuilder { public: @@ -130,15 +136,7 @@ public: void setProcessListElement(QueryStatus * elem); /// Recommend number of threads for pipeline execution. - size_t getNumThreads() const - { - auto num_threads = pipe.maxParallelStreams(); - - if (max_threads) //-V1051 - num_threads = std::min(num_threads, max_threads); - - return std::max(1, num_threads); - } + size_t getNumThreads() const; /// Set upper limit for the recommend number of threads void setMaxThreads(size_t max_threads_) { max_threads = max_threads_; } From 7c02bd75e38cc94d5cb5ab223ea2399851a8fb29 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 22 Apr 2022 14:13:44 +0300 Subject: [PATCH 002/227] Corrected bug in number of threads calculation --- src/QueryPipeline/QueryPipelineBuilder.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 74d574dc7fa..b692152ce49 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -478,7 +478,12 @@ size_t QueryPipelineBuilder::getNumThreads() const LOG_DEBUG(adqm_log,"Global number of threads from config: {}", global_max_threads); LOG_DEBUG(adqm_log,"Current global num threads: {}", context->getProcessList().getGlobalNumThreads()); - auto globally_available_threads = global_max_threads - context->getProcessList().getGlobalNumThreads(); + size_t current_global_num_threads = context->getProcessList().getGlobalNumThreads(); + size_t globally_available_threads; + if (global_max_threads > current_global_num_threads) + globally_available_threads = global_max_threads - current_global_num_threads; + else + globally_available_threads = 0; LOG_DEBUG(adqm_log,"Globally available threads: {}", globally_available_threads); num_threads = std::min(num_threads, globally_available_threads); LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); From 0c467872d962ec28b01d75c11cd1f78006aa340c Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 22 Apr 2022 17:17:29 +0300 Subject: [PATCH 003/227] Moved global_max_threads parameter from users.xml file to config.xml file --- programs/server/Server.cpp | 3 +++ src/Core/Settings.h | 1 - src/Interpreters/ProcessList.h | 11 +++++++++++ src/QueryPipeline/QueryPipelineBuilder.cpp | 2 +- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index fc9187cb622..b7848c0cd93 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1107,6 +1107,9 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); + if (config->has("global_max_threads")) + global_context->getProcessList().setGlobalMaxThreads(config->getInt("global_max_threads", 0)); + if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index cbf41379564..aa78456702c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -47,7 +47,6 @@ class IColumn; M(UInt64, max_insert_delayed_streams_for_parallel_write, 0, "The maximum number of streams (columns) to delay final part flush. Default - auto (1000 in case of underlying storage supports parallel write, for example S3 and disabled otherwise)", 0) \ M(UInt64, max_final_threads, 16, "The maximum number of threads to read from table with FINAL.", 0) \ M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \ - M(MaxThreads, global_max_threads, 0, "The total maximum number of threads for all requests.", 0) \ M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \ M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \ M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \ diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 09f783c4602..75f80de2fc7 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -301,6 +301,9 @@ protected: Container processes; size_t max_size = 0; /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + /// The total maximum number of threads for all requests. + size_t global_max_threads = 0; /// 0 means no limit. + /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; @@ -345,6 +348,14 @@ public: /// Get total number of threads for all queries in process list. size_t getGlobalNumThreads() const; + size_t getGlobalMaxThreads() const { return global_max_threads; } + + void setGlobalMaxThreads(size_t global_max_threads_) + { + std::lock_guard lock(mutex); + global_max_threads = global_max_threads_; + } + /// Get current state of process list per user. UserInfo getUserInfo(bool get_profile_events = false) const; diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index b692152ce49..3612eba2bf0 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -473,7 +473,7 @@ size_t QueryPipelineBuilder::getNumThreads() const LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); auto context = process_list_element->getContext(); - auto global_max_threads = context->getSettingsRef().global_max_threads; + auto global_max_threads = context->getProcessList().getGlobalMaxThreads(); if (process_list_element && global_max_threads) { LOG_DEBUG(adqm_log,"Global number of threads from config: {}", global_max_threads); LOG_DEBUG(adqm_log,"Current global num threads: {}", From d2efefa17fdd83cecd2021babc7035b8a14b88eb Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 28 Apr 2022 11:29:46 +0300 Subject: [PATCH 004/227] Added default value for globally_available_threads --- src/QueryPipeline/QueryPipelineBuilder.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 3612eba2bf0..1db3f5642a6 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -479,11 +479,9 @@ size_t QueryPipelineBuilder::getNumThreads() const LOG_DEBUG(adqm_log,"Current global num threads: {}", context->getProcessList().getGlobalNumThreads()); size_t current_global_num_threads = context->getProcessList().getGlobalNumThreads(); - size_t globally_available_threads; + size_t globally_available_threads = 0; if (global_max_threads > current_global_num_threads) globally_available_threads = global_max_threads - current_global_num_threads; - else - globally_available_threads = 0; LOG_DEBUG(adqm_log,"Globally available threads: {}", globally_available_threads); num_threads = std::min(num_threads, globally_available_threads); LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); From 5e22014b48e91b2e3a431bb998b940712521b160 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 29 Apr 2022 16:57:16 +0300 Subject: [PATCH 005/227] Added test_global_max_threads integration test --- .../test_global_max_threads/__init__.py | 0 .../configs/config_default.xml | 7 +++++ .../configs/config_defined.xml | 8 ++++++ .../test_global_max_threads/configs/users.xml | 23 ++++++++++++++++ .../test_global_max_threads/test.py | 27 +++++++++++++++++++ 5 files changed, 65 insertions(+) create mode 100644 tests/integration/test_global_max_threads/__init__.py create mode 100644 tests/integration/test_global_max_threads/configs/config_default.xml create mode 100644 tests/integration/test_global_max_threads/configs/config_defined.xml create mode 100644 tests/integration/test_global_max_threads/configs/users.xml create mode 100644 tests/integration/test_global_max_threads/test.py diff --git a/tests/integration/test_global_max_threads/__init__.py b/tests/integration/test_global_max_threads/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_global_max_threads/configs/config_default.xml b/tests/integration/test_global_max_threads/configs/config_default.xml new file mode 100644 index 00000000000..7a8ff1fb420 --- /dev/null +++ b/tests/integration/test_global_max_threads/configs/config_default.xml @@ -0,0 +1,7 @@ + + + + system + query_log
+
+
diff --git a/tests/integration/test_global_max_threads/configs/config_defined.xml b/tests/integration/test_global_max_threads/configs/config_defined.xml new file mode 100644 index 00000000000..a330452d2fb --- /dev/null +++ b/tests/integration/test_global_max_threads/configs/config_defined.xml @@ -0,0 +1,8 @@ + + + 50 + + system + query_log
+
+
diff --git a/tests/integration/test_global_max_threads/configs/users.xml b/tests/integration/test_global_max_threads/configs/users.xml new file mode 100644 index 00000000000..63fefbb803b --- /dev/null +++ b/tests/integration/test_global_max_threads/configs/users.xml @@ -0,0 +1,23 @@ + + + + + 1 + 100 + + + + + + + ::/0 + + default + default + + + + + + + diff --git a/tests/integration/test_global_max_threads/test.py b/tests/integration/test_global_max_threads/test.py new file mode 100644 index 00000000000..fe3d36c9dcf --- /dev/null +++ b/tests/integration/test_global_max_threads/test.py @@ -0,0 +1,27 @@ +import pytest +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance("node1", main_configs=["configs/config_default.xml"], user_configs=["configs/users.xml"]) +node2 = cluster.add_instance("node2", main_configs=["configs/config_defined.xml"], user_configs=["configs/users.xml"]) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def test_global_max_threads_default(started_cluster): + node1.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_global_max_threads_1"); + node1.query("SYSTEM FLUSH LOGS"); + assert node1.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_global_max_threads_1'") == "102\n" + + +def test_global_max_threads_defined(started_cluster): + node2.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_global_max_threads_2"); + node2.query("SYSTEM FLUSH LOGS"); + assert node2.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_global_max_threads_2'") == "51\n" From 925a94a656fc9cb626a8d452b389a42e837d0f31 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 12 May 2022 16:56:24 +0300 Subject: [PATCH 006/227] Implemented -1 value handling for global_max_threads parameter --- programs/server/Server.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b7848c0cd93..94b6bc6728b 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1107,8 +1107,17 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("global_max_threads")) - global_context->getProcessList().setGlobalMaxThreads(config->getInt("global_max_threads", 0)); + if (config->has("global_max_threads")) { + auto adqm_log = &Poco::Logger::get("ADQM"); + auto global_max_threads = config->getInt("global_max_threads", 0); + LOG_DEBUG(adqm_log,"From config.xml global_max_threads: {}", global_max_threads); + if (global_max_threads == -1) { + LOG_DEBUG(adqm_log,"PhysicalCPUCores: {}", getNumberOfPhysicalCPUCores()); + global_max_threads = getNumberOfPhysicalCPUCores()*2; + } + LOG_DEBUG(adqm_log,"Finally global_max_threads: {}", global_max_threads); + global_context->getProcessList().setGlobalMaxThreads(global_max_threads); + } if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); From d94e9c4263dcfb0269ccf56b05dc17f0ea485831 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 13 May 2022 10:56:34 +0300 Subject: [PATCH 007/227] For value -1 of global_max_threads replaced physical cores count by logical cores count --- programs/server/Server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 94b6bc6728b..7ebe52acf6f 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1112,8 +1112,8 @@ int Server::main(const std::vector & /*args*/) auto global_max_threads = config->getInt("global_max_threads", 0); LOG_DEBUG(adqm_log,"From config.xml global_max_threads: {}", global_max_threads); if (global_max_threads == -1) { - LOG_DEBUG(adqm_log,"PhysicalCPUCores: {}", getNumberOfPhysicalCPUCores()); - global_max_threads = getNumberOfPhysicalCPUCores()*2; + LOG_DEBUG(adqm_log,"number of logical cores: {}", std::thread::hardware_concurrency()); + global_max_threads = std::thread::hardware_concurrency()*2; } LOG_DEBUG(adqm_log,"Finally global_max_threads: {}", global_max_threads); global_context->getProcessList().setGlobalMaxThreads(global_max_threads); From 87f25c6864c379d11185db06825538bd481e388c Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 13 May 2022 15:02:45 +0300 Subject: [PATCH 008/227] Added thread_factor = 2 constant for initial value calculation of global_max_threads --- programs/server/Server.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 7ebe52acf6f..22f01cb6869 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1112,8 +1112,10 @@ int Server::main(const std::vector & /*args*/) auto global_max_threads = config->getInt("global_max_threads", 0); LOG_DEBUG(adqm_log,"From config.xml global_max_threads: {}", global_max_threads); if (global_max_threads == -1) { + // Based on tests global_max_threads has an optimal value when it's about two times of logical CPU cores + constexpr size_t thread_factor = 2; LOG_DEBUG(adqm_log,"number of logical cores: {}", std::thread::hardware_concurrency()); - global_max_threads = std::thread::hardware_concurrency()*2; + global_max_threads = std::thread::hardware_concurrency()*thread_factor; } LOG_DEBUG(adqm_log,"Finally global_max_threads: {}", global_max_threads); global_context->getProcessList().setGlobalMaxThreads(global_max_threads); From b06d84b3e56270dcf4a4451531e9f55a20cd48f3 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 13 May 2022 16:56:03 +0300 Subject: [PATCH 009/227] renamed global_max_threads parameter to total_max_threads --- programs/server/Server.cpp | 16 ++++++------- src/Interpreters/ProcessList.cpp | 10 ++++---- src/Interpreters/ProcessList.h | 8 +++---- src/QueryPipeline/QueryPipelineBuilder.cpp | 24 +++++++++---------- .../__init__.py | 0 .../configs/config_default.xml | 0 .../configs/config_defined.xml | 2 +- .../configs/users.xml | 0 .../test.py | 12 +++++----- 9 files changed, 36 insertions(+), 36 deletions(-) rename tests/integration/{test_global_max_threads => test_total_max_threads}/__init__.py (100%) rename tests/integration/{test_global_max_threads => test_total_max_threads}/configs/config_default.xml (100%) rename tests/integration/{test_global_max_threads => test_total_max_threads}/configs/config_defined.xml (89%) rename tests/integration/{test_global_max_threads => test_total_max_threads}/configs/users.xml (100%) rename tests/integration/{test_global_max_threads => test_total_max_threads}/test.py (75%) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 22f01cb6869..fc35f0a0c53 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1107,18 +1107,18 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("global_max_threads")) { + if (config->has("total_max_threads")) { auto adqm_log = &Poco::Logger::get("ADQM"); - auto global_max_threads = config->getInt("global_max_threads", 0); - LOG_DEBUG(adqm_log,"From config.xml global_max_threads: {}", global_max_threads); - if (global_max_threads == -1) { - // Based on tests global_max_threads has an optimal value when it's about two times of logical CPU cores + auto total_max_threads = config->getInt("total_max_threads", 0); + LOG_DEBUG(adqm_log,"From config.xml total_max_threads: {}", total_max_threads); + if (total_max_threads == -1) { + // Based on tests total_max_threads has an optimal value when it's about two times of logical CPU cores constexpr size_t thread_factor = 2; LOG_DEBUG(adqm_log,"number of logical cores: {}", std::thread::hardware_concurrency()); - global_max_threads = std::thread::hardware_concurrency()*thread_factor; + total_max_threads = std::thread::hardware_concurrency()*thread_factor; } - LOG_DEBUG(adqm_log,"Finally global_max_threads: {}", global_max_threads); - global_context->getProcessList().setGlobalMaxThreads(global_max_threads); + LOG_DEBUG(adqm_log,"Finally total_max_threads: {}", total_max_threads); + global_context->getProcessList().setGlobalMaxThreads(total_max_threads); } if (config->has("max_concurrent_queries")) diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 9963a3ba0a2..c3bbf9578ac 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -75,7 +75,7 @@ ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * as auto adqm_log = &Poco::Logger::get("ADQM"); LOG_DEBUG(adqm_log,"Inserting query into process list: {}", query_); LOG_DEBUG(adqm_log,"Num of concurrent queries: {}", processes.size()); - LOG_DEBUG(adqm_log,"Global Num Threads: {}", getGlobalNumThreads()); + LOG_DEBUG(adqm_log,"Global Num Threads: {}", getTotalNumThreads()); const ClientInfo & client_info = query_context->getClientInfo(); const Settings & settings = query_context->getSettingsRef(); @@ -505,18 +505,18 @@ ProcessList::Info ProcessList::getInfo(bool get_thread_list, bool get_profile_ev return per_query_infos; } -size_t ProcessList::getGlobalNumThreads() const +size_t ProcessList::getTotalNumThreads() const { - size_t global_num_threads = 0; + size_t total_num_threads = 0; std::lock_guard lock(mutex); for (const auto & process : processes) { auto qsi = process.getInfo(true); - global_num_threads += qsi.thread_ids.size(); + total_num_threads += qsi.thread_ids.size(); } - return global_num_threads; + return total_num_threads; } ProcessListForUser::ProcessListForUser(ProcessList * global_process_list) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 75f80de2fc7..0f2785be2a8 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -302,7 +302,7 @@ protected: size_t max_size = 0; /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. /// The total maximum number of threads for all requests. - size_t global_max_threads = 0; /// 0 means no limit. + size_t total_max_threads = 0; /// 0 means no limit. /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; @@ -346,14 +346,14 @@ public: Info getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const; /// Get total number of threads for all queries in process list. - size_t getGlobalNumThreads() const; + size_t getTotalNumThreads() const; - size_t getGlobalMaxThreads() const { return global_max_threads; } + size_t getTotalMaxThreads() const { return total_max_threads; } void setGlobalMaxThreads(size_t global_max_threads_) { std::lock_guard lock(mutex); - global_max_threads = global_max_threads_; + total_max_threads = global_max_threads_; } /// Get current state of process list per user. diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 1db3f5642a6..8272bb770e2 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -473,22 +473,22 @@ size_t QueryPipelineBuilder::getNumThreads() const LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); auto context = process_list_element->getContext(); - auto global_max_threads = context->getProcessList().getGlobalMaxThreads(); - if (process_list_element && global_max_threads) { - LOG_DEBUG(adqm_log,"Global number of threads from config: {}", global_max_threads); - LOG_DEBUG(adqm_log,"Current global num threads: {}", - context->getProcessList().getGlobalNumThreads()); - size_t current_global_num_threads = context->getProcessList().getGlobalNumThreads(); - size_t globally_available_threads = 0; - if (global_max_threads > current_global_num_threads) - globally_available_threads = global_max_threads - current_global_num_threads; - LOG_DEBUG(adqm_log,"Globally available threads: {}", globally_available_threads); - num_threads = std::min(num_threads, globally_available_threads); + auto total_max_threads = context->getProcessList().getTotalMaxThreads(); + if (process_list_element && total_max_threads) { + LOG_DEBUG(adqm_log,"Total number of threads from config: {}", total_max_threads); + LOG_DEBUG(adqm_log,"Current total num threads: {}", + context->getProcessList().getTotalNumThreads()); + size_t current_total_num_threads = context->getProcessList().getTotalNumThreads(); + size_t total_available_threads = 0; + if (total_max_threads > current_total_num_threads) + total_available_threads = total_max_threads - current_total_num_threads; + LOG_DEBUG(adqm_log,"Total available threads: {}", total_available_threads); + num_threads = std::min(num_threads, total_available_threads); LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); } num_threads = std::max(1, num_threads); - LOG_DEBUG(adqm_log,"Final num threads: {}", num_threads); + LOG_DEBUG(adqm_log,"Finally num threads: {}", num_threads); return num_threads; } diff --git a/tests/integration/test_global_max_threads/__init__.py b/tests/integration/test_total_max_threads/__init__.py similarity index 100% rename from tests/integration/test_global_max_threads/__init__.py rename to tests/integration/test_total_max_threads/__init__.py diff --git a/tests/integration/test_global_max_threads/configs/config_default.xml b/tests/integration/test_total_max_threads/configs/config_default.xml similarity index 100% rename from tests/integration/test_global_max_threads/configs/config_default.xml rename to tests/integration/test_total_max_threads/configs/config_default.xml diff --git a/tests/integration/test_global_max_threads/configs/config_defined.xml b/tests/integration/test_total_max_threads/configs/config_defined.xml similarity index 89% rename from tests/integration/test_global_max_threads/configs/config_defined.xml rename to tests/integration/test_total_max_threads/configs/config_defined.xml index a330452d2fb..050f041c4fb 100644 --- a/tests/integration/test_global_max_threads/configs/config_defined.xml +++ b/tests/integration/test_total_max_threads/configs/config_defined.xml @@ -1,6 +1,6 @@ - 50 + 50 system query_log
diff --git a/tests/integration/test_global_max_threads/configs/users.xml b/tests/integration/test_total_max_threads/configs/users.xml similarity index 100% rename from tests/integration/test_global_max_threads/configs/users.xml rename to tests/integration/test_total_max_threads/configs/users.xml diff --git a/tests/integration/test_global_max_threads/test.py b/tests/integration/test_total_max_threads/test.py similarity index 75% rename from tests/integration/test_global_max_threads/test.py rename to tests/integration/test_total_max_threads/test.py index fe3d36c9dcf..fbd9f5b151e 100644 --- a/tests/integration/test_global_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -15,13 +15,13 @@ def started_cluster(): cluster.shutdown() -def test_global_max_threads_default(started_cluster): - node1.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_global_max_threads_1"); +def test_total_max_threads_default(started_cluster): + node1.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_1"); node1.query("SYSTEM FLUSH LOGS"); - assert node1.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_global_max_threads_1'") == "102\n" + assert node1.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_1'") == "102\n" -def test_global_max_threads_defined(started_cluster): - node2.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_global_max_threads_2"); +def test_total_max_threads_defined(started_cluster): + node2.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_2"); node2.query("SYSTEM FLUSH LOGS"); - assert node2.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_global_max_threads_2'") == "51\n" + assert node2.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_2'") == "51\n" From 159ea14739f0aedcb03d5b3b78341ebd6ac35b68 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 16 May 2022 19:48:17 +0300 Subject: [PATCH 010/227] Code cleanup. Corrected Loggers path. Refactoring Context code. --- programs/server/Server.cpp | 2 +- src/Interpreters/ProcessList.cpp | 2 +- src/QueryPipeline/QueryPipelineBuilder.cpp | 9 ++++----- src/QueryPipeline/QueryPipelineBuilder.h | 4 +--- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index dfab466a74c..cddde9b2736 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1126,7 +1126,7 @@ int Server::main(const std::vector & /*args*/) // Based on tests total_max_threads has an optimal value when it's about two times of logical CPU cores constexpr size_t thread_factor = 2; LOG_DEBUG(adqm_log,"number of logical cores: {}", std::thread::hardware_concurrency()); - total_max_threads = std::thread::hardware_concurrency()*thread_factor; + total_max_threads = std::thread::hardware_concurrency() * thread_factor; } LOG_DEBUG(adqm_log,"Finally total_max_threads: {}", total_max_threads); global_context->getProcessList().setGlobalMaxThreads(total_max_threads); diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index cf038ff66c1..a2d9d6e3469 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include namespace CurrentMetrics { diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 8614187018c..51e288545a4 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -510,13 +510,12 @@ size_t QueryPipelineBuilder::getNumThreads() const num_threads = std::min(num_threads, max_threads); LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); - auto context = process_list_element->getContext(); - auto total_max_threads = context->getProcessList().getTotalMaxThreads(); - if (process_list_element && total_max_threads) { + auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); + if (total_max_threads && process_list_element) { LOG_DEBUG(adqm_log,"Total number of threads from config: {}", total_max_threads); LOG_DEBUG(adqm_log,"Current total num threads: {}", - context->getProcessList().getTotalNumThreads()); - size_t current_total_num_threads = context->getProcessList().getTotalNumThreads(); + process_list_element->getContext()->getProcessList().getTotalNumThreads()); + size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); size_t total_available_threads = 0; if (total_max_threads > current_total_num_threads) total_available_threads = total_max_threads - current_total_num_threads; diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 564bbfff227..416d8ccc4d8 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB @@ -34,8 +34,6 @@ struct ExpressionActionsSettings; class IJoin; using JoinPtr = std::shared_ptr; -class Context; - class QueryPipelineBuilder { public: From 7cc63f3eed79445be388b6940f5774b9a3283ca0 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 17 May 2022 10:02:54 +0300 Subject: [PATCH 011/227] Removed logging code --- programs/server/Server.cpp | 4 ---- src/Interpreters/ProcessList.cpp | 6 ------ src/QueryPipeline/QueryPipelineBuilder.cpp | 11 ----------- src/QueryPipeline/QueryPipelineBuilder.h | 2 -- 4 files changed, 23 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index cddde9b2736..004c52203c1 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1119,16 +1119,12 @@ int Server::main(const std::vector & /*args*/) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); if (config->has("total_max_threads")) { - auto adqm_log = &Poco::Logger::get("ADQM"); auto total_max_threads = config->getInt("total_max_threads", 0); - LOG_DEBUG(adqm_log,"From config.xml total_max_threads: {}", total_max_threads); if (total_max_threads == -1) { // Based on tests total_max_threads has an optimal value when it's about two times of logical CPU cores constexpr size_t thread_factor = 2; - LOG_DEBUG(adqm_log,"number of logical cores: {}", std::thread::hardware_concurrency()); total_max_threads = std::thread::hardware_concurrency() * thread_factor; } - LOG_DEBUG(adqm_log,"Finally total_max_threads: {}", total_max_threads); global_context->getProcessList().setGlobalMaxThreads(total_max_threads); } diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index a2d9d6e3469..bd751d6d618 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -15,8 +15,6 @@ #include #include #include -#include -#include namespace CurrentMetrics { @@ -72,10 +70,6 @@ static bool isUnlimitedQuery(const IAST * ast) ProcessList::EntryPtr ProcessList::insert(const String & query_, const IAST * ast, ContextPtr query_context) { EntryPtr res; - auto adqm_log = &Poco::Logger::get("ADQM"); - LOG_DEBUG(adqm_log,"Inserting query into process list: {}", query_); - LOG_DEBUG(adqm_log,"Num of concurrent queries: {}", processes.size()); - LOG_DEBUG(adqm_log,"Global Num Threads: {}", getTotalNumThreads()); const ClientInfo & client_info = query_context->getClientInfo(); const Settings & settings = query_context->getSettingsRef(); diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 51e288545a4..a468fe58ab1 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -502,30 +502,19 @@ size_t QueryPipelineBuilder::getNumThreads() const { auto num_threads = pipe.maxParallelStreams(); - auto adqm_log = &Poco::Logger::get("ADQM"); - LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); - LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); - if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); - LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); if (total_max_threads && process_list_element) { - LOG_DEBUG(adqm_log,"Total number of threads from config: {}", total_max_threads); - LOG_DEBUG(adqm_log,"Current total num threads: {}", - process_list_element->getContext()->getProcessList().getTotalNumThreads()); size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); size_t total_available_threads = 0; if (total_max_threads > current_total_num_threads) total_available_threads = total_max_threads - current_total_num_threads; - LOG_DEBUG(adqm_log,"Total available threads: {}", total_available_threads); num_threads = std::min(num_threads, total_available_threads); - LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); } num_threads = std::max(1, num_threads); - LOG_DEBUG(adqm_log,"Finally num threads: {}", num_threads); return num_threads; } diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 416d8ccc4d8..80bf23407b2 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -7,8 +7,6 @@ #include #include #include -#include -#include namespace DB From 4a6f398fedcc60064232ad488777bfb408fe8743 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 17 May 2022 11:10:06 +0300 Subject: [PATCH 012/227] Moved process_list_element into if. Made more code cleanup. --- src/Interpreters/ProcessList.h | 4 ++-- src/QueryPipeline/QueryPipelineBuilder.cpp | 17 ++++++++++------- src/QueryPipeline/QueryPipelineBuilder.h | 1 - .../configs/config_default.xml | 8 ++++---- .../configs/config_defined.xml | 8 ++++---- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 20ae098e547..9eb86e1fd2c 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -314,8 +314,8 @@ protected: Container processes; size_t max_size = 0; /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. - /// The total maximum number of threads for all requests. - size_t total_max_threads = 0; /// 0 means no limit. + /// The total maximum number of threads for all queries. + size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of query is determinated based on this parameter. /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index a468fe58ab1..dcb8afccfa8 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -505,13 +506,15 @@ size_t QueryPipelineBuilder::getNumThreads() const if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); - auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); - if (total_max_threads && process_list_element) { - size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); - size_t total_available_threads = 0; - if (total_max_threads > current_total_num_threads) - total_available_threads = total_max_threads - current_total_num_threads; - num_threads = std::min(num_threads, total_available_threads); + if (process_list_element) { + auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); + if (total_max_threads) { + size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); + size_t total_available_threads = 0; + if (total_max_threads > current_total_num_threads) + total_available_threads = total_max_threads - current_total_num_threads; + num_threads = std::min(num_threads, total_available_threads); + } } num_threads = std::max(1, num_threads); diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 80bf23407b2..08529aa8784 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB diff --git a/tests/integration/test_total_max_threads/configs/config_default.xml b/tests/integration/test_total_max_threads/configs/config_default.xml index 7a8ff1fb420..6c1f8f33de1 100644 --- a/tests/integration/test_total_max_threads/configs/config_default.xml +++ b/tests/integration/test_total_max_threads/configs/config_default.xml @@ -1,7 +1,7 @@ - - system - query_log
-
+ + system + query_log
+
diff --git a/tests/integration/test_total_max_threads/configs/config_defined.xml b/tests/integration/test_total_max_threads/configs/config_defined.xml index 050f041c4fb..09234c9924a 100644 --- a/tests/integration/test_total_max_threads/configs/config_defined.xml +++ b/tests/integration/test_total_max_threads/configs/config_defined.xml @@ -1,8 +1,8 @@ 50 - - system - query_log
-
+ + system + query_log
+
From 87b1fb4ceb19fe4f940f9566fc42109e6db7fecc Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 17 May 2022 11:45:06 +0300 Subject: [PATCH 013/227] Added comment about usage of total_max_threads. --- src/Interpreters/ProcessList.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 9eb86e1fd2c..6c25191ab86 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -315,7 +315,8 @@ protected: size_t max_size = 0; /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. /// The total maximum number of threads for all queries. - size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of query is determinated based on this parameter. + /// Sometimes, real total number of threads may exceed total_max_threads parameter. + size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of a query is determinated based on this parameter. /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; From e966016c1a38a28973821c93acbf61a29089f301 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 17 May 2022 19:35:40 +0300 Subject: [PATCH 014/227] Corrected all issues found by Style Check (actions). Renamed setGlobalMaxThreads into setTotalMaxThreads. --- programs/server/Server.cpp | 8 ++-- src/Interpreters/ProcessList.h | 6 +-- src/QueryPipeline/QueryPipelineBuilder.cpp | 6 ++- .../test_total_max_threads/test.py | 38 +++++++++++++++---- 4 files changed, 42 insertions(+), 16 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 004c52203c1..1e7b5549546 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1118,14 +1118,16 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("total_max_threads")) { + if (config->has("total_max_threads")) + { auto total_max_threads = config->getInt("total_max_threads", 0); - if (total_max_threads == -1) { + if (total_max_threads == -1) + { // Based on tests total_max_threads has an optimal value when it's about two times of logical CPU cores constexpr size_t thread_factor = 2; total_max_threads = std::thread::hardware_concurrency() * thread_factor; } - global_context->getProcessList().setGlobalMaxThreads(total_max_threads); + global_context->getProcessList().setTotalMaxThreads(total_max_threads); } if (config->has("max_concurrent_queries")) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 6c25191ab86..6624083fad5 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -316,7 +316,7 @@ protected: /// The total maximum number of threads for all queries. /// Sometimes, real total number of threads may exceed total_max_threads parameter. - size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of a query is determinated based on this parameter. + size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of a query is determined based on this parameter. /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; @@ -364,10 +364,10 @@ public: size_t getTotalMaxThreads() const { return total_max_threads; } - void setGlobalMaxThreads(size_t global_max_threads_) + void setTotalMaxThreads(size_t total_max_threads_) { std::lock_guard lock(mutex); - total_max_threads = global_max_threads_; + total_max_threads = total_max_threads_; } /// Get current state of process list per user. diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index dcb8afccfa8..efc8db42ea7 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -506,9 +506,11 @@ size_t QueryPipelineBuilder::getNumThreads() const if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); - if (process_list_element) { + if (process_list_element) + { auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); - if (total_max_threads) { + if (total_max_threads) + { size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); size_t total_available_threads = 0; if (total_max_threads > current_total_num_threads) diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index fbd9f5b151e..588ee383642 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -2,8 +2,16 @@ import pytest from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance("node1", main_configs=["configs/config_default.xml"], user_configs=["configs/users.xml"]) -node2 = cluster.add_instance("node2", main_configs=["configs/config_defined.xml"], user_configs=["configs/users.xml"]) +node1 = cluster.add_instance( + "node1", + main_configs=["configs/config_default.xml"], + user_configs=["configs/users.xml"], +) +node2 = cluster.add_instance( + "node2", + main_configs=["configs/config_defined.xml"], + user_configs=["configs/users.xml"], +) @pytest.fixture(scope="module") @@ -16,12 +24,26 @@ def started_cluster(): def test_total_max_threads_default(started_cluster): - node1.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_1"); - node1.query("SYSTEM FLUSH LOGS"); - assert node1.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_1'") == "102\n" + node1.query( + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_1" + ) + node1.query("SYSTEM FLUSH LOGS") + assert ( + node1.query( + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_1'" + ) + == "102\n" + ) def test_total_max_threads_defined(started_cluster): - node2.query("SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_2"); - node2.query("SYSTEM FLUSH LOGS"); - assert node2.query("select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_2'") == "51\n" + node2.query( + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_2" + ) + node2.query("SYSTEM FLUSH LOGS") + assert ( + node2.query( + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_2'" + ) + == "51\n" + ) From 83958d25492f62b2d42836159a9c14ca7a4a1165 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 20 May 2022 15:52:50 +0300 Subject: [PATCH 015/227] Add more intergration tests. Add documenation text. Change thread_factor to 3. Add logging. Set default value = -1 to see the results CI tests. --- .../settings.md | 10 +++ programs/server/Server.cpp | 4 +- programs/server/config.xml | 7 +++ src/QueryPipeline/QueryPipelineBuilder.cpp | 14 +++++ .../configs/config_defined_1.xml | 8 +++ ...nfig_defined.xml => config_defined_50.xml} | 0 .../configs/config_limit_reached.xml | 8 +++ .../test_total_max_threads/test.py | 63 ++++++++++++++++++- 8 files changed, 110 insertions(+), 4 deletions(-) create mode 100644 tests/integration/test_total_max_threads/configs/config_defined_1.xml rename tests/integration/test_total_max_threads/configs/{config_defined.xml => config_defined_50.xml} (100%) create mode 100644 tests/integration/test_total_max_threads/configs/config_limit_reached.xml diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index fd5c2a187b5..1c0523f2b61 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -702,6 +702,16 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa - [max_server_memory_usage](#max_server_memory_usage) +## total_max_threads {#total-max-threads} +The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run. + +Possible values: +- Positive integer. +- 0 — No limit. +- -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks. + +Default value: `0`. + ## max_concurrent_queries {#max-concurrent-queries} The maximum number of simultaneously processed queries. diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 1e7b5549546..d6c062bd619 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1123,8 +1123,8 @@ int Server::main(const std::vector & /*args*/) auto total_max_threads = config->getInt("total_max_threads", 0); if (total_max_threads == -1) { - // Based on tests total_max_threads has an optimal value when it's about two times of logical CPU cores - constexpr size_t thread_factor = 2; + // Based on tests total_max_threads has an optimal value when it's about 3 times of logical CPU cores + constexpr size_t thread_factor = 3; total_max_threads = std::thread::hardware_concurrency() * thread_factor; } global_context->getProcessList().setTotalMaxThreads(total_max_threads); diff --git a/programs/server/config.xml b/programs/server/config.xml index bd54051be19..6c7d25f15b0 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -269,6 +269,13 @@
]]>
--> + + -1 + 100 diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index efc8db42ea7..14f83616930 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include namespace DB { @@ -502,24 +504,36 @@ void QueryPipelineBuilder::setProcessListElement(QueryStatus * elem) size_t QueryPipelineBuilder::getNumThreads() const { auto num_threads = pipe.maxParallelStreams(); + + auto adqm_log = &Poco::Logger::get("ADQM"); + LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); + LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); + + LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); if (process_list_element) { auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); if (total_max_threads) { + LOG_DEBUG(adqm_log,"Total number of threads from config: {}", total_max_threads); + LOG_DEBUG(adqm_log,"Current total num threads: {}", + process_list_element->getContext()->getProcessList().getTotalNumThreads()); size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); size_t total_available_threads = 0; if (total_max_threads > current_total_num_threads) total_available_threads = total_max_threads - current_total_num_threads; + LOG_DEBUG(adqm_log,"Total available threads: {}", total_available_threads); num_threads = std::min(num_threads, total_available_threads); + LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); } } num_threads = std::max(1, num_threads); + LOG_DEBUG(adqm_log,"Finally num threads: {}", num_threads); return num_threads; } diff --git a/tests/integration/test_total_max_threads/configs/config_defined_1.xml b/tests/integration/test_total_max_threads/configs/config_defined_1.xml new file mode 100644 index 00000000000..ff4aa98c3ab --- /dev/null +++ b/tests/integration/test_total_max_threads/configs/config_defined_1.xml @@ -0,0 +1,8 @@ + + + 1 + + system + query_log
+
+
diff --git a/tests/integration/test_total_max_threads/configs/config_defined.xml b/tests/integration/test_total_max_threads/configs/config_defined_50.xml similarity index 100% rename from tests/integration/test_total_max_threads/configs/config_defined.xml rename to tests/integration/test_total_max_threads/configs/config_defined_50.xml diff --git a/tests/integration/test_total_max_threads/configs/config_limit_reached.xml b/tests/integration/test_total_max_threads/configs/config_limit_reached.xml new file mode 100644 index 00000000000..94afef2d6fb --- /dev/null +++ b/tests/integration/test_total_max_threads/configs/config_limit_reached.xml @@ -0,0 +1,8 @@ + + + 10 + + system + query_log
+
+
diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index 588ee383642..c7dfdd10f8f 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -1,6 +1,10 @@ import pytest from helpers.cluster import ClickHouseCluster +import threading +import time +import logging + cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( "node1", @@ -9,7 +13,17 @@ node1 = cluster.add_instance( ) node2 = cluster.add_instance( "node2", - main_configs=["configs/config_defined.xml"], + main_configs=["configs/config_defined_50.xml"], + user_configs=["configs/users.xml"], +) +node3 = cluster.add_instance( + "node3", + main_configs=["configs/config_defined_1.xml"], + user_configs=["configs/users.xml"], +) +node4 = cluster.add_instance( + "node4", + main_configs=["configs/config_limit_reached.xml"], user_configs=["configs/users.xml"], ) @@ -36,7 +50,7 @@ def test_total_max_threads_default(started_cluster): ) -def test_total_max_threads_defined(started_cluster): +def test_total_max_threads_defined_50(started_cluster): node2.query( "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_2" ) @@ -47,3 +61,48 @@ def test_total_max_threads_defined(started_cluster): ) == "51\n" ) + + +def test_total_max_threads_defined_1(started_cluster): + node3.query( + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_3" + ) + node3.query("SYSTEM FLUSH LOGS") + assert ( + node3.query( + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_3'" + ) + == "2\n" + ) + + +def test_total_max_threads_limit_reached(started_cluster): + logging.debug("ADQM: test begin") + + def thread_select(): + logging.debug("ADQM: started another thread") + node4.query( + "SELECT sleep(3) FROM numbers_mt(10000000) settings max_threads=100" + ) + logging.debug("ADQM: finished another thread") + + another_thread = threading.Thread(target=thread_select) + another_thread.start() + + time.sleep(0.5) + logging.debug("ADQM: started main query") + node4.query( + "SELECT count(*) FROM numbers_mt(10000000) settings max_threads=5", + query_id="test_total_max_threads_4", + ) + logging.debug("ADQM: finished main query") + another_thread.join() + logging.debug("ADQM: logs: %s", node4.grep_in_log("ADQM")) + node4.query("SYSTEM FLUSH LOGS") + assert ( + node4.query( + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_4'" + ) + == "2\n" + ) + logging.debug("ADQM: test end") From dc737cf6e8b9e72de5bd04b073518a880be42507 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 23 May 2022 15:15:09 +0300 Subject: [PATCH 016/227] Improve integration test test_total_max_threads_limit_reached by adding sync white loop. --- src/QueryPipeline/QueryPipelineBuilder.cpp | 2 +- tests/integration/test_total_max_threads/test.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 14f83616930..7c24d16ca62 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -507,7 +507,7 @@ size_t QueryPipelineBuilder::getNumThreads() const auto adqm_log = &Poco::Logger::get("ADQM"); LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); - LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); + LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index c7dfdd10f8f..4c7ebcd6e47 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -82,14 +82,22 @@ def test_total_max_threads_limit_reached(started_cluster): def thread_select(): logging.debug("ADQM: started another thread") node4.query( - "SELECT sleep(3) FROM numbers_mt(10000000) settings max_threads=100" + "SELECT count(*) FROM numbers_mt(1e11) settings max_threads=100", + query_id="background_query", ) logging.debug("ADQM: finished another thread") another_thread = threading.Thread(target=thread_select) another_thread.start() - time.sleep(0.5) + while ( + node4.query( + "SELECT count(*) FROM system.processes where query_id = 'background_query'" + ) + == "0\n" + ): + time.sleep(0.1) + logging.debug("ADQM: started main query") node4.query( "SELECT count(*) FROM numbers_mt(10000000) settings max_threads=5", @@ -105,4 +113,5 @@ def test_total_max_threads_limit_reached(started_cluster): ) == "2\n" ) + node4.query("KILL QUERY WHERE user = 'default' SYNC") logging.debug("ADQM: test end") From c0437f0b83483c1e7956203b22c2fd37e4ab4c51 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Mon, 23 May 2022 18:14:37 +0300 Subject: [PATCH 017/227] Style corrections --- src/QueryPipeline/QueryPipelineBuilder.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index 7c24d16ca62..aa8d037339b 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -504,14 +504,14 @@ void QueryPipelineBuilder::setProcessListElement(QueryStatus * elem) size_t QueryPipelineBuilder::getNumThreads() const { auto num_threads = pipe.maxParallelStreams(); - + auto adqm_log = &Poco::Logger::get("ADQM"); LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); - + LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); if (process_list_element) From fd117b13c1ef54f31634e905e9f10e2a43412a48 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 24 May 2022 10:22:20 +0300 Subject: [PATCH 018/227] Change max_threads=6 in 02015_global_in_threads test --- tests/queries/0_stateless/02015_global_in_threads.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02015_global_in_threads.sh b/tests/queries/0_stateless/02015_global_in_threads.sh index 9437187d462..1934b38c47b 100755 --- a/tests/queries/0_stateless/02015_global_in_threads.sh +++ b/tests/queries/0_stateless/02015_global_in_threads.sh @@ -4,6 +4,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=32 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" +${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=6 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" ${CLICKHOUSE_CLIENT} -q "system flush logs" -${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" +${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 6 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" From e4a58fcff688dbf9d8c3c6ed8ce9ceebeed79c0d Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 24 May 2022 14:11:20 +0300 Subject: [PATCH 019/227] Remove logging code --- src/QueryPipeline/QueryPipelineBuilder.cpp | 14 -------------- tests/integration/test_total_max_threads/test.py | 9 --------- 2 files changed, 23 deletions(-) diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index aa8d037339b..efc8db42ea7 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -25,8 +25,6 @@ #include #include #include -#include -#include namespace DB { @@ -505,35 +503,23 @@ size_t QueryPipelineBuilder::getNumThreads() const { auto num_threads = pipe.maxParallelStreams(); - auto adqm_log = &Poco::Logger::get("ADQM"); - LOG_DEBUG(adqm_log,"maxParallelStreams: {}", num_threads); - LOG_DEBUG(adqm_log,"max_threads: {}", max_threads); - if (max_threads) //-V1051 num_threads = std::min(num_threads, max_threads); - LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); - if (process_list_element) { auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); if (total_max_threads) { - LOG_DEBUG(adqm_log,"Total number of threads from config: {}", total_max_threads); - LOG_DEBUG(adqm_log,"Current total num threads: {}", - process_list_element->getContext()->getProcessList().getTotalNumThreads()); size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); size_t total_available_threads = 0; if (total_max_threads > current_total_num_threads) total_available_threads = total_max_threads - current_total_num_threads; - LOG_DEBUG(adqm_log,"Total available threads: {}", total_available_threads); num_threads = std::min(num_threads, total_available_threads); - LOG_DEBUG(adqm_log,"Recommended num threads: {}", num_threads); } } num_threads = std::max(1, num_threads); - LOG_DEBUG(adqm_log,"Finally num threads: {}", num_threads); return num_threads; } diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index 4c7ebcd6e47..30a20b11f5d 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -3,7 +3,6 @@ from helpers.cluster import ClickHouseCluster import threading import time -import logging cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( @@ -77,15 +76,11 @@ def test_total_max_threads_defined_1(started_cluster): def test_total_max_threads_limit_reached(started_cluster): - logging.debug("ADQM: test begin") - def thread_select(): - logging.debug("ADQM: started another thread") node4.query( "SELECT count(*) FROM numbers_mt(1e11) settings max_threads=100", query_id="background_query", ) - logging.debug("ADQM: finished another thread") another_thread = threading.Thread(target=thread_select) another_thread.start() @@ -98,14 +93,11 @@ def test_total_max_threads_limit_reached(started_cluster): ): time.sleep(0.1) - logging.debug("ADQM: started main query") node4.query( "SELECT count(*) FROM numbers_mt(10000000) settings max_threads=5", query_id="test_total_max_threads_4", ) - logging.debug("ADQM: finished main query") another_thread.join() - logging.debug("ADQM: logs: %s", node4.grep_in_log("ADQM")) node4.query("SYSTEM FLUSH LOGS") assert ( node4.query( @@ -114,4 +106,3 @@ def test_total_max_threads_limit_reached(started_cluster): == "2\n" ) node4.query("KILL QUERY WHERE user = 'default' SYNC") - logging.debug("ADQM: test end") From 24912131661231adde5c69f3a6b07f4bec2a16a4 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 24 May 2022 18:40:45 +0300 Subject: [PATCH 020/227] Set default value to 0 for total_max_threads parameter. Code style cleanup. --- .../operations/server-configuration-parameters/settings.md | 6 +++--- programs/server/config.xml | 4 ++-- tests/queries/0_stateless/02015_global_in_threads.sh | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 1c0523f2b61..d9d72feaeb1 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -703,12 +703,12 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa - [max_server_memory_usage](#max_server_memory_usage) ## total_max_threads {#total-max-threads} -The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run. +The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run. Possible values: -- Positive integer. +- Positive integer. - 0 — No limit. -- -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks. +- -1 — The parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks. Default value: `0`. diff --git a/programs/server/config.xml b/programs/server/config.xml index 6c7d25f15b0..b4ef0cb2fc4 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -274,8 +274,8 @@ For value equals to -1 this parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks. --> - -1 - + 0 + 100 diff --git a/tests/queries/0_stateless/02015_global_in_threads.sh b/tests/queries/0_stateless/02015_global_in_threads.sh index 1934b38c47b..9437187d462 100755 --- a/tests/queries/0_stateless/02015_global_in_threads.sh +++ b/tests/queries/0_stateless/02015_global_in_threads.sh @@ -4,6 +4,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=6 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" +${CLICKHOUSE_CLIENT} --log_queries=1 --max_threads=32 --query_id "2015_${CLICKHOUSE_DATABASE}_query" -q "select count() from remote('127.0.0.{2,3}', numbers(10)) where number global in (select number % 5 from numbers_mt(1000000))" ${CLICKHOUSE_CLIENT} -q "system flush logs" -${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 6 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" +${CLICKHOUSE_CLIENT} -q "select length(thread_ids) >= 32 from system.query_log where event_date >= yesterday() and query_id = '2015_${CLICKHOUSE_DATABASE}_query' and type = 'QueryFinish' and current_database = currentDatabase()" From a88862c65383fcee629f990770b4722af2952892 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 25 May 2022 19:26:56 +0300 Subject: [PATCH 021/227] Improve integration test test_total_max_threads_limit_reached --- .../test_total_max_threads/test.py | 57 ++++++++++++------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index 30a20b11f5d..c5e96939f4f 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -3,6 +3,7 @@ from helpers.cluster import ClickHouseCluster import threading import time +from helpers.client import QueryRuntimeException cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance( @@ -75,34 +76,48 @@ def test_total_max_threads_defined_1(started_cluster): ) +# In config_limit_reached.xml there is total_max_threads=10 +# Background query starts in a separate thread to reach this limit. +# When this limit is reached the foreground query gets less than 5 queries despite the fact that it has settings max_threads=5 def test_total_max_threads_limit_reached(started_cluster): - def thread_select(): - node4.query( - "SELECT count(*) FROM numbers_mt(1e11) settings max_threads=100", - query_id="background_query", - ) + def background_query(): + try: + node4.query( + "SELECT count(*) FROM numbers_mt(1e11) settings max_threads=100", + query_id="background_query", + ) + except QueryRuntimeException: + pass - another_thread = threading.Thread(target=thread_select) - another_thread.start() + background_thread = threading.Thread(target=background_query) + background_thread.start() - while ( - node4.query( - "SELECT count(*) FROM system.processes where query_id = 'background_query'" - ) - == "0\n" - ): + def limit_reached(): + s_count = node4.query( + "SELECT sum(length(thread_ids)) FROM system.processes" + ).strip() + if s_count: + count = int(s_count) + else: + count = 0 + return count >= 10 + + while not limit_reached(): time.sleep(0.1) node4.query( "SELECT count(*) FROM numbers_mt(10000000) settings max_threads=5", query_id="test_total_max_threads_4", ) - another_thread.join() + node4.query("SYSTEM FLUSH LOGS") - assert ( - node4.query( - "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_4'" - ) - == "2\n" - ) - node4.query("KILL QUERY WHERE user = 'default' SYNC") + s_count = node4.query( + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_4'" + ).strip() + if s_count: + count = int(s_count) + else: + count = 0 + assert count < 5 + node4.query("KILL QUERY WHERE query_id = 'background_query' SYNC") + background_thread.join() From 27fca1742123bdcab90f148361514b49dbf6ace3 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 26 May 2022 11:05:34 +0200 Subject: [PATCH 022/227] add concurrency control in pipeline executor --- src/Common/ConcurrencyControl.h | 207 ++++++++++++++++++ src/Processors/Executors/PipelineExecutor.cpp | 98 +++++---- src/Processors/Executors/PipelineExecutor.h | 11 +- 3 files changed, 275 insertions(+), 41 deletions(-) create mode 100644 src/Common/ConcurrencyControl.h diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h new file mode 100644 index 00000000000..b26a4b95bac --- /dev/null +++ b/src/Common/ConcurrencyControl.h @@ -0,0 +1,207 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +/* + * Controls how many threads can be allocated for a query (or another activity). + * There is a limited amount of slots for threads. It can be set with `setMaxConcurrency(limit)`. + * + * Lifecycle of a slot: free -> granted -> acquired -> free. + * free: slot is available to be allocated by any query. + * granted: slot is allocated by specific query, but not yet acquired by any thread. + * acquired: slot is allocated by specific query and acquired by a thread. + * + * USAGE: + * 1. Create an allocation for a query: + * `auto slots = ConcurrencyControl::instance().allocate(min, max);` + * It will allocate at least `min` and at most `max` slots. + * Note that `min` slots are granted immediately, but other `max - min` may be granted later. + * 2. For every thread a slot has to be acquired from that allocation: + * `while (auto slot = slots->tryAcquire()) createYourThread([slot = std::move(slot)] { ... });` + * This snippet can be used at query startup and for upscaling later. + * (both functions are non-blocking) + * + * Released slots are distributed between waiting allocations in a round-robin manner to provide fairness. + * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)` + * because `min` amount of slots is allocated for each query unconditionally. + */ +class ConcurrencyControl : boost::noncopyable +{ +public: + struct Allocation; + using AllocationPtr = std::shared_ptr; + using Slots = UInt64; + using Waiters = std::list; + + // Scoped guard for acquired slot, see Allocation::tryAcquire() + struct Slot : boost::noncopyable + { + ~Slot() + { + allocation->release(); + } + + private: + friend struct Allocation; // for ctor + + explicit Slot(AllocationPtr && allocation_) + : allocation(std::move(allocation_)) + {} + + AllocationPtr allocation; + }; + + // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet + using SlotPtr = std::shared_ptr; + + // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max) + struct Allocation : std::enable_shared_from_this, boost::noncopyable + { + ~Allocation() + { + if (released == limit) // also equal to `allocated`: everything is already released + return; + + std::unique_lock lock{parent.mutex}; + parent.cur_concurrency -= allocated - released; + + // Cancel waiting + if (allocated < limit && waiter != parent.waiters.end()) + parent.waiters.erase(waiter); + + parent.schedule(lock); + } + + // Take one already granted slot if available + [[nodiscard]] SlotPtr tryAcquire() + { + std::unique_lock lock{mutex}; + if (!granted) + return {}; + granted--; + return SlotPtr(new Slot(shared_from_this())); + } + + private: + friend struct Slot; // for release() + friend class ConcurrencyControl; // for grant() and ctor + + Allocation(ConcurrencyControl & concurrency_control, Slots min, Slots max) + : parent(concurrency_control) + , limit(std::max(max, min)) + { + std::unique_lock lock{parent.mutex}; + + // Acquire as much slots as we can, but not lower than `min` + granted = allocated = std::max(min, std::min(limit, parent.available(lock))); + parent.cur_concurrency += allocated; + + // Start waiting if more slots are required + if (allocated < limit) + waiter = parent.waiters.insert(parent.cur_waiter, this); + else + waiter = parent.waiters.end(); + } + + // Release one slot and grant it to other allocation if required + void release() + { + std::unique_lock lock{parent.mutex}; + parent.cur_concurrency--; + parent.schedule(lock); + + std::unique_lock lock2{mutex}; + released++; + assert(released <= allocated); + } + + // Grant single slot to allocation, returns true iff more slot(s) are required + bool grant() + { + std::unique_lock lock{mutex}; + granted++; + allocated++; + return allocated < limit; + // WARNING: `waiter` iterator is invalided after returning false + } + + ConcurrencyControl & parent; + Waiters::iterator waiter; // iterator to itself in Waiters list + + const Slots limit; + + std::mutex mutex; // the following values must be accessed under this mutex + Slots allocated = 0; + Slots granted = 0; // allocated, but not yet acquired + Slots released = 0; + }; + +public: + ConcurrencyControl() + : cur_waiter(waiters.end()) + {} + + // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl + // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries + ~ConcurrencyControl() + { + assert(waiters.empty()); + } + + // Allocate at least `min` and at most `max` slots. + // If not all `max` slots were successfully allocated, a subscription for later allocation is created + // Use Allocation::tryAcquire() to acquire allocated slot, before running a thread. + [[nodiscard]] AllocationPtr allocate(Slots min, Slots max) + { + return AllocationPtr(new Allocation(*this, min, max)); + } + + void setMaxConcurrency(Slots value) + { + std::unique_lock lock{mutex}; + max_concurrency = std::max(1, value); // never allow max_concurrency to be zero + schedule(lock); + } + + static ConcurrencyControl & instance() + { + static ConcurrencyControl result; + return result; + } + +private: + Slots available(std::unique_lock &) + { + if (cur_concurrency < max_concurrency) + return max_concurrency - cur_concurrency; + else + return 0; + } + + // Round-robin scheduling of available slots among waiting allocations + void schedule(std::unique_lock &) + { + while (cur_concurrency < max_concurrency && !waiters.empty()) + { + cur_concurrency++; + if (cur_waiter == waiters.end()) + cur_waiter = waiters.begin(); + Allocation * allocation = *cur_waiter; + if (allocation->grant()) + ++cur_waiter; + else + waiters.erase(cur_waiter++); // last required slot has just been granted -- stop waiting + } + } + + std::mutex mutex; + Waiters waiters; + Waiters::iterator cur_waiter; // round-robin pointer + Slots max_concurrency = Slots(-1); + Slots cur_concurrency = 0; +}; diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 039c2148232..6ca788f19d3 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -240,6 +239,9 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie tasks.pushTasks(queue, async_queue, context); } + // Upscale if possible + spawnThreads(); + #ifndef NDEBUG context.processing_time_ns += processing_time_watch.elapsed(); #endif @@ -265,6 +267,56 @@ void PipelineExecutor::initializeExecution(size_t num_threads) tasks.init(num_threads, profile_processors); tasks.fill(queue); + + slots = ConcurrencyControl::instance().allocate(1, num_threads); + std::unique_lock lock{threads_mutex}; + threads.reserve(num_threads); +} + +void PipelineExecutor::spawnThreads() +{ + while (auto slot = slots->tryAcquire()) + { + std::unique_lock lock{threads_mutex}; + size_t thread_num = threads.size(); + threads.emplace_back([this, thread_num, thread_group = CurrentThread::getGroup(), slot = std::move(slot)] + { + /// ThreadStatus thread_status; + + setThreadName("QueryPipelineEx"); + + if (thread_group) + CurrentThread::attachTo(thread_group); + + try + { + executeSingleThread(thread_num); + } + catch (...) + { + /// In case of exception from executor itself, stop other threads. + finish(); + tasks.getThreadContext(thread_num).setException(std::current_exception()); + } + }); + } +} + +void PipelineExecutor::joinThreads() +{ + for (size_t thread_num = 0; ; thread_num++) + { + std::unique_lock lock{threads_mutex}; + if (thread_num >= threads.size()) + break; + if (threads[thread_num].joinable()) + { + auto & thread = threads[thread_num]; + lock.unlock(); // to avoid deadlock if thread we are going to join starts spawning threads + thread.join(); + } + } + // NOTE: No races: all concurrent spawnThreads() calls are done from `threads`, but they're already joined. } void PipelineExecutor::executeImpl(size_t num_threads) @@ -273,59 +325,27 @@ void PipelineExecutor::executeImpl(size_t num_threads) initializeExecution(num_threads); - using ThreadsData = std::vector; - ThreadsData threads; - threads.reserve(num_threads); - bool finished_flag = false; SCOPE_EXIT_SAFE( if (!finished_flag) { finish(); - - for (auto & thread : threads) - if (thread.joinable()) - thread.join(); + joinThreads(); } ); if (num_threads > 1) { - auto thread_group = CurrentThread::getGroup(); - - for (size_t i = 0; i < num_threads; ++i) - { - threads.emplace_back([this, thread_group, thread_num = i] - { - /// ThreadStatus thread_status; - - setThreadName("QueryPipelineEx"); - - if (thread_group) - CurrentThread::attachTo(thread_group); - - try - { - executeSingleThread(thread_num); - } - catch (...) - { - /// In case of exception from executor itself, stop other threads. - finish(); - tasks.getThreadContext(thread_num).setException(std::current_exception()); - } - }); - } - + spawnThreads(); // start at least one thread tasks.processAsyncTasks(); - - for (auto & thread : threads) - if (thread.joinable()) - thread.join(); + joinThreads(); } else + { + auto slot = slots->tryAcquire(); executeSingleThread(0); + } finished_flag = true; } diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 0a9f8bdbeee..c6dafaf8ce5 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -4,9 +4,10 @@ #include #include #include +#include +#include #include -#include #include namespace DB @@ -52,7 +53,11 @@ private: ExecutingGraphPtr graph; ExecutorTasks tasks; - using Stack = std::stack; + + // Concurrency control related + ConcurrencyControl::AllocationPtr slots; + std::mutex threads_mutex; + std::vector threads; /// Flag that checks that initializeExecution was called. bool is_execution_initialized = false; @@ -70,6 +75,8 @@ private: void initializeExecution(size_t num_threads); /// Initialize executor contexts and task_queue. void finalizeExecution(); /// Check all processors are finished. + void spawnThreads(); + void joinThreads(); /// Methods connected to execution. void executeImpl(size_t num_threads); From 4c021b8d8099c41156f24d29969e77164a1629cf Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 26 May 2022 13:25:50 +0200 Subject: [PATCH 023/227] fix comments --- src/Common/ConcurrencyControl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index b26a4b95bac..0c30ca3a068 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -127,7 +127,7 @@ public: granted++; allocated++; return allocated < limit; - // WARNING: `waiter` iterator is invalided after returning false + // WARNING: `waiter` iterator is invalidated after returning false } ConcurrencyControl & parent; @@ -136,7 +136,7 @@ public: const Slots limit; std::mutex mutex; // the following values must be accessed under this mutex - Slots allocated = 0; + Slots allocated = 0; // allocated total (including already released) Slots granted = 0; // allocated, but not yet acquired Slots released = 0; }; From 6bd874df48e7b1b147badef26904977871e5b992 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Mon, 30 May 2022 17:43:25 +0200 Subject: [PATCH 024/227] add tests; fix bugs; make Allocation and ConcurrencyControl classes less tighly bounded --- src/Common/ConcurrencyControl.h | 143 +++++---- .../tests/gtest_concurrency_control.cpp | 286 ++++++++++++++++++ 2 files changed, 371 insertions(+), 58 deletions(-) create mode 100644 src/Common/tests/gtest_concurrency_control.cpp diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index 0c30ca3a068..53e04730e2e 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -35,9 +35,11 @@ class ConcurrencyControl : boost::noncopyable public: struct Allocation; using AllocationPtr = std::shared_ptr; - using Slots = UInt64; + using SlotCount = UInt64; using Waiters = std::list; + static constexpr SlotCount Unlimited = std::numeric_limits::max(); + // Scoped guard for acquired slot, see Allocation::tryAcquire() struct Slot : boost::noncopyable { @@ -64,17 +66,7 @@ public: { ~Allocation() { - if (released == limit) // also equal to `allocated`: everything is already released - return; - - std::unique_lock lock{parent.mutex}; - parent.cur_concurrency -= allocated - released; - - // Cancel waiting - if (allocated < limit && waiter != parent.waiters.end()) - parent.waiters.erase(waiter); - - parent.schedule(lock); + parent.free(this); // We have to lock parent's mutex to avoid race with grant() } // Take one already granted slot if available @@ -89,35 +81,27 @@ public: private: friend struct Slot; // for release() - friend class ConcurrencyControl; // for grant() and ctor + friend class ConcurrencyControl; // for grant(), free() and ctor - Allocation(ConcurrencyControl & concurrency_control, Slots min, Slots max) - : parent(concurrency_control) - , limit(std::max(max, min)) + Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_) + : parent(parent_) + , limit(limit_) + , allocated(granted_) + , granted(granted_) + {} + + auto free() { - std::unique_lock lock{parent.mutex}; - - // Acquire as much slots as we can, but not lower than `min` - granted = allocated = std::max(min, std::min(limit, parent.available(lock))); - parent.cur_concurrency += allocated; - - // Start waiting if more slots are required - if (allocated < limit) - waiter = parent.waiters.insert(parent.cur_waiter, this); - else - waiter = parent.waiters.end(); + std::unique_lock lock{mutex}; + return std::pair{allocated - released, + allocated < limit ? + std::optional(waiter) : + std::optional()}; } - // Release one slot and grant it to other allocation if required - void release() + void wait(Waiters::iterator waiter_) { - std::unique_lock lock{parent.mutex}; - parent.cur_concurrency--; - parent.schedule(lock); - - std::unique_lock lock2{mutex}; - released++; - assert(released <= allocated); + waiter = waiter_; } // Grant single slot to allocation, returns true iff more slot(s) are required @@ -127,18 +111,26 @@ public: granted++; allocated++; return allocated < limit; - // WARNING: `waiter` iterator is invalidated after returning false + } + + // Release one slot and grant it to other allocation if required + void release() + { + parent.release(1); + std::unique_lock lock{mutex}; + released++; + assert(released <= allocated); } ConcurrencyControl & parent; - Waiters::iterator waiter; // iterator to itself in Waiters list - - const Slots limit; + const SlotCount limit; std::mutex mutex; // the following values must be accessed under this mutex - Slots allocated = 0; // allocated total (including already released) - Slots granted = 0; // allocated, but not yet acquired - Slots released = 0; + SlotCount allocated = 0; // allocated total (including already released) + SlotCount granted = 0; // allocated, but not yet acquired + SlotCount released = 0; + + Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit }; public: @@ -156,15 +148,26 @@ public: // Allocate at least `min` and at most `max` slots. // If not all `max` slots were successfully allocated, a subscription for later allocation is created // Use Allocation::tryAcquire() to acquire allocated slot, before running a thread. - [[nodiscard]] AllocationPtr allocate(Slots min, Slots max) - { - return AllocationPtr(new Allocation(*this, min, max)); - } - - void setMaxConcurrency(Slots value) + [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max) { std::unique_lock lock{mutex}; - max_concurrency = std::max(1, value); // never allow max_concurrency to be zero + + // Acquire as much slots as we can, but not lower than `min` + SlotCount limit = std::max(min, max); + SlotCount granted = std::max(min, std::min(limit, available(lock))); + cur_concurrency += granted; + + // Create allocation and start waiting if more slots are required + auto allocation = new Allocation(*this, limit, granted); + if (granted < limit) + allocation->wait(waiters.insert(cur_waiter, allocation)); + return AllocationPtr(allocation); + } + + void setMaxConcurrency(SlotCount value) + { + std::unique_lock lock{mutex}; + max_concurrency = std::max(1, value); // never allow max_concurrency to be zero schedule(lock); } @@ -175,12 +178,28 @@ public: } private: - Slots available(std::unique_lock &) + friend struct Allocation; // for free() and release() + + void free(Allocation * allocation) { - if (cur_concurrency < max_concurrency) - return max_concurrency - cur_concurrency; - else - return 0; + std::unique_lock lock{mutex}; + auto [amount, waiter] = allocation->free(); + cur_concurrency -= amount; + if (waiter) + { + if (cur_waiter == *waiter) + cur_waiter = waiters.erase(*waiter); + else + waiters.erase(*waiter); + } + schedule(lock); + } + + void release(SlotCount amount) + { + std::unique_lock lock{mutex}; + cur_concurrency -= amount; + schedule(lock); } // Round-robin scheduling of available slots among waiting allocations @@ -195,13 +214,21 @@ private: if (allocation->grant()) ++cur_waiter; else - waiters.erase(cur_waiter++); // last required slot has just been granted -- stop waiting + cur_waiter = waiters.erase(cur_waiter); // last required slot has just been granted -- stop waiting } } + SlotCount available(std::unique_lock &) + { + if (cur_concurrency < max_concurrency) + return max_concurrency - cur_concurrency; + else + return 0; + } + std::mutex mutex; Waiters waiters; Waiters::iterator cur_waiter; // round-robin pointer - Slots max_concurrency = Slots(-1); - Slots cur_concurrency = 0; + SlotCount max_concurrency = Unlimited; + SlotCount cur_concurrency = 0; }; diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp new file mode 100644 index 00000000000..d22516f93f8 --- /dev/null +++ b/src/Common/tests/gtest_concurrency_control.cpp @@ -0,0 +1,286 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +struct ConcurrencyControlTest +{ + ConcurrencyControl cc; + + explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited) + { + cc.setMaxConcurrency(limit); + } +}; + +TEST(ConcurrencyControl, Unlimited) +{ + ConcurrencyControlTest t; // unlimited number of slots + auto slots = t.cc.allocate(0, 100500); + std::vector acquired; + while (auto slot = slots->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 100500); +} + +TEST(ConcurrencyControl, Fifo) +{ + ConcurrencyControlTest t(1); // use single slot + std::vector allocations; + constexpr int N = 42; + for (int i = 0; i < N; i++) + allocations.emplace_back(t.cc.allocate(0, 1)); + for (int i = 0; i < N; i++) + { + ConcurrencyControl::SlotPtr holder; + for (int j = 0; j < N; j++) + { + auto slot = allocations[j]->tryAcquire(); + if (i == j) // check fifo order of allocations + { + ASSERT_TRUE(slot); + holder = std::move(slot); + } + else + ASSERT_TRUE(!slot); + } + holder.reset(); // release slot -- leads to the next allocation + } +} + +TEST(ConcurrencyControl, Oversubscription) +{ + ConcurrencyControlTest t(10); + std::vector allocations; + for (int i = 0; i < 10; i++) + allocations.emplace_back(t.cc.allocate(1, 2)); + std::vector slots; + // Normal allocation using maximum amount of slots + for (int i = 0; i < 5; i++) + { + auto slot1 = allocations[i]->tryAcquire(); + ASSERT_TRUE(slot1); + slots.emplace_back(std::move(slot1)); + auto slot2 = allocations[i]->tryAcquire(); + ASSERT_TRUE(slot2); + slots.emplace_back(std::move(slot2)); + ASSERT_TRUE(!allocations[i]->tryAcquire()); + } + // Oversubscription: only minimum amount of slots are allocated + for (int i = 5; i < 10; i++) + { + auto slot1 = allocations[i]->tryAcquire(); + ASSERT_TRUE(slot1); + slots.emplace_back(std::move(slot1)); + ASSERT_TRUE(!allocations[i]->tryAcquire()); + } +} + +TEST(ConcurrencyControl, ReleaseUnacquiredSlots) +{ + ConcurrencyControlTest t(10); + { + std::vector allocations; + for (int i = 0; i < 10; i++) + allocations.emplace_back(t.cc.allocate(1, 2)); + // Do not acquire - just destory allocations with granted slots + } + // Check that slots were actually released + auto allocation = t.cc.allocate(0, 20); + std::vector acquired; + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 10); +} + +TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation) +{ + ConcurrencyControlTest t(10); + for (int i = 0; i < 3; i++) + { + auto allocation = t.cc.allocate(5, 20); + std::vector acquired; + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 10); + } +} + +TEST(ConcurrencyControl, DestroyAllocationBeforeSlots) +{ + ConcurrencyControlTest t(10); + for (int i = 0; i < 3; i++) + { + std::vector acquired; + auto allocation = t.cc.allocate(5, 20); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 10); + allocation.reset(); // slots are stil acquired (they should actually hold allocation) + } +} + +TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation) +{ + ConcurrencyControlTest t(3); + auto allocation = t.cc.allocate(0, 10); + std::list acquired; + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 3); // 0 1 2 + acquired.clear(); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 3); // 3 4 5 + acquired.pop_back(); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 3); // 3 4 6 + acquired.pop_front(); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 3); // 4 6 7 + acquired.clear(); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 2); // 8 9 +} + +TEST(ConcurrencyControl, FairGranting) +{ + ConcurrencyControlTest t(3); + auto startBusyPeriod = t.cc.allocate(3, 3); + auto a1 = t.cc.allocate(0, 10); + auto a2 = t.cc.allocate(0, 10); + auto a3 = t.cc.allocate(0, 10); + startBusyPeriod.reset(); + for (int i = 0; i < 10; i++) + { + auto s1 = a1->tryAcquire(); + ASSERT_TRUE(s1); + ASSERT_TRUE(!a1->tryAcquire()); + auto s2 = a2->tryAcquire(); + ASSERT_TRUE(s2); + ASSERT_TRUE(!a2->tryAcquire()); + auto s3 = a3->tryAcquire(); + ASSERT_TRUE(s3); + ASSERT_TRUE(!a3->tryAcquire()); + } +} + +TEST(ConcurrencyControl, SetSlotCount) +{ + ConcurrencyControlTest t(10); + auto allocation = t.cc.allocate(5, 30); + std::vector acquired; + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 10); + + t.cc.setMaxConcurrency(15); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 15); + + t.cc.setMaxConcurrency(5); + acquired.clear(); + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 5); + + // Check that newly added slots are equally distributed over waiting allocations + std::vector acquired2; + auto allocation2 = t.cc.allocate(0, 30); + ASSERT_TRUE(!allocation->tryAcquire()); + t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one + while (auto slot = allocation->tryAcquire()) + acquired.emplace_back(std::move(slot)); + while (auto slot = allocation2->tryAcquire()) + acquired2.emplace_back(std::move(slot)); + ASSERT_TRUE(acquired.size() == 10); + ASSERT_TRUE(acquired2.size() == 5); +} + +TEST(ConcurrencyControl, MultipleThreads) +{ + constexpr int cfg_total_queries = 1000; // total amount of queries to run + constexpr int cfg_work_us = 49; // max microseconds per single work + constexpr int cfg_concurrent_queries = 8; // do not run more than specified number of concurrent queries + constexpr int cfg_max_threads = 4; // max amount of threads a query is allowed to have + constexpr int cfg_max_concurrency = 16; // concurrency control limit (must be >3) + + ConcurrencyControlTest t(cfg_max_concurrency); + + auto run_query = [&] (size_t max_threads) + { + ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads); + std::mutex threads_mutex; + std::vector threads; + threads.reserve(max_threads); + + std::function spawn_threads = [&] () + { + while (auto slot = slots->tryAcquire()) + { + std::unique_lock lock{threads_mutex}; + threads.emplace_back([&, slot = std::move(slot)] + { + pcg64 rng(randomSeed()); + std::uniform_int_distribution distribution(1, cfg_work_us); + size_t steps = distribution(rng); + for (size_t step = 0; step < steps; step++) + { + sleepForMicroseconds(distribution(rng)); // emulate work + spawn_threads(); // upscale + } + }); + } + }; + + spawn_threads(); + + // graceful shutdown of a query + for (size_t thread_num = 0; ; thread_num++) + { + std::unique_lock lock{threads_mutex}; + if (thread_num >= threads.size()) + break; + if (threads[thread_num].joinable()) + { + auto & thread = threads[thread_num]; + lock.unlock(); // to avoid deadlock if thread we are going to join starts spawning threads + thread.join(); + } + } + // NOTE: No races: all concurrent spawn_threads() calls are done from `threads`, but they're already joined. + }; + + pcg64 rng(randomSeed()); + std::uniform_int_distribution max_threads_distribution(1, cfg_max_threads); + std::vector queries; + std::atomic started = 0; // queries started in total + std::atomic finished = 0; // queries finished in total + while (started < cfg_total_queries) + { + while (started < finished + cfg_concurrent_queries) + { + queries.emplace_back([&, max_threads = max_threads_distribution(rng)] + { + run_query(max_threads); + finished++; + }); + started++; + } + sleepForMicroseconds(5); // wait some queries to finish + t.cc.setMaxConcurrency(cfg_max_concurrency - started % 3); // emulate configuration updates + } + + for (auto & query : queries) + query.join(); +} From 1914f9fed7520c9281a9d49e473c1b543f41c3b1 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Tue, 31 May 2022 08:17:09 +0200 Subject: [PATCH 025/227] fix typos --- src/Common/tests/gtest_concurrency_control.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp index d22516f93f8..2c952d09203 100644 --- a/src/Common/tests/gtest_concurrency_control.cpp +++ b/src/Common/tests/gtest_concurrency_control.cpp @@ -89,7 +89,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) std::vector allocations; for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); - // Do not acquire - just destory allocations with granted slots + // Do not acquire - just destroy allocations with granted slots } // Check that slots were actually released auto allocation = t.cc.allocate(0, 20); @@ -122,7 +122,7 @@ TEST(ConcurrencyControl, DestroyAllocationBeforeSlots) while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); - allocation.reset(); // slots are stil acquired (they should actually hold allocation) + allocation.reset(); // slots are still acquired (they should actually hold allocation) } } From b7cde2c5ada440a8ecc86b4b3267c2d7076a2500 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 31 May 2022 19:43:15 +0300 Subject: [PATCH 026/227] Remove total_max_threads logic from QueryPipelineBuilder; Add setMaxConcurrency(total_max_threads) in Server.cpp --- programs/server/Server.cpp | 4 +++- src/Interpreters/ProcessList.cpp | 14 +---------- src/Interpreters/ProcessList.h | 15 ------------ src/QueryPipeline/QueryPipelineBuilder.cpp | 27 ---------------------- src/QueryPipeline/QueryPipelineBuilder.h | 11 +++++++-- 5 files changed, 13 insertions(+), 58 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index c7ecca45841..a1870e8ff4b 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -1129,7 +1130,8 @@ int Server::main(const std::vector & /*args*/) constexpr size_t thread_factor = 3; total_max_threads = std::thread::hardware_concurrency() * thread_factor; } - global_context->getProcessList().setTotalMaxThreads(total_max_threads); + if (total_max_threads) + ConcurrencyControl::instance().setMaxConcurrency(total_max_threads); } if (config->has("max_concurrent_queries")) diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index bd751d6d618..6c101143234 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -16,6 +16,7 @@ #include #include + namespace CurrentMetrics { extern const Metric Query; @@ -500,19 +501,6 @@ ProcessList::Info ProcessList::getInfo(bool get_thread_list, bool get_profile_ev return per_query_infos; } -size_t ProcessList::getTotalNumThreads() const -{ - size_t total_num_threads = 0; - std::lock_guard lock(mutex); - - for (const auto & process : processes) - { - auto qsi = process.getInfo(true); - total_num_threads += qsi.thread_ids.size(); - } - - return total_num_threads; -} ProcessListForUser::ProcessListForUser(ProcessList * global_process_list) : user_overcommit_tracker(global_process_list, this) diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 6624083fad5..51cd7eb98d9 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -314,10 +314,6 @@ protected: Container processes; size_t max_size = 0; /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. - /// The total maximum number of threads for all queries. - /// Sometimes, real total number of threads may exceed total_max_threads parameter. - size_t total_max_threads = 0; /// 0 means no limit. Otherwise, concurrency of a query is determined based on this parameter. - /// Stores per-user info: queries, statistics and limits UserToQueries user_to_queries; @@ -359,17 +355,6 @@ public: /// Get current state of process list. Info getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const; - /// Get total number of threads for all queries in process list. - size_t getTotalNumThreads() const; - - size_t getTotalMaxThreads() const { return total_max_threads; } - - void setTotalMaxThreads(size_t total_max_threads_) - { - std::lock_guard lock(mutex); - total_max_threads = total_max_threads_; - } - /// Get current state of process list per user. UserInfo getUserInfo(bool get_profile_events = false) const; diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index efc8db42ea7..012a825a9d5 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -16,15 +16,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include -#include namespace DB { @@ -499,30 +496,6 @@ void QueryPipelineBuilder::setProcessListElement(QueryStatus * elem) } } -size_t QueryPipelineBuilder::getNumThreads() const -{ - auto num_threads = pipe.maxParallelStreams(); - - if (max_threads) //-V1051 - num_threads = std::min(num_threads, max_threads); - - if (process_list_element) - { - auto total_max_threads = process_list_element->getContext()->getProcessList().getTotalMaxThreads(); - if (total_max_threads) - { - size_t current_total_num_threads = process_list_element->getContext()->getProcessList().getTotalNumThreads(); - size_t total_available_threads = 0; - if (total_max_threads > current_total_num_threads) - total_available_threads = total_max_threads - current_total_num_threads; - num_threads = std::min(num_threads, total_available_threads); - } - } - - num_threads = std::max(1, num_threads); - return num_threads; -} - PipelineExecutorPtr QueryPipelineBuilder::execute() { if (!isCompleted()) diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h index 08529aa8784..ad25985ab48 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.h +++ b/src/QueryPipeline/QueryPipelineBuilder.h @@ -7,7 +7,6 @@ #include #include - namespace DB { @@ -133,7 +132,15 @@ public: void setProcessListElement(QueryStatus * elem); /// Recommend number of threads for pipeline execution. - size_t getNumThreads() const; + size_t getNumThreads() const + { + auto num_threads = pipe.maxParallelStreams(); + + if (max_threads) //-V1051 + num_threads = std::min(num_threads, max_threads); + + return std::max(1, num_threads); + } /// Set upper limit for the recommend number of threads void setMaxThreads(size_t max_threads_) { max_threads = max_threads_; } From db2cf73b52680005b1b57a62034ab2b32202f80b Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 2 Jun 2022 11:17:13 +0200 Subject: [PATCH 027/227] perf/safety/docs improvements --- src/Common/ConcurrencyControl.h | 83 ++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 28 deletions(-) diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index 53e04730e2e..9ea5efd53d0 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -7,6 +7,16 @@ #include #include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} + /* * Controls how many threads can be allocated for a query (or another activity). * There is a limited amount of slots for threads. It can be set with `setMaxConcurrency(limit)`. @@ -66,31 +76,42 @@ public: { ~Allocation() { - parent.free(this); // We have to lock parent's mutex to avoid race with grant() + // We have to lock parent's mutex to avoid race with grant() + // NOTE: shortcut can be added, but it requires Allocation::mutex lock even to check if shortcut is possible + parent.free(this); } - // Take one already granted slot if available + // Take one already granted slot if available. Lock-free iff there is no granted slot. [[nodiscard]] SlotPtr tryAcquire() { - std::unique_lock lock{mutex}; - if (!granted) - return {}; - granted--; - return SlotPtr(new Slot(shared_from_this())); + SlotCount value = granted.load(); + while (value) + { + if (granted.compare_exchange_strong(value, value - 1)) + { + std::unique_lock lock{mutex}; + return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor + } + } + return {}; // avoid unnecessary locking } private: friend struct Slot; // for release() friend class ConcurrencyControl; // for grant(), free() and ctor - Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_) + Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_ = {}) : parent(parent_) , limit(limit_) , allocated(granted_) , granted(granted_) - {} + , waiter(waiter_) + { + if (allocated < limit) + *waiter = this; + } - auto free() + auto cancel() { std::unique_lock lock{mutex}; return std::pair{allocated - released, @@ -99,11 +120,6 @@ public: std::optional()}; } - void wait(Waiters::iterator waiter_) - { - waiter = waiter_; - } - // Grant single slot to allocation, returns true iff more slot(s) are required bool grant() { @@ -119,18 +135,20 @@ public: parent.release(1); std::unique_lock lock{mutex}; released++; - assert(released <= allocated); + if (released > allocated) + abort(); } ConcurrencyControl & parent; const SlotCount limit; std::mutex mutex; // the following values must be accessed under this mutex - SlotCount allocated = 0; // allocated total (including already released) - SlotCount granted = 0; // allocated, but not yet acquired + SlotCount allocated; // allocated total (including already `released`) SlotCount released = 0; - Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit + std::atomic granted; // allocated, but not yet acquired + + const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit }; public: @@ -142,26 +160,30 @@ public: // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries ~ConcurrencyControl() { - assert(waiters.empty()); + if (!waiters.empty()) + abort(); } // Allocate at least `min` and at most `max` slots. // If not all `max` slots were successfully allocated, a subscription for later allocation is created - // Use Allocation::tryAcquire() to acquire allocated slot, before running a thread. + // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread. [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max) { + if (min > max) + throw DB::Exception("ConcurrencyControl: invalid allocation requirements", DB::ErrorCodes::LOGICAL_ERROR); + std::unique_lock lock{mutex}; // Acquire as much slots as we can, but not lower than `min` - SlotCount limit = std::max(min, max); - SlotCount granted = std::max(min, std::min(limit, available(lock))); + SlotCount granted = std::max(min, std::min(max, available(lock))); cur_concurrency += granted; // Create allocation and start waiting if more slots are required - auto allocation = new Allocation(*this, limit, granted); - if (granted < limit) - allocation->wait(waiters.insert(cur_waiter, allocation)); - return AllocationPtr(allocation); + if (granted < max) + return AllocationPtr(new Allocation(*this, max, granted, + waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */))); + else + return AllocationPtr(new Allocation(*this, max, granted)); } void setMaxConcurrency(SlotCount value) @@ -182,8 +204,13 @@ private: void free(Allocation * allocation) { + // Allocation is allowed to be canceled even if there are: + // - `amount`: granted slots (acquired slots are not possible, because Slot holds AllocationPtr) + // - `waiter`: active waiting for more slots to be allocated + // Thus Allocation destruction may require the following lock, to avoid race conditions std::unique_lock lock{mutex}; - auto [amount, waiter] = allocation->free(); + auto [amount, waiter] = allocation->cancel(); + cur_concurrency -= amount; if (waiter) { From 0a063820c7980005920bcbf713d1fe170d5ffaf5 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 4 Jun 2022 01:35:41 -0400 Subject: [PATCH 028/227] WITH FILL of date/time type must be equal to type of ORDER BY column --- src/Interpreters/InterpreterSelectQuery.cpp | 38 ++++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index fbafb98e0d8..5d9baadbfdf 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -802,10 +803,14 @@ Block InterpreterSelectQuery::getSampleBlockImpl() return analysis_result.final_projection->getResultColumns(); } -static Field getWithFillFieldValue(const ASTPtr & node, ContextPtr context) +static Field getWithFillFieldValue(DataTypePtr col_type, const ASTPtr & node, ContextPtr context) { auto [field, type] = evaluateConstantExpression(node, context); + WhichDataType which(col_type); + if ((which.isDateOrDate32() || which.isDateTime() || which.isDateTime64()) && !col_type->equals(*type)) + throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be same as column " + col_type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + if (!isColumnedAsNumber(type)) throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); @@ -825,14 +830,14 @@ static std::pair> getWithFillStep(const ASTPt throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); } -static FillColumnDescription getWithFillDescription(const ASTOrderByElement & order_by_elem, ContextPtr context) +static FillColumnDescription getWithFillDescription(DataTypePtr type, const ASTOrderByElement & order_by_elem, ContextPtr context) { FillColumnDescription descr; if (order_by_elem.fill_from) - descr.fill_from = getWithFillFieldValue(order_by_elem.fill_from, context); + descr.fill_from = getWithFillFieldValue(type, order_by_elem.fill_from, context); if (order_by_elem.fill_to) - descr.fill_to = getWithFillFieldValue(order_by_elem.fill_to, context); + descr.fill_to = getWithFillFieldValue(type, order_by_elem.fill_to, context); if (order_by_elem.fill_step) std::tie(descr.fill_step, descr.step_kind) = getWithFillStep(order_by_elem.fill_step, context); @@ -872,7 +877,7 @@ static FillColumnDescription getWithFillDescription(const ASTOrderByElement & or return descr; } -static SortDescription getSortDescription(const ASTSelectQuery & query, ContextPtr context) +static SortDescription getSortDescription(const ASTSelectQuery & query, const Block & result_block, const Aliases & aliases, ContextPtr context) { SortDescription order_descr; order_descr.reserve(query.orderBy()->children.size()); @@ -886,7 +891,14 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, ContextP collator = std::make_shared(order_by_elem.collation->as().value.get()); if (order_by_elem.with_fill) { - FillColumnDescription fill_desc = getWithFillDescription(order_by_elem, context); + auto column = result_block.findByName(name); + if (!column) + for (auto &[alias, ast] : aliases) + if (name == ast->getColumnName()) + if ((column = result_block.findByName(alias))) + break; + + FillColumnDescription fill_desc = getWithFillDescription(column->type, order_by_elem, context); order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator, true, fill_desc); } else @@ -1492,7 +1504,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

aliases, context); for (auto & desc : order_descr) if (desc.with_fill) { @@ -2105,7 +2117,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc // TODO Do we need a projection variant for this field? query, analysis_result.order_by_elements_actions, - getSortDescription(query, context), + getSortDescription(query, result_header, syntax_analyzer_result->aliases, context), query_info.syntax_analyzer_result); } else @@ -2113,7 +2125,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.order_optimizer = std::make_shared( query, analysis_result.order_by_elements_actions, - getSortDescription(query, context), + getSortDescription(query, result_header, syntax_analyzer_result->aliases, context), query_info.syntax_analyzer_result); } } @@ -2542,7 +2554,7 @@ void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, Input void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info) { auto & query = getSelectQuery(); - SortDescription output_order_descr = getSortDescription(query, context); + SortDescription output_order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); UInt64 limit = getLimitForSorting(query, context); if (input_sorting_info) @@ -2580,7 +2592,7 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo void InterpreterSelectQuery::executeMergeSorted(QueryPlan & query_plan, const std::string & description) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query, context); + SortDescription order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); UInt64 limit = getLimitForSorting(query, context); executeMergeSorted(query_plan, order_descr, limit, description); @@ -2683,7 +2695,7 @@ void InterpreterSelectQuery::executeWithFill(QueryPlan & query_plan) auto & query = getSelectQuery(); if (query.orderBy()) { - SortDescription order_descr = getSortDescription(query, context); + SortDescription order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); SortDescription fill_descr; for (auto & desc : order_descr) { @@ -2734,7 +2746,7 @@ void InterpreterSelectQuery::executeLimit(QueryPlan & query_plan) { if (!query.orderBy()) throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); - order_descr = getSortDescription(query, context); + order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); } auto limit = std::make_unique( From adf305e1dd08ab2f453e223c9e00b3b93e385da6 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 4 Jun 2022 11:44:20 -0400 Subject: [PATCH 029/227] search source header for columns --- src/Interpreters/InterpreterSelectQuery.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 5d9baadbfdf..038d52d63c2 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -877,7 +877,7 @@ static FillColumnDescription getWithFillDescription(DataTypePtr type, const ASTO return descr; } -static SortDescription getSortDescription(const ASTSelectQuery & query, const Block & result_block, const Aliases & aliases, ContextPtr context) +static SortDescription getSortDescription(const ASTSelectQuery & query, const Block & source_block, const Block & result_block, const Aliases & aliases, ContextPtr context) { SortDescription order_descr; order_descr.reserve(query.orderBy()->children.size()); @@ -892,6 +892,8 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, const Bl if (order_by_elem.with_fill) { auto column = result_block.findByName(name); + if (!column) + column = source_block.findByName(name); if (!column) for (auto &[alias, ast] : aliases) if (name == ast->getColumnName()) @@ -1504,7 +1506,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

aliases, context); + SortDescription order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); for (auto & desc : order_descr) if (desc.with_fill) { @@ -2117,7 +2119,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc // TODO Do we need a projection variant for this field? query, analysis_result.order_by_elements_actions, - getSortDescription(query, result_header, syntax_analyzer_result->aliases, context), + getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context), query_info.syntax_analyzer_result); } else @@ -2125,7 +2127,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.order_optimizer = std::make_shared( query, analysis_result.order_by_elements_actions, - getSortDescription(query, result_header, syntax_analyzer_result->aliases, context), + getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context), query_info.syntax_analyzer_result); } } @@ -2554,7 +2556,7 @@ void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, Input void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info) { auto & query = getSelectQuery(); - SortDescription output_order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); + SortDescription output_order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); UInt64 limit = getLimitForSorting(query, context); if (input_sorting_info) @@ -2592,7 +2594,7 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo void InterpreterSelectQuery::executeMergeSorted(QueryPlan & query_plan, const std::string & description) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); + SortDescription order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); UInt64 limit = getLimitForSorting(query, context); executeMergeSorted(query_plan, order_descr, limit, description); @@ -2695,7 +2697,7 @@ void InterpreterSelectQuery::executeWithFill(QueryPlan & query_plan) auto & query = getSelectQuery(); if (query.orderBy()) { - SortDescription order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); + SortDescription order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); SortDescription fill_descr; for (auto & desc : order_descr) { @@ -2746,7 +2748,7 @@ void InterpreterSelectQuery::executeLimit(QueryPlan & query_plan) { if (!query.orderBy()) throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); - order_descr = getSortDescription(query, result_header, syntax_analyzer_result->aliases, context); + order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); } auto limit = std::make_unique( From 07009c27c234e0061f2015f6dfb6d887c9963690 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 4 Jun 2022 17:24:48 -0400 Subject: [PATCH 030/227] tidy build suggestions --- src/Interpreters/InterpreterSelectQuery.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 038d52d63c2..27d866c976a 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -891,11 +891,11 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, const Bl collator = std::make_shared(order_by_elem.collation->as().value.get()); if (order_by_elem.with_fill) { - auto column = result_block.findByName(name); + const auto *column = result_block.findByName(name); if (!column) column = source_block.findByName(name); if (!column) - for (auto &[alias, ast] : aliases) + for (const auto &[alias, ast] : aliases) if (name == ast->getColumnName()) if ((column = result_block.findByName(alias))) break; From 52ae3f0e9262700e5a057bd1f5226de2f15022a0 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sun, 5 Jun 2022 22:39:21 -0400 Subject: [PATCH 031/227] full refactoring - move type check to transform --- src/Core/SortDescription.h | 2 + src/Interpreters/InterpreterSelectQuery.cpp | 48 +++++++------------ .../Transforms/FillingTransform.cpp | 9 ++++ 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/Core/SortDescription.h b/src/Core/SortDescription.h index 3d4e3b665ee..75a4afe4ef0 100644 --- a/src/Core/SortDescription.h +++ b/src/Core/SortDescription.h @@ -28,7 +28,9 @@ struct FillColumnDescription /// All missed values in range [FROM, TO) will be filled /// Range [FROM, TO) respects sorting direction Field fill_from; /// Fill value >= FILL_FROM + DataTypePtr fill_from_type; Field fill_to; /// Fill value + STEP < FILL_TO + DataTypePtr fill_to_type; Field fill_step; /// Default = +1 or -1 according to direction std::optional step_kind; diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 27d866c976a..791bcd5562f 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -803,18 +802,14 @@ Block InterpreterSelectQuery::getSampleBlockImpl() return analysis_result.final_projection->getResultColumns(); } -static Field getWithFillFieldValue(DataTypePtr col_type, const ASTPtr & node, ContextPtr context) +static std::pair getWithFillFieldValue(const ASTPtr & node, ContextPtr context) { - auto [field, type] = evaluateConstantExpression(node, context); + auto field_type = evaluateConstantExpression(node, context); - WhichDataType which(col_type); - if ((which.isDateOrDate32() || which.isDateTime() || which.isDateTime64()) && !col_type->equals(*type)) - throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be same as column " + col_type->getName(), ErrorCodes::INVALID_WITH_FILL_EXPRESSION); + if (!isColumnedAsNumber(field_type.second)) + throw Exception("Illegal type " + field_type.second->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); - if (!isColumnedAsNumber(type)) - throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); - - return field; + return field_type; } static std::pair> getWithFillStep(const ASTPtr & node, ContextPtr context) @@ -830,14 +825,14 @@ static std::pair> getWithFillStep(const ASTPt throw Exception("Illegal type " + type->getName() + " of WITH FILL expression, must be numeric type", ErrorCodes::INVALID_WITH_FILL_EXPRESSION); } -static FillColumnDescription getWithFillDescription(DataTypePtr type, const ASTOrderByElement & order_by_elem, ContextPtr context) +static FillColumnDescription getWithFillDescription(const ASTOrderByElement & order_by_elem, ContextPtr context) { FillColumnDescription descr; if (order_by_elem.fill_from) - descr.fill_from = getWithFillFieldValue(type, order_by_elem.fill_from, context); + std::tie(descr.fill_from, descr.fill_from_type) = getWithFillFieldValue(order_by_elem.fill_from, context); if (order_by_elem.fill_to) - descr.fill_to = getWithFillFieldValue(type, order_by_elem.fill_to, context); + std::tie(descr.fill_to, descr.fill_to_type) = getWithFillFieldValue(order_by_elem.fill_to, context); if (order_by_elem.fill_step) std::tie(descr.fill_step, descr.step_kind) = getWithFillStep(order_by_elem.fill_step, context); @@ -877,7 +872,7 @@ static FillColumnDescription getWithFillDescription(DataTypePtr type, const ASTO return descr; } -static SortDescription getSortDescription(const ASTSelectQuery & query, const Block & source_block, const Block & result_block, const Aliases & aliases, ContextPtr context) +static SortDescription getSortDescription(const ASTSelectQuery & query, ContextPtr context) { SortDescription order_descr; order_descr.reserve(query.orderBy()->children.size()); @@ -891,16 +886,7 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, const Bl collator = std::make_shared(order_by_elem.collation->as().value.get()); if (order_by_elem.with_fill) { - const auto *column = result_block.findByName(name); - if (!column) - column = source_block.findByName(name); - if (!column) - for (const auto &[alias, ast] : aliases) - if (name == ast->getColumnName()) - if ((column = result_block.findByName(alias))) - break; - - FillColumnDescription fill_desc = getWithFillDescription(column->type, order_by_elem, context); + FillColumnDescription fill_desc = getWithFillDescription(order_by_elem, context); order_descr.emplace_back(name, order_by_elem.direction, order_by_elem.nulls_direction, collator, true, fill_desc); } else @@ -1506,7 +1492,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional

aliases, context); + SortDescription order_descr = getSortDescription(query, context); for (auto & desc : order_descr) if (desc.with_fill) { @@ -2119,7 +2105,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc // TODO Do we need a projection variant for this field? query, analysis_result.order_by_elements_actions, - getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context), + getSortDescription(query, context), query_info.syntax_analyzer_result); } else @@ -2127,7 +2113,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.order_optimizer = std::make_shared( query, analysis_result.order_by_elements_actions, - getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context), + getSortDescription(query, context), query_info.syntax_analyzer_result); } } @@ -2556,7 +2542,7 @@ void InterpreterSelectQuery::executeOrderOptimized(QueryPlan & query_plan, Input void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfoPtr input_sorting_info) { auto & query = getSelectQuery(); - SortDescription output_order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); + SortDescription output_order_descr = getSortDescription(query, context); UInt64 limit = getLimitForSorting(query, context); if (input_sorting_info) @@ -2594,7 +2580,7 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo void InterpreterSelectQuery::executeMergeSorted(QueryPlan & query_plan, const std::string & description) { auto & query = getSelectQuery(); - SortDescription order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); + SortDescription order_descr = getSortDescription(query, context); UInt64 limit = getLimitForSorting(query, context); executeMergeSorted(query_plan, order_descr, limit, description); @@ -2697,7 +2683,7 @@ void InterpreterSelectQuery::executeWithFill(QueryPlan & query_plan) auto & query = getSelectQuery(); if (query.orderBy()) { - SortDescription order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); + SortDescription order_descr = getSortDescription(query, context); SortDescription fill_descr; for (auto & desc : order_descr) { @@ -2748,7 +2734,7 @@ void InterpreterSelectQuery::executeLimit(QueryPlan & query_plan) { if (!query.orderBy()) throw Exception("LIMIT WITH TIES without ORDER BY", ErrorCodes::LOGICAL_ERROR); - order_descr = getSortDescription(query, source_header, result_header, syntax_analyzer_result->aliases, context); + order_descr = getSortDescription(query, context); } auto limit = std::make_unique( diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index a41b5660e0d..153f962a8f4 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -54,6 +54,15 @@ static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & WhichDataType which(type); DataTypePtr to_type; + WhichDataType which_from(descr.fill_from_type); + bool is_from_date = which_from.isDateOrDate32() || which_from.isDateTime() || which_from.isDateTime64(); + WhichDataType which_to(descr.fill_to_type); + bool is_to_date = which_to.isDateOrDate32() || which_to.isDateTime() || which_to.isDateTime64(); + + if ((is_from_date || is_to_date) && + (!descr.fill_from_type->equals(*descr.fill_to_type) || !descr.fill_from_type->equals(*type))) + return false; + /// TODO Wrong results for big integers. if (isInteger(type) || which.isDate() || which.isDate32() || which.isDateTime()) { From addefa202037c847c9a91f55fd278395f5f5ce47 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Mon, 6 Jun 2022 16:36:03 -0400 Subject: [PATCH 032/227] from or to statement can be absent --- .../Transforms/FillingTransform.cpp | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index 153f962a8f4..1303b9affff 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -54,14 +54,21 @@ static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & WhichDataType which(type); DataTypePtr to_type; - WhichDataType which_from(descr.fill_from_type); - bool is_from_date = which_from.isDateOrDate32() || which_from.isDateTime() || which_from.isDateTime64(); - WhichDataType which_to(descr.fill_to_type); - bool is_to_date = which_to.isDateOrDate32() || which_to.isDateTime() || which_to.isDateTime64(); + if (descr.fill_from_type) + { + WhichDataType which_from(descr.fill_from_type); + if ((which_from.isDateOrDate32() || which_from.isDateTime() || which_from.isDateTime64()) && + !descr.fill_from_type->equals(*type)) + return false; + } - if ((is_from_date || is_to_date) && - (!descr.fill_from_type->equals(*descr.fill_to_type) || !descr.fill_from_type->equals(*type))) - return false; + if (descr.fill_to_type) + { + WhichDataType which_to(descr.fill_to_type); + if ((which_to.isDateOrDate32() || which_to.isDateTime() || which_to.isDateTime64()) && + !descr.fill_to_type->equals(*type)) + return false; + } /// TODO Wrong results for big integers. if (isInteger(type) || which.isDate() || which.isDate32() || which.isDateTime()) From 216cde6195bf9100e569fd767fd43f60eec62a0d Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 17 Jun 2022 18:22:22 +0300 Subject: [PATCH 033/227] Fix Fatal errors caused by using spawnThreads() --- src/Processors/Executors/PipelineExecutor.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 6ca788f19d3..a4ca4ed944b 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -239,8 +239,11 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie tasks.pushTasks(queue, async_queue, context); } - // Upscale if possible - spawnThreads(); + if (!tasks.isFinished() && !checkTimeLimitSoft()) + { + // Upscale if possible + spawnThreads(); + } #ifndef NDEBUG context.processing_time_ns += processing_time_watch.elapsed(); From be565959fa56331a5c5d5a3ca0b1a7e1698995ef Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Fri, 17 Jun 2022 18:22:22 +0300 Subject: [PATCH 034/227] Fix Fatal errors caused by using spawnThreads() --- src/Processors/Executors/PipelineExecutor.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 6ca788f19d3..a4ca4ed944b 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -239,8 +239,11 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie tasks.pushTasks(queue, async_queue, context); } - // Upscale if possible - spawnThreads(); + if (!tasks.isFinished() && !checkTimeLimitSoft()) + { + // Upscale if possible + spawnThreads(); + } #ifndef NDEBUG context.processing_time_ns += processing_time_watch.elapsed(); From d6be2be5aad472142097cc37f0ca3832eee2d497 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Mon, 20 Jun 2022 19:04:24 +0200 Subject: [PATCH 035/227] fix single-thread mode for pulling and pushing executors to work properly --- src/Processors/Executors/PipelineExecutor.cpp | 15 +++++++++------ src/Processors/Executors/PipelineExecutor.h | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index a4ca4ed944b..673bf1f79b8 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -111,6 +111,11 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) { initializeExecution(1); + // Acquire slot until we are done + single_thread_slot = slots->tryAcquire(); + if (!single_thread_slot) + abort(); // Unable to allocate slot for the first thread, but we just allocated at least one slot + if (yield_flag && *yield_flag) return true; } @@ -125,6 +130,7 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) if (node->exception) std::rethrow_exception(node->exception); + single_thread_slot.reset(); finalizeExecution(); return false; @@ -239,16 +245,13 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie tasks.pushTasks(queue, async_queue, context); } - if (!tasks.isFinished() && !checkTimeLimitSoft()) - { - // Upscale if possible - spawnThreads(); - } - #ifndef NDEBUG context.processing_time_ns += processing_time_watch.elapsed(); #endif + // Upscale if possible + spawnThreads(); + /// We have executed single processor. Check if we need to yield execution. if (yield_flag && *yield_flag) yield = true; diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index c6dafaf8ce5..2d151810696 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -56,6 +56,7 @@ private: // Concurrency control related ConcurrencyControl::AllocationPtr slots; + ConcurrencyControl::SlotPtr single_thread_slot; // slot for single-thread mode to work using executeStep() std::mutex threads_mutex; std::vector threads; From 3de3ae1b644c07ed3be2ad95498d54692bc69f4a Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Tue, 21 Jun 2022 16:39:53 +0300 Subject: [PATCH 036/227] Set ConcurrencyControl::Unlimited when total_max_threads 0 or undefined --- programs/server/Server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index a1870e8ff4b..375850f51d8 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1132,7 +1132,11 @@ int Server::main(const std::vector & /*args*/) } if (total_max_threads) ConcurrencyControl::instance().setMaxConcurrency(total_max_threads); + else + ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited); } + else + ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited); if (config->has("max_concurrent_queries")) global_context->getProcessList().setMaxSize(config->getInt("max_concurrent_queries", 0)); From dc59d208decafd366bc283729c9f09f7433a3f0b Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Tue, 21 Jun 2022 18:38:51 +0200 Subject: [PATCH 037/227] unittest style/perf minor fixes --- src/Common/tests/gtest_concurrency_control.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp index 2c952d09203..2ffb16511f3 100644 --- a/src/Common/tests/gtest_concurrency_control.cpp +++ b/src/Common/tests/gtest_concurrency_control.cpp @@ -33,13 +33,14 @@ TEST(ConcurrencyControl, Fifo) { ConcurrencyControlTest t(1); // use single slot std::vector allocations; - constexpr int N = 42; - for (int i = 0; i < N; i++) + constexpr int count = 42; + allocations.reserve(count); + for (int i = 0; i < count; i++) allocations.emplace_back(t.cc.allocate(0, 1)); - for (int i = 0; i < N; i++) + for (int i = 0; i < count; i++) { ConcurrencyControl::SlotPtr holder; - for (int j = 0; j < N; j++) + for (int j = 0; j < count; j++) { auto slot = allocations[j]->tryAcquire(); if (i == j) // check fifo order of allocations @@ -58,6 +59,7 @@ TEST(ConcurrencyControl, Oversubscription) { ConcurrencyControlTest t(10); std::vector allocations; + allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); std::vector slots; @@ -87,6 +89,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) ConcurrencyControlTest t(10); { std::vector allocations; + allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); // Do not acquire - just destroy allocations with granted slots @@ -155,11 +158,11 @@ TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation) TEST(ConcurrencyControl, FairGranting) { ConcurrencyControlTest t(3); - auto startBusyPeriod = t.cc.allocate(3, 3); + auto start_busy_period = t.cc.allocate(3, 3); auto a1 = t.cc.allocate(0, 10); auto a2 = t.cc.allocate(0, 10); auto a3 = t.cc.allocate(0, 10); - startBusyPeriod.reset(); + start_busy_period.reset(); for (int i = 0; i < 10; i++) { auto s1 = a1->tryAcquire(); From e96af48e28e0be7363ec97a29757bf49e38f8421 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 7 Jul 2022 11:35:02 +0000 Subject: [PATCH 038/227] Polish --- src/Common/ZooKeeper/IKeeper.h | 11 +++++ src/Common/ZooKeeper/TestKeeper.h | 5 ++ src/Common/ZooKeeper/ZooKeeper.cpp | 17 +++++-- src/Common/ZooKeeper/ZooKeeper.h | 8 ++- src/Common/ZooKeeper/ZooKeeperCommon.cpp | 1 + src/Common/ZooKeeper/ZooKeeperConstants.cpp | 1 + src/Common/ZooKeeper/ZooKeeperImpl.cpp | 55 +++++++++++++++++++-- src/Common/ZooKeeper/ZooKeeperImpl.h | 6 +++ src/Coordination/KeeperServer.cpp | 1 + src/Coordination/KeeperStateMachine.cpp | 2 + src/Coordination/KeeperStorage.cpp | 12 +++++ 11 files changed, 108 insertions(+), 11 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 79f9943cb57..9592256b7e0 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -110,6 +110,15 @@ bool isUserError(Error code); const char * errorMessage(Error code); +enum KeeperApiVersion : uint8_t +{ + V0 = 0, // ZooKeeper compatible version + V1 // added FilteredList request +}; + +inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; +inline constexpr auto * keeper_api_version_path = "/keeper-api-version"; + struct Request; using RequestPtr = std::shared_ptr; using Requests = std::vector; @@ -516,6 +525,8 @@ public: const Requests & requests, MultiCallback callback) = 0; + virtual Coordination::KeeperApiVersion getApiVersion() = 0; + /// Expire session and finish all pending requests virtual void finalize(const String & reason) = 0; }; diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index 6e77a5d38c1..9ea2eef8f41 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -90,6 +90,11 @@ public: void finalize(const String & reason) override; + Coordination::KeeperApiVersion getApiVersion() override + { + return KeeperApiVersion::V0; + } + struct Node { String data; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 5a0be0f76ff..ce34f9136dd 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -3,6 +3,8 @@ #include "KeeperException.h" #include "TestKeeper.h" +#include +#include #include #include @@ -337,17 +339,17 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings } } -Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch) +Strings ZooKeeper::getChildren(const std::string & path, Coordination::Stat * stat, const EventPtr & watch, Coordination::ListRequestType list_request_type) { Strings res; - check(tryGetChildren(path, res, stat, watch), path); + check(tryGetChildren(path, res, stat, watch, list_request_type), path); return res; } -Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback) +Strings ZooKeeper::getChildrenWatch(const std::string & path, Coordination::Stat * stat, Coordination::WatchCallback watch_callback, Coordination::ListRequestType list_request_type) { Strings res; - check(tryGetChildrenWatch(path, res, stat, watch_callback), path); + check(tryGetChildrenWatch(path, res, stat, watch_callback, list_request_type), path); return res; } @@ -540,7 +542,6 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r } } - std::string ZooKeeper::get(const std::string & path, Coordination::Stat * stat, const EventPtr & watch) { Coordination::Error code = Coordination::Error::ZOK; @@ -904,6 +905,11 @@ bool ZooKeeper::expired() return impl->isExpired(); } +Coordination::KeeperApiVersion ZooKeeper::getApiVersion() +{ + return impl->getApiVersion(); +} + Int64 ZooKeeper::getClientID() { return impl->getSessionID(); @@ -1080,6 +1086,7 @@ std::future ZooKeeper::asyncTryGetChildrenNoThrow( auto callback = [promise](const Coordination::ListResponse & response) mutable { + LOG_INFO(&Poco::Logger::get("LOGGER"), "Got response {}", response.names.size()); promise->set_value(response); }; diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index d2f92b6b4c3..c246f8d94ed 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -127,6 +127,8 @@ public: /// Returns true, if the session has expired. bool expired(); + Coordination::KeeperApiVersion getApiVersion(); + /// Create a znode. /// Throw an exception if something went wrong. std::string create(const std::string & path, const std::string & data, int32_t mode); @@ -184,11 +186,13 @@ public: Strings getChildren(const std::string & path, Coordination::Stat * stat = nullptr, - const EventPtr & watch = nullptr); + const EventPtr & watch = nullptr, + Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL); Strings getChildrenWatch(const std::string & path, Coordination::Stat * stat, - Coordination::WatchCallback watch_callback); + Coordination::WatchCallback watch_callback, + Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL); /// Doesn't not throw in the following cases: /// * The node doesn't exist. diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 837ea5bbad8..b2b3c8f6f13 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -892,6 +892,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); + registerZooKeeperRequest(*this); } } diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp index 5b121ed6138..ba7a9b9f0c5 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp +++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp @@ -24,6 +24,7 @@ static const std::unordered_set VALID_OPERATIONS = static_cast(OpNum::SessionID), static_cast(OpNum::SetACL), static_cast(OpNum::GetACL), + static_cast(OpNum::FilteredList), }; std::string toString(OpNum op_num) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 8fa6f28c29c..76ec4e28613 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1,3 +1,4 @@ +#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -6,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -352,6 +354,8 @@ ZooKeeper::ZooKeeper( send_thread = ThreadFromGlobalPool([this] { sendThread(); }); receive_thread = ThreadFromGlobalPool([this] { receiveThread(); }); + initApiVersion(); + ProfileEvents::increment(ProfileEvents::ZooKeeperInit); } @@ -1057,6 +1061,37 @@ void ZooKeeper::pushRequest(RequestInfo && info) ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions); } +Coordination::KeeperApiVersion ZooKeeper::getApiVersion() +{ + return keeper_api_version; +} + +void ZooKeeper::initApiVersion() +{ + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + auto callback = [promise](const Coordination::GetResponse & response) mutable + { + promise->set_value(response); + }; + + get(Coordination::keeper_api_version_path, std::move(callback), {}); + if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) + return; + + auto response = future.get(); + + if (response.error != Coordination::Error::ZOK) + return; + + uint8_t keeper_version{0}; + DB::ReadBufferFromOwnString buf(response.data); + DB::readIntText(keeper_version, buf); + keeper_api_version = static_cast(keeper_version); +} + + void ZooKeeper::executeGenericRequest( const ZooKeeperRequestPtr & request, ResponseCallback callback) @@ -1172,12 +1207,24 @@ void ZooKeeper::list( ListCallback callback, WatchCallback watch) { - ZooKeeperFilteredListRequest request; - request.path = path; - request.list_request_type = list_request_type; + std::shared_ptr request{nullptr}; + if (keeper_api_version < Coordination::KeeperApiVersion::V1) + { + if (list_request_type != ListRequestType::ALL) + throw Exception("Filtered list request type cannot be used because it's not support by the server", Error::ZBADARGUMENTS); + + request = std::make_shared(); + } + else + { + auto filtered_list_request = std::make_shared(); + filtered_list_request->list_request_type = list_request_type; + request = std::move(filtered_list_request); + } + + request->path = path; RequestInfo request_info; - request_info.request = std::make_shared(std::move(request)); request_info.callback = [callback](const Response & response) { callback(dynamic_cast(response)); }; request_info.watch = watch; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index aa27b0eefe9..023e46f5017 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -181,6 +181,8 @@ public: const Requests & requests, MultiCallback callback) override; + Coordination::KeeperApiVersion getApiVersion() override; + /// Without forcefully invalidating (finalizing) ZooKeeper session before /// establishing a new one, there was a possibility that server is using /// two ZooKeeper sessions simultaneously in different parts of code. @@ -275,8 +277,12 @@ private: void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false); + void initApiVersion(); + CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; + + Coordination::KeeperApiVersion keeper_api_version{Coordination::KeeperApiVersion::V0}; }; } diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 2a3d17af403..db125c45547 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -328,6 +328,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo auto idx = state_machine->last_commit_index() + 1; for (const auto & entry : *log_entries) { + LOG_INFO(&Poco::Logger::get("LOGGER"), "Term of log {}", entry->get_term()); if (entry && entry->get_val_type() == nuraft::log_val_type::app_log) state_machine->pre_commit(idx, entry->get_buf()); diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 1b399e8cc92..4ff4217530b 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -133,6 +133,8 @@ nuraft::ptr KeeperStateMachine::pre_commit(uint64_t log_idx, nur if (!request_for_session.zxid) request_for_session.zxid = log_idx; + LOG_INFO(&Poco::Logger::get("LOGGER"), "Preprocessing {} for {} of type {} xid {}", request_for_session.zxid, log_idx, request_for_session.request->getOpNum(), request_for_session.time); + preprocess(request_for_session); return nullptr; } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index fd1fab5b6b0..02ea04e270d 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -232,6 +232,11 @@ KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, Node root_node; container.insert("/", root_node); nodes_digest += root_node.getDigest("/"); + + Node version_node; + version_node.setData(std::to_string(static_cast(Coordination::current_keeper_api_version))); + container.insert(Coordination::keeper_api_version_path, version_node); + nodes_digest += version_node.getDigest(Coordination::keeper_api_version_path); } template @@ -1187,7 +1192,9 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc auto list_request_type = ALL; if (auto * filtered_list = dynamic_cast(&request)) + { list_request_type = filtered_list->list_request_type; + } if (list_request_type == ALL) return true; @@ -1794,6 +1801,9 @@ void KeeperStorage::preprocessRequest( { int64_t last_zxid = getNextZXID() - 1; + if (new_last_zxid == 108367) + std::abort(); + if (uncommitted_transactions.empty()) { // if we have no uncommitted transactions it means the last zxid is possibly loaded from snapshot @@ -1879,6 +1889,8 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( { if (new_last_zxid) { + if (*new_last_zxid == 108366) + LOG_INFO(&Poco::Logger::get("LOGGER"), "Processing {}", *new_last_zxid); if (uncommitted_transactions.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to commit a ZXID ({}) which was not preprocessed", *new_last_zxid); From 8d8084d9f3717163505fc41c92a69bbe75460ccd Mon Sep 17 00:00:00 2001 From: Kerry Clendinning Date: Thu, 7 Jul 2022 10:48:15 -0500 Subject: [PATCH 039/227] Update any.md --- docs/en/sql-reference/aggregate-functions/reference/any.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index 096f1415d11..0707a2a4f4d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -4,7 +4,7 @@ sidebar_position: 6 # any -Selects the first encountered value. +Selects the first encountered (non-NULL) value. The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. From c7967fb721b145958d0948c7596c9105bcc8ee2c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 5 Jul 2022 20:03:51 +0200 Subject: [PATCH 040/227] Add an option to INTO OUTFILE to also print details to standard output. Implementation: - Added a bool to ASTQueryWithOutput & patched the usage in ClientBase. - Added a new buffer TeeWriteBuffer which extends from WriteBufferFromFile (used to write data to the file) and has WriteBufferFromFileDescriptor (used to write data to stdout). The WriteBufferFromFileDescriptor uses the same buffer as TeeWriteBuffer. - Added a new bool select_into_outfile_and_stdout in ClientBase to enable/disable progress rendering. Testing: - Added a test tests/queries/0_stateless/02346_into_outfile_and_stdout.sh Documentation: - Updated the english documentation for the new option in SELECT. --- .../statements/select/into-outfile.md | 3 +- src/Client/ClientBase.cpp | 33 ++++++++----- src/Client/ClientBase.h | 1 + src/IO/TeeWriteBuffer.cpp | 46 +++++++++++++++++++ src/IO/TeeWriteBuffer.h | 34 ++++++++++++++ src/Parsers/ASTQueryWithOutput.h | 1 + src/Parsers/ParserQueryWithOutput.cpp | 7 +++ .../02346_into_outfile_and_stdout.reference | 9 ++++ .../02346_into_outfile_and_stdout.sh | 45 ++++++++++++++++++ 9 files changed, 167 insertions(+), 12 deletions(-) create mode 100644 src/IO/TeeWriteBuffer.cpp create mode 100644 src/IO/TeeWriteBuffer.h create mode 100644 tests/queries/0_stateless/02346_into_outfile_and_stdout.reference create mode 100644 tests/queries/0_stateless/02346_into_outfile_and_stdout.sh diff --git a/docs/en/sql-reference/statements/select/into-outfile.md b/docs/en/sql-reference/statements/select/into-outfile.md index f101c10ff60..eb0cf68eca1 100644 --- a/docs/en/sql-reference/statements/select/into-outfile.md +++ b/docs/en/sql-reference/statements/select/into-outfile.md @@ -11,7 +11,7 @@ Compressed files are supported. Compression type is detected by the extension of **Syntax** ```sql -SELECT INTO OUTFILE file_name [COMPRESSION type [LEVEL level]] +SELECT INTO OUTFILE file_name [AND STDOUT] [COMPRESSION type [LEVEL level]] ``` `file_name` and `type` are string literals. Supported compression types are: `'none'`, `'gzip'`, `'deflate'`, `'br'`, `'xz'`, `'zstd'`, `'lz4'`, `'bz2'`. @@ -23,6 +23,7 @@ SELECT INTO OUTFILE file_name [COMPRESSION type [LEVEL level]] - This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail. - The query will fail if a file with the same file name already exists. - The default [output format](../../../interfaces/formats.md) is `TabSeparated` (like in the command-line client batch mode). Use [FORMAT](format.md) clause to change it. +- If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. **Example** diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 0e243f97aaf..d97f01617a6 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -69,6 +69,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -404,7 +405,6 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query) return; processed_rows += block.rows(); - /// Even if all blocks are empty, we still need to initialize the output stream to write empty resultset. initOutputFormat(block, parsed_query); @@ -415,7 +415,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query) return; /// If results are written INTO OUTFILE, we can avoid clearing progress to avoid flicker. - if (need_render_progress && (stdout_is_a_tty || is_interactive) && !select_into_file) + if (need_render_progress && (stdout_is_a_tty || is_interactive) && (!select_into_file || select_into_file_and_stdout)) progress_indication.clearProgressOutput(); try @@ -435,7 +435,7 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query) /// Restore progress bar after data block. if (need_render_progress && (stdout_is_a_tty || is_interactive)) { - if (select_into_file) + if (select_into_file && !select_into_file_and_stdout) std::cerr << "\r"; progress_indication.writeProgress(); } @@ -512,7 +512,7 @@ try String current_format = format; select_into_file = false; - + select_into_file_and_stdout = false; /// The query can specify output format or output file. if (const auto * query_with_output = dynamic_cast(parsed_query.get())) { @@ -548,12 +548,23 @@ try range.first, range.second); } - - out_file_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), - compression_method, - compression_level - ); + if (query_with_output->is_stdout_enabled) + { + select_into_file_and_stdout = true; + out_file_buf = wrapWriteBufferWithCompressionMethod( + std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), + compression_method, + compression_level + ); + } + else + { + out_file_buf = wrapWriteBufferWithCompressionMethod( + std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), + compression_method, + compression_level + ); + } // We are writing to file, so default format is the same as in non-interactive mode. if (is_interactive && is_default_format) @@ -579,7 +590,7 @@ try /// It is not clear how to write progress intermixed with data with parallel formatting. /// It may increase code complexity significantly. - if (!need_render_progress || select_into_file) + if (!need_render_progress || (select_into_file && !select_into_file_and_stdout)) output_format = global_context->getOutputFormatParallelIfPossible( current_format, out_file_buf ? *out_file_buf : *out_buf, block); else diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index ec2267a3be6..45c37617647 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -180,6 +180,7 @@ protected: String format; /// Query results output format. bool select_into_file = false; /// If writing result INTO OUTFILE. It affects progress rendering. + bool select_into_file_and_stdout = false; /// If writing result INTO OUTFILE AND STDOUT. It affects progress rendering. bool is_default_format = true; /// false, if format is set in the config or command line. size_t format_max_block_size = 0; /// Max block size for console output. String insert_format; /// Format of INSERT data that is read from stdin in batch mode. diff --git a/src/IO/TeeWriteBuffer.cpp b/src/IO/TeeWriteBuffer.cpp new file mode 100644 index 00000000000..5131fbdeacf --- /dev/null +++ b/src/IO/TeeWriteBuffer.cpp @@ -0,0 +1,46 @@ +#include + +namespace DB +{ + +TeeWriteBuffer::TeeWriteBuffer( + const std::string & file_name_, + size_t buf_size, + int flags, + mode_t mode, + char * existing_memory, + size_t alignment) + : WriteBufferFromFile(file_name_,buf_size,flags,mode,existing_memory,alignment), + stdout_buffer(STDOUT_FILENO,buf_size,working_buffer.begin()) +{ +} + +void TeeWriteBuffer::nextImpl() +{ + try + { + stdout_buffer.position() = position(); + stdout_buffer.next(); + WriteBufferFromFile::nextImpl(); + } + catch (Exception &exception) + { + exception.addMessage("While writing to TeeWriteBuffer "); + throw; + } +} + +void TeeWriteBuffer::finalizeImpl() +{ + if (fd < 0 || stdout_buffer.getFD() < 0) + return; + + next(); +} + +TeeWriteBuffer::~TeeWriteBuffer() +{ + finalize(); +} + +} diff --git a/src/IO/TeeWriteBuffer.h b/src/IO/TeeWriteBuffer.h new file mode 100644 index 00000000000..264ba42b0a8 --- /dev/null +++ b/src/IO/TeeWriteBuffer.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace DB +{ + +/** TeeWriteBuffer extends from WriteBufferFromFile and has + * WriteBufferFromFileDescriptor inside the class which is created + * by using the same buffer as TeeWriteBuffer. So both the data are written + * using same buffer + **/ +class TeeWriteBuffer : public WriteBufferFromFile +{ + +public: + explicit TeeWriteBuffer( + const std::string & file_name_, + size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, + int flags = -1, + mode_t mode = 0666, + char * existing_memory = nullptr, + size_t alignment = 0); + + ~TeeWriteBuffer() override; + +protected: + void nextImpl() override; + void finalizeImpl() override; + + WriteBufferFromFileDescriptor stdout_buffer; +}; + +} diff --git a/src/Parsers/ASTQueryWithOutput.h b/src/Parsers/ASTQueryWithOutput.h index a34826d128c..d2a24f1ebe2 100644 --- a/src/Parsers/ASTQueryWithOutput.h +++ b/src/Parsers/ASTQueryWithOutput.h @@ -15,6 +15,7 @@ class ASTQueryWithOutput : public IAST { public: ASTPtr out_file; + bool is_stdout_enabled; ASTPtr format; ASTPtr settings_ast; ASTPtr compression; diff --git a/src/Parsers/ParserQueryWithOutput.cpp b/src/Parsers/ParserQueryWithOutput.cpp index 79203c6d3d1..0f900ed8856 100644 --- a/src/Parsers/ParserQueryWithOutput.cpp +++ b/src/Parsers/ParserQueryWithOutput.cpp @@ -107,6 +107,13 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec } query_with_output.children.push_back(query_with_output.out_file); + + ParserKeyword s_stdout("AND STDOUT"); + if (s_stdout.ignore(pos, expected)) + { + query_with_output.is_stdout_enabled = true; + } + } ParserKeyword s_format("FORMAT"); diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference new file mode 100644 index 00000000000..09984d86ca3 --- /dev/null +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference @@ -0,0 +1,9 @@ +performing test: select +1 2 3 +1 2 3 +performing test: bad_query_incorrect_usage +query failed +performing test: bad_query_no_into_outfile +query failed +performing test: bad_query_file_exists +query failed diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh new file mode 100644 index 00000000000..76b308d003c --- /dev/null +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +function perform() +{ + local test_id=$1 + local query=$2 + + echo "performing test: $test_id" + ${CLICKHOUSE_CLIENT} --query "$query" 2>/dev/null + if [ "$?" -eq 0 ]; then + cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + else + echo "query failed" + fi + rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" +} + +function performFileExists() +{ + local test_id=$1 + local query=$2 + + touch "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + + echo "performing test: $test_id" + ${CLICKHOUSE_CLIENT} --query "$query" 2>/dev/null + if [ "$?" -eq 0 ]; then + cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + else + echo "query failed" + fi + rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" +} + +perform "select" "SELECT 1, 2, 3 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_select.out' AND STDOUT" + +perform "bad_query_incorrect_usage" "SELECT 1, 2, 3 INTO OUTFILE AND STDOUT'${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_incorrect_usage.out'" + +perform "bad_query_no_into_outfile" "SELECT 1, 2, 3 AND STDOUT'" + +performFileExists "bad_query_file_exists" "SELECT 1, 2, 3 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_file_exists.out' AND STDOUT" \ No newline at end of file From d55d190b9929065c8758a8934c059c4cab0c67d1 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 9 Jul 2022 14:53:53 +0200 Subject: [PATCH 041/227] added execute permission on the 02346_into_outfile_and_stdout.sh test file --- tests/queries/0_stateless/02346_into_outfile_and_stdout.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02346_into_outfile_and_stdout.sh diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh old mode 100644 new mode 100755 From 197388e29cfddef074a6aa106eecd465ff163bcc Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 11 Jul 2022 12:13:50 +0000 Subject: [PATCH 042/227] Fix list request --- src/Common/ZooKeeper/ZooKeeper.cpp | 1 - src/Common/ZooKeeper/ZooKeeperImpl.cpp | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index fd14802c6fa..dc4e309cdfa 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -1086,7 +1086,6 @@ std::future ZooKeeper::asyncTryGetChildrenNoThrow( auto callback = [promise](const Coordination::ListResponse & response) mutable { - LOG_INFO(&Poco::Logger::get("LOGGER"), "Got response {}", response.names.size()); promise->set_value(response); }; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 76ec4e28613..f4fbbea82f1 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1211,7 +1211,7 @@ void ZooKeeper::list( if (keeper_api_version < Coordination::KeeperApiVersion::V1) { if (list_request_type != ListRequestType::ALL) - throw Exception("Filtered list request type cannot be used because it's not support by the server", Error::ZBADARGUMENTS); + throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS); request = std::make_shared(); } @@ -1227,6 +1227,7 @@ void ZooKeeper::list( RequestInfo request_info; request_info.callback = [callback](const Response & response) { callback(dynamic_cast(response)); }; request_info.watch = watch; + request_info.request = std::move(request); pushRequest(std::move(request_info)); ProfileEvents::increment(ProfileEvents::ZooKeeperList); From 1f5b3b9922210ac94d85c385fc361810da8f211a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 11 Jul 2022 13:06:24 +0000 Subject: [PATCH 043/227] Add restrictions for modification of internal paths --- src/Coordination/KeeperStateMachine.cpp | 2 -- src/Coordination/KeeperStorage.cpp | 20 ++++++++++++++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 458b5da4909..3c899a268d8 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -146,8 +146,6 @@ nuraft::ptr KeeperStateMachine::pre_commit(uint64_t log_idx, nur if (!request_for_session.zxid) request_for_session.zxid = log_idx; - LOG_INFO(&Poco::Logger::get("LOGGER"), "Preprocessing {} for {} of type {} xid {}", request_for_session.zxid, log_idx, request_for_session.request->getOpNum(), request_for_session.time); - preprocess(request_for_session); return nullptr; } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 02ea04e270d..0a23cf225e7 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -916,6 +916,12 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr std::vector new_deltas; + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + const auto update_parent_pzxid = [&]() { auto parent_path = parentPath(request.path); @@ -1062,6 +1068,12 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce std::vector new_deltas; + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -1323,6 +1335,12 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr { Coordination::ZooKeeperSetACLRequest & request = dynamic_cast(*zk_request); + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + auto & uncommitted_state = storage.uncommitted_state; if (!uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -1889,8 +1907,6 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( { if (new_last_zxid) { - if (*new_last_zxid == 108366) - LOG_INFO(&Poco::Logger::get("LOGGER"), "Processing {}", *new_last_zxid); if (uncommitted_transactions.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to commit a ZXID ({}) which was not preprocessed", *new_last_zxid); From 0555477ec14b7b4abea79216c2ca81fcc1dd7c04 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 12 Jul 2022 07:05:29 +0000 Subject: [PATCH 044/227] Remove debug logs --- src/Coordination/KeeperServer.cpp | 1 - src/Coordination/KeeperStorage.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index db125c45547..2a3d17af403 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -328,7 +328,6 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo auto idx = state_machine->last_commit_index() + 1; for (const auto & entry : *log_entries) { - LOG_INFO(&Poco::Logger::get("LOGGER"), "Term of log {}", entry->get_term()); if (entry && entry->get_val_type() == nuraft::log_val_type::app_log) state_machine->pre_commit(idx, entry->get_buf()); diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 0a23cf225e7..845e97b63ab 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -1819,9 +1819,6 @@ void KeeperStorage::preprocessRequest( { int64_t last_zxid = getNextZXID() - 1; - if (new_last_zxid == 108367) - std::abort(); - if (uncommitted_transactions.empty()) { // if we have no uncommitted transactions it means the last zxid is possibly loaded from snapshot From ee4828b08417f4a7cc1dc36b2790e80e34ffcc5b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 13 Jul 2022 09:23:05 +0000 Subject: [PATCH 045/227] Fix list watches --- src/Coordination/KeeperStorage.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 845e97b63ab..f11b3473805 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -1993,8 +1993,10 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( { if (response->error == Coordination::Error::ZOK) { - auto & watches_type - = zk_request->getOpNum() == Coordination::OpNum::List || zk_request->getOpNum() == Coordination::OpNum::SimpleList + static constexpr std::array list_requests{ + Coordination::OpNum::List, Coordination::OpNum::SimpleList, Coordination::OpNum::FilteredList}; + + auto & watches_type = std::find(list_requests.begin(), list_requests.end(), zk_request->getOpNum()) != list_requests.end() ? list_watches : watches; From c296e84cabb8fbf47dfcf1e5f1194f4ab7d72cfa Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 12 Jul 2022 07:52:50 +0000 Subject: [PATCH 046/227] fix unit tests in debug mode --- src/Coordination/tests/gtest_coordination.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index bd0d329ef8d..669c19438e2 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1064,6 +1064,13 @@ void addNode(DB::KeeperStorage & storage, const std::string & path, const std::s node.setData(data); node.stat.ephemeralOwner = ephemeral_owner; storage.container.insertOrReplace(path, node); + auto child_it = storage.container.find(path); + auto child_path = DB::getBaseName(child_it->key); + storage.container.updateValue(DB::parentPath(StringRef{path}), [&](auto & parent) + { + parent.addChild(child_path); + parent.stat.numChildren++; + }); } TEST_P(CoordinationTest, TestStorageSnapshotSimple) @@ -1221,7 +1228,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) storage.container.erase("/hello_" + std::to_string(i)); } EXPECT_EQ(storage.container.size(), 26); - EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 101); + EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 102); EXPECT_EQ(storage.container.snapshotSizeWithVersion().second, 1); auto buf = manager.serializeSnapshotToBuffer(snapshot); manager.serializeSnapshotBufferToDisk(*buf, 50); @@ -1776,6 +1783,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual) DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); DB::KeeperStorage storage(500, "", true); + addNode(storage, "/hello", ""); for (size_t j = 0; j < 5000; ++j) { addNode(storage, "/hello_" + std::to_string(j), "world", 1); From 3b5bdd1e2ae255ec7d89ca56ef33ba126be6492e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 13 Jul 2022 12:41:16 +0000 Subject: [PATCH 047/227] Add test for current API --- src/Coordination/KeeperStorage.cpp | 18 +++++++++++++----- src/Coordination/tests/gtest_coordination.cpp | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index f11b3473805..d07caeaf496 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -232,11 +232,6 @@ KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, Node root_node; container.insert("/", root_node); nodes_digest += root_node.getDigest("/"); - - Node version_node; - version_node.setData(std::to_string(static_cast(Coordination::current_keeper_api_version))); - container.insert(Coordination::keeper_api_version_path, version_node); - nodes_digest += version_node.getDigest(Coordination::keeper_api_version_path); } template @@ -848,6 +843,9 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce { Coordination::ZooKeeperGetRequest & request = dynamic_cast(*zk_request); + if (request.path == Coordination::keeper_api_version_path) + return {}; + if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -870,6 +868,16 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce } } + // We cannot store the node because the result should be connected to the binary itself + // this way we avoid incorrect results when we read a snapshot from older Keeper that can have + // lower API version + if (request.path == Coordination::keeper_api_version_path) + { + response.data = std::to_string(static_cast(Coordination::current_keeper_api_version)); + response.error = Coordination::Error::ZOK; + return response_ptr; + } + auto & container = storage.container; auto node_it = container.find(request.path); if (node_it == container.end()) diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 669c19438e2..9d1435eca1f 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2043,6 +2043,20 @@ TEST_P(CoordinationTest, TestListRequestTypes) } } +TEST_P(CoordinationTest, TestCurrentApiVersion) +{ + using namespace Coordination; + KeeperStorage storage{500, "", true}; + auto request = std::make_shared(); + request->path = Coordination::keeper_api_version_path; + auto responses = storage.processRequest(request, 0, std::nullopt, true, true); + const auto & get_response = getSingleResponse(responses); + uint8_t keeper_version{0}; + DB::ReadBufferFromOwnString buf(get_response.data); + DB::readIntText(keeper_version, buf); + EXPECT_EQ(keeper_version, current_keeper_api_version); +} + INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, ::testing::ValuesIn(std::initializer_list{ From 93d8b8bb61318059696559b101c10cbde2e0a52b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 13 Jul 2022 13:01:13 +0000 Subject: [PATCH 048/227] small fixes --- src/Common/ZooKeeper/TestKeeper.h | 2 +- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 2 +- src/Interpreters/ZooKeeperLog.cpp | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index 9ea2eef8f41..cf2126fe18e 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -90,7 +90,7 @@ public: void finalize(const String & reason) override; - Coordination::KeeperApiVersion getApiVersion() override + Coordination::KeeperApiVersion getApiVersion() override { return KeeperApiVersion::V0; } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index f4fbbea82f1..a0544935e25 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1085,7 +1085,7 @@ void ZooKeeper::initApiVersion() if (response.error != Coordination::Error::ZOK) return; - uint8_t keeper_version{0}; + uint8_t keeper_version{0}; DB::ReadBufferFromOwnString buf(response.data); DB::readIntText(keeper_version, buf); keeper_api_version = static_cast(keeper_version); diff --git a/src/Interpreters/ZooKeeperLog.cpp b/src/Interpreters/ZooKeeperLog.cpp index 6394b1d5429..4f01ad8b5fb 100644 --- a/src/Interpreters/ZooKeeperLog.cpp +++ b/src/Interpreters/ZooKeeperLog.cpp @@ -85,6 +85,7 @@ NamesAndTypesList ZooKeeperLogElement::getNamesAndTypes() {"Multi", static_cast(Coordination::OpNum::Multi)}, {"Auth", static_cast(Coordination::OpNum::Auth)}, {"SessionID", static_cast(Coordination::OpNum::SessionID)}, + {"FilteredList", static_cast(Coordination::OpNum::FilteredList)}, }); auto error_enum = getCoordinationErrorCodesEnumType(); From ad6b50b087086fef8aa6f0f72b3a42f014266763 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 13 Jul 2022 17:34:20 +0300 Subject: [PATCH 049/227] Forbid defining non-default disk with default path from Suppose you have the following configuration: /var/lib/clickhouse/ /var/lib/clickhouse/

data
In this case disks will have two disks: - 'data' disk with path '/var/lib/clickhouse/' - 'default' disk with path '/var/lib/clickhouse/' And in this case MergeTree engine will complain on ATTACH for table that uses 'default' policy: 2022.06.20 07:49:15.165393 [ 242 ] {e8f50978-218a-426f-babc-637a8d03b1c6} TCPHandler: Code: 479. DB::Exception: Part `0_0_0_1` was found on disk `default` which is not defined in the storage policy. (UNKNOWN_DISK), Stack trace (when copying this message, always include the lines below): Signed-off-by: Azat Khuzhin --- src/Disks/DiskLocal.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 31deac88a19..e793f4dfb5a 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -68,6 +68,8 @@ static void loadDiskLocalConfig(const String & name, throw Exception("Disk path can not be empty. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); if (path.back() != '/') throw Exception("Disk path must end with /. Disk " + name, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + if (path == context->getPath()) + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "Disk path ('{}') cannot be equal to . Use disk instead.", path); } bool has_space_ratio = config.has(config_prefix + ".keep_free_space_ratio"); From d81758898a2b3f2235a79cdd51b155e0611a0351 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 13 Jul 2022 15:26:25 +0000 Subject: [PATCH 050/227] Remove extra newline --- src/Coordination/tests/gtest_coordination.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index f91ef943901..63edcf15508 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2134,7 +2134,6 @@ TEST_P(CoordinationTest, TestCurrentApiVersion) EXPECT_EQ(keeper_version, current_keeper_api_version); } - INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, ::testing::ValuesIn(std::initializer_list{ From 0e25dbbbebb79404b289358727b0bdf9e69ec20a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 14 Jul 2022 08:01:20 +0000 Subject: [PATCH 051/227] Fix stateless test --- src/Common/ZooKeeper/ZooKeeperCommon.cpp | 5 ++++- tests/queries/0_stateless/01158_zookeeper_log_long.sql | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index b2b3c8f6f13..b15126f5701 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -724,7 +724,10 @@ void ZooKeeperResponse::fillLogElements(LogElements & elems, size_t idx) const assert(!elem.xid || elem.xid == xid); elem.xid = xid; int32_t response_op = tryGetOpNum(); - assert(!elem.op_num || elem.op_num == response_op || response_op < 0); + + [[maybe_unused]] const bool is_filtered_list = elem.op_num == static_cast(Coordination::OpNum::FilteredList) + && response_op == static_cast(Coordination::OpNum::List); + assert(!elem.op_num || elem.op_num == response_op || is_filtered_list || response_op < 0); elem.op_num = response_op; elem.zxid = zxid; diff --git a/tests/queries/0_stateless/01158_zookeeper_log_long.sql b/tests/queries/0_stateless/01158_zookeeper_log_long.sql index 61a36df68d8..6048169a3d8 100644 --- a/tests/queries/0_stateless/01158_zookeeper_log_long.sql +++ b/tests/queries/0_stateless/01158_zookeeper_log_long.sql @@ -13,7 +13,7 @@ system flush logs; select 'log'; select address, type, has_watch, op_num, path, is_ephemeral, is_sequential, version, requests_size, request_idx, error, watch_type, watch_state, path_created, stat_version, stat_cversion, stat_dataLength, stat_numChildren -from system.zookeeper_log where path like '/test/01158/' || currentDatabase() || '/rmt/log%' and op_num not in (3, 4, 12) +from system.zookeeper_log where path like '/test/01158/' || currentDatabase() || '/rmt/log%' and op_num not in (3, 4, 12, 500) order by xid, type, request_idx; select 'parts'; @@ -27,7 +27,7 @@ select 'blocks'; select type, has_watch, op_num, path, is_ephemeral, is_sequential, version, requests_size, request_idx, error, watch_type, watch_state, path_created, stat_version, stat_cversion, stat_dataLength, stat_numChildren from system.zookeeper_log -where (session_id, xid) in (select session_id, xid from system.zookeeper_log where path like '/test/01158/' || currentDatabase() || '/rmt/blocks%' and op_num not in (1, 12)) +where (session_id, xid) in (select session_id, xid from system.zookeeper_log where path like '/test/01158/' || currentDatabase() || '/rmt/blocks%' and op_num not in (1, 12, 500)) order by xid, type, request_idx; drop table rmt; From 96bb6e0cd2d6195a2bddd684195aecab899b0d13 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 14 Jul 2022 21:00:11 +0800 Subject: [PATCH 052/227] Rename log when rename merge tree tables --- src/Common/logger_useful.h | 15 +++--- src/Storages/MergeTree/MergeTreeData.cpp | 46 ++++++++++++++----- src/Storages/MergeTree/MergeTreeData.h | 11 +++-- ...rename_table_along_with_log_name.reference | 0 .../02360_rename_table_along_with_log_name.sh | 18 ++++++++ 5 files changed, 69 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/02360_rename_table_along_with_log_name.reference create mode 100755 tests/queries/0_stateless/02360_rename_table_along_with_log_name.sh diff --git a/src/Common/logger_useful.h b/src/Common/logger_useful.h index ad7d6583f5e..1e84efd8085 100644 --- a/src/Common/logger_useful.h +++ b/src/Common/logger_useful.h @@ -14,8 +14,10 @@ namespace template constexpr auto firstArg(T && x, Ts &&...) { return std::forward(x); } /// For implicit conversion of fmt::basic_runtime<> to char* for std::string ctor template constexpr auto firstArg(fmt::basic_runtime && data, Ts &&...) { return data.str.data(); } -} + [[maybe_unused]] const ::Poco::Logger * getLogger(const ::Poco::Logger * logger) { return logger; }; + [[maybe_unused]] const ::Poco::Logger * getLogger(const std::atomic<::Poco::Logger *> & logger) { return logger.load(); }; +} /// Logs a message to a specified logger with that level. /// If more than one argument is provided, @@ -25,20 +27,21 @@ namespace #define LOG_IMPL(logger, priority, PRIORITY, ...) do \ { \ - const bool is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \ + auto _logger = ::getLogger(logger); \ + const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \ (DB::CurrentThread::getGroup()->client_logs_level >= (priority)); \ - if ((logger)->is((PRIORITY)) || is_clients_log) \ + if (_logger->is((PRIORITY)) || _is_clients_log) \ { \ std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \ - if (auto channel = (logger)->getChannel()) \ + if (auto _channel = _logger->getChannel()) \ { \ std::string file_function; \ file_function += __FILE__; \ file_function += "; "; \ file_function += __PRETTY_FUNCTION__; \ - Poco::Message poco_message((logger)->name(), formatted_message, \ + Poco::Message poco_message(_logger->name(), formatted_message, \ (PRIORITY), file_function.c_str(), __LINE__); \ - channel->log(poco_message); \ + _channel->log(poco_message); \ } \ } \ } while (false) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 5900ea0fdb7..ef315e865ec 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -216,8 +216,8 @@ MergeTreeData::MergeTreeData( , require_part_metadata(require_part_metadata_) , relative_data_path(relative_data_path_) , broken_part_callback(broken_part_callback_) - , log_name(table_id_.getNameForLogs()) - , log(&Poco::Logger::get(log_name)) + , log_name(std::make_shared(table_id_.getNameForLogs())) + , log(&Poco::Logger::get(*log_name)) , storage_settings(std::move(storage_settings_)) , pinned_part_uuids(std::make_shared()) , data_parts_by_info(data_parts_indexes.get()) @@ -2033,8 +2033,13 @@ void MergeTreeData::rename(const String & new_table_path, const StorageID & new_ relative_data_path = new_table_path; renameInMemory(new_table_id); +} - +void MergeTreeData::renameInMemory(const StorageID & new_table_id) +{ + IStorage::renameInMemory(new_table_id); + std::atomic_store(&log_name, std::make_shared(new_table_id.getNameForLogs())); + log = &Poco::Logger::get(*log_name); } void MergeTreeData::dropAllData() @@ -4734,11 +4739,19 @@ ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules( if (!destination_ptr) { if (move_ttl_entry->destination_type == DataDestinationType::VOLUME && !move_ttl_entry->if_exists) - LOG_WARNING(log, "Would like to reserve space on volume '{}' by TTL rule of table '{}' but volume was not found or rule is not applicable at the moment", - move_ttl_entry->destination_name, log_name); + LOG_WARNING( + log, + "Would like to reserve space on volume '{}' by TTL rule of table '{}' but volume was not found or rule is not " + "applicable at the moment", + move_ttl_entry->destination_name, + *std::atomic_load(&log_name)); else if (move_ttl_entry->destination_type == DataDestinationType::DISK && !move_ttl_entry->if_exists) - LOG_WARNING(log, "Would like to reserve space on disk '{}' by TTL rule of table '{}' but disk was not found or rule is not applicable at the moment", - move_ttl_entry->destination_name, log_name); + LOG_WARNING( + log, + "Would like to reserve space on disk '{}' by TTL rule of table '{}' but disk was not found or rule is not applicable " + "at the moment", + move_ttl_entry->destination_name, + *std::atomic_load(&log_name)); } else { @@ -4747,11 +4760,17 @@ ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules( return reservation; else if (move_ttl_entry->destination_type == DataDestinationType::VOLUME) - LOG_WARNING(log, "Would like to reserve space on volume '{}' by TTL rule of table '{}' but there is not enough space", - move_ttl_entry->destination_name, log_name); + LOG_WARNING( + log, + "Would like to reserve space on volume '{}' by TTL rule of table '{}' but there is not enough space", + move_ttl_entry->destination_name, + *std::atomic_load(&log_name)); else if (move_ttl_entry->destination_type == DataDestinationType::DISK) - LOG_WARNING(log, "Would like to reserve space on disk '{}' by TTL rule of table '{}' but there is not enough space", - move_ttl_entry->destination_name, log_name); + LOG_WARNING( + log, + "Would like to reserve space on disk '{}' by TTL rule of table '{}' but there is not enough space", + move_ttl_entry->destination_name, + *std::atomic_load(&log_name)); } } @@ -6590,7 +6609,10 @@ bool MergeTreeData::insertQueryIdOrThrowNoLock(const String & query_id, size_t m return false; if (query_id_set.size() >= max_queries) throw Exception( - ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, "Too many simultaneous queries for table {}. Maximum is: {}", log_name, max_queries); + ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous queries for table {}. Maximum is: {}", + *std::atomic_load(&log_name), + max_queries); query_id_set.insert(query_id); return true; } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 0b6e757ab49..be338d52978 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -456,7 +456,7 @@ public: /// Load the set of data parts from disk. Call once - immediately after the object is created. void loadDataParts(bool skip_sanity_checks); - String getLogName() const { return log_name; } + String getLogName() const { return *std::atomic_load(&log_name); } Int64 getMaxBlockNumber() const; @@ -649,6 +649,9 @@ public: /// because changes relative_data_path. void rename(const String & new_table_path, const StorageID & new_table_id) override; + /// Also rename log names. + void renameInMemory(const StorageID & new_table_id) override; + /// Check if the ALTER can be performed: /// - all needed columns are present. /// - all type conversions can be done. @@ -1021,8 +1024,10 @@ protected: /// Engine-specific methods BrokenPartCallback broken_part_callback; - String log_name; - Poco::Logger * log; + /// log_name will change during table RENAME. Use atomic_shared_ptr to allow concurrent RW. + /// NOTE clang-14 doesn't have atomic_shared_ptr yet. Use std::atomic* operations for now. + std::shared_ptr log_name; + std::atomic log; /// Storage settings. /// Use get and set to receive readonly versions. diff --git a/tests/queries/0_stateless/02360_rename_table_along_with_log_name.reference b/tests/queries/0_stateless/02360_rename_table_along_with_log_name.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02360_rename_table_along_with_log_name.sh b/tests/queries/0_stateless/02360_rename_table_along_with_log_name.sh new file mode 100755 index 00000000000..e8c7f844b5c --- /dev/null +++ b/tests/queries/0_stateless/02360_rename_table_along_with_log_name.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +[ ! -z "$CLICKHOUSE_CLIENT_REDEFINED" ] && CLICKHOUSE_CLIENT=$CLICKHOUSE_CLIENT_REDEFINED + +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS x;" +$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS y;" +$CLICKHOUSE_CLIENT -q "CREATE TABLE x(i int) ENGINE MergeTree ORDER BY i;" +$CLICKHOUSE_CLIENT -q "RENAME TABLE x TO y;" + +CLICKHOUSE_CLIENT_WITH_LOG=$(echo ${CLICKHOUSE_CLIENT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=trace/g') +regexp="${CLICKHOUSE_DATABASE}\\.x" # Check if there are still log entries with old table name +$CLICKHOUSE_CLIENT_WITH_LOG --send_logs_source_regexp "$regexp" -q "INSERT INTO y VALUES(1);" + +$CLICKHOUSE_CLIENT -q "DROP TABLE y;" From 03897589b90edf05a2eda9af1a9d8cae5a37bfca Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 14 Jul 2022 21:31:17 +0200 Subject: [PATCH 053/227] Fix --- src/IO/ReadWriteBufferFromHTTP.h | 89 +++++++++++++++++++++++++------- 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index c450ffe1747..ab358c8253a 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -44,6 +44,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int CANNOT_SEEK_THROUGH_FILE; extern const int SEEK_POSITION_OUT_OF_BOUND; + extern const int UNKNOWN_FILE_SIZE; } template @@ -119,6 +120,7 @@ namespace detail size_t offset_from_begin_pos = 0; Range read_range; + std::optional file_size; /// Delayed exception in case retries with partial content are not satisfiable. std::exception_ptr exception; @@ -201,11 +203,11 @@ namespace detail size_t getFileSize() override { - if (read_range.end) - return *read_range.end - getRangeBegin(); + if (file_size) + return *file_size; Poco::Net::HTTPResponse response; - for (size_t i = 0; i < 10; ++i) + for (size_t i = 0; i < settings.http_max_tries; ++i) { try { @@ -214,20 +216,30 @@ namespace detail } catch (const Poco::Exception & e) { + if (i == settings.http_max_tries - 1) + throw; + LOG_ERROR(log, "Failed to make HTTP_HEAD request to {}. Error: {}", uri.toString(), e.displayText()); } } if (response.hasContentLength()) - read_range.end = getRangeBegin() + response.getContentLength(); + { + if (!read_range.end) + read_range.end = getRangeBegin() + response.getContentLength(); - return *read_range.end; + file_size = response.getContentLength(); + return *file_size; + } + + throw Exception(ErrorCodes::UNKNOWN_FILE_SIZE, "Cannot find out file size for: {}", uri.toString()); } String getFileName() const override { return uri.toString(); } enum class InitializeError { + RETRIABLE_ERROR, /// If error is not retriable, `exception` variable must be set. NON_RETRIABLE_ERROR, /// Allows to skip not found urls for globs @@ -401,19 +413,30 @@ namespace detail saved_uri_redirect = uri_redirect; } + if (response.hasContentLength()) + LOG_DEBUG(log, "Received response with content length: {}", response.getContentLength()); + if (withPartialContent() && response.getStatus() != Poco::Net::HTTPResponse::HTTPStatus::HTTP_PARTIAL_CONTENT) { /// Having `200 OK` instead of `206 Partial Content` is acceptable in case we retried with range.begin == 0. if (read_range.begin && *read_range.begin != 0) { if (!exception) + { exception = std::make_exception_ptr(Exception( ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE, - "Cannot read with range: [{}, {}]", + "Cannot read with range: [{}, {}] (response status: {}, reason: {})", *read_range.begin, - read_range.end ? *read_range.end : '-')); + read_range.end ? toString(*read_range.end) : "-", + toString(response.getStatus()), response.getReason())); + } + + /// Retry 200OK + if (response.getStatus() == Poco::Net::HTTPResponse::HTTPStatus::HTTP_OK) + initialization_error = InitializeError::RETRIABLE_ERROR; + else + initialization_error = InitializeError::NON_RETRIABLE_ERROR; - initialization_error = InitializeError::NON_RETRIABLE_ERROR; return; } else if (read_range.end) @@ -481,6 +504,15 @@ namespace detail bool result = false; size_t milliseconds_to_wait = settings.http_retry_initial_backoff_ms; + auto on_retriable_error = [&]() + { + retry_with_range_header = true; + impl.reset(); + auto http_session = session->getSession(); + http_session->reset(); + sleepForMilliseconds(milliseconds_to_wait); + }; + for (size_t i = 0; i < settings.http_max_tries; ++i) { try @@ -488,14 +520,35 @@ namespace detail if (!impl) { initialize(); - if (initialization_error == InitializeError::NON_RETRIABLE_ERROR) + switch (initialization_error) { - assert(exception); - break; - } - else if (initialization_error == InitializeError::SKIP_NOT_FOUND_URL) - { - return false; + case InitializeError::NON_RETRIABLE_ERROR: + { + assert(exception); + break; + } + case InitializeError::SKIP_NOT_FOUND_URL: + { + return false; + } + case InitializeError::RETRIABLE_ERROR: + { + LOG_ERROR( + log, + "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. " + "(Current backoff wait is {}/{} ms)", + uri.toString(), i + 1, settings.http_max_tries, getOffset(), + read_range.end ? toString(*read_range.end) : "unknown", + milliseconds_to_wait, settings.http_retry_max_backoff_ms); + + assert(exception); + on_retriable_error(); + continue; + } + case InitializeError::NONE: + { + break; + } } if (use_external_buffer) @@ -531,12 +584,8 @@ namespace detail milliseconds_to_wait, settings.http_retry_max_backoff_ms); - retry_with_range_header = true; + on_retriable_error(); exception = std::current_exception(); - impl.reset(); - auto http_session = session->getSession(); - http_session->reset(); - sleepForMilliseconds(milliseconds_to_wait); } milliseconds_to_wait = std::min(milliseconds_to_wait * 2, settings.http_retry_max_backoff_ms); From 1dc2187f9cfbfbdb7aca0c015b03ea4150796ade Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 14 Jul 2022 22:14:46 +0200 Subject: [PATCH 054/227] Addressed review comments Implementation: - Added a new buffer ForkWriteBuffer takes a vector of WriteBuffer and writes data to all of them. It uses the buffer of the first element as its buffer and copies data from first buffer to all the other buffers Testing: - Updated tests/queries/0_stateless/02346_into_outfile_and_stdout.sh Documentation: - Updated the english documentation for SELECT.. INTO OUTFILE with AND STDOUT. --- .../statements/select/into-outfile.md | 2 +- src/Client/ClientBase.cpp | 12 ++- src/IO/ForkWriteBuffer.cpp | 83 +++++++++++++++++++ src/IO/ForkWriteBuffer.h | 35 ++++++++ src/IO/TeeWriteBuffer.cpp | 46 ---------- src/IO/TeeWriteBuffer.h | 34 -------- src/Parsers/ASTQueryWithOutput.h | 2 +- src/Parsers/ParserQueryWithOutput.cpp | 12 +-- .../02346_into_outfile_and_stdout.reference | 13 ++- .../02346_into_outfile_and_stdout.sh | 54 +++++++++--- 10 files changed, 186 insertions(+), 107 deletions(-) create mode 100644 src/IO/ForkWriteBuffer.cpp create mode 100644 src/IO/ForkWriteBuffer.h delete mode 100644 src/IO/TeeWriteBuffer.cpp delete mode 100644 src/IO/TeeWriteBuffer.h diff --git a/docs/en/sql-reference/statements/select/into-outfile.md b/docs/en/sql-reference/statements/select/into-outfile.md index eb0cf68eca1..6e33673a3c0 100644 --- a/docs/en/sql-reference/statements/select/into-outfile.md +++ b/docs/en/sql-reference/statements/select/into-outfile.md @@ -23,7 +23,7 @@ SELECT INTO OUTFILE file_name [AND STDOUT] [COMPRESSION type [LEVEL - This functionality is available in the [command-line client](../../../interfaces/cli.md) and [clickhouse-local](../../../operations/utilities/clickhouse-local.md). Thus a query sent via [HTTP interface](../../../interfaces/http.md) will fail. - The query will fail if a file with the same file name already exists. - The default [output format](../../../interfaces/formats.md) is `TabSeparated` (like in the command-line client batch mode). Use [FORMAT](format.md) clause to change it. -- If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. +- If `AND STDOUT` is mentioned in the query then the output that is written to the file is also displayed on standard output. If used with compression, the plaintext is displayed on standard output. **Example** diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index d97f01617a6..cbb5ec9f004 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -69,7 +69,7 @@ #include #include #include -#include +#include namespace fs = std::filesystem; @@ -548,14 +548,18 @@ try range.first, range.second); } - if (query_with_output->is_stdout_enabled) + + if (query_with_output->is_into_outfile_with_stdout) { select_into_file_and_stdout = true; - out_file_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), + WriteBufferPtr file_buf = wrapWriteBufferWithCompressionMethod( + std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), compression_method, compression_level ); + + out_file_buf = std::make_unique(std::vector{file_buf, + std::make_shared(STDOUT_FILENO)}); } else { diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp new file mode 100644 index 00000000000..ef39c9241db --- /dev/null +++ b/src/IO/ForkWriteBuffer.cpp @@ -0,0 +1,83 @@ +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int CANNOT_CREATE_IO_BUFFER; +} + +ForkWriteBuffer::ForkWriteBuffer(WriteBufferPtrs && sources_) + : WriteBuffer(nullptr, 0), sources(std::move(sources_)) +{ + if (sources.empty()) + { + first_buffer = nullptr; + throw Exception("ForkWriteBuffer required WriteBuffer is not provided", ErrorCodes::CANNOT_CREATE_IO_BUFFER); + } + else + { + first_buffer = sources.begin()->get(); + set(first_buffer->buffer().begin(), first_buffer->buffer().size()); + } +} + + +void ForkWriteBuffer::nextImpl() +{ + if (!first_buffer) + return; + + first_buffer->position() = position(); + + try + { + for (auto write_buffer :sources | std::views::reverse) + { + if (write_buffer.get() != first_buffer) + { + //if buffer size if not enough to write, then split the message with buffer length + if (write_buffer->available() < first_buffer->offset()) + { + size_t bytes_written = 0; + auto to_be_written = first_buffer->offset(); + while (to_be_written != 0) + { + int bytes_to_copy = std::min(to_be_written, write_buffer->available()); + write_buffer->write(first_buffer->buffer().begin()+bytes_written, bytes_to_copy); + write_buffer->next(); + bytes_written += bytes_to_copy; + to_be_written -= bytes_to_copy; + } + } + else + write_buffer->write(first_buffer->buffer().begin(), first_buffer->offset()); + } + write_buffer->next(); + } + } + catch (Exception & exception) + { + exception.addMessage("While writing to ForkWriteBuffer"); + throw; + } + +} + +void ForkWriteBuffer::finalizeImpl() +{ + next(); +} + + +ForkWriteBuffer::~ForkWriteBuffer() +{ + finalize(); +} + + +} diff --git a/src/IO/ForkWriteBuffer.h b/src/IO/ForkWriteBuffer.h new file mode 100644 index 00000000000..63267fcd8d7 --- /dev/null +++ b/src/IO/ForkWriteBuffer.h @@ -0,0 +1,35 @@ +#pragma once +#include + + +namespace DB +{ + +namespace ErrorCodes +{ +} + +/** ForkWriteBuffer takes a vector of WriteBuffer and writes data to all of them + * If the vector of WriteBufferPts is empty, then it throws an error + * It uses the buffer of the first element as its buffer and copies data from + * first buffer to all the other buffers + **/ +class ForkWriteBuffer : public WriteBuffer +{ +public: + + using WriteBufferPtrs = std::vector; + + explicit ForkWriteBuffer(WriteBufferPtrs && sources_); + ~ForkWriteBuffer() override; + +protected: + void nextImpl() override; + void finalizeImpl() override; + +private: + WriteBufferPtrs sources; + WriteBuffer *first_buffer; +}; + +} diff --git a/src/IO/TeeWriteBuffer.cpp b/src/IO/TeeWriteBuffer.cpp deleted file mode 100644 index 5131fbdeacf..00000000000 --- a/src/IO/TeeWriteBuffer.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include - -namespace DB -{ - -TeeWriteBuffer::TeeWriteBuffer( - const std::string & file_name_, - size_t buf_size, - int flags, - mode_t mode, - char * existing_memory, - size_t alignment) - : WriteBufferFromFile(file_name_,buf_size,flags,mode,existing_memory,alignment), - stdout_buffer(STDOUT_FILENO,buf_size,working_buffer.begin()) -{ -} - -void TeeWriteBuffer::nextImpl() -{ - try - { - stdout_buffer.position() = position(); - stdout_buffer.next(); - WriteBufferFromFile::nextImpl(); - } - catch (Exception &exception) - { - exception.addMessage("While writing to TeeWriteBuffer "); - throw; - } -} - -void TeeWriteBuffer::finalizeImpl() -{ - if (fd < 0 || stdout_buffer.getFD() < 0) - return; - - next(); -} - -TeeWriteBuffer::~TeeWriteBuffer() -{ - finalize(); -} - -} diff --git a/src/IO/TeeWriteBuffer.h b/src/IO/TeeWriteBuffer.h deleted file mode 100644 index 264ba42b0a8..00000000000 --- a/src/IO/TeeWriteBuffer.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -/** TeeWriteBuffer extends from WriteBufferFromFile and has - * WriteBufferFromFileDescriptor inside the class which is created - * by using the same buffer as TeeWriteBuffer. So both the data are written - * using same buffer - **/ -class TeeWriteBuffer : public WriteBufferFromFile -{ - -public: - explicit TeeWriteBuffer( - const std::string & file_name_, - size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, - int flags = -1, - mode_t mode = 0666, - char * existing_memory = nullptr, - size_t alignment = 0); - - ~TeeWriteBuffer() override; - -protected: - void nextImpl() override; - void finalizeImpl() override; - - WriteBufferFromFileDescriptor stdout_buffer; -}; - -} diff --git a/src/Parsers/ASTQueryWithOutput.h b/src/Parsers/ASTQueryWithOutput.h index d2a24f1ebe2..892d911e2e2 100644 --- a/src/Parsers/ASTQueryWithOutput.h +++ b/src/Parsers/ASTQueryWithOutput.h @@ -15,7 +15,7 @@ class ASTQueryWithOutput : public IAST { public: ASTPtr out_file; - bool is_stdout_enabled; + bool is_into_outfile_with_stdout; ASTPtr format; ASTPtr settings_ast; ASTPtr compression; diff --git a/src/Parsers/ParserQueryWithOutput.cpp b/src/Parsers/ParserQueryWithOutput.cpp index 0f900ed8856..6107bd2a5eb 100644 --- a/src/Parsers/ParserQueryWithOutput.cpp +++ b/src/Parsers/ParserQueryWithOutput.cpp @@ -90,6 +90,12 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec if (!out_file_p.parse(pos, query_with_output.out_file, expected)) return false; + ParserKeyword s_stdout("AND STDOUT"); + if (s_stdout.ignore(pos, expected)) + { + query_with_output.is_into_outfile_with_stdout = true; + } + ParserKeyword s_compression_method("COMPRESSION"); if (s_compression_method.ignore(pos, expected)) { @@ -108,12 +114,6 @@ bool ParserQueryWithOutput::parseImpl(Pos & pos, ASTPtr & node, Expected & expec query_with_output.children.push_back(query_with_output.out_file); - ParserKeyword s_stdout("AND STDOUT"); - if (s_stdout.ignore(pos, expected)) - { - query_with_output.is_stdout_enabled = true; - } - } ParserKeyword s_format("FORMAT"); diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference index 09984d86ca3..d14e7634f24 100644 --- a/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference @@ -2,8 +2,15 @@ performing test: select 1 2 3 1 2 3 performing test: bad_query_incorrect_usage -query failed +1 performing test: bad_query_no_into_outfile -query failed +1 performing test: bad_query_file_exists -query failed +1 +performing test: compression +Hello, World! From clickhouse. +Hello, World! From clickhouse. +performing test: bad_query_misplaced_compression +1 +performing test: bad_query_misplaced_format +1 diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh index 76b308d003c..3879249699f 100755 --- a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh @@ -9,37 +9,67 @@ function perform() local test_id=$1 local query=$2 - echo "performing test: $test_id" - ${CLICKHOUSE_CLIENT} --query "$query" 2>/dev/null + echo "performing test: ${test_id}" + ${CLICKHOUSE_CLIENT} --query "${query}" if [ "$?" -eq 0 ]; then - cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" else echo "query failed" fi - rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" +} + +function performBadQuery() +{ + local test_id=$1 + local query=$2 + local error_message=$3 + + echo "performing test: ${test_id}" + ${CLICKHOUSE_CLIENT} --query "${query}" 2>&1 | grep -Fc "${error_message}" } function performFileExists() { local test_id=$1 local query=$2 + local error_message=$3 - touch "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + touch "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" - echo "performing test: $test_id" - ${CLICKHOUSE_CLIENT} --query "$query" 2>/dev/null + echo "performing test: ${test_id}" + ${CLICKHOUSE_CLIENT} --query "${query}" 2>&1 | grep -Fc "${error_message}" + rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" +} + +function performCompression() +{ + local test_id=$1 + local query=$2 + + echo "performing test: ${test_id}" + ${CLICKHOUSE_CLIENT} --query "${query}" if [ "$?" -eq 0 ]; then - cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + gunzip "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.gz" + cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}" else echo "query failed" fi - rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_$test_id.out" + rm -f "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}" } + + perform "select" "SELECT 1, 2, 3 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_select.out' AND STDOUT" -perform "bad_query_incorrect_usage" "SELECT 1, 2, 3 INTO OUTFILE AND STDOUT'${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_incorrect_usage.out'" +performBadQuery "bad_query_incorrect_usage" "SELECT 1, 2, 3 INTO OUTFILE AND STDOUT'${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_incorrect_usage.out'" "SYNTAX_ERROR" -perform "bad_query_no_into_outfile" "SELECT 1, 2, 3 AND STDOUT'" +performBadQuery "bad_query_no_into_outfile" "SELECT 1, 2, 3 AND STDOUT'" "SYNTAX_ERROR" -performFileExists "bad_query_file_exists" "SELECT 1, 2, 3 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_file_exists.out' AND STDOUT" \ No newline at end of file +performFileExists "bad_query_file_exists" "SELECT 1, 2, 3 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_bad_query_file_exists.out' AND STDOUT" "File exists. (CANNOT_OPEN_FILE)" + +performCompression "compression" "SELECT * FROM (SELECT 'Hello, World! From clickhouse.') INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_compression.gz' AND STDOUT COMPRESSION 'GZ' FORMAT TabSeparated" + +performBadQuery "bad_query_misplaced_compression" "SELECT 1, 2, 3 INTO OUTFILE 'test.gz' COMPRESSION 'GZ' AND STDOUT'" "SYNTAX_ERROR" + +performBadQuery "bad_query_misplaced_format" "SELECT 1, 2, 3 INTO OUTFILE 'test.gz' FORMAT TabSeparated AND STDOUT'" "SYNTAX_ERROR" \ No newline at end of file From a46188216f48d894995f3a0a1d6f68e93c82dca6 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 7 Jun 2022 13:20:46 +0800 Subject: [PATCH 055/227] Support delete from ... where syntax on mergetree tables --- src/Interpreters/InterpreterDeleteQuery.cpp | 89 +++++++++++++++++++ src/Interpreters/InterpreterDeleteQuery.h | 24 +++++ src/Interpreters/InterpreterFactory.cpp | 6 ++ src/Parsers/ASTDeleteQuery.cpp | 42 +++++++++ src/Parsers/ASTDeleteQuery.h | 21 +++++ src/Parsers/ParserDeleteQuery.cpp | 49 ++++++++++ src/Parsers/ParserDeleteQuery.h | 19 ++++ src/Parsers/ParserQuery.cpp | 3 + ...19_standard_delete_on_merge_tree.reference | 3 + .../02319_standard_delete_on_merge_tree.sql | 21 +++++ 10 files changed, 277 insertions(+) create mode 100644 src/Interpreters/InterpreterDeleteQuery.cpp create mode 100644 src/Interpreters/InterpreterDeleteQuery.h create mode 100644 src/Parsers/ASTDeleteQuery.cpp create mode 100644 src/Parsers/ASTDeleteQuery.h create mode 100644 src/Parsers/ParserDeleteQuery.cpp create mode 100644 src/Parsers/ParserDeleteQuery.h create mode 100644 tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference create mode 100644 tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp new file mode 100644 index 00000000000..dff286fc92a --- /dev/null +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -0,0 +1,89 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int TABLE_IS_READ_ONLY; +} + + +InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, ContextPtr context_) : WithContext(context_), query_ptr(query_ptr_) +{ +} + + +BlockIO InterpreterDeleteQuery::execute() +{ + FunctionNameNormalizer().visit(query_ptr.get()); + const ASTDeleteQuery & delete_query = query_ptr->as(); + auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary); + + getContext()->checkAccess(AccessType::ALTER_DELETE, table_id); + + query_ptr->as().setDatabase(table_id.database_name); + + /// First check table storage for validations. + StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); + auto storage_merge_tree = std::dynamic_pointer_cast(table); + if (!storage_merge_tree) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree tables are supported"); + + checkStorageSupportsTransactionsIfNeeded(table, getContext()); + if (table->isStaticStorage()) + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only"); + + DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); + if (typeid_cast(database.get()) + && !getContext()->getClientInfo().is_replicated_database_internal) + { + auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); + guard->releaseTableLock(); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, getContext()); + } + + auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); + auto metadata_snapshot = table->getInMemoryMetadataPtr(); + + /// Currently do similar as alter table delete. + /// TODO: Mark this delete as lightweight. + MutationCommands mutation_commands; + MutationCommand mut_command; + + mut_command.type = MutationCommand::Type::DELETE; + mut_command.predicate = delete_query.predicate; + + auto command = std::make_shared(); + command->type = ASTAlterCommand::DELETE; + command->predicate = delete_query.predicate; + command->children.push_back(command->predicate); + mut_command.ast = command->ptr(); + + mutation_commands.emplace_back(mut_command); + + if (!mutation_commands.empty()) + { + table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); + table->mutate(mutation_commands, getContext()); + } + + return {}; +} + +} diff --git a/src/Interpreters/InterpreterDeleteQuery.h b/src/Interpreters/InterpreterDeleteQuery.h new file mode 100644 index 00000000000..7f17aa31e37 --- /dev/null +++ b/src/Interpreters/InterpreterDeleteQuery.h @@ -0,0 +1,24 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/** Allows you do lightweight deletion on a MergeTree family table. + */ +class InterpreterDeleteQuery : public IInterpreter, WithContext +{ +public: + InterpreterDeleteQuery(const ASTPtr & query_ptr_, ContextPtr context_); + + BlockIO execute() override; + + bool supportsTransactions() const override { return true; } + +private: + ASTPtr query_ptr; +}; + +} diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index 6b081467ae7..00183086bf6 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -314,6 +316,10 @@ std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMut { return std::make_unique(query, context); } + else if (query->as()) + { + return std::make_unique(query, context); + } else { throw Exception("Unknown type of query: " + query->getID(), ErrorCodes::UNKNOWN_TYPE_OF_QUERY); diff --git a/src/Parsers/ASTDeleteQuery.cpp b/src/Parsers/ASTDeleteQuery.cpp new file mode 100644 index 00000000000..ee7dba8dbf7 --- /dev/null +++ b/src/Parsers/ASTDeleteQuery.cpp @@ -0,0 +1,42 @@ +#include +#include + +namespace DB +{ + +String ASTDeleteQuery::getID(char delim) const +{ + return "DeleteQuery" + (delim + getDatabase()) + delim + getTable(); +} + +ASTPtr ASTDeleteQuery::clone() const +{ + auto res = std::make_shared(*this); + res->children.clear(); + + if (predicate) + { + res->predicate = predicate->clone(); + res->children.push_back(res->predicate); + } + + cloneTableOptions(*res); + return res; +} + +void ASTDeleteQuery::formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const +{ + settings.ostr << (settings.hilite ? hilite_keyword : "") << "DELETE FROM " << (settings.hilite ? hilite_none : ""); + + if (database) + { + settings.ostr << backQuoteIfNeed(getDatabase()); + settings.ostr << "."; + } + settings.ostr << backQuoteIfNeed(getTable()); + + settings.ostr << (settings.hilite ? hilite_keyword : "") << " WHERE " << (settings.hilite ? hilite_none : ""); + predicate->formatImpl(settings, state, frame); +} + +} diff --git a/src/Parsers/ASTDeleteQuery.h b/src/Parsers/ASTDeleteQuery.h new file mode 100644 index 00000000000..bcb97639b64 --- /dev/null +++ b/src/Parsers/ASTDeleteQuery.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +namespace DB +{ +/// DELETE FROM [db.]name WHERE ... +class ASTDeleteQuery : public ASTQueryWithTableAndOutput +{ +public: + String getID(char delim) const final; + ASTPtr clone() const final; + + ASTPtr predicate; + +protected: + void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; +}; + +} diff --git a/src/Parsers/ParserDeleteQuery.cpp b/src/Parsers/ParserDeleteQuery.cpp new file mode 100644 index 00000000000..bd06c060ac4 --- /dev/null +++ b/src/Parsers/ParserDeleteQuery.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include + + +namespace DB +{ + +bool ParserDeleteQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + auto query = std::make_shared(); + node = query; + + ParserKeyword s_delete("DELETE"); + ParserKeyword s_from("FROM"); + ParserKeyword s_where("WHERE"); + ParserExpression parser_exp_elem; + + if (s_delete.ignore(pos, expected)) + { + if (!s_from.ignore(pos, expected)) + return false; + + if (!parseDatabaseAndTableAsAST(pos, expected, query->database, query->table)) + return false; + + if (!s_where.ignore(pos, expected)) + return false; + + if (!parser_exp_elem.parse(pos, query->predicate, expected)) + return false; + } + else + return false; + + if (query->predicate) + query->children.push_back(query->predicate); + + if (query->database) + query->children.push_back(query->database); + + if (query->table) + query->children.push_back(query->table); + + return true; +} + +} diff --git a/src/Parsers/ParserDeleteQuery.h b/src/Parsers/ParserDeleteQuery.h new file mode 100644 index 00000000000..fb7c644f48b --- /dev/null +++ b/src/Parsers/ParserDeleteQuery.h @@ -0,0 +1,19 @@ +#pragma once + +#include + +namespace DB +{ + +/** Query like this: + * DELETE FROM [db.]name WHERE ... + */ + +class ParserDeleteQuery : public IParserBase +{ +protected: + const char * getName() const override{ return "Delete query"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + +} diff --git a/src/Parsers/ParserQuery.cpp b/src/Parsers/ParserQuery.cpp index a3cafee65d7..ca837e7dcc5 100644 --- a/src/Parsers/ParserQuery.cpp +++ b/src/Parsers/ParserQuery.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,7 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserSetRoleQuery set_role_p; ParserExternalDDLQuery external_ddl_p; ParserTransactionControl transaction_control_p; + ParserDeleteQuery delete_p; ParserBackupQuery backup_p; bool res = query_with_output_p.parse(pos, node, expected) @@ -73,6 +75,7 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) || grant_p.parse(pos, node, expected) || external_ddl_p.parse(pos, node, expected) || transaction_control_p.parse(pos, node, expected) + || delete_p.parse(pos, node, expected) || backup_p.parse(pos, node, expected); return res; diff --git a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference new file mode 100644 index 00000000000..b343623df61 --- /dev/null +++ b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference @@ -0,0 +1,3 @@ +99 +95 +0 diff --git a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql new file mode 100644 index 00000000000..419278df291 --- /dev/null +++ b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql @@ -0,0 +1,21 @@ +DROP TABLE IF EXISTS merge_table_standard_delete; + +CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTree order by id; + +INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); + +SET mutations_sync = 1; + +DELETE FROM merge_table_standard_delete WHERE id = 10; + +SELECT COUNT() FROM merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE name IN ('1','2','3','4'); + +SELECT COUNT() FROM merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE 1; + +SELECT COUNT() FROM merge_table_standard_delete; + +DROP TABLE merge_table_standard_delete; \ No newline at end of file From 192ca25e877893e260faab0c00692095c43e03f5 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 7 Jun 2022 14:32:23 +0800 Subject: [PATCH 056/227] Fix style check errors --- src/Interpreters/InterpreterDeleteQuery.cpp | 4 ++-- .../0_stateless/02319_standard_delete_on_merge_tree.sql | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index dff286fc92a..567b91d7577 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -18,7 +18,7 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; extern const int TABLE_IS_READ_ONLY; } @@ -64,7 +64,7 @@ BlockIO InterpreterDeleteQuery::execute() /// TODO: Mark this delete as lightweight. MutationCommands mutation_commands; MutationCommand mut_command; - + mut_command.type = MutationCommand::Type::DELETE; mut_command.predicate = delete_query.predicate; diff --git a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql index 419278df291..13b3a3e2701 100644 --- a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql @@ -18,4 +18,4 @@ DELETE FROM merge_table_standard_delete WHERE 1; SELECT COUNT() FROM merge_table_standard_delete; -DROP TABLE merge_table_standard_delete; \ No newline at end of file +DROP TABLE merge_table_standard_delete; From 8df7b7a030761f4fb3fe4e6192a31297574af01a Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Wed, 8 Jun 2022 10:31:11 +0800 Subject: [PATCH 057/227] Support new mutation type for lightweight --- src/Interpreters/InterpreterDeleteQuery.cpp | 2 +- .../MergeTree/MergeTreeMutationEntry.cpp | 24 +++++++++++++++---- .../MergeTree/MergeTreeMutationEntry.h | 7 +++++- .../MergeTree/MergeTreeMutationStatus.h | 2 ++ .../MergeTree/ReplicatedMergeTreeQueue.cpp | 1 + src/Storages/StorageMergeTree.cpp | 12 +++++++--- src/Storages/StorageMergeTree.h | 5 +++- .../System/StorageSystemMutations.cpp | 2 ++ 8 files changed, 45 insertions(+), 10 deletions(-) diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 567b91d7577..fff66402cff 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -80,7 +80,7 @@ BlockIO InterpreterDeleteQuery::execute() { table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); - table->mutate(mutation_commands, getContext()); + storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); } return {}; diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index a222f2a8ad8..5b103cbe8d8 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -46,8 +46,9 @@ UInt64 MergeTreeMutationEntry::parseFileName(const String & file_name_) } MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number, - const TransactionID & tid_, const WriteSettings & settings) - : create_time(time(nullptr)) + const TransactionID & tid_, const WriteSettings & settings, MutationType type_) + : type(type_) + , create_time(time(nullptr)) , commands(std::move(commands_)) , disk(std::move(disk_)) , path_prefix(path_prefix_) @@ -58,7 +59,8 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP try { auto out = disk->writeFile(std::filesystem::path(path_prefix) / file_name, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, settings); - *out << "format version: 1\n" + *out << "format version: 2\n" + << "type: " << type << "\n" << "create time: " << LocalDateTime(create_time) << "\n"; *out << "commands: "; commands.writeText(*out); @@ -121,7 +123,21 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat block_number = parseFileName(file_name); auto buf = disk->readFile(path_prefix + file_name); - *buf >> "format version: 1\n"; + int format_version; + *buf >> "format version: " >> format_version >> "\n"; + + assert(format_version <= 2); + + type = MutationType::Ordinary; + if (format_version == 2) + { + String type_str; + *buf >> "type: " >> type_str >> "\n"; + + auto type_value = magic_enum::enum_cast(type_str); + if (type_value.has_value()) + type = type_value.value(); + } LocalDateTime create_time_dt; *buf >> "create time: " >> create_time_dt >> "\n"; diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.h b/src/Storages/MergeTree/MergeTreeMutationEntry.h index 04297f2852a..06cbd44ed49 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.h +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.h @@ -11,10 +11,15 @@ namespace DB { class IBackupEntry; +enum class MutationType { Ordinary, Lightweight }; + /// A mutation entry for non-replicated MergeTree storage engines. /// Stores information about mutation in file mutation_*.txt. struct MergeTreeMutationEntry { + /// Type of mutation, used for lightweight delete. + MutationType type; + time_t create_time = 0; MutationCommands commands; @@ -38,7 +43,7 @@ struct MergeTreeMutationEntry /// Create a new entry and write it to a temporary file. MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk, const String & path_prefix_, UInt64 tmp_number, - const TransactionID & tid_, const WriteSettings & settings); + const TransactionID & tid_, const WriteSettings & settings, MutationType type_); MergeTreeMutationEntry(const MergeTreeMutationEntry &) = delete; MergeTreeMutationEntry(MergeTreeMutationEntry &&) = default; diff --git a/src/Storages/MergeTree/MergeTreeMutationStatus.h b/src/Storages/MergeTree/MergeTreeMutationStatus.h index acda43b9254..f0949047f6e 100644 --- a/src/Storages/MergeTree/MergeTreeMutationStatus.h +++ b/src/Storages/MergeTree/MergeTreeMutationStatus.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -13,6 +14,7 @@ namespace DB struct MergeTreeMutationStatus { + MutationType type; String id; String command; time_t create_time = 0; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index f6c80baba05..312c4146cb9 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1920,6 +1920,7 @@ std::vector ReplicatedMergeTreeQueue::getMutationsStatu formatAST(*command.ast, buf, false, true); result.push_back(MergeTreeMutationStatus { + MutationType::Ordinary, entry.znode_name, buf.str(), entry.create_time, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 6825698f006..495218d4ef5 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -429,7 +429,7 @@ CurrentlyMergingPartsTagger::~CurrentlyMergingPartsTagger() storage.currently_processing_in_background_condition.notify_all(); } -Int64 StorageMergeTree::startMutation(const MutationCommands & commands, ContextPtr query_context) +Int64 StorageMergeTree::startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type) { /// Choose any disk, because when we load mutations we search them at each disk /// where storage can be placed. See loadMutations(). @@ -447,7 +447,7 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, Context { std::lock_guard lock(currently_processing_in_background_mutex); - MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings()); + MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings(), type); version = increment.get(); entry.commit(version); String mutation_id = entry.file_name; @@ -554,11 +554,16 @@ void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn) } void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) +{ + mutate(commands, query_context, MutationType::Ordinary); +} + +void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context, MutationType type) { /// Validate partition IDs (if any) before starting mutation getPartitionIdsAffectedByCommands(commands, query_context); - Int64 version = startMutation(commands, query_context); + Int64 version = startMutation(commands, query_context, type); if (query_context->getSettingsRef().mutations_sync > 0 || query_context->getCurrentTransaction()) waitForMutation(version); @@ -652,6 +657,7 @@ std::vector StorageMergeTree::getMutationsStatus() cons formatAST(*command.ast, buf, false, true); result.push_back(MergeTreeMutationStatus { + entry.type, entry.file_name, buf.str(), entry.create_time, diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 8ca8ab5d11e..a27925994c9 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -87,6 +87,9 @@ public: void mutate(const MutationCommands & commands, ContextPtr context) override; + /// Support lightweight delete. + void mutate(const MutationCommands & commands, ContextPtr context, MutationType type); + /// Return introspection information about currently processing or recently processed mutations. std::vector getMutationsStatus() const override; @@ -180,7 +183,7 @@ private: /// Allocate block number for new mutation, write mutation to disk /// and into in-memory structures. Wake up merge-mutation task. - Int64 startMutation(const MutationCommands & commands, ContextPtr query_context); + Int64 startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type = MutationType::Ordinary); /// Wait until mutation with version will finish mutation for all parts void waitForMutation(Int64 version); void waitForMutation(const String & mutation_id) override; diff --git a/src/Storages/System/StorageSystemMutations.cpp b/src/Storages/System/StorageSystemMutations.cpp index fa521c632b8..907376a4936 100644 --- a/src/Storages/System/StorageSystemMutations.cpp +++ b/src/Storages/System/StorageSystemMutations.cpp @@ -20,6 +20,7 @@ NamesAndTypesList StorageSystemMutations::getNamesAndTypes() return { { "database", std::make_shared() }, { "table", std::make_shared() }, + { "is_lightweight", std::make_shared() }, { "mutation_id", std::make_shared() }, { "command", std::make_shared() }, { "create_time", std::make_shared() }, @@ -130,6 +131,7 @@ void StorageSystemMutations::fillData(MutableColumns & res_columns, ContextPtr c res_columns[col_num++]->insert(database); res_columns[col_num++]->insert(table); + res_columns[col_num++]->insert(status.type == MutationType::Lightweight); res_columns[col_num++]->insert(status.id); res_columns[col_num++]->insert(status.command); res_columns[col_num++]->insert(UInt64(status.create_time)); From 8696319d62a83ce393c7c64fb5ae01ebe0e20882 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Mon, 20 Jun 2022 17:18:00 +0800 Subject: [PATCH 058/227] Support lightweight delete execution using string as deleted rows mask,also part of select can handle LWD --- src/Interpreters/MutationsInterpreter.cpp | 73 ++++++- src/Interpreters/MutationsInterpreter.h | 9 +- .../MergeTree/FutureMergedMutatedPart.h | 2 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 29 +++ src/Storages/MergeTree/IMergeTreeDataPart.h | 12 ++ .../MergeTree/MergeTreeRangeReader.cpp | 76 +++++++ src/Storages/MergeTree/MergeTreeRangeReader.h | 7 + src/Storages/MergeTree/MutateTask.cpp | 203 +++++++++++++++++- src/Storages/StorageMergeTree.cpp | 6 + 9 files changed, 411 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 8c1d929e409..8753905521d 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -286,15 +286,20 @@ MutationsInterpreter::MutationsInterpreter( const StorageMetadataPtr & metadata_snapshot_, MutationCommands commands_, ContextPtr context_, - bool can_execute_) + bool can_execute_, + bool is_lightweight_) : storage(std::move(storage_)) , metadata_snapshot(metadata_snapshot_) , commands(std::move(commands_)) , context(Context::createCopy(context_)) , can_execute(can_execute_) , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections()) + , is_lightweight(is_lightweight_) { - mutation_ast = prepare(!can_execute); + if (is_lightweight) + mutation_ast = prepareLightweightDelete(!can_execute); + else + mutation_ast = prepare(!can_execute); } static NameSet getKeyColumns(const StoragePtr & storage, const StorageMetadataPtr & metadata_snapshot) @@ -890,6 +895,70 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & return select; } +/// Prepare for lightweight delete +ASTPtr MutationsInterpreter::prepareLightweightDelete(bool dry_run) +{ + if (is_prepared) + throw Exception("MutationsInterpreter is already prepared. It is a bug.", ErrorCodes::LOGICAL_ERROR); + + if (commands.empty()) + throw Exception("Empty mutation commands list", ErrorCodes::LOGICAL_ERROR); + + /// For lightweight DELETE, we use predicate expression to get deleted rows. + /// Collect predicates in the commands + for (auto & command : commands) + { + if (command.type == MutationCommand::DELETE) + { + mutation_kind.set(MutationKind::MUTATE_OTHER); + if (stages.empty()) + stages.emplace_back(context); + + auto mask_predicate = getPartitionAndPredicateExpressionForMutationCommand(command); + stages.back().filters.push_back(mask_predicate); + } + else + throw Exception("Unsupported lightweight mutation command type: " + DB::toString(command.type), ErrorCodes::UNKNOWN_MUTATION_COMMAND); + } + + /// The updated_header is empty for lightweight delete. + updated_header = std::make_unique(); + + is_prepared = true; + + return prepareInterpreterSelectQueryLightweight(stages, dry_run); +} + +ASTPtr MutationsInterpreter::prepareInterpreterSelectQueryLightweight(std::vector & prepared_stages, bool) +{ + /// Construct a SELECT statement for lightweight delete is like "select _part_offset from db.table where " + auto select = std::make_shared(); + + /// DELETEs only query just need the _part_offset virtual column without real columns + select->setExpression(ASTSelectQuery::Expression::SELECT, std::make_shared()); + select->select()->children.push_back(std::make_shared("_part_offset")); + + ASTPtr where_expression; + if (!prepared_stages[0].filters.empty()) + { + if (prepared_stages[0].filters.size() == 1) + where_expression = prepared_stages[0].filters[0]; + else + { + auto coalesced_predicates = std::make_shared(); + coalesced_predicates->name = "or"; + coalesced_predicates->arguments = std::make_shared(); + coalesced_predicates->children.push_back(coalesced_predicates->arguments); + coalesced_predicates->arguments->children = prepared_stages[0].filters; + where_expression = std::move(coalesced_predicates); + } + + select->setExpression(ASTSelectQuery::Expression::WHERE, std::move(where_expression)); + } + + return select; +} + QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const { for (size_t i_stage = 1; i_stage < prepared_stages.size(); ++i_stage) diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 6ce132f300c..360e5aaf17c 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -43,7 +43,8 @@ public: const StorageMetadataPtr & metadata_snapshot_, MutationCommands commands_, ContextPtr context_, - bool can_execute_); + bool can_execute_, + bool is_lightweight_ = false); void validate(); @@ -80,10 +81,13 @@ public: private: ASTPtr prepare(bool dry_run); + ASTPtr prepareLightweightDelete(bool dry_run); struct Stage; ASTPtr prepareInterpreterSelectQuery(std::vector &prepared_stages, bool dry_run); + ASTPtr prepareInterpreterSelectQueryLightweight(std::vector &prepared_stages, bool dry_run); + QueryPipelineBuilder addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const; std::optional getStorageSortDescriptionIfPossible(const Block & header) const; @@ -97,6 +101,9 @@ private: bool can_execute; SelectQueryOptions select_limits; + /// True for lightweight delete. + bool is_lightweight = false; + ASTPtr mutation_ast; /// We have to store interpreter because it use own copy of context diff --git a/src/Storages/MergeTree/FutureMergedMutatedPart.h b/src/Storages/MergeTree/FutureMergedMutatedPart.h index 4447687c3d9..06659249cae 100644 --- a/src/Storages/MergeTree/FutureMergedMutatedPart.h +++ b/src/Storages/MergeTree/FutureMergedMutatedPart.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -23,6 +24,7 @@ struct FutureMergedMutatedPart MergeTreePartInfo part_info; MergeTreeData::DataPartsVector parts; MergeType merge_type = MergeType::Regular; + MutationType mutation_type = MutationType::Ordinary; const MergeTreePartition & getPartition() const { return parts.front()->partition; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 60941108f00..cea8a91e15a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -648,6 +648,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); + loadDeletedRowMask(); } catch (...) { @@ -1208,6 +1209,34 @@ void IMergeTreeDataPart::loadColumns(bool require) setSerializationInfos(infos); } +void IMergeTreeDataPart::loadDeletedRowMask() +{ + if (part_type == Type::Compact) + return; + + auto path = fs::path(getFullRelativePath()) / DELETED_ROW_MARK_FILE_NAME; + if (volume->getDisk()->exists(path)) + { + has_lightweight_delete = true; + + auto in = openForReading(volume->getDisk(), path); + readString(deleted_rows_mask, *in); + } +} + +void IMergeTreeDataPart::writeLightWeightDeletedMask(String bitmap) const +{ + if (bitmap.empty()) + return; + + auto disk = volume->getDisk(); + String file_name = fs::path(getFullRelativePath()) / DELETED_ROW_MARK_FILE_NAME; + + /// write Non-Empty merged bitmap + auto out = disk->writeFile(file_name); + DB::writeText(bitmap, *out); +} + void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const { TransactionID expected_tid = txn ? txn->tid : Tx::PrehistoricTID; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 7f3c41ce4c2..f90649b388c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -327,6 +327,10 @@ public: mutable VersionMetadata version; + /// True if the part has deleted_row_mask.bin file used for lightweight delete. + bool has_lightweight_delete = false; + String deleted_rows_mask; + /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -404,6 +408,9 @@ public: static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; + /// File name for lightweight delete rows mask bitmap file. + static inline constexpr auto DELETED_ROW_MARK_FILE_NAME = "deleted_row_mask.bin"; + /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part /// for zero copy replication. Sadly it's very complex. @@ -456,6 +463,11 @@ public: /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; + /// Reads deleted row mask from deleted_row_mask.bin if exists and set has_lightweight_delete. + void loadDeletedRowMask(); + + /// Write lightweight deleted mask to a file. + void writeLightWeightDeletedMask(String bitmap) const; protected: diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 6e14e9c7aa9..1f5589bb75d 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -673,6 +673,9 @@ MergeTreeRangeReader::MergeTreeRangeReader( sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); } + need_read_deleted_mask = merge_tree_reader->data_part->has_lightweight_delete; + deleted_rows_mask = merge_tree_reader->data_part->deleted_rows_mask; + if (prewhere_info) { const auto & step = *prewhere_info; @@ -852,6 +855,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar read_result = startReadingChain(max_rows, ranges); read_result.num_rows = read_result.numReadRows(); + executeDeletedRowMaskFilterColumns(read_result); + if (read_result.num_rows) { /// Physical columns go first and then some virtual columns follow @@ -951,6 +956,10 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t fillPartOffsetColumn(result, leading_begin_part_offset, leading_end_part_offset); } + /// Do similar as part_offset for deleted mask. + if (need_read_deleted_mask) + fillDeletedRowMaskColumn(result, leading_begin_part_offset, leading_end_part_offset); + return result; } @@ -980,6 +989,43 @@ void MergeTreeRangeReader::fillPartOffsetColumn(ReadResult & result, UInt64 lead result.columns.emplace_back(std::move(column)); } +/// Fill deleted_row_mask column, referenced from fillPartOffsetColumn(). +void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset) +{ + size_t num_rows = result.numReadRows(); + + auto mask_column = ColumnUInt8::create(num_rows); + ColumnUInt8::Container & vec = mask_column->getData(); + + UInt8 * pos = vec.data(); + UInt8 * end = &vec[num_rows]; + + while (pos < end && leading_begin_part_offset < leading_end_part_offset) + { + if (deleted_rows_mask[leading_begin_part_offset++] == '0') + *pos++ = 1; + else + *pos++ = 0; + } + + const auto start_ranges = result.startedRanges(); + + for (const auto & start_range : start_ranges) + { + UInt64 start_part_offset = index_granularity->getMarkStartingRow(start_range.range.begin); + UInt64 end_part_offset = index_granularity->getMarkStartingRow(start_range.range.end); + + while (pos < end && start_part_offset < end_part_offset) + { + if (deleted_rows_mask[start_part_offset++] == '0') + *pos++ = 1; + else + *pos++ = 0; + } + } + + result.deleted_mask_filter_holder = std::move(mask_column); +} Columns MergeTreeRangeReader::continueReadingChain(const ReadResult & result, size_t & num_rows) { @@ -1095,6 +1141,36 @@ static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second) return mut_first; } + +/// Implicitly apply deleted mask filter to columns. +/// If there is no prewhere_info, apply directly the deleted mask filter. +/// If prewhere_info exists, works like row_level_filter and prewhere filter. +void MergeTreeRangeReader::executeDeletedRowMaskFilterColumns(ReadResult & result) +{ + if (prewhere_info || !need_read_deleted_mask || !result.deleted_mask_filter_holder) + return; + + const ColumnUInt8 * mask_filter = typeid_cast(result.deleted_mask_filter_holder.get()); + filterColumns(result.columns, mask_filter->getData()); + + bool has_column = false; + for (auto & column : result.columns) + { + if (column) + { + has_column = true; + result.num_rows = column->size(); + break; + } + } + + /// There is only one filter column. Record the actual number. + if (!has_column) + result.num_rows = result.countBytesInResultFilter(mask_filter->getData()); + + result.need_filter = true; +} + void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { if (!prewhere_info) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index b4fb8913122..bd38be47ed9 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -160,6 +160,9 @@ public: /// The number of bytes read from disk. size_t numBytesRead() const { return num_bytes_read; } + /// Similar as filter that you need to apply to newly-read columns + ColumnPtr deleted_mask_filter_holder; + private: /// Only MergeTreeRangeReader is supposed to access ReadResult internals. friend class MergeTreeRangeReader; @@ -251,6 +254,8 @@ private: Columns continueReadingChain(const ReadResult & result, size_t & num_rows); void executePrewhereActionsAndFilterColumns(ReadResult & result); void fillPartOffsetColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset); + void fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset); + void executeDeletedRowMaskFilterColumns(ReadResult & result); IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; @@ -264,6 +269,8 @@ private: bool last_reader_in_chain = false; bool is_initialized = false; Names non_const_virtual_column_names; + bool need_read_deleted_mask = false; + String deleted_rows_mask; }; } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 3a5aa2f8860..25f6fb580a2 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,8 @@ struct MutationContext MergeTreeData::DataPartPtr source_part; StoragePtr storage_from_source_part; + bool is_lightweight_mutation{0}; + StorageMetadataPtr metadata_snapshot; MutationCommandsConstPtr commands; @@ -1351,6 +1354,193 @@ private: std::unique_ptr part_merger_writer_task{nullptr}; }; +class LightweightDeleteTask : public IExecutableTask +{ +public: + + explicit LightweightDeleteTask(MutationContextPtr ctx_) : ctx(ctx_) {} + + void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + StorageID getStorageID() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + UInt64 getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } + + bool executeStep() override + { + switch (state) + { + case State::NEED_PREPARE: + { + prepare(); + + state = State::NEED_EXECUTE; + return true; + } + case State::NEED_EXECUTE: + { + execute(); + + state = State::NEED_FINALIZE; + return true; + } + case State::NEED_FINALIZE: + { + finalize(); + + state = State::SUCCESS; + return true; + } + case State::SUCCESS: + { + return false; + } + } + return false; + } + +private: + + void prepare() + { + if (ctx->execute_ttl_type != ExecuteTTLType::NONE) + ctx->files_to_skip.insert("ttl.txt"); + + ctx->disk->createDirectories(ctx->new_part_tmp_path); + + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID; + /// NOTE do not pass context for writing to system.transactions_info_log, + /// because part may have temporary name (with temporary block numbers). Will write it later. + ctx->new_data_part->version.setCreationTID(tid, nullptr); + ctx->new_data_part->storeVersionMetadata(); + + NameSet hardlinked_files; + /// Create hardlinks for unchanged files + for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) + { + if (ctx->files_to_skip.contains(it->name())) + continue; + + String destination = ctx->new_part_tmp_path; + String file_name = it->name(); + + destination += file_name; + + if (!ctx->disk->isDirectory(it->path())) + { + ctx->disk->createHardLink(it->path(), destination); + hardlinked_files.insert(file_name); + } + else if (!endsWith(".tmp_proj", file_name)) // ignore projection tmp merge dir + { + // it's a projection part directory + ctx->disk->createDirectories(destination); + for (auto p_it = ctx->disk->iterateDirectory(it->path()); p_it->isValid(); p_it->next()) + { + String p_destination = fs::path(destination) / p_it->name(); + ctx->disk->createHardLink(p_it->path(), p_destination); + hardlinked_files.insert(p_it->name()); + } + } + } + + /// Tracking of hardlinked files required for zero-copy replication. + /// We don't remove them when we delete last copy of source part because + /// new part can use them. + ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID(); + ctx->hardlinked_files.source_part_name = ctx->source_part->name; + ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files; + + /// Only the _delete mask column will be written. + (*ctx->mutate_entry)->columns_written = 1; + + ctx->new_data_part->checksums = ctx->source_part->checksums; + + ctx->compression_codec = ctx->source_part->default_codec; + + if (ctx->mutating_pipeline_builder.initialized()) + { + QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + + if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) + builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + + if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) + builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); + /// Is calculated inside MergeProgressCallback. + ctx->mutating_pipeline.disableProfileEventUpdate(); + ctx->mutating_executor = std::make_unique(ctx->mutating_pipeline); + } + } + + void execute() + { + Block block; + bool has_deleted_rows = false; + + /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap + if (ctx->source_part->has_lightweight_delete) + { + new_bitmap = ctx->source_part->deleted_rows_mask; + has_deleted_rows = true; + } + else + new_bitmap.resize(ctx->source_part->rows_count, '0'); + + /// Mark the data corresponding to the offset in the as deleted. + while (MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry) && ctx->mutating_executor->pull(block)) + { + size_t block_rows = block.rows(); + + if (block_rows && !has_deleted_rows) + has_deleted_rows = true; + + const auto & cols = block.getColumns(); + const auto * offset_col = typeid_cast(cols[0].get()); + const UInt64 * offset = offset_col->getData().data(); + + /// Fill 1 for rows in offset + for (size_t current_row = 0; current_row < block_rows; current_row++) + new_bitmap[offset[current_row]] = '1'; + } + + if (has_deleted_rows) + { + ctx->new_data_part->writeLightWeightDeletedMask(new_bitmap); + ctx->new_data_part->has_lightweight_delete = true; + ctx->new_data_part->deleted_rows_mask = new_bitmap; + } + } + + void finalize() + { + if (ctx->mutating_executor) + { + ctx->mutating_executor.reset(); + ctx->mutating_pipeline.reset(); + } + + MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); + } + + enum class State + { + NEED_PREPARE, + NEED_EXECUTE, + NEED_FINALIZE, + + SUCCESS + }; + + State state{State::NEED_PREPARE}; + + MutationContextPtr ctx; + + String new_bitmap; +}; + MutateTask::MutateTask( FutureMergedMutatedPartPtr future_part_, @@ -1437,8 +1627,10 @@ bool MutateTask::prepare() command.partition, context_for_reading)) ctx->commands_for_part.emplace_back(command); } - - if (ctx->source_part->isStoredOnDisk() && !isStorageTouchedByMutations( + /// Enable lightweight delete for wide part only. + if (isWidePart(ctx->source_part) && (ctx->future_part->mutation_type == MutationType::Lightweight)) + ctx->is_lightweight_mutation = true; + if (ctx->source_part->isStoredOnDisk() && !ctx->is_lightweight_mutation && !isStorageTouchedByMutations( ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation); @@ -1457,7 +1649,7 @@ bool MutateTask::prepare() if (!ctx->for_interpreter.empty()) { ctx->interpreter = std::make_unique( - ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->for_interpreter, context_for_reading, true); + ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->for_interpreter, context_for_reading, true, ctx->is_lightweight_mutation); ctx->materialized_indices = ctx->interpreter->grabMaterializedIndices(); ctx->materialized_projections = ctx->interpreter->grabMaterializedProjections(); ctx->mutation_kind = ctx->interpreter->getMutationKind(); @@ -1516,6 +1708,11 @@ bool MutateTask::prepare() { task = std::make_unique(ctx); } + else if (ctx->is_lightweight_mutation) + { + /// We will modify or create only deleted_row_mask for lightweight delete. Other columns and key values are copied as-is. + task = std::make_unique(ctx); + } else /// TODO: check that we modify only non-key columns in this case. { /// We will modify only some of the columns. Other columns and key values can be copied as-is. diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 495218d4ef5..83ca5c794e8 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1024,12 +1024,17 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( auto commands = std::make_shared(); size_t current_ast_elements = 0; auto last_mutation_to_apply = mutations_end_it; + MutationType first_mutation_type = mutations_begin_it->second.type; for (auto it = mutations_begin_it; it != mutations_end_it; ++it) { /// Do not squash mutations from different transactions to be able to commit/rollback them independently. if (first_mutation_tid != it->second.tid) break; + /// Do not combine mutations with different types. + if (it->second.type != first_mutation_type) + break; + size_t commands_size = 0; MutationCommands commands_for_size_validation; for (const auto & command : it->second.commands) @@ -1114,6 +1119,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( future_part->part_info = new_part_info; future_part->name = part->getNewName(new_part_info); future_part->type = part->getType(); + future_part->mutation_type = first_mutation_type; tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); return std::make_shared(future_part, std::move(tagger), commands, txn); From b4a37e1e22ae8218eeb48bb300d5db6a1b41bad2 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 21 Jun 2022 11:07:25 +0800 Subject: [PATCH 059/227] Disable optimizations for count() when lightweight delete exists, add hasLightweightDelete() function in IMergeTreeDataPart --- src/Interpreters/InterpreterSelectQuery.cpp | 1 + src/Storages/IStorage.h | 3 +++ src/Storages/MergeTree/IMergeTreeDataPart.h | 3 +++ src/Storages/MergeTree/MergeTreeData.cpp | 13 ++++++++++++- src/Storages/MergeTree/MergeTreeData.h | 3 +++ src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 5 +++++ src/Storages/StorageMergeTree.h | 1 + 9 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index ac31588d210..5852200da2c 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1936,6 +1936,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc && !settings.empty_result_for_aggregation_by_empty_set && storage && storage->getName() != "MaterializedMySQL" + && !storage->hasLightweightDelete() && !row_policy_filter && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index e265c94eb11..899e7e365ce 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -235,6 +235,9 @@ public: /// Returns true if the storage supports backup/restore for specific partitions. virtual bool supportsBackupPartition() const { return false; } + /// Return true if there are lightweight parts. + virtual bool hasLightweightDelete() const { return false; } + private: StorageID storage_id; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index f90649b388c..ff1a8269cb9 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -463,6 +463,9 @@ public: /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; + /// True if here is light weight bitmap file in part. + bool hasLightweightDelete() const { return has_lightweight_delete; } + /// Reads deleted row mask from deleted_row_mask.bin if exists and set has_lightweight_delete. void loadDeletedRowMask(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 64aaa40bd4c..6b3ddbf8d49 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1002,6 +1002,7 @@ void MergeTreeData::loadDataPartsFromDisk( size_t suspicious_broken_parts_bytes = 0; std::atomic has_adaptive_parts = false; std::atomic has_non_adaptive_parts = false; + std::atomic has_lightweight_in_parts = false; std::mutex mutex; auto load_part = [&](const String & part_name, const DiskPtr & part_disk_ptr) @@ -1075,6 +1076,10 @@ void MergeTreeData::loadDataPartsFromDisk( else has_adaptive_parts.store(true, std::memory_order_relaxed); + /// Check if there is lightweight delete in part + if (part->hasLightweightDelete()) + has_lightweight_in_parts.store(true, std::memory_order_relaxed); + part->modification_time = part_disk_ptr->getLastModified(fs::path(relative_data_path) / part_name).epochTime(); /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later part->setState(DataPartState::Active); @@ -1149,6 +1154,9 @@ void MergeTreeData::loadDataPartsFromDisk( has_non_adaptive_index_granularity_parts = has_non_adaptive_parts; + if (has_lightweight_in_parts) + has_lightweight_delete_parts.store(true); + if (suspicious_broken_parts > settings->max_suspicious_broken_parts && !skip_sanity_checks) throw Exception(ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS, "Suspiciously many ({} parts, {} in total) broken parts to remove while maximum allowed broken parts count is {}. You can change the maximum value " @@ -2864,6 +2872,9 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR); + if (part.hasLightweightDelete()) + has_lightweight_delete_parts.store(true); + checkPartCanBeAddedToTable(part, lock); DataPartPtr covering_part; @@ -5672,7 +5683,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg ProjectionCandidate * selected_candidate = nullptr; size_t min_sum_marks = std::numeric_limits::max(); - if (metadata_snapshot->minmax_count_projection) + if (metadata_snapshot->minmax_count_projection && !has_lightweight_delete_parts.load(std::memory_order_relaxed)) /// Disable ReadFromStorage for parts with lightweight. add_projection_candidate(*metadata_snapshot->minmax_count_projection, true); std::optional minmax_count_projection_candidate; if (!candidates.empty()) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 0b6e757ab49..e18a3b20b74 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -942,6 +942,9 @@ public: bool has_non_adaptive_index_granularity_parts = false; + /// True if at least one part contains lightweight delete. + mutable std::atomic_bool has_lightweight_delete_parts = false; + /// Parts that currently moving from disk/volume to another. /// This set have to be used with `currently_processing_in_background_mutex`. /// Moving may conflict with merges and mutations, but this is OK, because diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 1f5589bb75d..b37c5d6489f 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -673,7 +673,7 @@ MergeTreeRangeReader::MergeTreeRangeReader( sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); } - need_read_deleted_mask = merge_tree_reader->data_part->has_lightweight_delete; + need_read_deleted_mask = merge_tree_reader->data_part->hasLightweightDelete(); deleted_rows_mask = merge_tree_reader->data_part->deleted_rows_mask; if (prewhere_info) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 25f6fb580a2..119a216bc88 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1481,7 +1481,7 @@ private: bool has_deleted_rows = false; /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap - if (ctx->source_part->has_lightweight_delete) + if (ctx->source_part->hasLightweightDelete()) { new_bitmap = ctx->source_part->deleted_rows_mask; has_deleted_rows = true; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 83ca5c794e8..5480114359e 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -569,6 +569,11 @@ void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr quer waitForMutation(version); } +bool StorageMergeTree::hasLightweightDelete() const +{ + return has_lightweight_delete_parts.load(std::memory_order_relaxed); +} + std::optional StorageMergeTree::getIncompleteMutationsStatus(Int64 mutation_version, std::set * mutation_ids) const { std::unique_lock lock(currently_processing_in_background_mutex); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index a27925994c9..6251d0d75e0 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -89,6 +89,7 @@ public: /// Support lightweight delete. void mutate(const MutationCommands & commands, ContextPtr context, MutationType type); + bool hasLightweightDelete() const override; /// Return introspection information about currently processing or recently processed mutations. std::vector getMutationsStatus() const override; From 2c74e9b86667a7945e84cf80e0d1f137bc6cfeb7 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 21 Jun 2022 13:36:21 +0800 Subject: [PATCH 060/227] Merge support to apply lightweight delete mask --- .../MergeTree/MergeTreeSequentialSource.cpp | 118 ++++++++++++------ 1 file changed, 79 insertions(+), 39 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 7c4cc04ba52..926fde522d0 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -75,53 +75,93 @@ try { const auto & header = getPort().getHeader(); - if (!isCancelled() && current_row < data_part->rows_count) + /// The chunk after deleted mask applied maybe empty. But the empty chunk means done of read rows. + String deleted_rows_mask; + bool need_read_deleted_mask = data_part->hasLightweightDelete(); + if (need_read_deleted_mask) + deleted_rows_mask = data_part->deleted_rows_mask; + + do { - size_t rows_to_read = data_part->index_granularity.getMarkRows(current_mark); - bool continue_reading = (current_mark != 0); - - const auto & sample = reader->getColumns(); - Columns columns(sample.size()); - size_t rows_read = reader->readRows(current_mark, data_part->getMarksCount(), continue_reading, rows_to_read, columns); - - if (rows_read) + if (!isCancelled() && current_row < data_part->rows_count) { - current_row += rows_read; - current_mark += (rows_to_read == rows_read); + size_t rows_to_read = data_part->index_granularity.getMarkRows(current_mark); + bool continue_reading = (current_mark != 0); - bool should_evaluate_missing_defaults = false; - reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read); + const auto & sample = reader->getColumns(); + Columns columns(sample.size()); + size_t rows_read = reader->readRows(current_mark, data_part->getMarksCount(), continue_reading, rows_to_read, columns); - if (should_evaluate_missing_defaults) + if (rows_read) { - reader->evaluateMissingDefaults({}, columns); + current_row += rows_read; + current_mark += (rows_to_read == rows_read); + + if (need_read_deleted_mask) + { + size_t pos = current_row - rows_read; + + /// Get deleted mask for rows_read + IColumn::Filter deleted_rows_filter(rows_read, true); + for (size_t i = 0; i < rows_read; i++) + { + if (deleted_rows_mask[pos++] == '1') + deleted_rows_filter[i] = 0; + } + + // Filter only if some items were deleted + if (auto num_deleted_rows = std::count(deleted_rows_filter.begin(), deleted_rows_filter.end(), 0)) + { + const auto remaining_rows = deleted_rows_filter.size() - num_deleted_rows; + + /// If we return {} here, it means finished, no reading of the following rows. + /// Continue to read until remaining rows are not zero or reach the end (REAL finish). + if (!remaining_rows) + continue; + + for (auto & col : columns) + col = col->filter(deleted_rows_filter, remaining_rows); + + /// Update rows_read with actual rows in columns + rows_read = remaining_rows; + } + } + + bool should_evaluate_missing_defaults = false; + reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read); + + if (should_evaluate_missing_defaults) + { + reader->evaluateMissingDefaults({}, columns); + } + + reader->performRequiredConversions(columns); + + /// Reorder columns and fill result block. + size_t num_columns = sample.size(); + Columns res_columns; + res_columns.reserve(num_columns); + + auto it = sample.begin(); + for (size_t i = 0; i < num_columns; ++i) + { + if (header.has(it->name)) + res_columns.emplace_back(std::move(columns[i])); + + ++it; + } + + return Chunk(std::move(res_columns), rows_read); } - - reader->performRequiredConversions(columns); - - /// Reorder columns and fill result block. - size_t num_columns = sample.size(); - Columns res_columns; - res_columns.reserve(num_columns); - - auto it = sample.begin(); - for (size_t i = 0; i < num_columns; ++i) - { - if (header.has(it->name)) - res_columns.emplace_back(std::move(columns[i])); - - ++it; - } - - return Chunk(std::move(res_columns), rows_read); } - } - else - { - finish(); - } + else + { + finish(); + } + + return {}; + } while (true); - return {}; } catch (...) { From 9d27af7ee223335fb9712ada3e944b7fc13663b6 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Thu, 23 Jun 2022 21:02:22 +0800 Subject: [PATCH 061/227] For some columns mutations, skip to apply deleted mask when read some columns. Also add unit test case --- src/Interpreters/Context.h | 6 ++ src/Interpreters/MutationsInterpreter.cpp | 13 +++- src/Interpreters/MutationsInterpreter.h | 4 ++ .../QueryPlan/ReadFromMergeTree.cpp | 1 + src/Storages/MergeTree/IMergeTreeReader.h | 2 + src/Storages/MergeTree/MergeTreeIOSettings.h | 2 + .../MergeTree/MergeTreeRangeReader.cpp | 5 +- src/Storages/MergeTree/MutateTask.cpp | 13 ++-- ...lightweight_delete_on_merge_tree.reference | 31 ++++++++++ ...02319_lightweight_delete_on_merge_tree.sql | 59 +++++++++++++++++++ ...19_standard_delete_on_merge_tree.reference | 3 - .../02319_standard_delete_on_merge_tree.sql | 21 ------- 12 files changed, 129 insertions(+), 31 deletions(-) create mode 100644 tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference create mode 100644 tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql delete mode 100644 tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference delete mode 100644 tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 4da5326aca4..c83e38a0ed1 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -360,6 +360,9 @@ private: inline static ContextPtr global_context_instance; + /// A flag, used to mark if reader needs to apply deleted rows mask. + bool skip_deleted_mask = false; + public: // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context. OpenTelemetryTraceContext query_trace_context; @@ -912,6 +915,9 @@ public: bool isInternalQuery() const { return is_internal_query; } void setInternalQuery(bool internal) { is_internal_query = internal; } + bool skipDeletedMask() const { return skip_deleted_mask; } + void setSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } + ActionLocksManagerPtr getActionLocksManager(); enum class ApplicationType diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 8753905521d..cc658bfa764 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -1041,7 +1041,18 @@ QueryPipelineBuilder MutationsInterpreter::execute() throw Exception("Cannot execute mutations interpreter because can_execute flag set to false", ErrorCodes::LOGICAL_ERROR); if (!select_interpreter) - select_interpreter = std::make_unique(mutation_ast, context, storage, metadata_snapshot, select_limits); + { + /// Skip to apply deleted mask for MutateSomePartColumn cases when part has lightweight delete. + if (!is_lightweight && skip_deleted_mask) + { + auto context_for_reading = Context::createCopy(context); + context_for_reading->setSkipDeletedMask(skip_deleted_mask); + select_interpreter = std::make_unique(mutation_ast, context_for_reading, storage, metadata_snapshot, select_limits); + } + else + select_interpreter = std::make_unique(mutation_ast, context, storage, metadata_snapshot, select_limits); + } + QueryPlan plan; select_interpreter->buildQueryPlan(plan); diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 360e5aaf17c..fb94c1f5f33 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -79,6 +79,8 @@ public: MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; } + void SetSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } + private: ASTPtr prepare(bool dry_run); ASTPtr prepareLightweightDelete(bool dry_run); @@ -103,6 +105,8 @@ private: /// True for lightweight delete. bool is_lightweight = false; + /// True for MutateSomePartColumns on part with lightweight. + bool skip_deleted_mask = false; ASTPtr mutation_ast; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index b4e143cc002..59f6ec558e7 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -62,6 +62,7 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings( .save_marks_in_cache = true, .checksum_on_read = settings.checksum_on_read, .read_in_order = query_info.input_order_info != nullptr, + .skip_deleted_mask = context->skipDeletedMask(), }; } diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index b13db9c3255..229f62da293 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -62,6 +62,8 @@ public: MergeTreeData::DataPartPtr data_part; + bool needReadDeletedMask() { return !settings.skip_deleted_mask && data_part->hasLightweightDelete(); } + protected: /// Returns actual column type in part, which can differ from table metadata. NameAndTypePair getColumnFromPart(const NameAndTypePair & required_column) const; diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index 85cf3e9eda6..02372011876 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -23,6 +23,8 @@ struct MergeTreeReaderSettings bool checksum_on_read = true; /// True if we read in order of sorting key. bool read_in_order = false; + /// Do not apply deleted mask for internal select from mutate some part columns. + bool skip_deleted_mask = false; }; struct MergeTreeWriterSettings diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index b37c5d6489f..5625ea06b7d 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -673,8 +673,9 @@ MergeTreeRangeReader::MergeTreeRangeReader( sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); } - need_read_deleted_mask = merge_tree_reader->data_part->hasLightweightDelete(); - deleted_rows_mask = merge_tree_reader->data_part->deleted_rows_mask; + need_read_deleted_mask = merge_tree_reader->needReadDeletedMask(); + if (need_read_deleted_mask) + deleted_rows_mask = merge_tree_reader->data_part->deleted_rows_mask; if (prewhere_info) { diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 119a216bc88..21122a42833 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -604,6 +604,7 @@ void finalizeMutatedPart( new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); new_data_part->loadProjections(false, false); + new_data_part->loadDeletedRowMask(); new_data_part->setBytesOnDisk(new_data_part->data_part_storage->calculateTotalSizeOnDisk()); new_data_part->default_codec = codec; new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); @@ -1509,8 +1510,6 @@ private: if (has_deleted_rows) { ctx->new_data_part->writeLightWeightDeletedMask(new_bitmap); - ctx->new_data_part->has_lightweight_delete = true; - ctx->new_data_part->deleted_rows_mask = new_bitmap; } } @@ -1646,6 +1645,8 @@ bool MutateTask::prepare() ctx->stage_progress = std::make_unique(1.0); + bool need_mutate_all_columns = !isWidePart(ctx->source_part); + if (!ctx->for_interpreter.empty()) { ctx->interpreter = std::make_unique( @@ -1653,6 +1654,11 @@ bool MutateTask::prepare() ctx->materialized_indices = ctx->interpreter->grabMaterializedIndices(); ctx->materialized_projections = ctx->interpreter->grabMaterializedProjections(); ctx->mutation_kind = ctx->interpreter->getMutationKind(); + + /// Skip to apply deleted mask when reading for MutateSomePartColumns. + need_mutate_all_columns = need_mutate_all_columns || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter->isAffectingAllColumns()); + if(!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) + ctx->interpreter->SetSkipDeletedMask(true); ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); ctx->progress_callback = MergeProgressCallback((*ctx->mutate_entry)->ptr(), ctx->watch_prev_elapsed, *ctx->stage_progress); @@ -1703,8 +1709,7 @@ bool MutateTask::prepare() /// All columns from part are changed and may be some more that were missing before in part /// TODO We can materialize compact part without copying data - if (!isWidePart(ctx->source_part) - || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter && ctx->interpreter->isAffectingAllColumns())) + if (need_mutate_all_columns) { task = std::make_unique(ctx); } diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference new file mode 100644 index 00000000000..aefc0c88762 --- /dev/null +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference @@ -0,0 +1,31 @@ +99 +95 +0 +-----lightweight mutation type----- +1 DELETE WHERE (c % 5) = 1 1 +1 DELETE WHERE c = 4 1 +0 MATERIALIZE INDEX i_c 1 +0 UPDATE b = -1 WHERE a < 3 1 +0 DROP INDEX i_c 1 +-----Check that select and merge with lightweight delete.----- +7 +0 -1 0 +2 -1 2 +3 3 3 +5 5 5 +7 7 7 +8 8 8 +9 9 9 +t_light 0 0_1_1_0_10 2 +t_light 1 1_2_2_0_10 2 +t_light 2 2_3_3_0_10 2 +t_light 3 3_4_4_0_10 2 +t_light 4 4_5_5_0_10 2 +7 +t_light 0 0_1_1_1_10 2 +t_light 2 2_3_3_1_10 2 +t_light 3 3_4_4_1_10 2 +t_light 4 4_5_5_1_10 1 +-----Test lightweight delete in multi blocks----- +1000 -2 +1005 -2 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql new file mode 100644 index 00000000000..da302f0f4ff --- /dev/null +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -0,0 +1,59 @@ +DROP TABLE IF EXISTS merge_table_standard_delete; + +CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTree order by id settings min_bytes_for_wide_part=0; + +INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); + +SET mutations_sync = 1; + +DELETE FROM merge_table_standard_delete WHERE id = 10; + +SELECT COUNT() FROM merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE name IN ('1','2','3','4'); + +SELECT COUNT() FROM merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE 1; + +SELECT COUNT() FROM merge_table_standard_delete; + +DROP TABLE merge_table_standard_delete; + +drop table if exists t_light; +create table t_light(a int, b int, c int, index i_c(b) type minmax granularity 4) engine = MergeTree order by a partition by c % 5 settings min_bytes_for_wide_part=0; +INSERT INTO t_light SELECT number, number, number FROM numbers(10); + +SELECT '-----lightweight mutation type-----'; + +DELETE FROM t_light WHERE c%5=1; +DELETE FROM t_light WHERE c=4; +alter table t_light MATERIALIZE INDEX i_c; +alter table t_light update b=-1 where a<3; +alter table t_light drop index i_c; + +SELECT is_lightweight, command, is_done FROM system.mutations WHERE database = currentDatabase() AND table = 't_light'; + +SELECT '-----Check that select and merge with lightweight delete.-----'; +select count(*) from t_light; +select * from t_light order by a; + +select table, partition, name, rows from system.parts where database = currentDatabase() AND active and table ='t_light' order by name; + +optimize table t_light final; +select count(*) from t_light; + +select table, partition, name, rows from system.parts where database = currentDatabase() AND active and table ='t_light' and rows > 0 order by name; + +drop table t_light; + +SELECT '-----Test lightweight delete in multi blocks-----'; +CREATE TABLE t_large(a UInt32, b int) ENGINE=MergeTree order BY a settings min_bytes_for_wide_part=0; +INSERT INTO t_large SELECT number + 1, number + 1 FROM numbers(100000); + +DELETE FROM t_large WHERE a = 50000; +ALTER TABLE t_large UPDATE b = -2 WHERE a between 1000 and 1005; +ALTER TABLE t_large DELETE WHERE a=1; +SELECT * FROM t_large WHERE a in (1,1000,1005,50000) order by a; + +DROP TABLE t_large; diff --git a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference deleted file mode 100644 index b343623df61..00000000000 --- a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.reference +++ /dev/null @@ -1,3 +0,0 @@ -99 -95 -0 diff --git a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql deleted file mode 100644 index 13b3a3e2701..00000000000 --- a/tests/queries/0_stateless/02319_standard_delete_on_merge_tree.sql +++ /dev/null @@ -1,21 +0,0 @@ -DROP TABLE IF EXISTS merge_table_standard_delete; - -CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTree order by id; - -INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); - -SET mutations_sync = 1; - -DELETE FROM merge_table_standard_delete WHERE id = 10; - -SELECT COUNT() FROM merge_table_standard_delete; - -DELETE FROM merge_table_standard_delete WHERE name IN ('1','2','3','4'); - -SELECT COUNT() FROM merge_table_standard_delete; - -DELETE FROM merge_table_standard_delete WHERE 1; - -SELECT COUNT() FROM merge_table_standard_delete; - -DROP TABLE merge_table_standard_delete; From 11fdea6e4bff0ec55cae329c6f526274575c7b56 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Sun, 26 Jun 2022 20:01:39 +0800 Subject: [PATCH 062/227] Add missing code for deleted_mask_filter_holder --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 5625ea06b7d..857b70ede61 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -321,6 +321,7 @@ void MergeTreeRangeReader::ReadResult::clear() total_rows_per_granule = 0; filter_holder = nullptr; filter = nullptr; + deleted_mask_filter_holder = nullptr; } void MergeTreeRangeReader::ReadResult::shrink(Columns & old_columns) From 8ad2bb7c330a161878fc3a5d768bec0301dc2641 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 28 Jun 2022 10:07:49 +0800 Subject: [PATCH 063/227] Code changes due to master new fixes, and update reference for mutations table --- .../MergeTree/DataPartStorageOnDisk.cpp | 26 +++++++++++++ .../MergeTree/DataPartStorageOnDisk.h | 3 ++ src/Storages/MergeTree/IDataPartStorage.h | 3 ++ src/Storages/MergeTree/IMergeTreeDataPart.cpp | 15 ++------ src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- .../MergeTree/MergeTreeRangeReader.cpp | 15 +++++++- src/Storages/MergeTree/MutateTask.cpp | 37 +++++++++++-------- .../02117_show_create_table_system.reference | 1 + 9 files changed, 75 insertions(+), 29 deletions(-) diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp index f3b228a0748..816d31f44e5 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -368,7 +368,10 @@ void DataPartStorageOnDisk::clearDirectory( request.emplace_back(fs::path(dir) / "delete-on-destroy.txt", true); if (!is_projection) + { request.emplace_back(fs::path(dir) / "txn_version.txt", true); + request.emplace_back(fs::path(dir) / "deleted_row_mask.bin", true); + } disk->removeSharedFiles(request, !can_remove_shared_data, names_not_to_remove); disk->removeDirectory(dir); @@ -649,6 +652,29 @@ bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & stor return !volume_ptr->areMergesAvoided(); } +void DataPartStorageOnDisk::loadDeletedRowMask(String & bitmap) const +{ + String deleted_mask_path = fs::path(getRelativePath()) / "deleted_row_mask.bin"; + auto disk = volume->getDisk(); + auto in = openForReading(disk, deleted_mask_path); + readString(bitmap, *in); +} + +void DataPartStorageOnDisk::writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const +{ + String deleted_mask_path = fs::path(getRelativePath()) / "deleted_row_mask.bin"; + auto disk = volume->getDisk(); + try + { + auto out = volume->getDisk()->writeFile(deleted_mask_path); + DB::writeText(bitmap, *out); + } + catch (Poco::Exception & e) + { + LOG_ERROR(log, "{} (while writing deleted rows mask file for lightweight delete: {})", e.what(), backQuote(fullPath(disk, deleted_mask_path))); + } +} + void DataPartStorageOnDisk::backup( TemporaryFilesOnDisks & temp_dirs, const MergeTreeDataPartChecksums & checksums, diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h index 2426b5eee80..75f11617aef 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.h +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h @@ -86,6 +86,9 @@ public: bool shallParticipateInMerges(const IStoragePolicy &) const override; + void loadDeletedRowMask(String & bitmap) const override; + void writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const override; + void backup( TemporaryFilesOnDisks & temp_dirs, const MergeTreeDataPartChecksums & checksums, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index f0173baecb7..16ff2485a6c 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -169,6 +169,9 @@ public: /// A leak of abstraction virtual bool shallParticipateInMerges(const IStoragePolicy &) const { return true; } + virtual void loadDeletedRowMask(String & bitmap) const = 0; + virtual void writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const = 0; + /// Create a backup of a data part. /// This method adds a new entry to backup_entries. /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time). diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index cea8a91e15a..3ee96459185 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1214,27 +1214,20 @@ void IMergeTreeDataPart::loadDeletedRowMask() if (part_type == Type::Compact) return; - auto path = fs::path(getFullRelativePath()) / DELETED_ROW_MARK_FILE_NAME; - if (volume->getDisk()->exists(path)) + if (data_part_storage->exists(DELETED_ROW_MARK_FILE_NAME)) { has_lightweight_delete = true; - auto in = openForReading(volume->getDisk(), path); - readString(deleted_rows_mask, *in); + data_part_storage->loadDeletedRowMask(deleted_rows_mask); } } -void IMergeTreeDataPart::writeLightWeightDeletedMask(String bitmap) const +void IMergeTreeDataPart::writeLightweightDeletedMask(String bitmap) const { if (bitmap.empty()) return; - auto disk = volume->getDisk(); - String file_name = fs::path(getFullRelativePath()) / DELETED_ROW_MARK_FILE_NAME; - - /// write Non-Empty merged bitmap - auto out = disk->writeFile(file_name); - DB::writeText(bitmap, *out); + data_part_storage->writeLightweightDeletedMask(bitmap, storage.log); } void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index ff1a8269cb9..975e7d6272d 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -470,7 +470,7 @@ public: void loadDeletedRowMask(); /// Write lightweight deleted mask to a file. - void writeLightWeightDeletedMask(String bitmap) const; + void writeLightweightDeletedMask(String bitmap) const; protected: diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6b3ddbf8d49..c887552b35b 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2872,7 +2872,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR); - if (part.hasLightweightDelete()) + if (part->hasLightweightDelete()) has_lightweight_delete_parts.store(true); checkPartCanBeAddedToTable(part, lock); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 857b70ede61..d3ac822094a 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -1146,7 +1146,7 @@ static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second) /// Implicitly apply deleted mask filter to columns. /// If there is no prewhere_info, apply directly the deleted mask filter. -/// If prewhere_info exists, works like row_level_filter and prewhere filter. +/// If prewhere_info exists, only apply to the first prewhere filter. void MergeTreeRangeReader::executeDeletedRowMaskFilterColumns(ReadResult & result) { if (prewhere_info || !need_read_deleted_mask || !result.deleted_mask_filter_holder) @@ -1233,6 +1233,19 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. result.block_before_prewhere = block; + /// Apply deleted mask filter for the first prewhere step + if (!result.getFilter() && result.deleted_mask_filter_holder) + { + auto columns = block.getColumns(); + filterColumns(columns, result.deleted_mask_filter_holder); + if (columns.empty()) + block = block.cloneEmpty(); + else + block.setColumns(columns); + + result.setFilter(result.deleted_mask_filter_holder); + } + if (prewhere_info->actions) prewhere_info->actions->execute(block); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 21122a42833..e22e45da35f 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1405,7 +1405,7 @@ private: if (ctx->execute_ttl_type != ExecuteTTLType::NONE) ctx->files_to_skip.insert("ttl.txt"); - ctx->disk->createDirectories(ctx->new_part_tmp_path); + ctx->data_part_storage_builder->createDirectories(); /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. TransactionID tid = ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID; @@ -1416,29 +1416,36 @@ private: NameSet hardlinked_files; /// Create hardlinks for unchanged files - for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) + for (auto it = ctx->source_part->data_part_storage->iterate(); it->isValid(); it->next()) { if (ctx->files_to_skip.contains(it->name())) continue; - String destination = ctx->new_part_tmp_path; - String file_name = it->name(); + String destination; + destination = it->name(); - destination += file_name; + /// Skip to create hardlink for deleted_row_mask.bin + if (ctx->source_part->hasLightweightDelete() && destination == "deleted_row_mask.bin") + continue; - if (!ctx->disk->isDirectory(it->path())) + if (it->isFile()) { - ctx->disk->createHardLink(it->path(), destination); - hardlinked_files.insert(file_name); + ctx->data_part_storage_builder->createHardLinkFrom( + *ctx->source_part->data_part_storage, it->name(), destination); + hardlinked_files.insert(it->name()); } - else if (!endsWith(".tmp_proj", file_name)) // ignore projection tmp merge dir + else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir { // it's a projection part directory - ctx->disk->createDirectories(destination); - for (auto p_it = ctx->disk->iterateDirectory(it->path()); p_it->isValid(); p_it->next()) + ctx->data_part_storage_builder->createProjection(destination); + + auto projection_data_part_storage = ctx->source_part->data_part_storage->getProjection(destination); + auto projection_data_part_storage_builder = ctx->data_part_storage_builder->getProjection(destination); + + for (auto p_it = projection_data_part_storage->iterate(); p_it->isValid(); p_it->next()) { - String p_destination = fs::path(destination) / p_it->name(); - ctx->disk->createHardLink(p_it->path(), p_destination); + projection_data_part_storage_builder->createHardLinkFrom( + *projection_data_part_storage, p_it->name(), p_it->name()); hardlinked_files.insert(p_it->name()); } } @@ -1509,7 +1516,7 @@ private: if (has_deleted_rows) { - ctx->new_data_part->writeLightWeightDeletedMask(new_bitmap); + ctx->new_data_part->writeLightweightDeletedMask(new_bitmap); } } @@ -1521,7 +1528,7 @@ private: ctx->mutating_pipeline.reset(); } - MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); + MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->data_part_storage_builder, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); } enum class State diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 6e9d9188962..acc9b08da29 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -378,6 +378,7 @@ CREATE TABLE system.mutations ( `database` String, `table` String, + `is_lightweight` UInt8, `mutation_id` String, `command` String, `create_time` DateTime, From 624848679060cc5ae250864d200335906089e0a0 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 28 Jun 2022 17:34:03 +0800 Subject: [PATCH 064/227] Fix lightweight delete bugs: add skip files and use source part's columns to avoid metadata updated cases --- src/Storages/MergeTree/MutateTask.cpp | 29 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e22e45da35f..e82becd3166 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1424,10 +1424,6 @@ private: String destination; destination = it->name(); - /// Skip to create hardlink for deleted_row_mask.bin - if (ctx->source_part->hasLightweightDelete() && destination == "deleted_row_mask.bin") - continue; - if (it->isFile()) { ctx->data_part_storage_builder->createHardLinkFrom( @@ -1695,12 +1691,22 @@ bool MutateTask::prepare() /// It shouldn't be changed by mutation. ctx->new_data_part->index_granularity_info = ctx->source_part->index_granularity_info; - auto [new_columns, new_infos] = MutationHelpers::getColumnsForNewDataPart( - ctx->source_part, ctx->updated_header, ctx->storage_columns, - ctx->source_part->getSerializationInfos(), ctx->commands_for_part); + if (ctx->is_lightweight_mutation) + { + /// The metadata alter will update the metadata snapshot, we should use same as source part. + ctx->new_data_part->setColumns(ctx->source_part->getColumns()); + ctx->new_data_part->setSerializationInfos(ctx->source_part->getSerializationInfos()); + } + else + { + auto [new_columns, new_infos] = MutationHelpers::getColumnsForNewDataPart( + ctx->source_part, ctx->updated_header, ctx->storage_columns, + ctx->source_part->getSerializationInfos(), ctx->commands_for_part); + + ctx->new_data_part->setColumns(new_columns); + ctx->new_data_part->setSerializationInfos(new_infos); + } - ctx->new_data_part->setColumns(new_columns); - ctx->new_data_part->setSerializationInfos(new_infos); ctx->new_data_part->partition.assign(ctx->source_part->partition); /// Don't change granularity type while mutating subset of columns @@ -1722,6 +1728,11 @@ bool MutateTask::prepare() } else if (ctx->is_lightweight_mutation) { + ctx->files_to_skip = ctx->source_part->getFileNamesWithoutChecksums(); + /// Skip to create hardlink for deleted_row_mask.bin + if (ctx->source_part->hasLightweightDelete()) + ctx->files_to_skip.insert("deleted_row_mask.bin"); + /// We will modify or create only deleted_row_mask for lightweight delete. Other columns and key values are copied as-is. task = std::make_unique(ctx); } From dcc7367ac4703905c9df59e310c5bb0d587b3e51 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 28 Jun 2022 18:27:56 +0800 Subject: [PATCH 065/227] Fix code style error --- src/Storages/MergeTree/MutateTask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e82becd3166..08d4e4b47c5 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1660,7 +1660,7 @@ bool MutateTask::prepare() /// Skip to apply deleted mask when reading for MutateSomePartColumns. need_mutate_all_columns = need_mutate_all_columns || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter->isAffectingAllColumns()); - if(!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) + if (!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) ctx->interpreter->SetSkipDeletedMask(true); ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); From 7e433859eaa96e97f66cc653560b2b159d5c5c60 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Thu, 30 Jun 2022 13:51:48 +0800 Subject: [PATCH 066/227] Change deleted rows mask from String to Native UInt8 format --- src/Compression/CompressionFactory.cpp | 4 +- src/Compression/CompressionFactory.h | 2 +- src/Formats/NativeWriter.cpp | 2 +- src/Formats/NativeWriter.h | 2 + .../MergeTree/DataPartStorageOnDisk.cpp | 33 ++-- .../MergeTree/DataPartStorageOnDisk.h | 4 +- src/Storages/MergeTree/IDataPartStorage.h | 5 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 24 ++- src/Storages/MergeTree/IMergeTreeDataPart.h | 17 +- .../MergeTreeDataPartDeletedMask.cpp | 162 ++++++++++++++++++ .../MergeTree/MergeTreeDataPartDeletedMask.h | 33 ++++ .../MergeTree/MergeTreeRangeReader.cpp | 17 +- src/Storages/MergeTree/MergeTreeRangeReader.h | 1 - .../MergeTree/MergeTreeSequentialSource.cpp | 8 +- src/Storages/MergeTree/MutateTask.cpp | 22 ++- 15 files changed, 272 insertions(+), 64 deletions(-) create mode 100644 src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp create mode 100644 src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index b8a1c5877a4..94fb30af1bc 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -98,14 +98,14 @@ CompressionCodecPtr CompressionCodecFactory::get( } -CompressionCodecPtr CompressionCodecFactory::get(uint8_t byte_code) const +CompressionCodecPtr CompressionCodecFactory::get(uint8_t byte_code, const IDataType * column_type) const { const auto family_code_and_creator = family_code_with_codec.find(byte_code); if (family_code_and_creator == family_code_with_codec.end()) throw Exception("Unknown codec family code: " + toString(byte_code), ErrorCodes::UNKNOWN_CODEC); - return family_code_and_creator->second({}, nullptr); + return family_code_and_creator->second({}, column_type); } diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index a4451f9ed2e..c386784686e 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -63,7 +63,7 @@ public: } /// Get codec by method byte (no params available) - CompressionCodecPtr get(uint8_t byte_code) const; + CompressionCodecPtr get(uint8_t byte_code, const IDataType * column_type = nullptr) const; /// For backward compatibility with config settings CompressionCodecPtr get(const String & family_name, std::optional level) const; diff --git a/src/Formats/NativeWriter.cpp b/src/Formats/NativeWriter.cpp index 77692eec6b6..004c75182a7 100644 --- a/src/Formats/NativeWriter.cpp +++ b/src/Formats/NativeWriter.cpp @@ -46,7 +46,7 @@ void NativeWriter::flush() } -static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +void NativeWriter::writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) diff --git a/src/Formats/NativeWriter.h b/src/Formats/NativeWriter.h index 010a03ec722..02fc53b60fe 100644 --- a/src/Formats/NativeWriter.h +++ b/src/Formats/NativeWriter.h @@ -32,6 +32,8 @@ public: static String getContentType() { return "application/octet-stream"; } + static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit); + private: WriteBuffer & ostr; UInt64 client_revision; diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp index 816d31f44e5..45e0132ddf7 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -370,7 +370,7 @@ void DataPartStorageOnDisk::clearDirectory( if (!is_projection) { request.emplace_back(fs::path(dir) / "txn_version.txt", true); - request.emplace_back(fs::path(dir) / "deleted_row_mask.bin", true); + request.emplace_back(fs::path(dir) / "deleted_rows_mask.bin", true); } disk->removeSharedFiles(request, !can_remove_shared_data, names_not_to_remove); @@ -652,27 +652,30 @@ bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & stor return !volume_ptr->areMergesAvoided(); } -void DataPartStorageOnDisk::loadDeletedRowMask(String & bitmap) const +void DataPartStorageOnDisk::loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const { - String deleted_mask_path = fs::path(getRelativePath()) / "deleted_row_mask.bin"; + String deleted_mask_path = fs::path(getRelativePath()) / deleted_mask.name; auto disk = volume->getDisk(); - auto in = openForReading(disk, deleted_mask_path); - readString(bitmap, *in); + + if (disk->isFile(deleted_mask_path)) + { + auto read_buf = openForReading(disk, deleted_mask_path); + deleted_mask.read(*read_buf); + assertEOF(*read_buf); + } } -void DataPartStorageOnDisk::writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const +void DataPartStorageOnDisk::writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const { - String deleted_mask_path = fs::path(getRelativePath()) / "deleted_row_mask.bin"; - auto disk = volume->getDisk(); - try + const String final_path = fs::path(getRelativePath()) / deleted_mask.name; + const String tmp_path = final_path + ".tmp"; + { - auto out = volume->getDisk()->writeFile(deleted_mask_path); - DB::writeText(bitmap, *out); - } - catch (Poco::Exception & e) - { - LOG_ERROR(log, "{} (while writing deleted rows mask file for lightweight delete: {})", e.what(), backQuote(fullPath(disk, deleted_mask_path))); + auto out = volume->getDisk()->writeFile(tmp_path, 4096); + deleted_mask.write(*out); } + + volume->getDisk()->moveFile(tmp_path, final_path); } void DataPartStorageOnDisk::backup( diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h index 75f11617aef..61a91714a3b 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.h +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h @@ -86,8 +86,8 @@ public: bool shallParticipateInMerges(const IStoragePolicy &) const override; - void loadDeletedRowMask(String & bitmap) const override; - void writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const override; + void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const override; + void writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const override; void backup( TemporaryFilesOnDisks & temp_dirs, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 16ff2485a6c..3af9a0ce499 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace DB @@ -169,8 +170,8 @@ public: /// A leak of abstraction virtual bool shallParticipateInMerges(const IStoragePolicy &) const { return true; } - virtual void loadDeletedRowMask(String & bitmap) const = 0; - virtual void writeLightweightDeletedMask(String & bitmap, Poco::Logger * log) const = 0; + virtual void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const = 0; + virtual void writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const = 0; /// Create a backup of a data part. /// This method adds a new entry to backup_entries. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 3ee96459185..507d815644c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -648,7 +648,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); - loadDeletedRowMask(); + loadDeletedMask(); } catch (...) { @@ -1209,25 +1209,31 @@ void IMergeTreeDataPart::loadColumns(bool require) setSerializationInfos(infos); } -void IMergeTreeDataPart::loadDeletedRowMask() +void IMergeTreeDataPart::loadDeletedMask() { if (part_type == Type::Compact) return; - if (data_part_storage->exists(DELETED_ROW_MARK_FILE_NAME)) + if (data_part_storage->exists(deleted_mask.name)) { has_lightweight_delete = true; - data_part_storage->loadDeletedRowMask(deleted_rows_mask); + data_part_storage->loadDeletedRowsMask(deleted_mask); + + if (deleted_mask.getDeletedRows().size() != rows_count) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Size of deleted mask loaded from '{}':'{}' doesn't match expected " + "for part {}" + "(loaded {} rows, expected {} rows).", + data_part_storage->getDiskPath(), deleted_mask.name, name, deleted_mask.getDeletedRows().size(), rows_count); } } -void IMergeTreeDataPart::writeLightweightDeletedMask(String bitmap) const +void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) { - if (bitmap.empty()) - return; - - data_part_storage->writeLightweightDeletedMask(bitmap, storage.log); + deleted_mask.setDeletedRows(new_mask); + has_lightweight_delete = true; + data_part_storage->writeDeletedRowsMask(deleted_mask); } void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 975e7d6272d..d5f6322beab 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -327,9 +328,8 @@ public: mutable VersionMetadata version; - /// True if the part has deleted_row_mask.bin file used for lightweight delete. + /// True if the part has deleted_rows_mask.bin file used for lightweight delete. bool has_lightweight_delete = false; - String deleted_rows_mask; /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; @@ -408,9 +408,6 @@ public: static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; - /// File name for lightweight delete rows mask bitmap file. - static inline constexpr auto DELETED_ROW_MARK_FILE_NAME = "deleted_row_mask.bin"; - /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part /// for zero copy replication. Sadly it's very complex. @@ -466,14 +463,14 @@ public: /// True if here is light weight bitmap file in part. bool hasLightweightDelete() const { return has_lightweight_delete; } - /// Reads deleted row mask from deleted_row_mask.bin if exists and set has_lightweight_delete. - void loadDeletedRowMask(); - - /// Write lightweight deleted mask to a file. - void writeLightweightDeletedMask(String bitmap) const; + const MergeTreeDataPartDeletedMask& getDeletedMask() const { return deleted_mask; } + void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask); + void loadDeletedMask(); protected: + MergeTreeDataPartDeletedMask deleted_mask {}; + /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk ColumnSize total_columns_size; diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp new file mode 100644 index 00000000000..fe715f236a5 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp @@ -0,0 +1,162 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB::ErrorCodes +{ + extern const int UNKNOWN_FORMAT_VERSION; + extern const int CORRUPTED_DATA; +} + +namespace DB +{ + +namespace +{ +struct HashValue +{ +private: + char value[16]; + +public: + HashValue() = default; + HashValue(SipHash & hasher) + { + hasher.get128(value); + + static_assert(std::is_pod_v, "Expected to be a POD-type"); + static_assert(sizeof(HashValue) * 8 == 128); + } + + bool operator==(const HashValue & other) const + { + return memcmp(value, other.value, sizeof(value)) == 0; + } +}; + +constexpr UInt8 FORMAT_VERSION = 1; +constexpr UInt8 DEFAULT_CODEC = static_cast(CompressionMethodByte::T64); +constexpr UInt8 PADDING_SIZE = 7; // just in case +constexpr UInt8 HEADER_SIZE = 0 + + sizeof(FORMAT_VERSION) + + sizeof(UInt64) // number of rows in mask + + sizeof(HashValue) // column data hash + + PADDING_SIZE // padding: zero-bytes + + sizeof(HashValue); // header hash +} + +MergeTreeDataPartDeletedMask::MergeTreeDataPartDeletedMask() + : deleted_rows(ColumnUInt8::create()) +{} + +const ColumnUInt8 & MergeTreeDataPartDeletedMask::getDeletedRows() const +{ + return *deleted_rows; +} + +void MergeTreeDataPartDeletedMask::setDeletedRows(DeletedRows new_rows) +{ + deleted_rows.swap(new_rows); +} + +void MergeTreeDataPartDeletedMask::setDeletedRows(size_t rows, bool value) +{ + setDeletedRows(ColumnUInt8::create(rows, value)); +} + +void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) +{ + std::array header_buffer_data; + in.readStrict(header_buffer_data.data(), header_buffer_data.size()); + {// validate hash of the header first + SipHash hash; + hash.update(header_buffer_data.data(), header_buffer_data.size()); + const HashValue computed_hash(hash); + + HashValue read_hash; + readPODBinary(read_hash, in); + if (read_hash != computed_hash) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Invalid deleted masks file header hash"); + } + + UInt8 format_version = FORMAT_VERSION; + UInt64 stored_rows = 0; + HashValue column_hash; + {// Read header values + ReadBuffer header(header_buffer_data.data(), header_buffer_data.size(), 0); + readBinary(format_version, header); + if (format_version != FORMAT_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, + "Unknown deleted mask file format version {}", + static_cast(format_version)); + + readBinary(stored_rows, header); + readPODBinary(column_hash, header); + header.ignore(PADDING_SIZE); + assertEOF(header); + } + + auto data_read_buffer = std::make_unique(in); + + auto res_column = DeletedRows(ColumnUInt8::create()); + ColumnPtr res_col_ptr = res_column; + SerializationPtr serialization = DataTypeUInt8().getDefaultSerialization(); + NativeReader::readData(*serialization, res_col_ptr, *data_read_buffer, stored_rows, 0); + assertEOF(*data_read_buffer); + + // we probably don't want to check column hash here, since codec verifies data integrity. + deleted_rows = res_column; +} + +void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const +{ + {// Header + std::array header_buffer_data; + WriteBuffer header(header_buffer_data.data(), header_buffer_data.size()); + + writeBinary(FORMAT_VERSION, header); + writeBinary(static_cast(deleted_rows->size()), header); + + { + SipHash hash; + deleted_rows->updateHashFast(hash); + writePODBinary(HashValue(hash), header); + } + + { + const char padding[PADDING_SIZE] = {'\0'}; + writePODBinary(padding, header); + } + assert(header_buffer_data.max_size() == header.count()); + + writePODBinary(header_buffer_data, out); + {// header hash + SipHash hash; + hash.update(header_buffer_data.data(), header_buffer_data.size()); + writePODBinary(HashValue(hash), out); + } + } + assert(HEADER_SIZE == out.count()); + + const DataTypeUInt8 col_datatype; + auto codec = CompressionCodecFactory::instance().get(static_cast(DEFAULT_CODEC), &col_datatype); + auto data_write_buffer = std::make_unique(out, codec); + SerializationPtr serialization = col_datatype.getDefaultSerialization(); + + NativeWriter::writeData(*serialization, deleted_rows, *data_write_buffer, 0, deleted_rows->size()); + data_write_buffer->finalize(); +} + +} diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h new file mode 100644 index 00000000000..b75c9d37c55 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include + +#include + +namespace DB +{ +class ReadBuffer; +class WriteBuffer; + +/// Per-part info about rows deleted by lightweight mutations. +struct MergeTreeDataPartDeletedMask +{ + explicit MergeTreeDataPartDeletedMask(); + using DeletedRows = ColumnUInt8::Ptr; + + std::string name = "deleted_rows_mask.bin"; + + const ColumnUInt8 & getDeletedRows() const; + void setDeletedRows(DeletedRows new_rows); + void setDeletedRows(size_t rows, bool value); + + void read(ReadBuffer & in); + void write(WriteBuffer & out) const; + +private: + ColumnUInt8::Ptr deleted_rows; +}; + +}; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index d3ac822094a..47433fddadb 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -675,8 +675,6 @@ MergeTreeRangeReader::MergeTreeRangeReader( } need_read_deleted_mask = merge_tree_reader->needReadDeletedMask(); - if (need_read_deleted_mask) - deleted_rows_mask = merge_tree_reader->data_part->deleted_rows_mask; if (prewhere_info) { @@ -1002,12 +1000,15 @@ void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 UInt8 * pos = vec.data(); UInt8 * end = &vec[num_rows]; + const auto & deleted_rows_col = merge_tree_reader->data_part->getDeletedMask().getDeletedRows(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); + while (pos < end && leading_begin_part_offset < leading_end_part_offset) { - if (deleted_rows_mask[leading_begin_part_offset++] == '0') - *pos++ = 1; - else + if (deleted_rows_mask[leading_begin_part_offset++]) *pos++ = 0; + else + *pos++ = 1; } const auto start_ranges = result.startedRanges(); @@ -1019,10 +1020,10 @@ void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 while (pos < end && start_part_offset < end_part_offset) { - if (deleted_rows_mask[start_part_offset++] == '0') - *pos++ = 1; - else + if (deleted_rows_mask[start_part_offset++]) *pos++ = 0; + else + *pos++ = 1; } } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index bd38be47ed9..2515455bd83 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -270,7 +270,6 @@ private: bool is_initialized = false; Names non_const_virtual_column_names; bool need_read_deleted_mask = false; - String deleted_rows_mask; }; } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 926fde522d0..af31560cb86 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -76,10 +76,7 @@ try const auto & header = getPort().getHeader(); /// The chunk after deleted mask applied maybe empty. But the empty chunk means done of read rows. - String deleted_rows_mask; bool need_read_deleted_mask = data_part->hasLightweightDelete(); - if (need_read_deleted_mask) - deleted_rows_mask = data_part->deleted_rows_mask; do { @@ -99,13 +96,16 @@ try if (need_read_deleted_mask) { + const auto & deleted_rows_col = data_part->getDeletedMask().getDeletedRows(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); + size_t pos = current_row - rows_read; /// Get deleted mask for rows_read IColumn::Filter deleted_rows_filter(rows_read, true); for (size_t i = 0; i < rows_read; i++) { - if (deleted_rows_mask[pos++] == '1') + if (deleted_rows_mask[pos++]) deleted_rows_filter[i] = 0; } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 08d4e4b47c5..a6c18147c88 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -604,7 +604,7 @@ void finalizeMutatedPart( new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); new_data_part->loadProjections(false, false); - new_data_part->loadDeletedRowMask(); + new_data_part->loadDeletedMask(); new_data_part->setBytesOnDisk(new_data_part->data_part_storage->calculateTotalSizeOnDisk()); new_data_part->default_codec = codec; new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); @@ -1484,14 +1484,20 @@ private: Block block; bool has_deleted_rows = false; + auto new_deleted_rows = ColumnUInt8::create(); + auto & data = new_deleted_rows->getData(); + /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap if (ctx->source_part->hasLightweightDelete()) { - new_bitmap = ctx->source_part->deleted_rows_mask; + const auto & deleted_rows_col = ctx->source_part->getDeletedMask().getDeletedRows(); + const auto & source_data = deleted_rows_col.getData(); + data.insert(source_data.begin(), source_data.begin() + ctx->source_part->rows_count); + has_deleted_rows = true; } else - new_bitmap.resize(ctx->source_part->rows_count, '0'); + new_deleted_rows->insertManyDefaults(ctx->source_part->rows_count); /// Mark the data corresponding to the offset in the as deleted. while (MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry) && ctx->mutating_executor->pull(block)) @@ -1507,12 +1513,12 @@ private: /// Fill 1 for rows in offset for (size_t current_row = 0; current_row < block_rows; current_row++) - new_bitmap[offset[current_row]] = '1'; + data[offset[current_row]] = 1; } if (has_deleted_rows) { - ctx->new_data_part->writeLightweightDeletedMask(new_bitmap); + ctx->new_data_part->writeDeletedMask(ColumnUInt8::Ptr(std::move(new_deleted_rows))); } } @@ -1539,8 +1545,6 @@ private: State state{State::NEED_PREPARE}; MutationContextPtr ctx; - - String new_bitmap; }; @@ -1729,9 +1733,9 @@ bool MutateTask::prepare() else if (ctx->is_lightweight_mutation) { ctx->files_to_skip = ctx->source_part->getFileNamesWithoutChecksums(); - /// Skip to create hardlink for deleted_row_mask.bin + /// Skip to create hardlink for deleted_rows_mask.bin if (ctx->source_part->hasLightweightDelete()) - ctx->files_to_skip.insert("deleted_row_mask.bin"); + ctx->files_to_skip.insert("deleted_rows_mask.bin"); /// We will modify or create only deleted_row_mask for lightweight delete. Other columns and key values are copied as-is. task = std::make_unique(ctx); From 6e6f77ef8ac51684850251e734d4f99768ec974b Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Mon, 4 Jul 2022 08:59:53 +0800 Subject: [PATCH 067/227] Fix compile error in clang tidy build --- src/Interpreters/MutationsInterpreter.h | 2 +- src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index fb94c1f5f33..162039e3fd0 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -88,7 +88,7 @@ private: struct Stage; ASTPtr prepareInterpreterSelectQuery(std::vector &prepared_stages, bool dry_run); - ASTPtr prepareInterpreterSelectQueryLightweight(std::vector &prepared_stages, bool dry_run); + static ASTPtr prepareInterpreterSelectQueryLightweight(std::vector &prepared_stages, bool dry_run); QueryPipelineBuilder addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const; diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp index fe715f236a5..e93f3dd65ca 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp @@ -31,7 +31,7 @@ private: public: HashValue() = default; - HashValue(SipHash & hasher) + explicit HashValue(SipHash & hasher) { hasher.get128(value); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index a6c18147c88..92a096a7890 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -626,7 +626,7 @@ struct MutationContext MergeTreeData::DataPartPtr source_part; StoragePtr storage_from_source_part; - bool is_lightweight_mutation{0}; + bool is_lightweight_mutation{false}; StorageMetadataPtr metadata_snapshot; From 780cdfb8f0dbdadd6b485267834aa6fd127dca24 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Wed, 6 Jul 2022 18:29:29 +0800 Subject: [PATCH 068/227] Code changes based on review opinions. Make ordinary single alter delete lightwight by default --- src/Core/Settings.h | 2 + src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterDeleteQuery.cpp | 16 ++++---- src/Interpreters/InterpreterSelectQuery.cpp | 2 +- src/Interpreters/MutationsInterpreter.h | 4 +- src/Storages/IStorage.h | 4 +- .../MergeTree/DataPartStorageOnDisk.cpp | 2 +- .../MergeTree/DataPartStorageOnDisk.h | 2 +- src/Storages/MergeTree/IDataPartStorage.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 +-- src/Storages/MergeTree/IMergeTreeDataPart.h | 9 ++--- .../MergeTree/MergeTreeDataPartDeletedMask.h | 2 +- .../MergeTree/MergeTreeMutationEntry.cpp | 4 +- .../MergeTree/MergeTreeMutationEntry.h | 9 +++-- src/Storages/MergeTree/MutateTask.cpp | 12 ++++-- .../MergeTree/ReplicatedMergeTreeQueue.cpp | 2 +- src/Storages/StorageJoin.cpp | 2 +- src/Storages/StorageMemory.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 13 +++++-- src/Storages/StorageMergeTree.h | 4 +- ...lightweight_delete_on_merge_tree.reference | 8 ++++ ...02319_lightweight_delete_on_merge_tree.sql | 37 +++++++++++++++++++ 22 files changed, 102 insertions(+), 44 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bda72f089eb..7c559330e38 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -458,6 +458,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ + M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \ + M(Bool, lightweight_delete_mutation, true, "Enable to make ordinary ALTER DELETE queries lightweight for mergetree tables", 0) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 056a3d9f7b4..03b0e1d5894 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -144,7 +144,7 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) if (!mutation_commands.empty()) { table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); table->mutate(mutation_commands, getContext()); } diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index fff66402cff..3a786997ae3 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -20,6 +20,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int TABLE_IS_READ_ONLY; + extern const int SUPPORT_IS_DISABLED; } @@ -30,6 +31,9 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex BlockIO InterpreterDeleteQuery::execute() { + if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it"); + FunctionNameNormalizer().visit(query_ptr.get()); const ASTDeleteQuery & delete_query = query_ptr->as(); auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary); @@ -60,8 +64,7 @@ BlockIO InterpreterDeleteQuery::execute() auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - /// Currently do similar as alter table delete. - /// TODO: Mark this delete as lightweight. + /// Convert to MutationCommand MutationCommands mutation_commands; MutationCommand mut_command; @@ -76,12 +79,9 @@ BlockIO InterpreterDeleteQuery::execute() mutation_commands.emplace_back(mut_command); - if (!mutation_commands.empty()) - { - table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); - storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); - } + table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); + storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); return {}; } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 5852200da2c..ddef35e88cf 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1936,7 +1936,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc && !settings.empty_result_for_aggregation_by_empty_set && storage && storage->getName() != "MaterializedMySQL" - && !storage->hasLightweightDelete() + && !storage->hasLightweightDeletedMask() && !row_policy_filter && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 162039e3fd0..49165c6f9ad 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -44,7 +44,7 @@ public: MutationCommands commands_, ContextPtr context_, bool can_execute_, - bool is_lightweight_ = false); + bool is_lightweight_); void validate(); @@ -79,7 +79,7 @@ public: MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; } - void SetSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } + void setSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } private: ASTPtr prepare(bool dry_run); diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 899e7e365ce..3647941cc57 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -235,8 +235,8 @@ public: /// Returns true if the storage supports backup/restore for specific partitions. virtual bool supportsBackupPartition() const { return false; } - /// Return true if there are lightweight parts. - virtual bool hasLightweightDelete() const { return false; } + /// Return true if there are at least one part containing lightweight deleted mask. + virtual bool hasLightweightDeletedMask() const { return false; } private: diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp index 45e0132ddf7..1c4d8d9186a 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -665,7 +665,7 @@ void DataPartStorageOnDisk::loadDeletedRowsMask(MergeTreeDataPartDeletedMask & d } } -void DataPartStorageOnDisk::writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const +void DataPartStorageOnDisk::writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const { const String final_path = fs::path(getRelativePath()) / deleted_mask.name; const String tmp_path = final_path + ".tmp"; diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h index 61a91714a3b..2362d30a92d 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.h +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h @@ -87,7 +87,7 @@ public: bool shallParticipateInMerges(const IStoragePolicy &) const override; void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const override; - void writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const override; + void writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const override; void backup( TemporaryFilesOnDisks & temp_dirs, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 3af9a0ce499..cfd29a550e1 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -171,7 +171,7 @@ public: virtual bool shallParticipateInMerges(const IStoragePolicy &) const { return true; } virtual void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const = 0; - virtual void writeDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const = 0; + virtual void writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const = 0; /// Create a backup of a data part. /// This method adds a new entry to backup_entries. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 507d815644c..b7066ec0cd5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -791,6 +791,9 @@ NameSet IMergeTreeDataPart::getFileNamesWithoutChecksums() const if (data_part_storage->exists(TXN_VERSION_METADATA_FILE_NAME)) result.emplace(TXN_VERSION_METADATA_FILE_NAME); + if (data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME)) + result.emplace(DELETED_ROWS_MARK_FILE_NAME); + return result; } @@ -1216,8 +1219,6 @@ void IMergeTreeDataPart::loadDeletedMask() if (data_part_storage->exists(deleted_mask.name)) { - has_lightweight_delete = true; - data_part_storage->loadDeletedRowsMask(deleted_mask); if (deleted_mask.getDeletedRows().size() != rows_count) @@ -1232,7 +1233,6 @@ void IMergeTreeDataPart::loadDeletedMask() void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) { deleted_mask.setDeletedRows(new_mask); - has_lightweight_delete = true; data_part_storage->writeDeletedRowsMask(deleted_mask); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index d5f6322beab..54f8af6bb5d 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -328,9 +328,6 @@ public: mutable VersionMetadata version; - /// True if the part has deleted_rows_mask.bin file used for lightweight delete. - bool has_lightweight_delete = false; - /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -408,6 +405,8 @@ public: static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; + static inline constexpr auto DELETED_ROWS_MARK_FILE_NAME = "deleted_rows_mask.bin"; + /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part /// for zero copy replication. Sadly it's very complex. @@ -460,8 +459,8 @@ public: /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; - /// True if here is light weight bitmap file in part. - bool hasLightweightDelete() const { return has_lightweight_delete; } + /// True if here is lightweight deleted mask file in part. + bool hasLightweightDelete() const { return deleted_mask.getDeletedRows().size() > 0; } const MergeTreeDataPartDeletedMask& getDeletedMask() const { return deleted_mask; } void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask); diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h index b75c9d37c55..2ecdd8e7b28 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h @@ -17,7 +17,7 @@ struct MergeTreeDataPartDeletedMask explicit MergeTreeDataPartDeletedMask(); using DeletedRows = ColumnUInt8::Ptr; - std::string name = "deleted_rows_mask.bin"; + const std::string name = "deleted_rows_mask.bin"; const ColumnUInt8 & getDeletedRows() const; void setDeletedRows(DeletedRows new_rows); diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 5b103cbe8d8..180d78f6ee3 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -47,14 +47,14 @@ UInt64 MergeTreeMutationEntry::parseFileName(const String & file_name_) MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number, const TransactionID & tid_, const WriteSettings & settings, MutationType type_) - : type(type_) - , create_time(time(nullptr)) + : create_time(time(nullptr)) , commands(std::move(commands_)) , disk(std::move(disk_)) , path_prefix(path_prefix_) , file_name("tmp_mutation_" + toString(tmp_number) + ".txt") , is_temp(true) , tid(tid_) + , type(type_) { try { diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.h b/src/Storages/MergeTree/MergeTreeMutationEntry.h index 06cbd44ed49..37dbca9de7b 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.h +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.h @@ -11,15 +11,15 @@ namespace DB { class IBackupEntry; +/// Type of Mutate. Used to control different mutates during mutates +/// assignment. Also allows to apply special logic during mutate process +/// Stored in FutureMergedMutatedPart and MergeTreeMutationEntry. enum class MutationType { Ordinary, Lightweight }; /// A mutation entry for non-replicated MergeTree storage engines. /// Stores information about mutation in file mutation_*.txt. struct MergeTreeMutationEntry { - /// Type of mutation, used for lightweight delete. - MutationType type; - time_t create_time = 0; MutationCommands commands; @@ -41,6 +41,9 @@ struct MergeTreeMutationEntry /// or UnknownCSN if it's not committed (yet) or RolledBackCSN if it's rolled back or PrehistoricCSN if there is no transaction. CSN csn = Tx::UnknownCSN; + /// Type of mutation, used for lightweight delete. + MutationType type; + /// Create a new entry and write it to a temporary file. MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk, const String & path_prefix_, UInt64 tmp_number, const TransactionID & tid_, const WriteSettings & settings, MutationType type_); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 92a096a7890..a3a6c88f7d6 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -432,6 +432,10 @@ NameSet collectFilesToSkip( { NameSet files_to_skip = source_part->getFileNamesWithoutChecksums(); + /// Remove deleted rows mask file name to create hard link for it when mutate some columns. + if (files_to_skip.contains(IMergeTreeDataPart::DELETED_ROWS_MARK_FILE_NAME)) + files_to_skip.erase(IMergeTreeDataPart::DELETED_ROWS_MARK_FILE_NAME); + /// Skip updated files for (const auto & entry : updated_header) { @@ -1355,6 +1359,9 @@ private: std::unique_ptr part_merger_writer_task{nullptr}; }; +/// LightweightDeleteTask works for lightweight delete mutate. +/// The MutationsInterpreter returns a simple select like "select _part_offset where predicates". +/// The prepare() and execute() has special logics for LWD mutate. class LightweightDeleteTask : public IExecutableTask { public: @@ -1665,7 +1672,7 @@ bool MutateTask::prepare() /// Skip to apply deleted mask when reading for MutateSomePartColumns. need_mutate_all_columns = need_mutate_all_columns || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter->isAffectingAllColumns()); if (!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) - ctx->interpreter->SetSkipDeletedMask(true); + ctx->interpreter->setSkipDeletedMask(true); ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); ctx->progress_callback = MergeProgressCallback((*ctx->mutate_entry)->ptr(), ctx->watch_prev_elapsed, *ctx->stage_progress); @@ -1733,9 +1740,6 @@ bool MutateTask::prepare() else if (ctx->is_lightweight_mutation) { ctx->files_to_skip = ctx->source_part->getFileNamesWithoutChecksums(); - /// Skip to create hardlink for deleted_rows_mask.bin - if (ctx->source_part->hasLightweightDelete()) - ctx->files_to_skip.insert("deleted_rows_mask.bin"); /// We will modify or create only deleted_row_mask for lightweight delete. Other columns and key values are copied as-is. task = std::make_unique(ctx); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 312c4146cb9..cd31d356b4b 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1920,7 +1920,7 @@ std::vector ReplicatedMergeTreeQueue::getMutationsStatu formatAST(*command.ast, buf, false, true); result.push_back(MergeTreeMutationStatus { - MutationType::Ordinary, + MutationType::Ordinary, /// TODO: ReplicatedMergeTree supports lightweight delete. entry.znode_name, buf.str(), entry.create_time, diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 5e161fc2e6a..0604bb304d0 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -123,7 +123,7 @@ void StorageJoin::mutate(const MutationCommands & commands, ContextPtr context) // New scope controls lifetime of pipeline. { auto storage_ptr = DatabaseCatalog::instance().getTable(getStorageID(), context); - auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true, false); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index f3f1162287f..fc4a671c071 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -318,7 +318,7 @@ void StorageMemory::mutate(const MutationCommands & commands, ContextPtr context new_context->setSetting("max_streams_to_max_threads_ratio", 1); new_context->setSetting("max_threads", 1); - auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, new_context, true); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, new_context, true, false); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 5480114359e..5a178340382 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -332,7 +332,7 @@ void StorageMergeTree::alter( DatabaseCatalog::instance().getDatabase(table_id.database_name)->alterTable(local_context, table_id, new_metadata); if (!maybe_mutation_commands.empty()) - mutation_version = startMutation(maybe_mutation_commands, local_context); + mutation_version = startMutation(maybe_mutation_commands, local_context, MutationType::Ordinary); } /// Always execute required mutations synchronously, because alters @@ -555,7 +555,12 @@ void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn) void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) { - mutate(commands, query_context, MutationType::Ordinary); + /// Make ordinary ALTER DELETE queries lightweight to check all tests. + if (query_context->getSettingsRef().lightweight_delete_mutation + && commands.size() == 1 && commands.begin()->type == MutationCommand::DELETE) + mutate(commands, query_context, MutationType::Lightweight); + else + mutate(commands, query_context, MutationType::Ordinary); } void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context, MutationType type) @@ -569,7 +574,7 @@ void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr quer waitForMutation(version); } -bool StorageMergeTree::hasLightweightDelete() const +bool StorageMergeTree::hasLightweightDeletedMask() const { return has_lightweight_delete_parts.load(std::memory_order_relaxed); } @@ -1065,7 +1070,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( fake_query_context->makeQueryContext(); fake_query_context->setCurrentQueryId(""); MutationsInterpreter interpreter( - shared_from_this(), metadata_snapshot, commands_for_size_validation, fake_query_context, false); + shared_from_this(), metadata_snapshot, commands_for_size_validation, fake_query_context, false, false); commands_size += interpreter.evaluateCommandsSize(); } catch (...) diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 6251d0d75e0..379e15d5bdf 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -89,7 +89,7 @@ public: /// Support lightweight delete. void mutate(const MutationCommands & commands, ContextPtr context, MutationType type); - bool hasLightweightDelete() const override; + bool hasLightweightDeletedMask() const override; /// Return introspection information about currently processing or recently processed mutations. std::vector getMutationsStatus() const override; @@ -184,7 +184,7 @@ private: /// Allocate block number for new mutation, write mutation to disk /// and into in-memory structures. Wake up merge-mutation task. - Int64 startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type = MutationType::Ordinary); + Int64 startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type); /// Wait until mutation with version will finish mutation for all parts void waitForMutation(Int64 version); void waitForMutation(const String & mutation_id) override; diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference index aefc0c88762..bc30d677348 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference @@ -1,7 +1,13 @@ 99 +1 95 +1 0 +1 -----lightweight mutation type----- +1 +1 +1 1 DELETE WHERE (c % 5) = 1 1 1 DELETE WHERE c = 4 1 0 MATERIALIZE INDEX i_c 1 @@ -27,5 +33,7 @@ t_light 2 2_3_3_1_10 2 t_light 3 3_4_4_1_10 2 t_light 4 4_5_5_1_10 1 -----Test lightweight delete in multi blocks----- +1 +1 1000 -2 1005 -2 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index da302f0f4ff..3c3df06915f 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -5,19 +5,32 @@ CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTr INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); SET mutations_sync = 1; +SET allow_experimental_lightweight_delete = 1; DELETE FROM merge_table_standard_delete WHERE id = 10; SELECT COUNT() FROM merge_table_standard_delete; +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + DELETE FROM merge_table_standard_delete WHERE name IN ('1','2','3','4'); SELECT COUNT() FROM merge_table_standard_delete; +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + DELETE FROM merge_table_standard_delete WHERE 1; SELECT COUNT() FROM merge_table_standard_delete; +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + DROP TABLE merge_table_standard_delete; drop table if exists t_light; @@ -27,11 +40,25 @@ INSERT INTO t_light SELECT number, number, number FROM numbers(10); SELECT '-----lightweight mutation type-----'; DELETE FROM t_light WHERE c%5=1; + +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + DELETE FROM t_light WHERE c=4; + +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + alter table t_light MATERIALIZE INDEX i_c; alter table t_light update b=-1 where a<3; alter table t_light drop index i_c; +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + SELECT is_lightweight, command, is_done FROM system.mutations WHERE database = currentDatabase() AND table = 't_light'; SELECT '-----Check that select and merge with lightweight delete.-----'; @@ -52,8 +79,18 @@ CREATE TABLE t_large(a UInt32, b int) ENGINE=MergeTree order BY a settings min_b INSERT INTO t_large SELECT number + 1, number + 1 FROM numbers(100000); DELETE FROM t_large WHERE a = 50000; + +DETACH TABLE t_large; +ATTACH TABLE t_large; +CHECK TABLE t_large; + ALTER TABLE t_large UPDATE b = -2 WHERE a between 1000 and 1005; ALTER TABLE t_large DELETE WHERE a=1; + +DETACH TABLE t_large; +ATTACH TABLE t_large; +CHECK TABLE t_large; + SELECT * FROM t_large WHERE a in (1,1000,1005,50000) order by a; DROP TABLE t_large; From 198674e99483639be3f88c78503a4e3808478d74 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Thu, 7 Jul 2022 11:08:31 +0800 Subject: [PATCH 069/227] Consider lightweight delete as ordinary for compact part in selectPartsToMutate() --- src/Storages/StorageMergeTree.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 5a178340382..ba192f39e2b 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1042,7 +1042,8 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( break; /// Do not combine mutations with different types. - if (it->second.type != first_mutation_type) + /// TODO: compact part support lightweight delete. + if (isWidePart(part) && it->second.type != first_mutation_type) break; size_t commands_size = 0; @@ -1129,7 +1130,11 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( future_part->part_info = new_part_info; future_part->name = part->getNewName(new_part_info); future_part->type = part->getType(); - future_part->mutation_type = first_mutation_type; + + if (isWidePart(part)) + future_part->mutation_type = first_mutation_type; + else + future_part->mutation_type = MutationType::Ordinary; tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); return std::make_shared(future_part, std::move(tagger), commands, txn); From 9d1dff515cd2016953d3c1f564c78d4b39086ffe Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Fri, 8 Jul 2022 11:22:25 +0800 Subject: [PATCH 070/227] code changes for strange name and adding exception for mutation entry --- .../MergeTreeDataPartDeletedMask.cpp | 30 +++++++++---------- .../MergeTree/MergeTreeMutationEntry.cpp | 6 +++- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp index e93f3dd65ca..47e284fced9 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp @@ -24,22 +24,22 @@ namespace DB namespace { -struct HashValue +struct DeletedRowsHash { private: char value[16]; public: - HashValue() = default; - explicit HashValue(SipHash & hasher) + DeletedRowsHash() = default; + explicit DeletedRowsHash(SipHash & hasher) { hasher.get128(value); - static_assert(std::is_pod_v, "Expected to be a POD-type"); - static_assert(sizeof(HashValue) * 8 == 128); + static_assert(std::is_pod_v, "Expected to be a POD-type"); + static_assert(sizeof(DeletedRowsHash) * 8 == 128); } - bool operator==(const HashValue & other) const + bool operator==(const DeletedRowsHash & other) const { return memcmp(value, other.value, sizeof(value)) == 0; } @@ -51,9 +51,9 @@ constexpr UInt8 PADDING_SIZE = 7; // just in case constexpr UInt8 HEADER_SIZE = 0 + sizeof(FORMAT_VERSION) + sizeof(UInt64) // number of rows in mask - + sizeof(HashValue) // column data hash + + sizeof(DeletedRowsHash) // column data hash + PADDING_SIZE // padding: zero-bytes - + sizeof(HashValue); // header hash + + sizeof(DeletedRowsHash); // header hash } MergeTreeDataPartDeletedMask::MergeTreeDataPartDeletedMask() @@ -77,14 +77,14 @@ void MergeTreeDataPartDeletedMask::setDeletedRows(size_t rows, bool value) void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) { - std::array header_buffer_data; + std::array header_buffer_data; in.readStrict(header_buffer_data.data(), header_buffer_data.size()); {// validate hash of the header first SipHash hash; hash.update(header_buffer_data.data(), header_buffer_data.size()); - const HashValue computed_hash(hash); + const DeletedRowsHash computed_hash(hash); - HashValue read_hash; + DeletedRowsHash read_hash; readPODBinary(read_hash, in); if (read_hash != computed_hash) throw Exception(ErrorCodes::CORRUPTED_DATA, @@ -93,7 +93,7 @@ void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) UInt8 format_version = FORMAT_VERSION; UInt64 stored_rows = 0; - HashValue column_hash; + DeletedRowsHash column_hash; {// Read header values ReadBuffer header(header_buffer_data.data(), header_buffer_data.size(), 0); readBinary(format_version, header); @@ -123,7 +123,7 @@ void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const { {// Header - std::array header_buffer_data; + std::array header_buffer_data; WriteBuffer header(header_buffer_data.data(), header_buffer_data.size()); writeBinary(FORMAT_VERSION, header); @@ -132,7 +132,7 @@ void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const { SipHash hash; deleted_rows->updateHashFast(hash); - writePODBinary(HashValue(hash), header); + writePODBinary(DeletedRowsHash(hash), header); } { @@ -145,7 +145,7 @@ void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const {// header hash SipHash hash; hash.update(header_buffer_data.data(), header_buffer_data.size()); - writePODBinary(HashValue(hash), out); + writePODBinary(DeletedRowsHash(hash), out); } } assert(HEADER_SIZE == out.count()); diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 180d78f6ee3..08f45e85d23 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -126,7 +126,9 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat int format_version; *buf >> "format version: " >> format_version >> "\n"; - assert(format_version <= 2); + /// Allow format_version = 1 for backward compatibility. + if (format_version != 1 && format_version != 2) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported format version in mutation.txt, expected '1' or '2', got '{}'", format_version); type = MutationType::Ordinary; if (format_version == 2) @@ -137,6 +139,8 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat auto type_value = magic_enum::enum_cast(type_str); if (type_value.has_value()) type = type_value.value(); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported mutation type in mutation.txt, expected 'Lightweight' or 'Ordinary', got '{}'", type_str); } LocalDateTime create_time_dt; From 4268ac8b0e4a8ba6362bc258e7ef1715b0fb53e5 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Fri, 8 Jul 2022 16:12:59 +0800 Subject: [PATCH 071/227] Add supportLightweightDeleteMutate() in IMergeTreeDataPart to disable LWD for part with projections --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 ++++++ src/Storages/MergeTree/IMergeTreeDataPart.h | 3 +++ src/Storages/MergeTree/MutateTask.cpp | 7 ++++--- src/Storages/StorageMergeTree.cpp | 12 +++++------- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index b7066ec0cd5..2c613122224 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1212,6 +1212,12 @@ void IMergeTreeDataPart::loadColumns(bool require) setSerializationInfos(infos); } +/// Project part / part with project parts / compact part doesn't support LWD. +bool IMergeTreeDataPart::supportLightweightDeleteMutate() const +{ + return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.size() == 0; +} + void IMergeTreeDataPart::loadDeletedMask() { if (part_type == Type::Compact) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 54f8af6bb5d..07eb910d2ba 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -459,6 +459,9 @@ public: /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; + /// True if the part supports lightweight delete mutate. + bool supportLightweightDeleteMutate() const; + /// True if here is lightweight deleted mask file in part. bool hasLightweightDelete() const { return deleted_mask.getDeletedRows().size() > 0; } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index a3a6c88f7d6..b3cbe9e9a29 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1586,6 +1586,9 @@ MutateTask::MutateTask( ctx->source_part = ctx->future_part->parts[0]; ctx->storage_from_source_part = std::make_shared(ctx->source_part); + /// part is checked for lightweight delete in selectPartsToMutate(). + ctx->is_lightweight_mutation = ctx->future_part->mutation_type == MutationType::Lightweight; + auto storage_snapshot = ctx->storage_from_source_part->getStorageSnapshot(ctx->metadata_snapshot, context_); extendObjectColumns(ctx->storage_columns, storage_snapshot->object_columns, /*with_subcolumns=*/ false); } @@ -1640,9 +1643,7 @@ bool MutateTask::prepare() command.partition, context_for_reading)) ctx->commands_for_part.emplace_back(command); } - /// Enable lightweight delete for wide part only. - if (isWidePart(ctx->source_part) && (ctx->future_part->mutation_type == MutationType::Lightweight)) - ctx->is_lightweight_mutation = true; + if (ctx->source_part->isStoredOnDisk() && !ctx->is_lightweight_mutation && !isStorageTouchedByMutations( ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index ba192f39e2b..bf731f09428 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1034,7 +1034,9 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( auto commands = std::make_shared(); size_t current_ast_elements = 0; auto last_mutation_to_apply = mutations_end_it; - MutationType first_mutation_type = mutations_begin_it->second.type; + + bool support_lightweight_mutate = part->supportLightweightDeleteMutate(); + MutationType first_mutation_type = support_lightweight_mutate ? mutations_begin_it->second.type : MutationType::Ordinary; for (auto it = mutations_begin_it; it != mutations_end_it; ++it) { /// Do not squash mutations from different transactions to be able to commit/rollback them independently. @@ -1043,7 +1045,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( /// Do not combine mutations with different types. /// TODO: compact part support lightweight delete. - if (isWidePart(part) && it->second.type != first_mutation_type) + if (support_lightweight_mutate && it->second.type != first_mutation_type) break; size_t commands_size = 0; @@ -1130,11 +1132,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( future_part->part_info = new_part_info; future_part->name = part->getNewName(new_part_info); future_part->type = part->getType(); - - if (isWidePart(part)) - future_part->mutation_type = first_mutation_type; - else - future_part->mutation_type = MutationType::Ordinary; + future_part->mutation_type = first_mutation_type; tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); return std::make_shared(future_part, std::move(tagger), commands, txn); From d37152a5d622f09f2027f01c657d7509d0575966 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Mon, 11 Jul 2022 16:22:00 +0800 Subject: [PATCH 072/227] Remove loadDeletedMask() and get deleted mask when needed --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 12 +++++++----- src/Storages/MergeTree/IMergeTreeDataPart.h | 8 +++----- .../MergeTree/MergeTreeDataPartDeletedMask.h | 1 + src/Storages/MergeTree/MergeTreeRangeReader.cpp | 5 +++-- src/Storages/MergeTree/MergeTreeRangeReader.h | 1 + src/Storages/MergeTree/MergeTreeSequentialSource.cpp | 6 ++++-- src/Storages/MergeTree/MutateTask.cpp | 5 ++--- 7 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 2c613122224..c3919d29ee9 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -648,7 +648,6 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); - loadDeletedMask(); } catch (...) { @@ -1215,14 +1214,14 @@ void IMergeTreeDataPart::loadColumns(bool require) /// Project part / part with project parts / compact part doesn't support LWD. bool IMergeTreeDataPart::supportLightweightDeleteMutate() const { - return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.size() == 0; + return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.empty(); } -void IMergeTreeDataPart::loadDeletedMask() +const MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() const { - if (part_type == Type::Compact) - return; + MergeTreeDataPartDeletedMask deleted_mask {}; + /// Check if deleted mask file exists. if (data_part_storage->exists(deleted_mask.name)) { data_part_storage->loadDeletedRowsMask(deleted_mask); @@ -1234,10 +1233,13 @@ void IMergeTreeDataPart::loadDeletedMask() "(loaded {} rows, expected {} rows).", data_part_storage->getDiskPath(), deleted_mask.name, name, deleted_mask.getDeletedRows().size(), rows_count); } + + return std::move(deleted_mask.getDeletedRowsPtr()); } void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) { + MergeTreeDataPartDeletedMask deleted_mask {}; deleted_mask.setDeletedRows(new_mask); data_part_storage->writeDeletedRowsMask(deleted_mask); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 07eb910d2ba..1f08d42f016 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -463,16 +463,14 @@ public: bool supportLightweightDeleteMutate() const; /// True if here is lightweight deleted mask file in part. - bool hasLightweightDelete() const { return deleted_mask.getDeletedRows().size() > 0; } + bool hasLightweightDelete() const { return data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME); } - const MergeTreeDataPartDeletedMask& getDeletedMask() const { return deleted_mask; } + /// Read lightweight deleted mask when needed. + const MergeTreeDataPartDeletedMask::DeletedRows getDeletedMask() const; void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask); - void loadDeletedMask(); protected: - MergeTreeDataPartDeletedMask deleted_mask {}; - /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk ColumnSize total_columns_size; diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h index 2ecdd8e7b28..4bd2cf77bab 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h @@ -20,6 +20,7 @@ struct MergeTreeDataPartDeletedMask const std::string name = "deleted_rows_mask.bin"; const ColumnUInt8 & getDeletedRows() const; + const DeletedRows & getDeletedRowsPtr() const { return deleted_rows; } void setDeletedRows(DeletedRows new_rows); void setDeletedRows(size_t rows, bool value); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 47433fddadb..a2ae9978d15 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -675,6 +675,8 @@ MergeTreeRangeReader::MergeTreeRangeReader( } need_read_deleted_mask = merge_tree_reader->needReadDeletedMask(); + if (need_read_deleted_mask) + deleted_rows = merge_tree_reader->data_part->getDeletedMask(); if (prewhere_info) { @@ -1000,8 +1002,7 @@ void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 UInt8 * pos = vec.data(); UInt8 * end = &vec[num_rows]; - const auto & deleted_rows_col = merge_tree_reader->data_part->getDeletedMask().getDeletedRows(); - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows->getData(); while (pos < end && leading_begin_part_offset < leading_end_part_offset) { diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 2515455bd83..d57a94820c4 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -270,6 +270,7 @@ private: bool is_initialized = false; Names non_const_virtual_column_names; bool need_read_deleted_mask = false; + ColumnUInt8::Ptr deleted_rows; }; } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index af31560cb86..0deded95850 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -77,6 +77,9 @@ try /// The chunk after deleted mask applied maybe empty. But the empty chunk means done of read rows. bool need_read_deleted_mask = data_part->hasLightweightDelete(); + ColumnUInt8::Ptr deleted_rows_col; + if (need_read_deleted_mask) + deleted_rows_col = data_part->getDeletedMask(); do { @@ -96,8 +99,7 @@ try if (need_read_deleted_mask) { - const auto & deleted_rows_col = data_part->getDeletedMask().getDeletedRows(); - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col->getData(); size_t pos = current_row - rows_read; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index b3cbe9e9a29..9e254361293 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -608,7 +608,6 @@ void finalizeMutatedPart( new_data_part->minmax_idx = source_part->minmax_idx; new_data_part->modification_time = time(nullptr); new_data_part->loadProjections(false, false); - new_data_part->loadDeletedMask(); new_data_part->setBytesOnDisk(new_data_part->data_part_storage->calculateTotalSizeOnDisk()); new_data_part->default_codec = codec; new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); @@ -1497,8 +1496,8 @@ private: /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap if (ctx->source_part->hasLightweightDelete()) { - const auto & deleted_rows_col = ctx->source_part->getDeletedMask().getDeletedRows(); - const auto & source_data = deleted_rows_col.getData(); + const auto & deleted_rows_col = ctx->source_part->getDeletedMask(); + const auto & source_data = deleted_rows_col->getData(); data.insert(source_data.begin(), source_data.begin() + ctx->source_part->rows_count); has_deleted_rows = true; From 8b1114fa609e648895a43c05a28f79c28668bdba Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 12 Jul 2022 09:38:07 +0800 Subject: [PATCH 073/227] fix clang tidy compile error --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 ++-- src/Storages/MergeTree/IMergeTreeDataPart.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index c3919d29ee9..8f8ea1af025 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1217,7 +1217,7 @@ bool IMergeTreeDataPart::supportLightweightDeleteMutate() const return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.empty(); } -const MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() const +MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() const { MergeTreeDataPartDeletedMask deleted_mask {}; @@ -1237,7 +1237,7 @@ const MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMa return std::move(deleted_mask.getDeletedRowsPtr()); } -void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) +void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const { MergeTreeDataPartDeletedMask deleted_mask {}; deleted_mask.setDeletedRows(new_mask); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 1f08d42f016..cafdb658216 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -466,8 +466,8 @@ public: bool hasLightweightDelete() const { return data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME); } /// Read lightweight deleted mask when needed. - const MergeTreeDataPartDeletedMask::DeletedRows getDeletedMask() const; - void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask); + MergeTreeDataPartDeletedMask::DeletedRows getDeletedMask() const; + void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const; protected: From 0793add9b5604f39fb15904d198759af2ec209f2 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 12 Jul 2022 15:04:00 +0800 Subject: [PATCH 074/227] Fix clang tidy compile error, remove std::move for const getDeletedRowsPtr() --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 8f8ea1af025..4c69bbb5c8c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1234,7 +1234,7 @@ MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() c data_part_storage->getDiskPath(), deleted_mask.name, name, deleted_mask.getDeletedRows().size(), rows_count); } - return std::move(deleted_mask.getDeletedRowsPtr()); + return deleted_mask.getDeletedRowsPtr(); } void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const From 5a8d6ba0fc954deedb0d25e98749a928b8608f8c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 15 Jul 2022 09:06:25 +0200 Subject: [PATCH 075/227] clang tidy feedback addressed: updated src/IO/ForkWriteBuffer.cpp to use const reference --- src/IO/ForkWriteBuffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index ef39c9241db..b055ae588a9 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -36,7 +36,7 @@ void ForkWriteBuffer::nextImpl() try { - for (auto write_buffer :sources | std::views::reverse) + for (const WriteBufferPtr & write_buffer : sources | std::views::reverse) { if (write_buffer.get() != first_buffer) { From ca42f649da7bf5ce2fa57523d23994a7220677c6 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Fri, 15 Jul 2022 15:31:10 +0800 Subject: [PATCH 076/227] Rewrite logic for loading deleted mask related to getDeletedMask() --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 10 ++++++---- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- .../MergeTree/MergeTreeDataPartDeletedMask.cpp | 2 +- .../MergeTree/MergeTreeDataPartDeletedMask.h | 2 +- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 12 ++++++------ src/Storages/MergeTree/MergeTreeRangeReader.h | 5 +++-- .../MergeTree/MergeTreeSequentialSource.cpp | 13 ++++++------- .../MergeTree/MergeTreeSequentialSource.h | 4 ++++ src/Storages/MergeTree/MutateTask.cpp | 15 ++++++++++----- 9 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 4c69bbb5c8c..b8fd70193f1 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1217,12 +1217,12 @@ bool IMergeTreeDataPart::supportLightweightDeleteMutate() const return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.empty(); } -MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() const +bool IMergeTreeDataPart::getDeletedMask(MergeTreeDataPartDeletedMask & deleted_mask) const { - MergeTreeDataPartDeletedMask deleted_mask {}; + bool found = false; /// Check if deleted mask file exists. - if (data_part_storage->exists(deleted_mask.name)) + if (data_part_storage->exists(String(deleted_mask.name))) { data_part_storage->loadDeletedRowsMask(deleted_mask); @@ -1232,9 +1232,11 @@ MergeTreeDataPartDeletedMask::DeletedRows IMergeTreeDataPart::getDeletedMask() c "for part {}" "(loaded {} rows, expected {} rows).", data_part_storage->getDiskPath(), deleted_mask.name, name, deleted_mask.getDeletedRows().size(), rows_count); + + found = true; } - return deleted_mask.getDeletedRowsPtr(); + return found; } void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index cafdb658216..7869ca52969 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -466,7 +466,7 @@ public: bool hasLightweightDelete() const { return data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME); } /// Read lightweight deleted mask when needed. - MergeTreeDataPartDeletedMask::DeletedRows getDeletedMask() const; + bool getDeletedMask(MergeTreeDataPartDeletedMask & deleted_mask) const; void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const; protected: diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp index 47e284fced9..d1a78623278 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp @@ -117,7 +117,7 @@ void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) assertEOF(*data_read_buffer); // we probably don't want to check column hash here, since codec verifies data integrity. - deleted_rows = res_column; + deleted_rows = std::move(res_column); } void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h index 4bd2cf77bab..c8652746d98 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h +++ b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h @@ -17,7 +17,7 @@ struct MergeTreeDataPartDeletedMask explicit MergeTreeDataPartDeletedMask(); using DeletedRows = ColumnUInt8::Ptr; - const std::string name = "deleted_rows_mask.bin"; + static constexpr std::string_view name = "deleted_rows_mask.bin"; const ColumnUInt8 & getDeletedRows() const; const DeletedRows & getDeletedRowsPtr() const { return deleted_rows; } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index a2ae9978d15..2f34d75b2c4 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -674,9 +674,8 @@ MergeTreeRangeReader::MergeTreeRangeReader( sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); } - need_read_deleted_mask = merge_tree_reader->needReadDeletedMask(); - if (need_read_deleted_mask) - deleted_rows = merge_tree_reader->data_part->getDeletedMask(); + if (merge_tree_reader->needReadDeletedMask()) + need_apply_deleted_mask = merge_tree_reader->data_part->getDeletedMask(deleted_mask); if (prewhere_info) { @@ -959,7 +958,7 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t } /// Do similar as part_offset for deleted mask. - if (need_read_deleted_mask) + if (need_apply_deleted_mask) fillDeletedRowMaskColumn(result, leading_begin_part_offset, leading_end_part_offset); return result; @@ -1002,7 +1001,8 @@ void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 UInt8 * pos = vec.data(); UInt8 * end = &vec[num_rows]; - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows->getData(); + const auto & deleted_rows_col = deleted_mask.getDeletedRows(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); while (pos < end && leading_begin_part_offset < leading_end_part_offset) { @@ -1151,7 +1151,7 @@ static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second) /// If prewhere_info exists, only apply to the first prewhere filter. void MergeTreeRangeReader::executeDeletedRowMaskFilterColumns(ReadResult & result) { - if (prewhere_info || !need_read_deleted_mask || !result.deleted_mask_filter_holder) + if (prewhere_info || !need_apply_deleted_mask || !result.deleted_mask_filter_holder) return; const ColumnUInt8 * mask_filter = typeid_cast(result.deleted_mask_filter_holder.get()); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index d57a94820c4..0e6ace18be9 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -2,6 +2,7 @@ #include #include #include +#include namespace DB { @@ -269,8 +270,8 @@ private: bool last_reader_in_chain = false; bool is_initialized = false; Names non_const_virtual_column_names; - bool need_read_deleted_mask = false; - ColumnUInt8::Ptr deleted_rows; + bool need_apply_deleted_mask = false; + MergeTreeDataPartDeletedMask deleted_mask; }; } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 0deded95850..47c45058088 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -68,6 +68,9 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, MarkRanges{MarkRange(0, data_part->getMarksCount())}, /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings, {}, {}); + + if (data_part->hasLightweightDelete()) + need_apply_deleted_mask = data_part->getDeletedMask(deleted_mask); } Chunk MergeTreeSequentialSource::generate() @@ -76,11 +79,6 @@ try const auto & header = getPort().getHeader(); /// The chunk after deleted mask applied maybe empty. But the empty chunk means done of read rows. - bool need_read_deleted_mask = data_part->hasLightweightDelete(); - ColumnUInt8::Ptr deleted_rows_col; - if (need_read_deleted_mask) - deleted_rows_col = data_part->getDeletedMask(); - do { if (!isCancelled() && current_row < data_part->rows_count) @@ -97,9 +95,10 @@ try current_row += rows_read; current_mark += (rows_to_read == rows_read); - if (need_read_deleted_mask) + if (need_apply_deleted_mask) { - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col->getData(); + const auto & deleted_rows_col = deleted_mask.getDeletedRows(); + const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); size_t pos = current_row - rows_read; diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.h b/src/Storages/MergeTree/MergeTreeSequentialSource.h index a3e4f5fa856..5a571825db5 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.h +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.h @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace DB @@ -58,6 +59,9 @@ private: /// current row at which we stop reading size_t current_row = 0; + bool need_apply_deleted_mask = false; + MergeTreeDataPartDeletedMask deleted_mask {}; + /// Closes readers and unlock part locks void finish(); }; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 9e254361293..30abd546c49 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1496,13 +1496,18 @@ private: /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap if (ctx->source_part->hasLightweightDelete()) { - const auto & deleted_rows_col = ctx->source_part->getDeletedMask(); - const auto & source_data = deleted_rows_col->getData(); - data.insert(source_data.begin(), source_data.begin() + ctx->source_part->rows_count); + MergeTreeDataPartDeletedMask deleted_mask {}; + if (ctx->source_part->getDeletedMask(deleted_mask)) + { + const auto & deleted_rows_col = deleted_mask.getDeletedRows(); + const auto & source_data = deleted_rows_col.getData(); + data.insert(source_data.begin(), source_data.begin() + ctx->source_part->rows_count); - has_deleted_rows = true; + has_deleted_rows = true; + } } - else + + if (!has_deleted_rows) new_deleted_rows->insertManyDefaults(ctx->source_part->rows_count); /// Mark the data corresponding to the offset in the as deleted. From 197d3796eca37a734cc82007967d974a4dd10e99 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Fri, 15 Jul 2022 15:37:36 +0800 Subject: [PATCH 077/227] Killed mutation has empty commands may cause SegV --- src/Storages/MergeTree/MutateTask.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 30abd546c49..50c37ba5b08 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -644,7 +644,7 @@ struct MutationContext QueryPipelineBuilder mutating_pipeline_builder; QueryPipeline mutating_pipeline; // in - std::unique_ptr mutating_executor; + std::unique_ptr mutating_executor{nullptr}; ProgressCallback progress_callback; Block updated_header; @@ -1511,7 +1511,7 @@ private: new_deleted_rows->insertManyDefaults(ctx->source_part->rows_count); /// Mark the data corresponding to the offset in the as deleted. - while (MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry) && ctx->mutating_executor->pull(block)) + while (MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry) && ctx->mutating_executor && ctx->mutating_executor->pull(block)) { size_t block_rows = block.rows(); @@ -1593,6 +1593,10 @@ MutateTask::MutateTask( /// part is checked for lightweight delete in selectPartsToMutate(). ctx->is_lightweight_mutation = ctx->future_part->mutation_type == MutationType::Lightweight; + /// Empty mutation commands mean that the mutation is killed. Just work as ordinary, clone the part. + if (ctx->commands->empty()) + ctx->is_lightweight_mutation = false; + auto storage_snapshot = ctx->storage_from_source_part->getStorageSnapshot(ctx->metadata_snapshot, context_); extendObjectColumns(ctx->storage_columns, storage_snapshot->object_columns, /*with_subcolumns=*/ false); } From 73e0c35ab070905c325dbac1a3530efa67d36dbc Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 15 Jul 2022 10:13:39 +0000 Subject: [PATCH 078/227] Use 4LW for api version --- src/Common/ZooKeeper/ZooKeeperCommon.cpp | 7 ++ src/Common/ZooKeeper/ZooKeeperCommon.h | 2 + src/Common/ZooKeeper/ZooKeeperIO.cpp | 6 ++ src/Common/ZooKeeper/ZooKeeperIO.h | 1 + src/Common/ZooKeeper/ZooKeeperImpl.cpp | 88 +++++++++++++---------- src/Common/ZooKeeper/ZooKeeperImpl.h | 4 +- src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 15 +++- src/Coordination/FourLetterCommand.h | 12 ++++ src/Coordination/KeeperStorage.cpp | 31 -------- 10 files changed, 93 insertions(+), 75 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index b15126f5701..2cce22daae4 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -898,4 +898,11 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); } +int32_t fourLetterCommandNameToCode(std::string_view name) +{ + int32_t res = *reinterpret_cast(name.data()); + /// keep consistent with Coordination::read method by changing big endian to little endian. + return __builtin_bswap32(res); +} + } diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index 53fabf651fa..36f33b0bb01 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -554,4 +554,6 @@ private: ZooKeeperRequestFactory(); }; +int32_t fourLetterCommandNameToCode(std::string_view name); + } diff --git a/src/Common/ZooKeeper/ZooKeeperIO.cpp b/src/Common/ZooKeeper/ZooKeeperIO.cpp index c84a8624d78..f796212ef0b 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.cpp +++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp @@ -49,6 +49,12 @@ void write(const std::string & s, WriteBuffer & out) out.write(s.data(), s.size()); } +void write(std::string_view s, WriteBuffer & out) +{ + write(static_cast(s.size()), out); + out.write(s.data(), s.size()); +} + void write(const ACL & acl, WriteBuffer & out) { write(acl.permissions, out); diff --git a/src/Common/ZooKeeper/ZooKeeperIO.h b/src/Common/ZooKeeper/ZooKeeperIO.h index ec77b46f3d9..5e5503c504e 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.h +++ b/src/Common/ZooKeeper/ZooKeeperIO.h @@ -26,6 +26,7 @@ void write(uint8_t x, WriteBuffer & out); void write(OpNum x, WriteBuffer & out); void write(bool x, WriteBuffer & out); void write(const std::string & s, WriteBuffer & out); +void write(std::string_view s, WriteBuffer & out); void write(const ACL & acl, WriteBuffer & out); void write(const Stat & stat, WriteBuffer & out); void write(const Error & x, WriteBuffer & out); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index a0544935e25..4f9436947e9 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -354,12 +355,37 @@ ZooKeeper::ZooKeeper( send_thread = ThreadFromGlobalPool([this] { sendThread(); }); receive_thread = ThreadFromGlobalPool([this] { receiveThread(); }); - initApiVersion(); - ProfileEvents::increment(ProfileEvents::ZooKeeperInit); } +Poco::Net::StreamSocket ZooKeeper::connectToNode(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure) +{ + Poco::Net::StreamSocket result; + /// Reset the state of previous attempt. + if (is_secure) + { +#if USE_SSL + result = Poco::Net::SecureStreamSocket(); +#else + throw Poco::Exception( + "Communication with ZooKeeper over SSL is disabled because poco library was built without NetSSL support."); +#endif + } + else + { + result = Poco::Net::StreamSocket(); + } + + result.connect(node_address, connection_timeout); + + result.setReceiveTimeout(operation_timeout); + result.setSendTimeout(operation_timeout); + result.setNoDelay(true); + + return result; +} + void ZooKeeper::connect( const Nodes & nodes, Poco::Timespan connection_timeout) @@ -377,28 +403,9 @@ void ZooKeeper::connect( { try { - /// Reset the state of previous attempt. - if (node.secure) - { -#if USE_SSL - socket = Poco::Net::SecureStreamSocket(); -#else - throw Poco::Exception( - "Communication with ZooKeeper over SSL is disabled because poco library was built without NetSSL support."); -#endif - } - else - { - socket = Poco::Net::StreamSocket(); - } - - socket.connect(node.address, connection_timeout); + socket = connectToNode(node.address, connection_timeout, node.secure); socket_address = socket.peerAddress(); - socket.setReceiveTimeout(operation_timeout); - socket.setSendTimeout(operation_timeout); - socket.setNoDelay(true); - in.emplace(socket); out.emplace(socket); @@ -423,6 +430,9 @@ void ZooKeeper::connect( } connected = true; + + initApiVersion(node.address, connection_timeout, node.secure); + break; } catch (...) @@ -1066,29 +1076,29 @@ Coordination::KeeperApiVersion ZooKeeper::getApiVersion() return keeper_api_version; } -void ZooKeeper::initApiVersion() +void ZooKeeper::initApiVersion(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure) { - auto promise = std::make_shared>(); - auto future = promise->get_future(); - - auto callback = [promise](const Coordination::GetResponse & response) mutable + try { - promise->set_value(response); - }; + auto command_socket = connectToNode(node_address, connection_timeout, is_secure); - get(Coordination::keeper_api_version_path, std::move(callback), {}); - if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) - return; + auto apiv_code = Coordination::fourLetterCommandNameToCode("apiv"); - auto response = future.get(); + WriteBufferFromPocoSocket command_out(command_socket); + Coordination::write(apiv_code, command_out); + command_out.next(); - if (response.error != Coordination::Error::ZOK) - return; + ReadBufferFromPocoSocket command_in(command_socket); + std::string result; + readStringUntilEOF(result, command_in); - uint8_t keeper_version{0}; - DB::ReadBufferFromOwnString buf(response.data); - DB::readIntText(keeper_version, buf); - keeper_api_version = static_cast(keeper_version); + auto read_version = parseFromString(result); + keeper_api_version = static_cast(read_version); + } + catch (const DB::Exception & e) + { + LOG_ERROR(&Poco::Logger::get("ZooKeeper"), "Failed to get version: {}", e.message()); + } } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 023e46f5017..ec7b43ec38f 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -254,6 +254,8 @@ private: const Nodes & node, Poco::Timespan connection_timeout); + Poco::Net::StreamSocket connectToNode(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure); + void sendHandshake(); void receiveHandshake(); @@ -277,7 +279,7 @@ private: void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false); - void initApiVersion(); + void initApiVersion(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure); CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 34d69967828..046659af01e 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index cec107806b7..ab86d1a3a0c 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -39,9 +40,7 @@ String IFourLetterCommand::toName(int32_t code) int32_t IFourLetterCommand::toCode(const String & name) { - int32_t res = *reinterpret_cast(name.data()); - /// keep consistent with Coordination::read method by changing big endian to little endian. - return __builtin_bswap32(res); + return Coordination::fourLetterCommandNameToCode(name); } IFourLetterCommand::~IFourLetterCommand() = default; @@ -132,6 +131,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); factory.registerCommand(recovery_command); + FourLetterCommandPtr api_version_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(api_version_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -247,6 +249,8 @@ String MonitorCommand::run() print(ret, "synced_followers", keeper_info.synced_follower_count); } + print(ret, "api_version", static_cast(Coordination::current_keeper_api_version)); + return ret.str(); } @@ -463,4 +467,9 @@ String RecoveryCommand::run() return "ok"; } +String ApiVersionCommand::run() +{ + return toString(static_cast(Coordination::current_keeper_api_version)); +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index b5d08f4c250..8a98b94b33a 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -315,4 +315,16 @@ struct RecoveryCommand : public IFourLetterCommand String run() override; ~RecoveryCommand() override = default; }; + +struct ApiVersionCommand : public IFourLetterCommand +{ + explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "apiv"; } + String run() override; + ~ApiVersionCommand() override = default; +}; } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index d07caeaf496..2eec5756b35 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -843,9 +843,6 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce { Coordination::ZooKeeperGetRequest & request = dynamic_cast(*zk_request); - if (request.path == Coordination::keeper_api_version_path) - return {}; - if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -868,16 +865,6 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce } } - // We cannot store the node because the result should be connected to the binary itself - // this way we avoid incorrect results when we read a snapshot from older Keeper that can have - // lower API version - if (request.path == Coordination::keeper_api_version_path) - { - response.data = std::to_string(static_cast(Coordination::current_keeper_api_version)); - response.error = Coordination::Error::ZOK; - return response_ptr; - } - auto & container = storage.container; auto node_it = container.find(request.path); if (node_it == container.end()) @@ -924,12 +911,6 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr std::vector new_deltas; - if (request.path == Coordination::keeper_api_version_path) - { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); - return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; - } - const auto update_parent_pzxid = [&]() { auto parent_path = parentPath(request.path); @@ -1076,12 +1057,6 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce std::vector new_deltas; - if (request.path == Coordination::keeper_api_version_path) - { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); - return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; - } - if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -1343,12 +1318,6 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr { Coordination::ZooKeeperSetACLRequest & request = dynamic_cast(*zk_request); - if (request.path == Coordination::keeper_api_version_path) - { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); - return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; - } - auto & uncommitted_state = storage.uncommitted_state; if (!uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; From a06fe4e21b60bf586e017717665e7be3190e16da Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 15 Jul 2022 13:53:20 +0200 Subject: [PATCH 079/227] Fix --- src/IO/ReadWriteBufferFromHTTP.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index ab358c8253a..bbd1f92f0ad 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -551,6 +551,9 @@ namespace detail } } + if (exception) + break; + if (use_external_buffer) { setupExternalBuffer(); From d7fc7a429028b7ebf2910100834d87de91efc0e7 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 15 Jul 2022 16:08:53 +0200 Subject: [PATCH 080/227] Addressed review comments - Removed first_buffer, finalizeImpl and logic to different buffer size from ForkWriteBuffer --- src/IO/ForkWriteBuffer.cpp | 37 ++++--------------------------------- src/IO/ForkWriteBuffer.h | 2 -- 2 files changed, 4 insertions(+), 35 deletions(-) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index b055ae588a9..e91f6ca433b 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -16,46 +16,23 @@ ForkWriteBuffer::ForkWriteBuffer(WriteBufferPtrs && sources_) { if (sources.empty()) { - first_buffer = nullptr; throw Exception("ForkWriteBuffer required WriteBuffer is not provided", ErrorCodes::CANNOT_CREATE_IO_BUFFER); } - else - { - first_buffer = sources.begin()->get(); - set(first_buffer->buffer().begin(), first_buffer->buffer().size()); - } + set(sources.front()->buffer().begin(), sources.front()->buffer().size()); } void ForkWriteBuffer::nextImpl() { - if (!first_buffer) - return; - - first_buffer->position() = position(); + sources.front()->position() = position(); try { for (const WriteBufferPtr & write_buffer : sources | std::views::reverse) { - if (write_buffer.get() != first_buffer) + if (write_buffer != sources.front()) { - //if buffer size if not enough to write, then split the message with buffer length - if (write_buffer->available() < first_buffer->offset()) - { - size_t bytes_written = 0; - auto to_be_written = first_buffer->offset(); - while (to_be_written != 0) - { - int bytes_to_copy = std::min(to_be_written, write_buffer->available()); - write_buffer->write(first_buffer->buffer().begin()+bytes_written, bytes_to_copy); - write_buffer->next(); - bytes_written += bytes_to_copy; - to_be_written -= bytes_to_copy; - } - } - else - write_buffer->write(first_buffer->buffer().begin(), first_buffer->offset()); + write_buffer->write(sources.front()->buffer().begin(), sources.front()->offset()); } write_buffer->next(); } @@ -68,12 +45,6 @@ void ForkWriteBuffer::nextImpl() } -void ForkWriteBuffer::finalizeImpl() -{ - next(); -} - - ForkWriteBuffer::~ForkWriteBuffer() { finalize(); diff --git a/src/IO/ForkWriteBuffer.h b/src/IO/ForkWriteBuffer.h index 63267fcd8d7..56e9c445842 100644 --- a/src/IO/ForkWriteBuffer.h +++ b/src/IO/ForkWriteBuffer.h @@ -25,11 +25,9 @@ public: protected: void nextImpl() override; - void finalizeImpl() override; private: WriteBufferPtrs sources; - WriteBuffer *first_buffer; }; } From 6318927fba01ab4e9a914586e83f899a7f2b2eea Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 15 Jul 2022 18:37:50 +0200 Subject: [PATCH 081/227] addressed review comment: ForkWriteBuffer for loop --- src/IO/ForkWriteBuffer.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index e91f6ca433b..dd910f7fdf4 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -1,7 +1,5 @@ #include #include -#include - namespace DB { @@ -28,14 +26,14 @@ void ForkWriteBuffer::nextImpl() try { - for (const WriteBufferPtr & write_buffer : sources | std::views::reverse) + auto & source_buffer = sources.front(); + for (auto it = sources.begin() + 1; it != sources.end(); ++it) { - if (write_buffer != sources.front()) - { - write_buffer->write(sources.front()->buffer().begin(), sources.front()->offset()); - } - write_buffer->next(); + auto & buffer = *it; + buffer->write(source_buffer->buffer().begin(), source_buffer->offset()); + buffer->next(); } + source_buffer->next(); } catch (Exception & exception) { From 228b9e7ec4af15fa92f2301a8d4f7e67d34452d1 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 17 Jul 2022 20:09:18 +0200 Subject: [PATCH 082/227] Addressed review comments - Updated exception message in ForkWriteBuffer - Added test case to tests/queries/0_stateless/02346_into_outfile_and_stdout.sh for calling nextImpl more than once --- src/IO/ForkWriteBuffer.cpp | 2 +- .../0_stateless/02346_into_outfile_and_stdout.reference | 5 +++++ .../queries/0_stateless/02346_into_outfile_and_stdout.sh | 8 +++++--- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index dd910f7fdf4..876a924a72b 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -14,7 +14,7 @@ ForkWriteBuffer::ForkWriteBuffer(WriteBufferPtrs && sources_) { if (sources.empty()) { - throw Exception("ForkWriteBuffer required WriteBuffer is not provided", ErrorCodes::CANNOT_CREATE_IO_BUFFER); + throw Exception("Expected non-zero number of buffers for `ForkWriteBuffer`", ErrorCodes::CANNOT_CREATE_IO_BUFFER); } set(sources.front()->buffer().begin(), sources.front()->buffer().size()); } diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference index d14e7634f24..6032d8f4b79 100644 --- a/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.reference @@ -14,3 +14,8 @@ performing test: bad_query_misplaced_compression 1 performing test: bad_query_misplaced_format 1 +performing test: union_all +1 2 +3 4 +1 2 +3 4 diff --git a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh index 3879249699f..021dc9125d4 100755 --- a/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh +++ b/tests/queries/0_stateless/02346_into_outfile_and_stdout.sh @@ -10,9 +10,9 @@ function perform() local query=$2 echo "performing test: ${test_id}" - ${CLICKHOUSE_CLIENT} --query "${query}" + ${CLICKHOUSE_CLIENT} --query "${query}" | sort --numeric-sort if [ "$?" -eq 0 ]; then - cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" + cat "${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_${test_id}.out" | sort --numeric-sort else echo "query failed" fi @@ -72,4 +72,6 @@ performCompression "compression" "SELECT * FROM (SELECT 'Hello, World! From clic performBadQuery "bad_query_misplaced_compression" "SELECT 1, 2, 3 INTO OUTFILE 'test.gz' COMPRESSION 'GZ' AND STDOUT'" "SYNTAX_ERROR" -performBadQuery "bad_query_misplaced_format" "SELECT 1, 2, 3 INTO OUTFILE 'test.gz' FORMAT TabSeparated AND STDOUT'" "SYNTAX_ERROR" \ No newline at end of file +performBadQuery "bad_query_misplaced_format" "SELECT 1, 2, 3 INTO OUTFILE 'test.gz' FORMAT TabSeparated AND STDOUT'" "SYNTAX_ERROR" + +perform "union_all" "SELECT 3, 4 UNION ALL SELECT 1, 2 INTO OUTFILE '${CLICKHOUSE_TMP}/test_into_outfile_and_stdout_union_all.out' AND STDOUT" \ No newline at end of file From 7d7c4ce4cdbc7f4e42d1d87a09f95df49d2b9f18 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 08:44:38 +0000 Subject: [PATCH 083/227] Add ApiVersion request --- src/Common/ZooKeeper/IKeeper.h | 1 - src/Common/ZooKeeper/ZooKeeperCommon.cpp | 15 ++++ src/Common/ZooKeeper/ZooKeeperCommon.h | 24 ++++++ src/Common/ZooKeeper/ZooKeeperConstants.cpp | 3 + src/Common/ZooKeeper/ZooKeeperConstants.h | 1 + src/Common/ZooKeeper/ZooKeeperImpl.cpp | 92 +++++++++++---------- src/Common/ZooKeeper/ZooKeeperImpl.h | 4 +- src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 8 -- src/Coordination/FourLetterCommand.h | 12 --- src/Coordination/KeeperDispatcher.cpp | 14 ++++ 11 files changed, 109 insertions(+), 67 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 9592256b7e0..7992547bd11 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -117,7 +117,6 @@ enum KeeperApiVersion : uint8_t }; inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; -inline constexpr auto * keeper_api_version_path = "/keeper-api-version"; struct Request; using RequestPtr = std::shared_ptr; diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 2cce22daae4..161cd76ec79 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -661,6 +661,20 @@ void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const Coordination::write(server_id, out); } +Coordination::ZooKeeperResponsePtr ZooKeeperApiVersionRequest::makeResponse() const +{ + return std::make_shared(); +} + +void ZooKeeperApiVersionResponse::readImpl(ReadBuffer & in) +{ + Coordination::read(api_version, in); +} + +void ZooKeeperApiVersionResponse::writeImpl(WriteBuffer & out) const +{ + Coordination::write(api_version, out); +} void ZooKeeperRequest::createLogElements(LogElements & elems) const { @@ -896,6 +910,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); + registerZooKeeperRequest(*this); } int32_t fourLetterCommandNameToCode(std::string_view name) diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index 36f33b0bb01..41048e3c84a 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -535,6 +535,30 @@ struct ZooKeeperSessionIDResponse final : ZooKeeperResponse Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } }; +struct ZooKeeperApiVersionRequest final : ZooKeeperRequest +{ + Coordination::OpNum getOpNum() const override { return OpNum::ApiVersion; } + String getPath() const override { return {}; } + void writeImpl(WriteBuffer &) const override {} + void readImpl(ReadBuffer &) override {} + + Coordination::ZooKeeperResponsePtr makeResponse() const override; + bool isReadRequest() const override { return false; } +}; + +/// Fake internal coordination (keeper) response. Never received from client +/// and never send to client. +struct ZooKeeperApiVersionResponse final : ZooKeeperResponse +{ + int64_t api_version; + + void readImpl(ReadBuffer & in) override; + + void writeImpl(WriteBuffer & out) const override; + + Coordination::OpNum getOpNum() const override { return OpNum::ApiVersion; } +}; + class ZooKeeperRequestFactory final : private boost::noncopyable { diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp index ba7a9b9f0c5..01f0f49e966 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp +++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp @@ -25,6 +25,7 @@ static const std::unordered_set VALID_OPERATIONS = static_cast(OpNum::SetACL), static_cast(OpNum::GetACL), static_cast(OpNum::FilteredList), + static_cast(OpNum::ApiVersion), }; std::string toString(OpNum op_num) @@ -67,6 +68,8 @@ std::string toString(OpNum op_num) return "GetACL"; case OpNum::FilteredList: return "FilteredList"; + case OpNum::ApiVersion: + return "ApiVersion"; } int32_t raw_op = static_cast(op_num); throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED); diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h index 44f8437f12c..711e7c4527c 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.h +++ b/src/Common/ZooKeeper/ZooKeeperConstants.h @@ -35,6 +35,7 @@ enum class OpNum : int32_t // CH Keeper specific operations FilteredList = 500, + ApiVersion = 501, SessionID = 997, /// Special internal request }; diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 4f9436947e9..51ddaa984eb 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -349,6 +349,13 @@ ZooKeeper::ZooKeeper( connect(nodes, connection_timeout); + if (!initApiVersion()) + { + // We failed to get the version, let's reconnect in case + // the connection became faulty + connect(nodes, connection_timeout); + } + if (!auth_scheme.empty()) sendAuth(auth_scheme, auth_data); @@ -358,34 +365,6 @@ ZooKeeper::ZooKeeper( ProfileEvents::increment(ProfileEvents::ZooKeeperInit); } - -Poco::Net::StreamSocket ZooKeeper::connectToNode(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure) -{ - Poco::Net::StreamSocket result; - /// Reset the state of previous attempt. - if (is_secure) - { -#if USE_SSL - result = Poco::Net::SecureStreamSocket(); -#else - throw Poco::Exception( - "Communication with ZooKeeper over SSL is disabled because poco library was built without NetSSL support."); -#endif - } - else - { - result = Poco::Net::StreamSocket(); - } - - result.connect(node_address, connection_timeout); - - result.setReceiveTimeout(operation_timeout); - result.setSendTimeout(operation_timeout); - result.setNoDelay(true); - - return result; -} - void ZooKeeper::connect( const Nodes & nodes, Poco::Timespan connection_timeout) @@ -403,9 +382,29 @@ void ZooKeeper::connect( { try { - socket = connectToNode(node.address, connection_timeout, node.secure); + /// Reset the state of previous attempt. + if (node.secure) + { +#if USE_SSL + socket = Poco::Net::SecureStreamSocket(); +#else + throw Poco::Exception( + "Communication with ZooKeeper over SSL is disabled because poco library was built without NetSSL support."); +#endif + } + else + { + socket = Poco::Net::StreamSocket(); + } + + socket.connect(node.address, connection_timeout); + socket_address = socket.peerAddress(); + socket.setReceiveTimeout(operation_timeout); + socket.setSendTimeout(operation_timeout); + socket.setNoDelay(true); + in.emplace(socket); out.emplace(socket); @@ -431,8 +430,6 @@ void ZooKeeper::connect( connected = true; - initApiVersion(node.address, connection_timeout, node.secure); - break; } catch (...) @@ -1076,28 +1073,39 @@ Coordination::KeeperApiVersion ZooKeeper::getApiVersion() return keeper_api_version; } -void ZooKeeper::initApiVersion(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure) +bool ZooKeeper::initApiVersion() { try { - auto command_socket = connectToNode(node_address, connection_timeout, is_secure); + ZooKeeperApiVersionRequest request; + request.write(*out); - auto apiv_code = Coordination::fourLetterCommandNameToCode("apiv"); + if (!in->poll(operation_timeout.totalMilliseconds())) + { + LOG_ERROR(&Poco::Logger::get("ZooKeeper"), "Failed to get version: timeout"); + return false; + } - WriteBufferFromPocoSocket command_out(command_socket); - Coordination::write(apiv_code, command_out); - command_out.next(); + ZooKeeperApiVersionResponse response; - ReadBufferFromPocoSocket command_in(command_socket); - std::string result; - readStringUntilEOF(result, command_in); + int32_t length; + XID xid; + int64_t zxid; + Error err; + read(length); + read(xid); + read(zxid); + read(err); - auto read_version = parseFromString(result); - keeper_api_version = static_cast(read_version); + response.readImpl(*in); + + keeper_api_version = static_cast(response.api_version); + return true; } catch (const DB::Exception & e) { LOG_ERROR(&Poco::Logger::get("ZooKeeper"), "Failed to get version: {}", e.message()); + return false; } } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index ec7b43ec38f..df34a44235e 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -254,8 +254,6 @@ private: const Nodes & node, Poco::Timespan connection_timeout); - Poco::Net::StreamSocket connectToNode(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure); - void sendHandshake(); void receiveHandshake(); @@ -279,7 +277,7 @@ private: void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false); - void initApiVersion(const Poco::Net::SocketAddress & node_address, Poco::Timespan connection_timeout, bool is_secure); + bool initApiVersion(); CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 046659af01e..34d69967828 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index ab86d1a3a0c..2e941003bc4 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -131,9 +131,6 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); factory.registerCommand(recovery_command); - FourLetterCommandPtr api_version_command = std::make_shared(keeper_dispatcher); - factory.registerCommand(api_version_command); - factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -467,9 +464,4 @@ String RecoveryCommand::run() return "ok"; } -String ApiVersionCommand::run() -{ - return toString(static_cast(Coordination::current_keeper_api_version)); -} - } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index 8a98b94b33a..b5d08f4c250 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -315,16 +315,4 @@ struct RecoveryCommand : public IFourLetterCommand String run() override; ~RecoveryCommand() override = default; }; - -struct ApiVersionCommand : public IFourLetterCommand -{ - explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_) - : IFourLetterCommand(keeper_dispatcher_) - { - } - - String name() override { return "apiv"; } - String run() override; - ~ApiVersionCommand() override = default; -}; } diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index 5b376a03b02..fd6135fcbe6 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -266,6 +266,20 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ if (!requests_queue->push(std::move(request_info))) throw Exception("Cannot push request to queue", ErrorCodes::SYSTEM_ERROR); } + else if (request->getOpNum() == Coordination::OpNum::ApiVersion) + { + auto response = std::make_shared(); + response->api_version = Coordination::current_keeper_api_version; + LOG_DEBUG(log, "Returning api version {}", response->api_version); + + KeeperStorage::ResponseForSession response_for_session; + response_for_session.session_id = session_id; + response_for_session.response = std::move(response); + { + if (!responses_queue.tryPush(std::move(response_for_session), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Could not push response with API version into responses queue"); + } + } else if (!requests_queue->tryPush(std::move(request_info), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) { throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); From b8ffb151dcec59f92c2aa2600572f93e886f1461 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 08:52:52 +0000 Subject: [PATCH 084/227] Some small polishing --- src/Common/ZooKeeper/ZooKeeperCommon.cpp | 7 ------- src/Common/ZooKeeper/ZooKeeperCommon.h | 4 ---- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 5 ----- src/Coordination/FourLetterCommand.cpp | 6 +++--- src/Coordination/tests/gtest_coordination.cpp | 14 -------------- 5 files changed, 3 insertions(+), 33 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 161cd76ec79..05d1af10b4d 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -913,11 +913,4 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); } -int32_t fourLetterCommandNameToCode(std::string_view name) -{ - int32_t res = *reinterpret_cast(name.data()); - /// keep consistent with Coordination::read method by changing big endian to little endian. - return __builtin_bswap32(res); -} - } diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index 41048e3c84a..a5de08363f7 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -546,8 +546,6 @@ struct ZooKeeperApiVersionRequest final : ZooKeeperRequest bool isReadRequest() const override { return false; } }; -/// Fake internal coordination (keeper) response. Never received from client -/// and never send to client. struct ZooKeeperApiVersionResponse final : ZooKeeperResponse { int64_t api_version; @@ -578,6 +576,4 @@ private: ZooKeeperRequestFactory(); }; -int32_t fourLetterCommandNameToCode(std::string_view name); - } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 51ddaa984eb..342dea9bdb7 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1,4 +1,3 @@ -#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -7,12 +6,10 @@ #include #include #include -#include #include #include #include #include -#include #include @@ -398,7 +395,6 @@ void ZooKeeper::connect( } socket.connect(node.address, connection_timeout); - socket_address = socket.peerAddress(); socket.setReceiveTimeout(operation_timeout); @@ -429,7 +425,6 @@ void ZooKeeper::connect( } connected = true; - break; } catch (...) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 2e941003bc4..0eea4ad01c6 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -40,7 +40,9 @@ String IFourLetterCommand::toName(int32_t code) int32_t IFourLetterCommand::toCode(const String & name) { - return Coordination::fourLetterCommandNameToCode(name); + int32_t res = *reinterpret_cast(name.data()); + /// keep consistent with Coordination::read method by changing big endian to little endian. + return __builtin_bswap32(res); } IFourLetterCommand::~IFourLetterCommand() = default; @@ -246,8 +248,6 @@ String MonitorCommand::run() print(ret, "synced_followers", keeper_info.synced_follower_count); } - print(ret, "api_version", static_cast(Coordination::current_keeper_api_version)); - return ret.str(); } diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 63edcf15508..20c33880139 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2120,20 +2120,6 @@ TEST_P(CoordinationTest, TestDurableState) } } -TEST_P(CoordinationTest, TestCurrentApiVersion) -{ - using namespace Coordination; - KeeperStorage storage{500, "", true}; - auto request = std::make_shared(); - request->path = Coordination::keeper_api_version_path; - auto responses = storage.processRequest(request, 0, std::nullopt, true, true); - const auto & get_response = getSingleResponse(responses); - uint8_t keeper_version{0}; - DB::ReadBufferFromOwnString buf(get_response.data); - DB::readIntText(keeper_version, buf); - EXPECT_EQ(keeper_version, current_keeper_api_version); -} - INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, ::testing::ValuesIn(std::initializer_list{ From 758a61a36f770baf7edc69a6ba1dc170fd732a78 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 09:43:21 +0000 Subject: [PATCH 085/227] Close socket if ApiVersion wasn't processed --- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 342dea9bdb7..7350c7f6c1a 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -350,6 +350,7 @@ ZooKeeper::ZooKeeper( { // We failed to get the version, let's reconnect in case // the connection became faulty + socket.close(); connect(nodes, connection_timeout); } From cc1046f18aef8196a3d25d467230e9e48a26cae7 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 12:12:24 +0000 Subject: [PATCH 086/227] Possible fix for flaky test_keeper_force_recovery --- tests/integration/test_keeper_force_recovery/test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index 5f1b7d1e4e4..8eb759fae47 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -5,7 +5,7 @@ from helpers.cluster import ClickHouseCluster import time -from kazoo.client import KazooClient +from kazoo.client import KazooClient, KazooRetry CLUSTER_SIZE = 5 QUORUM_SIZE = CLUSTER_SIZE // 2 + 1 @@ -52,8 +52,10 @@ def started_cluster(): def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient( - hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout + hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout, + command_retry=KazooRetry(max_tries=10) ) + _fake_zk_instance.start() return _fake_zk_instance @@ -117,7 +119,7 @@ def test_cluster_recovery(started_cluster): data_in_cluster = [] def add_data(zk, path, data): - zk.create(path, data.encode()) + zk.retry(zk.create, path, data.encode()) data_in_cluster.append((path, data)) def assert_all_data(zk): From 48db20d5a1d950d72c221a6e23ba6ae660341c7e Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 12:19:16 +0000 Subject: [PATCH 087/227] Add 4LW for api version --- src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 10 ++++++++++ src/Coordination/FourLetterCommand.h | 12 ++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 34d69967828..046659af01e 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 0eea4ad01c6..378b46bddb6 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -133,6 +133,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); factory.registerCommand(recovery_command); + FourLetterCommandPtr api_version_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(api_version_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -248,6 +251,8 @@ String MonitorCommand::run() print(ret, "synced_followers", keeper_info.synced_follower_count); } + print(ret, "api_version", static_cast(Coordination::current_keeper_api_version)); + return ret.str(); } @@ -464,4 +469,9 @@ String RecoveryCommand::run() return "ok"; } +String ApiVersionCommand::run() +{ + return toString(static_cast(Coordination::current_keeper_api_version)); +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index b5d08f4c250..8a98b94b33a 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -315,4 +315,16 @@ struct RecoveryCommand : public IFourLetterCommand String run() override; ~RecoveryCommand() override = default; }; + +struct ApiVersionCommand : public IFourLetterCommand +{ + explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "apiv"; } + String run() override; + ~ApiVersionCommand() override = default; +}; } From cfc741030fb740913035b79f0c91a7ecff0c90b5 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 18 Jul 2022 14:30:15 +0000 Subject: [PATCH 088/227] Revert to the version with path --- src/Common/ZooKeeper/IKeeper.h | 1 + src/Common/ZooKeeper/ZooKeeperCommon.cpp | 15 ----- src/Common/ZooKeeper/ZooKeeperCommon.h | 22 ------- src/Common/ZooKeeper/ZooKeeperConstants.cpp | 3 - src/Common/ZooKeeper/ZooKeeperConstants.h | 1 - src/Common/ZooKeeper/ZooKeeperIO.cpp | 6 -- src/Common/ZooKeeper/ZooKeeperIO.h | 1 - src/Common/ZooKeeper/ZooKeeperImpl.cpp | 58 +++++++------------ src/Common/ZooKeeper/ZooKeeperImpl.h | 2 +- src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 11 ---- src/Coordination/FourLetterCommand.h | 12 ---- src/Coordination/KeeperDispatcher.cpp | 14 ----- src/Coordination/KeeperStorage.cpp | 31 ++++++++++ src/Coordination/tests/gtest_coordination.cpp | 14 +++++ 15 files changed, 70 insertions(+), 123 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 7992547bd11..9592256b7e0 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -117,6 +117,7 @@ enum KeeperApiVersion : uint8_t }; inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; +inline constexpr auto * keeper_api_version_path = "/keeper-api-version"; struct Request; using RequestPtr = std::shared_ptr; diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 05d1af10b4d..b15126f5701 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -661,20 +661,6 @@ void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const Coordination::write(server_id, out); } -Coordination::ZooKeeperResponsePtr ZooKeeperApiVersionRequest::makeResponse() const -{ - return std::make_shared(); -} - -void ZooKeeperApiVersionResponse::readImpl(ReadBuffer & in) -{ - Coordination::read(api_version, in); -} - -void ZooKeeperApiVersionResponse::writeImpl(WriteBuffer & out) const -{ - Coordination::write(api_version, out); -} void ZooKeeperRequest::createLogElements(LogElements & elems) const { @@ -910,7 +896,6 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); - registerZooKeeperRequest(*this); } } diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index a5de08363f7..53fabf651fa 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -535,28 +535,6 @@ struct ZooKeeperSessionIDResponse final : ZooKeeperResponse Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } }; -struct ZooKeeperApiVersionRequest final : ZooKeeperRequest -{ - Coordination::OpNum getOpNum() const override { return OpNum::ApiVersion; } - String getPath() const override { return {}; } - void writeImpl(WriteBuffer &) const override {} - void readImpl(ReadBuffer &) override {} - - Coordination::ZooKeeperResponsePtr makeResponse() const override; - bool isReadRequest() const override { return false; } -}; - -struct ZooKeeperApiVersionResponse final : ZooKeeperResponse -{ - int64_t api_version; - - void readImpl(ReadBuffer & in) override; - - void writeImpl(WriteBuffer & out) const override; - - Coordination::OpNum getOpNum() const override { return OpNum::ApiVersion; } -}; - class ZooKeeperRequestFactory final : private boost::noncopyable { diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp index 01f0f49e966..ba7a9b9f0c5 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp +++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp @@ -25,7 +25,6 @@ static const std::unordered_set VALID_OPERATIONS = static_cast(OpNum::SetACL), static_cast(OpNum::GetACL), static_cast(OpNum::FilteredList), - static_cast(OpNum::ApiVersion), }; std::string toString(OpNum op_num) @@ -68,8 +67,6 @@ std::string toString(OpNum op_num) return "GetACL"; case OpNum::FilteredList: return "FilteredList"; - case OpNum::ApiVersion: - return "ApiVersion"; } int32_t raw_op = static_cast(op_num); throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED); diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h index 711e7c4527c..44f8437f12c 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.h +++ b/src/Common/ZooKeeper/ZooKeeperConstants.h @@ -35,7 +35,6 @@ enum class OpNum : int32_t // CH Keeper specific operations FilteredList = 500, - ApiVersion = 501, SessionID = 997, /// Special internal request }; diff --git a/src/Common/ZooKeeper/ZooKeeperIO.cpp b/src/Common/ZooKeeper/ZooKeeperIO.cpp index f796212ef0b..c84a8624d78 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.cpp +++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp @@ -49,12 +49,6 @@ void write(const std::string & s, WriteBuffer & out) out.write(s.data(), s.size()); } -void write(std::string_view s, WriteBuffer & out) -{ - write(static_cast(s.size()), out); - out.write(s.data(), s.size()); -} - void write(const ACL & acl, WriteBuffer & out) { write(acl.permissions, out); diff --git a/src/Common/ZooKeeper/ZooKeeperIO.h b/src/Common/ZooKeeper/ZooKeeperIO.h index 5e5503c504e..ec77b46f3d9 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.h +++ b/src/Common/ZooKeeper/ZooKeeperIO.h @@ -26,7 +26,6 @@ void write(uint8_t x, WriteBuffer & out); void write(OpNum x, WriteBuffer & out); void write(bool x, WriteBuffer & out); void write(const std::string & s, WriteBuffer & out); -void write(std::string_view s, WriteBuffer & out); void write(const ACL & acl, WriteBuffer & out); void write(const Stat & stat, WriteBuffer & out); void write(const Error & x, WriteBuffer & out); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 7350c7f6c1a..a0544935e25 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1,3 +1,4 @@ +#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -6,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -346,23 +348,18 @@ ZooKeeper::ZooKeeper( connect(nodes, connection_timeout); - if (!initApiVersion()) - { - // We failed to get the version, let's reconnect in case - // the connection became faulty - socket.close(); - connect(nodes, connection_timeout); - } - if (!auth_scheme.empty()) sendAuth(auth_scheme, auth_data); send_thread = ThreadFromGlobalPool([this] { sendThread(); }); receive_thread = ThreadFromGlobalPool([this] { receiveThread(); }); + initApiVersion(); + ProfileEvents::increment(ProfileEvents::ZooKeeperInit); } + void ZooKeeper::connect( const Nodes & nodes, Poco::Timespan connection_timeout) @@ -1069,40 +1066,29 @@ Coordination::KeeperApiVersion ZooKeeper::getApiVersion() return keeper_api_version; } -bool ZooKeeper::initApiVersion() +void ZooKeeper::initApiVersion() { - try + auto promise = std::make_shared>(); + auto future = promise->get_future(); + + auto callback = [promise](const Coordination::GetResponse & response) mutable { - ZooKeeperApiVersionRequest request; - request.write(*out); + promise->set_value(response); + }; - if (!in->poll(operation_timeout.totalMilliseconds())) - { - LOG_ERROR(&Poco::Logger::get("ZooKeeper"), "Failed to get version: timeout"); - return false; - } + get(Coordination::keeper_api_version_path, std::move(callback), {}); + if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) + return; - ZooKeeperApiVersionResponse response; + auto response = future.get(); - int32_t length; - XID xid; - int64_t zxid; - Error err; - read(length); - read(xid); - read(zxid); - read(err); + if (response.error != Coordination::Error::ZOK) + return; - response.readImpl(*in); - - keeper_api_version = static_cast(response.api_version); - return true; - } - catch (const DB::Exception & e) - { - LOG_ERROR(&Poco::Logger::get("ZooKeeper"), "Failed to get version: {}", e.message()); - return false; - } + uint8_t keeper_version{0}; + DB::ReadBufferFromOwnString buf(response.data); + DB::readIntText(keeper_version, buf); + keeper_api_version = static_cast(keeper_version); } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index df34a44235e..023e46f5017 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -277,7 +277,7 @@ private: void logOperationIfNeeded(const ZooKeeperRequestPtr & request, const ZooKeeperResponsePtr & response = nullptr, bool finalize = false); - bool initApiVersion(); + void initApiVersion(); CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 046659af01e..34d69967828 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index 378b46bddb6..cec107806b7 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -2,7 +2,6 @@ #include #include -#include #include #include #include @@ -133,9 +132,6 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); factory.registerCommand(recovery_command); - FourLetterCommandPtr api_version_command = std::make_shared(keeper_dispatcher); - factory.registerCommand(api_version_command); - factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -251,8 +247,6 @@ String MonitorCommand::run() print(ret, "synced_followers", keeper_info.synced_follower_count); } - print(ret, "api_version", static_cast(Coordination::current_keeper_api_version)); - return ret.str(); } @@ -469,9 +463,4 @@ String RecoveryCommand::run() return "ok"; } -String ApiVersionCommand::run() -{ - return toString(static_cast(Coordination::current_keeper_api_version)); -} - } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index 8a98b94b33a..b5d08f4c250 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -315,16 +315,4 @@ struct RecoveryCommand : public IFourLetterCommand String run() override; ~RecoveryCommand() override = default; }; - -struct ApiVersionCommand : public IFourLetterCommand -{ - explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_) - : IFourLetterCommand(keeper_dispatcher_) - { - } - - String name() override { return "apiv"; } - String run() override; - ~ApiVersionCommand() override = default; -}; } diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index fd6135fcbe6..5b376a03b02 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -266,20 +266,6 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ if (!requests_queue->push(std::move(request_info))) throw Exception("Cannot push request to queue", ErrorCodes::SYSTEM_ERROR); } - else if (request->getOpNum() == Coordination::OpNum::ApiVersion) - { - auto response = std::make_shared(); - response->api_version = Coordination::current_keeper_api_version; - LOG_DEBUG(log, "Returning api version {}", response->api_version); - - KeeperStorage::ResponseForSession response_for_session; - response_for_session.session_id = session_id; - response_for_session.response = std::move(response); - { - if (!responses_queue.tryPush(std::move(response_for_session), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Could not push response with API version into responses queue"); - } - } else if (!requests_queue->tryPush(std::move(request_info), configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds())) { throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 2eec5756b35..d07caeaf496 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -843,6 +843,9 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce { Coordination::ZooKeeperGetRequest & request = dynamic_cast(*zk_request); + if (request.path == Coordination::keeper_api_version_path) + return {}; + if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -865,6 +868,16 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce } } + // We cannot store the node because the result should be connected to the binary itself + // this way we avoid incorrect results when we read a snapshot from older Keeper that can have + // lower API version + if (request.path == Coordination::keeper_api_version_path) + { + response.data = std::to_string(static_cast(Coordination::current_keeper_api_version)); + response.error = Coordination::Error::ZOK; + return response_ptr; + } + auto & container = storage.container; auto node_it = container.find(request.path); if (node_it == container.end()) @@ -911,6 +924,12 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr std::vector new_deltas; + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + const auto update_parent_pzxid = [&]() { auto parent_path = parentPath(request.path); @@ -1057,6 +1076,12 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce std::vector new_deltas; + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + if (!storage.uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; @@ -1318,6 +1343,12 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr { Coordination::ZooKeeperSetACLRequest & request = dynamic_cast(*zk_request); + if (request.path == Coordination::keeper_api_version_path) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } + auto & uncommitted_state = storage.uncommitted_state; if (!uncommitted_state.getNode(request.path)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 20c33880139..63edcf15508 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2120,6 +2120,20 @@ TEST_P(CoordinationTest, TestDurableState) } } +TEST_P(CoordinationTest, TestCurrentApiVersion) +{ + using namespace Coordination; + KeeperStorage storage{500, "", true}; + auto request = std::make_shared(); + request->path = Coordination::keeper_api_version_path; + auto responses = storage.processRequest(request, 0, std::nullopt, true, true); + const auto & get_response = getSingleResponse(responses); + uint8_t keeper_version{0}; + DB::ReadBufferFromOwnString buf(get_response.data); + DB::readIntText(keeper_version, buf); + EXPECT_EQ(keeper_version, current_keeper_api_version); +} + INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, CoordinationTest, ::testing::ValuesIn(std::initializer_list{ From e165e68386daaef105a4875dda1477d3d4237cca Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 18 Jul 2022 18:12:13 +0200 Subject: [PATCH 089/227] Addressed review comment - updated creating ForkWriteBuffer in ClientBase to avoid duplication --- src/Client/ClientBase.cpp | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index cbb5ec9f004..c8a1dd3dcb0 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -549,26 +549,18 @@ try range.second); } + out_file_buf = wrapWriteBufferWithCompressionMethod( + std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), + compression_method, + compression_level + ); + if (query_with_output->is_into_outfile_with_stdout) { select_into_file_and_stdout = true; - WriteBufferPtr file_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), - compression_method, - compression_level - ); - - out_file_buf = std::make_unique(std::vector{file_buf, + out_file_buf = std::make_unique(std::vector{std::move(out_file_buf), std::make_shared(STDOUT_FILENO)}); } - else - { - out_file_buf = wrapWriteBufferWithCompressionMethod( - std::make_unique(out_file, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_EXCL | O_CREAT), - compression_method, - compression_level - ); - } // We are writing to file, so default format is the same as in non-interactive mode. if (is_interactive && is_default_format) From 7f6b175c3eecd3e37d1199ffc44185d46e3eb8e3 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 18 Jul 2022 19:39:10 +0200 Subject: [PATCH 090/227] impl --- .../external-dictionaries/external-dicts-dict-sources.md | 3 ++- .../external-dictionaries/external-dicts-dict-sources.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index ec62205a36d..1a5308b5569 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -553,6 +553,7 @@ Setting fields: :::note The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +There is no explicit parameter `secure`. Both cases: when establishing SSL-connection is mandatory and when it's not are handled automatically. ::: MySQL can be connected on a local host via sockets. To do this, set `host` and `socket`. @@ -815,4 +816,4 @@ Setting fields: :::note The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. -::: \ No newline at end of file +::: diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 36475449cf8..ac03dd39047 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -542,6 +542,7 @@ SOURCE(MYSQL( :::info "Примечание" Поля `table` или `where` не могут быть использованы вместе с полем `query`. Также обязательно должен быть один из источников данных: `table` или `query`. + Явный параметр `secure` отсутствует. Автоматически поддержана работа в обоих случаях: когда установка SSL-соединения необходима и когда нет. MySQL можно подключить на локальном хосте через сокеты, для этого необходимо задать `host` и `socket`. From 3f81aadb6051a31c6bf0c78bf19e342c0d630790 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 18 Jul 2022 17:53:18 +0000 Subject: [PATCH 091/227] Fix schema inference in case of empty messages in Protobuf/CapnProto formats --- src/Formats/CapnProtoUtils.cpp | 8 ++++++- src/Formats/ProtobufSerializer.cpp | 6 ++++- ...apnproto_protobuf_empty_messages.reference | 4 ++++ ...02327_capnproto_protobuf_empty_messages.sh | 24 +++++++++++++++++++ .../format_schemas/02327_schema.capnp | 10 ++++++++ .../format_schemas/02327_schema.proto | 8 +++++++ 6 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference create mode 100755 tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh create mode 100644 tests/queries/0_stateless/format_schemas/02327_schema.capnp create mode 100644 tests/queries/0_stateless/format_schemas/02327_schema.proto diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index add5220414f..4ae2590ee60 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -16,6 +16,8 @@ #include #include +#include + namespace DB { @@ -490,6 +492,10 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) { auto struct_schema = capnp_type.asStruct(); + + if (struct_schema.getFields().size() == 0) + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + /// Check if it can be Nullable. if (checkIfStructIsNamedUnion(struct_schema)) { @@ -525,7 +531,7 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) { if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported 2"); NamesAndTypesList names_and_types; for (auto field : schema.getNonUnionFields()) diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index b9af9d61da0..02054d0c1ed 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -3489,7 +3489,11 @@ namespace case FieldTypeId::TYPE_MESSAGE: { const auto * message_descriptor = field_descriptor->message_type(); - if (message_descriptor->field_count() == 1) + if (message_descriptor->field_count() == 0) + { + throw Exception("Empty messages are not supported", ErrorCodes::BAD_ARGUMENTS); + } + else if (message_descriptor->field_count() == 1) { const auto * nested_field_descriptor = message_descriptor->field(0); auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference new file mode 100644 index 00000000000..b462a5a7baa --- /dev/null +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference @@ -0,0 +1,4 @@ +OK +OK +OK +OK diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh new file mode 100755 index 00000000000..3890f013b3b --- /dev/null +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel, no-replicated-database + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +touch $USER_FILES_PATH/data.capnp + +SCHEMADIR=$(clickhouse-client --query "select * from file('data.capnp', 'CapnProto', 'val1 char') settings format_schema='nonexist:Message'" 2>&1 | grep Exception | grep -oP "file \K.*(?=/nonexist.capnp)") +CLIENT_SCHEMADIR=$CURDIR/format_schemas +SERVER_SCHEMADIR=test_02327 +mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR +cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ + + +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; + +$CLICKHOUSE_CLIENT --query="create table t engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table t engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; + +rm -rf ${SCHEMADIR:?}/${SERVER_SCHEMADIR:?} diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.capnp b/tests/queries/0_stateless/format_schemas/02327_schema.capnp new file mode 100644 index 00000000000..c882dcab8d4 --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02327_schema.capnp @@ -0,0 +1,10 @@ +@0x9ef128e10a8010b8; + +struct Nested1 +{ +} + +struct Message +{ + tuple1 @0 : Nested1; +} diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.proto b/tests/queries/0_stateless/format_schemas/02327_schema.proto new file mode 100644 index 00000000000..ae1f440d279 --- /dev/null +++ b/tests/queries/0_stateless/format_schemas/02327_schema.proto @@ -0,0 +1,8 @@ +syntax = "proto3"; + +message Nested { +} + +message Message { + Nested nested = 1; +}; From 24c94676413b38fa65e3568db524b2b72d62e435 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 18 Jul 2022 19:55:14 +0200 Subject: [PATCH 092/227] Fix --- src/Formats/CapnProtoUtils.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index 4ae2590ee60..65954315c0d 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -16,8 +16,6 @@ #include #include -#include - namespace DB { @@ -531,7 +529,7 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) { if (checkIfStructContainsUnnamedUnion(schema)) - throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported 2"); + throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); NamesAndTypesList names_and_types; for (auto field : schema.getNonUnionFields()) From 9de72d995a14d18b49d5dd9b29898e42352ffbff Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 12 Jul 2022 13:25:14 +0200 Subject: [PATCH 093/227] POC lightweight delete using __row_exists virtual column and prewhere-like filtering --- src/Core/Settings.h | 1 + src/Interpreters/MutationsInterpreter.cpp | 6 +- src/Processors/QueryPlan/FilterStep.cpp | 4 +- .../QueryPlan/ReadFromMergeTree.cpp | 1 + .../Transforms/FilterSortedStreamByRange.h | 2 +- src/Processors/Transforms/FilterTransform.cpp | 15 +++-- src/Processors/Transforms/FilterTransform.h | 2 +- src/Storages/MergeTree/MergeTask.cpp | 20 +++++- .../MergeTreeBaseSelectProcessor.cpp | 64 ++++++++++++++++++- .../MergeTree/MergeTreeBaseSelectProcessor.h | 1 + .../MergeTree/MergeTreeBlockReadUtils.cpp | 14 +++- .../MergeTree/MergeTreeBlockReadUtils.h | 1 + src/Storages/MergeTree/MergeTreeData.cpp | 1 + .../MergeTree/MergeTreeDataSelectExecutor.cpp | 4 ++ .../MergeTree/MergeTreeRangeReader.cpp | 27 ++++++-- src/Storages/MergeTree/MergeTreeRangeReader.h | 2 + src/Storages/MergeTree/MergeTreeReadPool.cpp | 4 +- src/Storages/MergeTree/MergeTreeReadPool.h | 3 +- .../MergeTree/MergeTreeSelectProcessor.cpp | 8 ++- .../MergeTree/MergeTreeSequentialSource.cpp | 2 +- .../MergeTreeThreadSelectProcessor.cpp | 20 ++++++ src/Storages/MergeTree/MutateTask.cpp | 14 ++++ src/Storages/StorageDistributed.cpp | 1 + .../02352_ligthweight_delete.reference | 11 ++++ .../0_stateless/02352_ligthweight_delete.sql | 37 +++++++++++ 25 files changed, 238 insertions(+), 27 deletions(-) create mode 100644 tests/queries/0_stateless/02352_ligthweight_delete.reference create mode 100644 tests/queries/0_stateless/02352_ligthweight_delete.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7c559330e38..4e1024a07df 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -459,6 +459,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \ + M(Bool, allow_experimental_lwd2, false, "Enable lightweight DELETE mutations using __rows_exists column for mergetree tables. Work in progress", 0) \ M(Bool, lightweight_delete_mutation, true, "Enable to make ordinary ALTER DELETE queries lightweight for mergetree tables", 0) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index cc658bfa764..21160ac5dd8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -297,7 +297,9 @@ MutationsInterpreter::MutationsInterpreter( , is_lightweight(is_lightweight_) { if (is_lightweight) + { mutation_ast = prepareLightweightDelete(!can_execute); + } else mutation_ast = prepare(!can_execute); } @@ -354,7 +356,7 @@ static void validateUpdateColumns( } } - if (!found) + if (!found && column_name != "__row_exists") /// TODO: properly handle updating __row_exists column for LWD { for (const auto & col : metadata_snapshot->getColumns().getMaterialized()) { @@ -507,7 +509,7 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) /// /// Outer CAST is added just in case if we don't trust the returning type of 'if'. - const auto & type = columns_desc.getPhysical(column).type; + const auto type = (column == "__row_exists" ? std::make_shared() : columns_desc.getPhysical(column).type); auto type_literal = std::make_shared(type->getName()); const auto & update_expr = kv.second; diff --git a/src/Processors/QueryPlan/FilterStep.cpp b/src/Processors/QueryPlan/FilterStep.cpp index ff58abf8874..becfe1062fb 100644 --- a/src/Processors/QueryPlan/FilterStep.cpp +++ b/src/Processors/QueryPlan/FilterStep.cpp @@ -34,7 +34,7 @@ FilterStep::FilterStep( input_stream_, FilterTransform::transformHeader( input_stream_.header, - *actions_dag_, + actions_dag_.get(), filter_column_name_, remove_filter_column_), getTraits(actions_dag_)) @@ -109,7 +109,7 @@ void FilterStep::updateOutputStream() { output_stream = createOutputStream( input_streams.front(), - FilterTransform::transformHeader(input_streams.front().header, *actions_dag, filter_column_name, remove_filter_column), + FilterTransform::transformHeader(input_streams.front().header, actions_dag.get(), filter_column_name, remove_filter_column), getDataStreamTraits()); } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 59f6ec558e7..9a65cd4f17e 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -185,6 +185,7 @@ Pipe ReadFromMergeTree::readFromPool( storage_snapshot, prewhere_info, required_columns, + virt_column_names, backoff_settings, settings.preferred_block_size_bytes, false); diff --git a/src/Processors/Transforms/FilterSortedStreamByRange.h b/src/Processors/Transforms/FilterSortedStreamByRange.h index 04ef3afd69c..e1141ebd299 100644 --- a/src/Processors/Transforms/FilterSortedStreamByRange.h +++ b/src/Processors/Transforms/FilterSortedStreamByRange.h @@ -23,7 +23,7 @@ public: bool on_totals_ = false) : ISimpleTransform( header_, - FilterTransform::transformHeader(header_, expression_->getActionsDAG(), filter_column_name_, remove_filter_column_), + FilterTransform::transformHeader(header_, &expression_->getActionsDAG(), filter_column_name_, remove_filter_column_), true) , filter_transform(header_, expression_, filter_column_name_, remove_filter_column_, on_totals_) { diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index 9164599f3b1..5d75bdc483c 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -29,11 +29,12 @@ static void replaceFilterToConstant(Block & block, const String & filter_column_ Block FilterTransform::transformHeader( Block header, - const ActionsDAG & expression, + const ActionsDAG * expression, const String & filter_column_name, bool remove_filter_column) { - header = expression.updateHeader(std::move(header)); + if (expression) + header = expression->updateHeader(std::move(header)); if (remove_filter_column) header.erase(filter_column_name); @@ -51,7 +52,7 @@ FilterTransform::FilterTransform( bool on_totals_) : ISimpleTransform( header_, - transformHeader(header_, expression_->getActionsDAG(), filter_column_name_, remove_filter_column_), + transformHeader(header_, expression_ ? &expression_->getActionsDAG() : nullptr, filter_column_name_, remove_filter_column_), true) , expression(std::move(expression_)) , filter_column_name(std::move(filter_column_name_)) @@ -59,7 +60,8 @@ FilterTransform::FilterTransform( , on_totals(on_totals_) { transformed_header = getInputPort().getHeader(); - expression->execute(transformed_header); + if (expression) + expression->execute(transformed_header); filter_column_position = transformed_header.getPositionByName(filter_column_name); auto & column = transformed_header.getByPosition(filter_column_position).column; @@ -74,7 +76,7 @@ IProcessor::Status FilterTransform::prepare() /// Optimization for `WHERE column in (empty set)`. /// The result will not change after set was created, so we can skip this check. /// It is implemented in prepare() stop pipeline before reading from input port. - || (!are_prepared_sets_initialized && expression->checkColumnIsAlwaysFalse(filter_column_name)))) + || (!are_prepared_sets_initialized && expression && expression->checkColumnIsAlwaysFalse(filter_column_name)))) { input.close(); output.finish(); @@ -106,7 +108,8 @@ void FilterTransform::transform(Chunk & chunk) Block block = getInputPort().getHeader().cloneWithColumns(columns); columns.clear(); - expression->execute(block, num_rows_before_filtration); + if (expression) + expression->execute(block, num_rows_before_filtration); columns = block.getColumns(); } diff --git a/src/Processors/Transforms/FilterTransform.h b/src/Processors/Transforms/FilterTransform.h index 3340fe230b7..be892414c96 100644 --- a/src/Processors/Transforms/FilterTransform.h +++ b/src/Processors/Transforms/FilterTransform.h @@ -24,7 +24,7 @@ public: static Block transformHeader( Block header, - const ActionsDAG & expression, + const ActionsDAG * expression, const String & filter_column_name, bool remove_filter_column); diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 7426b384394..cc93e17bda3 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -810,11 +811,28 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() for (const auto & part : global_ctx->future_part->parts) { + auto columns = global_ctx->merging_column_names; + + if (part->getColumns().contains("__row_exists")) + columns.emplace_back("__row_exists"); + + auto input = std::make_unique( - *global_ctx->data, global_ctx->storage_snapshot, part, global_ctx->merging_column_names, ctx->read_with_direct_io, true); + *global_ctx->data, global_ctx->storage_snapshot, part, columns, ctx->read_with_direct_io, true); Pipe pipe(std::move(input)); + +///////////// + if (part->getColumns().contains("__row_exists")) + { + pipe.addSimpleTransform([](const Block & header) + { + return std::make_shared(header, nullptr, "__row_exists", "__row_exists"); + }); + } +///////////// + if (global_ctx->metadata_snapshot->hasSortingKey()) { pipe.addSimpleTransform([this](const Block & header) diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 31ca533a9ec..933f9144c6a 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -61,6 +61,10 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( { non_const_virtual_column_names.emplace_back(*it); } + else if (*it == "__row_exists") + { + non_const_virtual_column_names.emplace_back(*it); + } else { /// Remove virtual columns that are going to be filled with const values @@ -219,10 +223,20 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu { MergeTreeRangeReader* prev_reader = nullptr; bool last_reader = false; + size_t pre_readers_shift = 0; + + if (!reader_settings.skip_deleted_mask && current_task.data_part->getColumns().contains("__row_exists")) + { +// last_reader = !prewhere_actions || prewhere_actions->steps.empty(); + current_task.pre_range_readers.push_back( + MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lwd_filter_step, last_reader, non_const_virtual_column_names)); + prev_reader = ¤t_task.pre_range_readers.back(); + pre_readers_shift++; + } if (prewhere_info) { - if (prewhere_actions->steps.size() != pre_reader_for_step.size()) + if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "PREWHERE steps count mismatch, actions: {}, readers: {}", prewhere_actions->steps.size(), pre_reader_for_step.size()); @@ -232,7 +246,7 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu { last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size()); current_task.pre_range_readers.push_back( - MergeTreeRangeReader(pre_reader_for_step[i].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names)); + MergeTreeRangeReader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names)); prev_reader = ¤t_task.pre_range_readers.back(); } @@ -339,7 +353,10 @@ Chunk MergeTreeBaseSelectProcessor::readFromPartImpl() /// Reorder columns. TODO: maybe skip for default case. for (size_t ps = 0; ps < header_without_virtual_columns.columns(); ++ps) { - auto pos_in_sample_block = sample_block.getPositionByName(header_without_virtual_columns.getByPosition(ps).name); + const auto & name = header_without_virtual_columns.getByPosition(ps).name; + if (name == "__row_exists" && !sample_block.has(name)) + continue; /// TODO: properly deal with cases when __row_exists is not read and is filled later + auto pos_in_sample_block = sample_block.getPositionByName(name); ordered_columns.emplace_back(std::move(read_result.columns[pos_in_sample_block])); } @@ -365,6 +382,7 @@ namespace virtual void insertArrayOfStringsColumn(const ColumnPtr & column, const String & name) = 0; virtual void insertStringColumn(const ColumnPtr & column, const String & name) = 0; + virtual void insertUInt8Column(const ColumnPtr & column, const String & name) = 0; virtual void insertUInt64Column(const ColumnPtr & column, const String & name) = 0; virtual void insertUUIDColumn(const ColumnPtr & column, const String & name) = 0; @@ -390,6 +408,8 @@ static void injectNonConstVirtualColumns( { if (virtual_column_name == "_part_offset") inserter.insertUInt64Column(DataTypeUInt64().createColumn(), virtual_column_name); + if (virtual_column_name == "__row_exists") + inserter.insertUInt8Column(DataTypeUInt8().createColumn(), virtual_column_name); } } @@ -485,6 +505,11 @@ namespace block.insert({column, std::make_shared(), name}); } + void insertUInt8Column(const ColumnPtr & column, const String & name) final + { + block.insert({column, std::make_shared(), name}); + } + void insertUInt64Column(const ColumnPtr & column, const String & name) final { block.insert({column, std::make_shared(), name}); @@ -525,6 +550,11 @@ namespace columns.push_back(column); } + void insertUInt8Column(const ColumnPtr & column, const String &) final + { + columns.push_back(column); + } + void insertUInt64Column(const ColumnPtr & column, const String &) final { columns.push_back(column); @@ -569,6 +599,34 @@ void MergeTreeBaseSelectProcessor::injectVirtualColumns( auto columns = chunk.detachColumns(); VirtualColumnsInserterIntoColumns inserter{columns}; + +///////////////////////// +// TODO: implement properly + for (const auto & virtual_column_name : virtual_columns) + { + + if (virtual_column_name == "__row_exists") + { + if (task->data_part->getColumns().contains(virtual_column_name)) + { + /// If this column is present in the part it must be read from the data + assert(task->task_columns.columns.contains(virtual_column_name)); + } + else + { + /// If __row_exists column isn't present in the part then + ColumnPtr column; + if (num_rows) + column = DataTypeUInt8().createColumnConst(num_rows, 1)->convertToFullColumnIfConst(); + else + column = DataTypeUInt8().createColumn(); + + inserter.insertUInt8Column(column, virtual_column_name); + } + } + } +/////////////////////////// + /// Only add const virtual columns because non-const ones have already been added injectPartConstVirtualColumns(num_rows, inserter, task, partition_value_type, virtual_columns); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 368dce7deaa..15a088d115c 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -89,6 +89,7 @@ protected: const MergeTreeData & storage; StorageSnapshotPtr storage_snapshot; + PrewhereExprStep lwd_filter_step { nullptr, "__row_exists", true, true }; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 50f4c34f004..ad208f6b041 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -108,7 +108,9 @@ NameSet injectRequiredColumns( for (size_t i = 0; i < columns.size(); ++i) { /// We are going to fetch only physical columns - if (!storage_snapshot->tryGetColumn(options, columns[i])) + const bool is_real_column = storage_snapshot->tryGetColumn(options, columns[i]).has_value(); + const bool is_virtual_column = storage.isVirtualColumn(columns[i], storage_snapshot->getMetadataForQuery()); + if (!is_real_column && !is_virtual_column) throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( @@ -272,18 +274,26 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, + const Names & non_const_virtual_columns, const PrewhereInfoPtr & prewhere_info, bool with_subcolumns) { Names column_names = required_columns; Names pre_column_names; + /// read non-const virtual column from data if it exists + for (const auto & name : non_const_virtual_columns) + { + if (data_part->getColumns().contains(name)) + column_names.push_back(name); + } + /// inject columns required for defaults evaluation injectRequiredColumns( storage, storage_snapshot, data_part, with_subcolumns, column_names); MergeTreeReadTaskColumns result; - auto options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects(); + auto options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); if (with_subcolumns) options.withSubcolumns(); diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index f9f82dbd1f2..d6277167555 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -79,6 +79,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, + const Names & non_const_virtual_columns, const PrewhereInfoPtr & prewhere_info, bool with_subcolumns); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c887552b35b..4df47eb7765 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6562,6 +6562,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const NameAndTypePair("_partition_value", getPartitionValueType()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), + NameAndTypePair("__row_exists", std::make_shared()), }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 85231aca253..36d45430cff 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1214,6 +1214,10 @@ static void selectColumnNames( { virt_column_names.push_back(name); } + else if (name == "__row_exists") + { + virt_column_names.push_back(name); + } else if (name == "_part_uuid") { virt_column_names.push_back(name); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 2f34d75b2c4..cf90da36ace 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -657,21 +657,28 @@ MergeTreeRangeReader::MergeTreeRangeReader( , prewhere_info(prewhere_info_) , last_reader_in_chain(last_reader_in_chain_) , is_initialized(true) - , non_const_virtual_column_names(non_const_virtual_column_names_) +// , non_const_virtual_column_names() { + + if (prev_reader) sample_block = prev_reader->getSampleBlock(); for (const auto & name_and_type : merge_tree_reader->getColumns()) sample_block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); - - for (const auto & column_name : non_const_virtual_column_names) + + for (const auto & column_name : non_const_virtual_column_names_) { if (sample_block.has(column_name)) continue; + non_const_virtual_column_names.push_back(column_name); + if (column_name == "_part_offset") sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); + +// if (column_name == "__row_exists") +// sample_block.insert(ColumnWithTypeAndName(ColumnUInt8::create(), std::make_shared(), column_name)); } if (merge_tree_reader->needReadDeletedMask()) @@ -861,7 +868,11 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar if (read_result.num_rows) { /// Physical columns go first and then some virtual columns follow - const size_t physical_columns_count = read_result.columns.size() - non_const_virtual_column_names.size(); + size_t physical_columns_count = read_result.columns.size() - read_result.extra_columns_filled.size(); +/////////// +// TODO: properly account for "virtual columns" that are overridden with real data in the part + +///////////// Columns physical_columns(read_result.columns.begin(), read_result.columns.begin() + physical_columns_count); bool should_evaluate_missing_defaults; @@ -989,6 +1000,7 @@ void MergeTreeRangeReader::fillPartOffsetColumn(ReadResult & result, UInt64 lead } result.columns.emplace_back(std::move(column)); + result.extra_columns_filled.push_back("_part_offset"); } /// Fill deleted_row_mask column, referenced from fillPartOffsetColumn(). void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset) @@ -1184,7 +1196,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r size_t num_columns = header.size(); /// Check that we have columns from previous steps and newly read required columns - if (result.columns.size() < num_columns + non_const_virtual_column_names.size()) + if (result.columns.size() < num_columns + result.extra_columns_filled.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid number of columns passed to MergeTreeRangeReader. Expected {}, got {}", num_columns, result.columns.size()); @@ -1227,6 +1239,11 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r block.insert({result.columns[pos], std::make_shared(), column_name}); } + else if (column_name == "__row_exists") + { + /// do nothing, it will be added later + /// TODO: properly implement reading non-const virtual columns or filling them with default values + } else throw Exception("Unexpected non-const virtual column: " + column_name, ErrorCodes::LOGICAL_ERROR); ++pos; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 0e6ace18be9..8f063786cbc 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -244,6 +244,8 @@ public: static size_t numZerosInTail(const UInt8 * begin, const UInt8 * end); std::map filter_bytes_map; + + Names extra_columns_filled; }; ReadResult read(size_t max_rows, MarkRanges & ranges); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index d44d250149e..9bcc6535abb 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -26,6 +26,7 @@ MergeTreeReadPool::MergeTreeReadPool( const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, const Names & column_names_, + const Names & virtual_column_names_, const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, bool do_not_steal_tasks_) @@ -34,6 +35,7 @@ MergeTreeReadPool::MergeTreeReadPool( , data{data_} , storage_snapshot{storage_snapshot_} , column_names{column_names_} + , virtual_column_names{virtual_column_names_} , do_not_steal_tasks{do_not_steal_tasks_} , predict_block_size_bytes{preferred_block_size_bytes_ > 0} , prewhere_info{prewhere_info_} @@ -213,7 +215,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & auto task_columns = getReadTaskColumns( data, storage_snapshot, part.data_part, - column_names, prewhere_info, /*with_subcolumns=*/ true); + column_names, virtual_column_names /*TODO: fill non-const virtual columns*/, prewhere_info, /*with_subcolumns=*/ true); auto size_predictor = !predict_block_size_bytes ? nullptr : MergeTreeBaseSelectProcessor::getSizePredictor(part.data_part, task_columns, sample_block); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index d882c0d761f..01a1280b6cb 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -73,7 +73,7 @@ public: size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_, RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_, const PrewhereInfoPtr & prewhere_info_, - const Names & column_names_, + const Names & column_names_, const Names & virtual_column_names_, const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, bool do_not_steal_tasks_ = false); @@ -97,6 +97,7 @@ private: const MergeTreeData & data; StorageSnapshotPtr storage_snapshot; const Names column_names; + const Names virtual_column_names; bool do_not_steal_tasks; bool predict_block_size_bytes; diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index aa6c457f412..47bcf72d611 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -52,7 +52,7 @@ void MergeTreeSelectProcessor::initializeReaders() { task_columns = getReadTaskColumns( storage, storage_snapshot, data_part, - required_columns, prewhere_info, /*with_subcolumns=*/ true); + required_columns, non_const_virtual_column_names, prewhere_info, /*with_subcolumns=*/ true); /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); @@ -68,6 +68,12 @@ void MergeTreeSelectProcessor::initializeReaders() pre_reader_for_step.clear(); + if (!reader_settings.skip_deleted_mask && data_part->getColumns().contains("__row_exists")) + { + pre_reader_for_step.push_back(data_part->getReader({{"__row_exists", std::make_shared()}}, storage_snapshot->getMetadataForQuery(), + all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {})); + } + if (prewhere_info) { for (const auto & pre_columns_for_step : task_columns.pre_columns) diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 47c45058088..53a56bad97e 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -46,7 +46,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { - auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals(); columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); } else diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp index 574ce4479f2..4c0eac95593 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp @@ -116,6 +116,16 @@ void MergeTreeThreadSelectProcessor::finalizeNewTask() IMergeTreeReader::ValueSizeMap{}, profile_callback); pre_reader_for_step.clear(); + + + if (!reader_settings.skip_deleted_mask && task->data_part->getColumns().contains("__row_exists")) + { + pre_reader_for_step.push_back(task->data_part->getReader({{"__row_exists", std::make_shared()}}, metadata_snapshot, task->mark_ranges, + owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, + IMergeTreeReader::ValueSizeMap{}, profile_callback)); + } + + if (prewhere_info) { for (const auto & pre_columns_per_step : task->task_columns.pre_columns) @@ -137,6 +147,16 @@ void MergeTreeThreadSelectProcessor::finalizeNewTask() reader->getAvgValueSizeHints(), profile_callback); pre_reader_for_step.clear(); + + if (!reader_settings.skip_deleted_mask && task->data_part->getColumns().contains("__row_exists")) + { + pre_reader_for_step.push_back(task->data_part->getReader({{"__row_exists", std::make_shared()}}, metadata_snapshot, task->mark_ranges, + owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, + reader->getAvgValueSizeHints(), profile_callback)); + } + + + if (prewhere_info) { for (const auto & pre_columns_per_step : task->task_columns.pre_columns) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 50c37ba5b08..b47f0cab6ab 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -174,6 +174,15 @@ getColumnsForNewDataPart( /// All commands are validated in AlterCommand so we don't care about order for (const auto & command : commands_for_removes) { + if (command.type == MutationCommand::UPDATE) + { + for (const auto & [column_name, _] : command.column_to_update_expression) + { + if (column_name == "__row_exists" && !storage_columns.contains(column_name)) + storage_columns.emplace_back("__row_exists", std::make_shared()); + } + } + /// If we don't have this column in source part, than we don't need to materialize it if (!part_columns.has(command.column_name)) continue; @@ -1682,6 +1691,11 @@ bool MutateTask::prepare() need_mutate_all_columns = need_mutate_all_columns || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter->isAffectingAllColumns()); if (!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) ctx->interpreter->setSkipDeletedMask(true); + +///// + ctx->interpreter->setSkipDeletedMask(true); +///// + ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); ctx->progress_callback = MergeProgressCallback((*ctx->mutate_entry)->ptr(), ctx->watch_prev_elapsed, *ctx->stage_progress); diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 03eb400a8ad..32e0fcffca6 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -312,6 +312,7 @@ NamesAndTypesList StorageDistributed::getVirtuals() const NameAndTypePair("_partition_id", std::make_shared()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), + NameAndTypePair("__row_exists", std::make_shared()), NameAndTypePair("_shard_num", std::make_shared()), /// deprecated }; } diff --git a/tests/queries/0_stateless/02352_ligthweight_delete.reference b/tests/queries/0_stateless/02352_ligthweight_delete.reference new file mode 100644 index 00000000000..26cce75896c --- /dev/null +++ b/tests/queries/0_stateless/02352_ligthweight_delete.reference @@ -0,0 +1,11 @@ +Rows in parts 10000000 +Count 10000000 +First row 0 10 +Delete 3M rows using light weight delete +Rows in parts 10000000 +Count 7000000 +First row 3000000 10 +Force merge to cleanup deleted rows +Rows in parts 7000000 +Count 7000000 +First row 3000000 10 diff --git a/tests/queries/0_stateless/02352_ligthweight_delete.sql b/tests/queries/0_stateless/02352_ligthweight_delete.sql new file mode 100644 index 00000000000..a472a927424 --- /dev/null +++ b/tests/queries/0_stateless/02352_ligthweight_delete.sql @@ -0,0 +1,37 @@ +DROP TABLE IF EXISTS lwd_test; + +CREATE TABLE lwd_test (id UInt64 , value String) ENGINE MergeTree() ORDER BY id; + +INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 10000000; + +SET mutations_sync = 1; +--SET allow_experimental_lightweight_delete = 1; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; + +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; + +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +SELECT 'Delete 3M rows using light weight delete'; +ALTER TABLE lwd_test UPDATE __row_exists = 0 WHERE id < 3000000; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; + +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; + +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +SELECT 'Force merge to cleanup deleted rows'; +OPTIMIZE TABLE lwd_test FINAL; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; + +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; + +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +DROP TABLE lwd_test; \ No newline at end of file From f956810fdd791a9deab34abfbba5dcdaf05ec672 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 12 Jul 2022 22:56:15 +0200 Subject: [PATCH 094/227] Rewrite DELETE FROM into UPDATE __row_exist=0 --- src/Interpreters/InterpreterDeleteQuery.cpp | 56 +++++++++++++++---- ...nce => 02352_lightweight_delete.reference} | 6 +- ...elete.sql => 02352_lightweight_delete.sql} | 16 +++++- 3 files changed, 62 insertions(+), 16 deletions(-) rename tests/queries/0_stateless/{02352_ligthweight_delete.reference => 02352_lightweight_delete.reference} (58%) rename tests/queries/0_stateless/{02352_ligthweight_delete.sql => 02352_lightweight_delete.sql} (71%) diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 3a786997ae3..33662b94614 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -31,7 +33,7 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex BlockIO InterpreterDeleteQuery::execute() { - if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete) + if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete && !getContext()->getSettingsRef().allow_experimental_lwd2) throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it"); FunctionNameNormalizer().visit(query_ptr.get()); @@ -68,20 +70,50 @@ BlockIO InterpreterDeleteQuery::execute() MutationCommands mutation_commands; MutationCommand mut_command; - mut_command.type = MutationCommand::Type::DELETE; - mut_command.predicate = delete_query.predicate; + if (getContext()->getSettingsRef().allow_experimental_lwd2) + { + /// UPDATE __row_exists = 0 WHERE predicate + mut_command.type = MutationCommand::Type::UPDATE; + mut_command.predicate = delete_query.predicate; - auto command = std::make_shared(); - command->type = ASTAlterCommand::DELETE; - command->predicate = delete_query.predicate; - command->children.push_back(command->predicate); - mut_command.ast = command->ptr(); + auto command = std::make_shared(); + command->type = ASTAlterCommand::UPDATE; + command->predicate = delete_query.predicate; + command->update_assignments = std::make_shared(); + auto set_row_exists = std::make_shared(); + set_row_exists->column_name = "__row_exists"; + auto zero_value = std::make_shared(DB::Field(UInt8(0))); + set_row_exists->children.push_back(zero_value); + command->update_assignments->children.push_back(set_row_exists); + command->children.push_back(command->predicate); + command->children.push_back(command->update_assignments); + mut_command.column_to_update_expression[set_row_exists->column_name] = zero_value; + mut_command.ast = command->ptr(); - mutation_commands.emplace_back(mut_command); + mutation_commands.emplace_back(mut_command); + + table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); + storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Ordinary); + } + else + { + mut_command.type = MutationCommand::Type::DELETE; + mut_command.predicate = delete_query.predicate; + + auto command = std::make_shared(); + command->type = ASTAlterCommand::DELETE; + command->predicate = delete_query.predicate; + command->children.push_back(command->predicate); + mut_command.ast = command->ptr(); + + mutation_commands.emplace_back(mut_command); + + table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); + storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); + } - table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); - storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); return {}; } diff --git a/tests/queries/0_stateless/02352_ligthweight_delete.reference b/tests/queries/0_stateless/02352_lightweight_delete.reference similarity index 58% rename from tests/queries/0_stateless/02352_ligthweight_delete.reference rename to tests/queries/0_stateless/02352_lightweight_delete.reference index 26cce75896c..8ae6b1d3195 100644 --- a/tests/queries/0_stateless/02352_ligthweight_delete.reference +++ b/tests/queries/0_stateless/02352_lightweight_delete.reference @@ -1,7 +1,7 @@ Rows in parts 10000000 Count 10000000 First row 0 10 -Delete 3M rows using light weight delete +Delete 3M rows using UPDATE __row_exists Rows in parts 10000000 Count 7000000 First row 3000000 10 @@ -9,3 +9,7 @@ Force merge to cleanup deleted rows Rows in parts 7000000 Count 7000000 First row 3000000 10 +Delete 3M more rows using light weight DELETE +Rows in parts 7000000 +Count 4000000 +First row 6000000 10 diff --git a/tests/queries/0_stateless/02352_ligthweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql similarity index 71% rename from tests/queries/0_stateless/02352_ligthweight_delete.sql rename to tests/queries/0_stateless/02352_lightweight_delete.sql index a472a927424..c5f636181cd 100644 --- a/tests/queries/0_stateless/02352_ligthweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -5,7 +5,6 @@ CREATE TABLE lwd_test (id UInt64 , value String) ENGINE MergeTree() ORDER BY id; INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 10000000; SET mutations_sync = 1; ---SET allow_experimental_lightweight_delete = 1; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; @@ -14,7 +13,7 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 3M rows using light weight delete'; +SELECT 'Delete 3M rows using UPDATE __row_exists'; ALTER TABLE lwd_test UPDATE __row_exists = 0 WHERE id < 3000000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; @@ -34,4 +33,15 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -DROP TABLE lwd_test; \ No newline at end of file +SET allow_experimental_lwd2 = 1; +SELECT 'Delete 3M more rows using light weight DELETE'; +DELETE FROM lwd_test WHERE id < 6000000; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; + +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; + +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +DROP TABLE lwd_test; From f324ca992155b85dd3b4ef5e901f094fa90473b2 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Sun, 17 Jul 2022 20:41:17 +0200 Subject: [PATCH 095/227] Cleanups --- src/Core/Settings.h | 2 +- src/Interpreters/InterpreterDeleteQuery.cpp | 19 +- src/Interpreters/MutationsInterpreter.cpp | 18 +- src/Storages/AlterCommands.cpp | 7 +- src/Storages/MergeTree/MergeTask.cpp | 18 +- .../MergeTreeBaseSelectProcessor.cpp | 294 +++++++----------- .../MergeTree/MergeTreeBaseSelectProcessor.h | 28 +- .../MergeTree/MergeTreeRangeReader.cpp | 1 + src/Storages/MergeTree/MergeTreeRangeReader.h | 6 +- .../MergeTreeReverseSelectProcessor.cpp | 6 +- .../MergeTreeReverseSelectProcessor.h | 4 +- .../MergeTree/MergeTreeSelectProcessor.cpp | 23 +- .../MergeTreeThreadSelectProcessor.cpp | 64 +--- src/Storages/MergeTree/MutateTask.cpp | 16 +- .../MergeTree/registerStorageMergeTree.cpp | 3 + src/Storages/StorageInMemoryMetadata.cpp | 2 + src/Storages/StorageInMemoryMetadata.h | 2 + src/Storages/TTLDescription.h | 7 + .../02352_lightweight_delete.reference | 8 + .../0_stateless/02352_lightweight_delete.sql | 31 +- 20 files changed, 256 insertions(+), 303 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4e1024a07df..cb932843fc2 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -459,7 +459,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \ - M(Bool, allow_experimental_lwd2, false, "Enable lightweight DELETE mutations using __rows_exists column for mergetree tables. Work in progress", 0) \ + M(Bool, allow_experimental_lightweight_delete_with_row_exists, false, "Enable lightweight DELETE mutations using __rows_exists column for mergetree tables. Work in progress", 0) \ M(Bool, lightweight_delete_mutation, true, "Enable to make ordinary ALTER DELETE queries lightweight for mergetree tables", 0) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 33662b94614..0b7fdbd264c 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -33,8 +33,11 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex BlockIO InterpreterDeleteQuery::execute() { - if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete && !getContext()->getSettingsRef().allow_experimental_lwd2) + if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete && + !getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) + { throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it"); + } FunctionNameNormalizer().visit(query_ptr.get()); const ASTDeleteQuery & delete_query = query_ptr->as(); @@ -70,9 +73,9 @@ BlockIO InterpreterDeleteQuery::execute() MutationCommands mutation_commands; MutationCommand mut_command; - if (getContext()->getSettingsRef().allow_experimental_lwd2) + if (getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) { - /// UPDATE __row_exists = 0 WHERE predicate + /// Build "UPDATE __row_exists = 0 WHERE predicate" query mut_command.type = MutationCommand::Type::UPDATE; mut_command.predicate = delete_query.predicate; @@ -80,14 +83,14 @@ BlockIO InterpreterDeleteQuery::execute() command->type = ASTAlterCommand::UPDATE; command->predicate = delete_query.predicate; command->update_assignments = std::make_shared(); - auto set_row_exists = std::make_shared(); - set_row_exists->column_name = "__row_exists"; + auto set_row_does_not_exist = std::make_shared(); + set_row_does_not_exist->column_name = metadata_snapshot->lightweight_delete_description.filter_column.name; auto zero_value = std::make_shared(DB::Field(UInt8(0))); - set_row_exists->children.push_back(zero_value); - command->update_assignments->children.push_back(set_row_exists); + set_row_does_not_exist->children.push_back(zero_value); + command->update_assignments->children.push_back(set_row_does_not_exist); command->children.push_back(command->predicate); command->children.push_back(command->update_assignments); - mut_command.column_to_update_expression[set_row_exists->column_name] = zero_value; + mut_command.column_to_update_expression[set_row_does_not_exist->column_name] = zero_value; mut_command.ast = command->ptr(); mutation_commands.emplace_back(mut_command); diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 21160ac5dd8..ffe80a9502a 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -29,7 +29,6 @@ #include #include - namespace DB { @@ -297,9 +296,7 @@ MutationsInterpreter::MutationsInterpreter( , is_lightweight(is_lightweight_) { if (is_lightweight) - { mutation_ast = prepareLightweightDelete(!can_execute); - } else mutation_ast = prepare(!can_execute); } @@ -356,7 +353,11 @@ static void validateUpdateColumns( } } - if (!found && column_name != "__row_exists") /// TODO: properly handle updating __row_exists column for LWD + /// Allow to override values of virtual columns + if (!found && column_name == metadata_snapshot->lightweight_delete_description.filter_column.name) + found = true; + + if (!found) { for (const auto & col : metadata_snapshot->getColumns().getMaterialized()) { @@ -509,7 +510,14 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) /// /// Outer CAST is added just in case if we don't trust the returning type of 'if'. - const auto type = (column == "__row_exists" ? std::make_shared() : columns_desc.getPhysical(column).type); + DataTypePtr type; + if (auto physical_column = columns_desc.tryGetPhysical(column)) + type = physical_column->type; + else if (column == metadata_snapshot->lightweight_delete_description.filter_column.name) + type = metadata_snapshot->lightweight_delete_description.filter_column.type; + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown column {}", column); + auto type_literal = std::make_shared(type->getName()); const auto & update_expr = kv.second; diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 845aae52582..03053eb9b18 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -785,7 +785,8 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada /// Drop alias is metadata alter, in other case mutation is required. if (type == DROP_COLUMN) - return metadata.columns.hasColumnOrNested(GetColumnsOptions::AllPhysical, column_name); + return metadata.columns.hasColumnOrNested(GetColumnsOptions::AllPhysical, column_name) || + column_name == metadata.lightweight_delete_description.filter_column.name; if (type != MODIFY_COLUMN || data_type == nullptr) return false; @@ -1149,7 +1150,9 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const } else if (command.type == AlterCommand::DROP_COLUMN) { - if (all_columns.has(command.column_name) || all_columns.hasNested(command.column_name)) + if (all_columns.has(command.column_name) || + all_columns.hasNested(command.column_name) || + (command.clear && column_name == metadata.lightweight_delete_description.filter_column.name)) { if (!command.clear) /// CLEAR column is Ok even if there are dependencies. { diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index cc93e17bda3..65c9523f861 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -813,25 +813,25 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() { auto columns = global_ctx->merging_column_names; - if (part->getColumns().contains("__row_exists")) - columns.emplace_back("__row_exists"); - + /// The part might have some rows masked by lightweight deletes + const auto lwd_filter_column = global_ctx->metadata_snapshot->lightweight_delete_description.filter_column.name; + const bool need_to_filter_deleted_rows = !lwd_filter_column.empty() && part->getColumns().contains(lwd_filter_column); + if (need_to_filter_deleted_rows) + columns.emplace_back(lwd_filter_column); auto input = std::make_unique( *global_ctx->data, global_ctx->storage_snapshot, part, columns, ctx->read_with_direct_io, true); Pipe pipe(std::move(input)); - -///////////// - if (part->getColumns().contains("__row_exists")) + /// Add filtering step that discards deleted rows + if (need_to_filter_deleted_rows) { - pipe.addSimpleTransform([](const Block & header) + pipe.addSimpleTransform([lwd_filter_column](const Block & header) { - return std::make_shared(header, nullptr, "__row_exists", "__row_exists"); + return std::make_shared(header, nullptr, lwd_filter_column, true); }); } -///////////// if (global_ctx->metadata_snapshot->hasSortingKey()) { diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 933f9144c6a..ce48a03ce8b 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -208,16 +208,58 @@ Chunk MergeTreeBaseSelectProcessor::generate() auto res = readFromPart(); - if (res.hasRows()) + if (res.row_count) { - injectVirtualColumns(res, task.get(), partition_value_type, virt_column_names); - return res; + injectVirtualColumns(res.block, res.row_count, task.get(), partition_value_type, virt_column_names); + + /// Reorder the columns according to output header + const auto & output_header = output.getHeader(); + Columns ordered_columns; + ordered_columns.reserve(output_header.columns()); + for (size_t i = 0; i < output_header.columns(); ++i) + { + auto name = output_header.getByPosition(i).name; + ordered_columns.push_back(res.block.getByName(name).column); + } + + return Chunk(ordered_columns, res.row_count); } } return {}; } +void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart( + MergeTreeData::DataPartPtr & data_part, + const MergeTreeReadTaskColumns & task_columns, const StorageMetadataPtr & metadata_snapshot, + const MarkRanges & mark_ranges, const IMergeTreeReader::ValueSizeMap & value_size_map, + const ReadBufferFromFileBase::ProfileCallback & profile_callback) +{ + reader = data_part->getReader(task_columns.columns, metadata_snapshot, mark_ranges, + owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, + value_size_map, profile_callback); + + pre_reader_for_step.clear(); + + /// Add lightweight delete filtering step + const auto & lightweigth_delete_info = metadata_snapshot->lightweight_delete_description; + if (!reader_settings.skip_deleted_mask && data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) + { + pre_reader_for_step.push_back(data_part->getReader({lightweigth_delete_info.filter_column}, metadata_snapshot, mark_ranges, + owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, + value_size_map, profile_callback)); + } + + if (prewhere_info) + { + for (const auto & pre_columns_per_step : task_columns.pre_columns) + { + pre_reader_for_step.push_back(data_part->getReader(pre_columns_per_step, metadata_snapshot, mark_ranges, + owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, + value_size_map, profile_callback)); + } + } +} void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task) { @@ -225,9 +267,10 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu bool last_reader = false; size_t pre_readers_shift = 0; - if (!reader_settings.skip_deleted_mask && current_task.data_part->getColumns().contains("__row_exists")) + /// Add filtering step with lightweight delete mask + const auto & lightweigth_delete_info = storage_snapshot->metadata->lightweight_delete_description; + if (!reader_settings.skip_deleted_mask && current_task.data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) { -// last_reader = !prewhere_actions || prewhere_actions->steps.empty(); current_task.pre_range_readers.push_back( MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lwd_filter_step, last_reader, non_const_virtual_column_names)); prev_reader = ¤t_task.pre_range_readers.back(); @@ -241,7 +284,6 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu "PREWHERE steps count mismatch, actions: {}, readers: {}", prewhere_actions->steps.size(), pre_reader_for_step.size()); - for (size_t i = 0; i < prewhere_actions->steps.size(); ++i) { last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size()); @@ -304,7 +346,7 @@ static UInt64 estimateNumRows(const MergeTreeReadTask & current_task, UInt64 cur } -Chunk MergeTreeBaseSelectProcessor::readFromPartImpl() +MergeTreeBaseSelectProcessor::BlockAndRowCount MergeTreeBaseSelectProcessor::readFromPartImpl() { if (task->size_predictor) task->size_predictor->startBlock(); @@ -347,24 +389,13 @@ Chunk MergeTreeBaseSelectProcessor::readFromPartImpl() if (read_result.num_rows == 0) return {}; - Columns ordered_columns; - ordered_columns.reserve(header_without_virtual_columns.columns()); + BlockAndRowCount res = { sample_block.cloneWithColumns(read_result.columns), read_result.num_rows }; - /// Reorder columns. TODO: maybe skip for default case. - for (size_t ps = 0; ps < header_without_virtual_columns.columns(); ++ps) - { - const auto & name = header_without_virtual_columns.getByPosition(ps).name; - if (name == "__row_exists" && !sample_block.has(name)) - continue; /// TODO: properly deal with cases when __row_exists is not read and is filled later - auto pos_in_sample_block = sample_block.getPositionByName(name); - ordered_columns.emplace_back(std::move(read_result.columns[pos_in_sample_block])); - } - - return Chunk(std::move(ordered_columns), read_result.num_rows); + return res; } -Chunk MergeTreeBaseSelectProcessor::readFromPart() +MergeTreeBaseSelectProcessor::BlockAndRowCount MergeTreeBaseSelectProcessor::readFromPart() { if (!task->range_reader.isInitialized()) initializeRangeReaders(*task); @@ -375,22 +406,46 @@ Chunk MergeTreeBaseSelectProcessor::readFromPart() namespace { - /// Simple interfaces to insert virtual columns. struct VirtualColumnsInserter { - virtual ~VirtualColumnsInserter() = default; + explicit VirtualColumnsInserter(Block & block_) : block(block_) {} - virtual void insertArrayOfStringsColumn(const ColumnPtr & column, const String & name) = 0; - virtual void insertStringColumn(const ColumnPtr & column, const String & name) = 0; - virtual void insertUInt8Column(const ColumnPtr & column, const String & name) = 0; - virtual void insertUInt64Column(const ColumnPtr & column, const String & name) = 0; - virtual void insertUUIDColumn(const ColumnPtr & column, const String & name) = 0; + bool columnExists(const String & name) const { return block.has(name); } - virtual void insertPartitionValueColumn( - size_t rows, - const Row & partition_value, - const DataTypePtr & partition_value_type, - const String & name) = 0; + void insertStringColumn(const ColumnPtr & column, const String & name) + { + block.insert({column, std::make_shared(), name}); + } + + void insertUInt8Column(const ColumnPtr & column, const String & name) + { + block.insert({column, std::make_shared(), name}); + } + + void insertUInt64Column(const ColumnPtr & column, const String & name) + { + block.insert({column, std::make_shared(), name}); + } + + void insertUUIDColumn(const ColumnPtr & column, const String & name) + { + block.insert({column, std::make_shared(), name}); + } + + void insertPartitionValueColumn( + size_t rows, const Row & partition_value, const DataTypePtr & partition_value_type, const String & name) + { + ColumnPtr column; + if (rows) + column = partition_value_type->createColumnConst(rows, Tuple(partition_value.begin(), partition_value.end())) + ->convertToFullColumnIfConst(); + else + column = partition_value_type->createColumn(); + + block.insert({column, partition_value_type, name}); + } + + Block & block; }; } @@ -400,16 +455,34 @@ static void injectNonConstVirtualColumns( VirtualColumnsInserter & inserter, const Names & virtual_columns) { - if (unlikely(rows)) - throw Exception("Cannot insert non-constant virtual column to non-empty chunk.", - ErrorCodes::LOGICAL_ERROR); - for (const auto & virtual_column_name : virtual_columns) { if (virtual_column_name == "_part_offset") - inserter.insertUInt64Column(DataTypeUInt64().createColumn(), virtual_column_name); + { + if (!rows) + { + inserter.insertUInt64Column(DataTypeUInt64().createColumn(), virtual_column_name); + } + else + { + if (!inserter.columnExists(virtual_column_name)) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Column {} must have been filled part reader", + virtual_column_name); + } + } + if (virtual_column_name == "__row_exists") - inserter.insertUInt8Column(DataTypeUInt8().createColumn(), virtual_column_name); + { + /// If __row_exists column isn't present in the part then fill it here with 1s + ColumnPtr column; + if (rows) + column = DataTypeUInt8().createColumnConst(rows, 1)->convertToFullColumnIfConst(); + else + column = DataTypeUInt8().createColumn(); + + inserter.insertUInt8Column(column, virtual_column_name); + } } } @@ -489,148 +562,15 @@ static void injectPartConstVirtualColumns( } } -namespace -{ - struct VirtualColumnsInserterIntoBlock : public VirtualColumnsInserter - { - explicit VirtualColumnsInserterIntoBlock(Block & block_) : block(block_) {} - - void insertArrayOfStringsColumn(const ColumnPtr & column, const String & name) final - { - block.insert({column, std::make_shared(std::make_shared()), name}); - } - - void insertStringColumn(const ColumnPtr & column, const String & name) final - { - block.insert({column, std::make_shared(), name}); - } - - void insertUInt8Column(const ColumnPtr & column, const String & name) final - { - block.insert({column, std::make_shared(), name}); - } - - void insertUInt64Column(const ColumnPtr & column, const String & name) final - { - block.insert({column, std::make_shared(), name}); - } - - void insertUUIDColumn(const ColumnPtr & column, const String & name) final - { - block.insert({column, std::make_shared(), name}); - } - - void insertPartitionValueColumn( - size_t rows, const Row & partition_value, const DataTypePtr & partition_value_type, const String & name) final - { - ColumnPtr column; - if (rows) - column = partition_value_type->createColumnConst(rows, Tuple(partition_value.begin(), partition_value.end())) - ->convertToFullColumnIfConst(); - else - column = partition_value_type->createColumn(); - - block.insert({column, partition_value_type, name}); - } - - Block & block; - }; - - struct VirtualColumnsInserterIntoColumns : public VirtualColumnsInserter - { - explicit VirtualColumnsInserterIntoColumns(Columns & columns_) : columns(columns_) {} - - void insertArrayOfStringsColumn(const ColumnPtr & column, const String &) final - { - columns.push_back(column); - } - - void insertStringColumn(const ColumnPtr & column, const String &) final - { - columns.push_back(column); - } - - void insertUInt8Column(const ColumnPtr & column, const String &) final - { - columns.push_back(column); - } - - void insertUInt64Column(const ColumnPtr & column, const String &) final - { - columns.push_back(column); - } - - void insertUUIDColumn(const ColumnPtr & column, const String &) final - { - columns.push_back(column); - } - - void insertPartitionValueColumn( - size_t rows, const Row & partition_value, const DataTypePtr & partition_value_type, const String &) final - { - ColumnPtr column; - if (rows) - column = partition_value_type->createColumnConst(rows, Tuple(partition_value.begin(), partition_value.end())) - ->convertToFullColumnIfConst(); - else - column = partition_value_type->createColumn(); - columns.push_back(column); - } - - Columns & columns; - }; -} - void MergeTreeBaseSelectProcessor::injectVirtualColumns( - Block & block, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns) + Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns) { - VirtualColumnsInserterIntoBlock inserter{block}; + VirtualColumnsInserter inserter{block}; /// First add non-const columns that are filled by the range reader and then const columns that we will fill ourselves. /// Note that the order is important: virtual columns filled by the range reader must go first - injectNonConstVirtualColumns(block.rows(), inserter, virtual_columns); - injectPartConstVirtualColumns(block.rows(), inserter, task, partition_value_type, virtual_columns); -} - -void MergeTreeBaseSelectProcessor::injectVirtualColumns( - Chunk & chunk, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns) -{ - UInt64 num_rows = chunk.getNumRows(); - auto columns = chunk.detachColumns(); - - VirtualColumnsInserterIntoColumns inserter{columns}; - -///////////////////////// -// TODO: implement properly - for (const auto & virtual_column_name : virtual_columns) - { - - if (virtual_column_name == "__row_exists") - { - if (task->data_part->getColumns().contains(virtual_column_name)) - { - /// If this column is present in the part it must be read from the data - assert(task->task_columns.columns.contains(virtual_column_name)); - } - else - { - /// If __row_exists column isn't present in the part then - ColumnPtr column; - if (num_rows) - column = DataTypeUInt8().createColumnConst(num_rows, 1)->convertToFullColumnIfConst(); - else - column = DataTypeUInt8().createColumn(); - - inserter.insertUInt8Column(column, virtual_column_name); - } - } - } -/////////////////////////// - - /// Only add const virtual columns because non-const ones have already been added - injectPartConstVirtualColumns(num_rows, inserter, task, partition_value_type, virtual_columns); - - chunk.setColumns(columns, num_rows); + injectNonConstVirtualColumns(row_count, inserter, virtual_columns); + injectPartConstVirtualColumns(row_count, inserter, task, partition_value_type, virtual_columns); } Block MergeTreeBaseSelectProcessor::transformHeader( @@ -676,7 +616,7 @@ Block MergeTreeBaseSelectProcessor::transformHeader( } } - injectVirtualColumns(block, nullptr, partition_value_type, virtual_columns); + injectVirtualColumns(block, 0, nullptr, partition_value_type, virtual_columns); return block; } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 15a088d115c..299feed5a49 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,12 @@ public: const Block & sample_block); protected: + /// This struct allow to return block with no columns but with non-zero number of rows similar to Chunk + struct BlockAndRowCount + { + Block block; + size_t row_count = 0; + }; Chunk generate() final; @@ -74,22 +81,29 @@ protected: /// Closes readers and unlock part locks virtual void finish() = 0; - virtual Chunk readFromPart(); + virtual BlockAndRowCount readFromPart(); - Chunk readFromPartImpl(); + BlockAndRowCount readFromPartImpl(); - /// Two versions for header and chunk. + /// Used for filling header with no rows as well as block with data static void - injectVirtualColumns(Block & block, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns); - static void - injectVirtualColumns(Chunk & chunk, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns); + injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns); + /// Sets up data readers for each step of prewhere and where + void initializeMergeTreeReadersForPart( + MergeTreeData::DataPartPtr & data_part, + const MergeTreeReadTaskColumns & task_columns, const StorageMetadataPtr & metadata_snapshot, + const MarkRanges & mark_ranges, const IMergeTreeReader::ValueSizeMap & value_size_map, + const ReadBufferFromFileBase::ProfileCallback & profile_callback); + + /// Sets up range readers corresponding to data readers void initializeRangeReaders(MergeTreeReadTask & task); const MergeTreeData & storage; StorageSnapshotPtr storage_snapshot; - PrewhereExprStep lwd_filter_step { nullptr, "__row_exists", true, true }; + /// This step is added when the part has lightweight delete mask + const PrewhereExprStep lwd_filter_step { nullptr, "__row_exists", true, true }; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index cf90da36ace..867d43e20b3 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -1002,6 +1002,7 @@ void MergeTreeRangeReader::fillPartOffsetColumn(ReadResult & result, UInt64 lead result.columns.emplace_back(std::move(column)); result.extra_columns_filled.push_back("_part_offset"); } + /// Fill deleted_row_mask column, referenced from fillPartOffsetColumn(). void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset) { diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 8f063786cbc..ba71f1898f6 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -161,9 +161,6 @@ public: /// The number of bytes read from disk. size_t numBytesRead() const { return num_bytes_read; } - /// Similar as filter that you need to apply to newly-read columns - ColumnPtr deleted_mask_filter_holder; - private: /// Only MergeTreeRangeReader is supposed to access ReadResult internals. friend class MergeTreeRangeReader; @@ -245,6 +242,9 @@ public: std::map filter_bytes_map; + /// Similar as filter that you need to apply to newly-read columns + ColumnPtr deleted_mask_filter_holder; + Names extra_columns_filled; }; diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp index 84548d357b7..c009e6f1165 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp @@ -44,9 +44,9 @@ catch (...) throw; } -Chunk MergeTreeReverseSelectProcessor::readFromPart() +MergeTreeBaseSelectProcessor::BlockAndRowCount MergeTreeReverseSelectProcessor::readFromPart() { - Chunk res; + BlockAndRowCount res; if (!chunks.empty()) { @@ -60,7 +60,7 @@ Chunk MergeTreeReverseSelectProcessor::readFromPart() while (!task->isFinished()) { - Chunk chunk = readFromPartImpl(); + auto chunk = readFromPartImpl(); chunks.push_back(std::move(chunk)); } diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h index 38dcc1a2352..06a218abafa 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h @@ -27,9 +27,9 @@ private: bool getNewTaskImpl() override; void finalizeNewTask() override {} - Chunk readFromPart() override; + BlockAndRowCount readFromPart() override; - Chunks chunks; + std::vector chunks; Poco::Logger * log = &Poco::Logger::get("MergeTreeReverseSelectProcessor"); }; diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 47bcf72d611..3e346df6662 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -52,7 +52,7 @@ void MergeTreeSelectProcessor::initializeReaders() { task_columns = getReadTaskColumns( storage, storage_snapshot, data_part, - required_columns, non_const_virtual_column_names, prewhere_info, /*with_subcolumns=*/ true); + required_columns, virt_column_names, prewhere_info, /*with_subcolumns=*/ true); /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); @@ -63,25 +63,8 @@ void MergeTreeSelectProcessor::initializeReaders() owned_mark_cache = storage.getContext()->getMarkCache(); - reader = data_part->getReader(task_columns.columns, storage_snapshot->getMetadataForQuery(), - all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {}); - - pre_reader_for_step.clear(); - - if (!reader_settings.skip_deleted_mask && data_part->getColumns().contains("__row_exists")) - { - pre_reader_for_step.push_back(data_part->getReader({{"__row_exists", std::make_shared()}}, storage_snapshot->getMetadataForQuery(), - all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {})); - } - - if (prewhere_info) - { - for (const auto & pre_columns_for_step : task_columns.pre_columns) - { - pre_reader_for_step.push_back(data_part->getReader(pre_columns_for_step, storage_snapshot->getMetadataForQuery(), - all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, {}, {})); - } - } + initializeMergeTreeReadersForPart(data_part, task_columns, storage_snapshot->getMetadataForQuery(), + all_mark_ranges, {}, {}); } diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp index 4c0eac95593..04b7f6094e4 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp @@ -105,68 +105,24 @@ void MergeTreeThreadSelectProcessor::finalizeNewTask() auto profile_callback = [this](ReadBufferFromFileBase::ProfileInfo info_) { pool->profileFeedback(info_); }; const auto & metadata_snapshot = storage_snapshot->metadata; + IMergeTreeReader::ValueSizeMap value_size_map; + if (!reader) { if (use_uncompressed_cache) owned_uncompressed_cache = storage.getContext()->getUncompressedCache(); owned_mark_cache = storage.getContext()->getMarkCache(); - - reader = task->data_part->getReader(task->task_columns.columns, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - IMergeTreeReader::ValueSizeMap{}, profile_callback); - - pre_reader_for_step.clear(); - - - if (!reader_settings.skip_deleted_mask && task->data_part->getColumns().contains("__row_exists")) - { - pre_reader_for_step.push_back(task->data_part->getReader({{"__row_exists", std::make_shared()}}, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - IMergeTreeReader::ValueSizeMap{}, profile_callback)); - } - - - if (prewhere_info) - { - for (const auto & pre_columns_per_step : task->task_columns.pre_columns) - { - pre_reader_for_step.push_back(task->data_part->getReader(pre_columns_per_step, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - IMergeTreeReader::ValueSizeMap{}, profile_callback)); - } - } } - else + else if (part_name != last_readed_part_name) { - /// in other case we can reuse readers, anyway they will be "seeked" to required mark - if (part_name != last_readed_part_name) - { - /// retain avg_value_size_hints - reader = task->data_part->getReader(task->task_columns.columns, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - reader->getAvgValueSizeHints(), profile_callback); + value_size_map = reader->getAvgValueSizeHints(); + } - pre_reader_for_step.clear(); - - if (!reader_settings.skip_deleted_mask && task->data_part->getColumns().contains("__row_exists")) - { - pre_reader_for_step.push_back(task->data_part->getReader({{"__row_exists", std::make_shared()}}, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - reader->getAvgValueSizeHints(), profile_callback)); - } - - - - if (prewhere_info) - { - for (const auto & pre_columns_per_step : task->task_columns.pre_columns) - { - pre_reader_for_step.push_back(task->data_part->getReader(pre_columns_per_step, metadata_snapshot, task->mark_ranges, - owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, - reader->getAvgValueSizeHints(), profile_callback)); - } - } - } + const bool init_new_readers = !reader || part_name != last_readed_part_name; + if (init_new_readers) + { + initializeMergeTreeReadersForPart(task->data_part, task->task_columns, metadata_snapshot, + task->mark_ranges, value_size_map, profile_callback); } last_readed_part_name = part_name; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index b47f0cab6ab..ae64b08e351 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -170,6 +170,15 @@ getColumnsForNewDataPart( NameToNameMap renamed_columns_to_from; NameToNameMap renamed_columns_from_to; ColumnsDescription part_columns(source_part->getColumns()); + const auto all_virtual_columns = source_part->storage.getVirtuals(); + + /// Preserve virtual columns that have persisted values in the source_part +/// TODO: only allow LWD mask to be overriden!!!!! + for (const auto & virtual_column : all_virtual_columns) + { + if (part_columns.has(virtual_column.name) && !storage_columns.contains(virtual_column.name)) + storage_columns.emplace_back(virtual_column); + } /// All commands are validated in AlterCommand so we don't care about order for (const auto & command : commands_for_removes) @@ -178,8 +187,11 @@ getColumnsForNewDataPart( { for (const auto & [column_name, _] : command.column_to_update_expression) { - if (column_name == "__row_exists" && !storage_columns.contains(column_name)) - storage_columns.emplace_back("__row_exists", std::make_shared()); + /// Allow to update and persist values of virtual column +/// TODO: only allow LWD mask to be overriden!!!!! + auto virtual_column = all_virtual_columns.tryGetByName(column_name); + if (virtual_column && !storage_columns.contains(column_name)) + storage_columns.emplace_back(column_name, virtual_column->type); } } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index e52a0fed674..9390514d299 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "DataTypes/DataTypesNumber.h" #include #include @@ -677,6 +678,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (arg_num != arg_cnt) throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS); + metadata.lightweight_delete_description.filter_column = { "__row_exists", std::make_shared() }; + if (replicated) { auto storage_policy = args.getContext()->getStoragePolicy(storage_settings->storage_policy); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 66dcc938aef..de12467bdec 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -38,6 +38,7 @@ StorageInMemoryMetadata::StorageInMemoryMetadata(const StorageInMemoryMetadata & , sampling_key(other.sampling_key) , column_ttls_by_name(other.column_ttls_by_name) , table_ttl(other.table_ttl) + , lightweight_delete_description(other.lightweight_delete_description) , settings_changes(other.settings_changes ? other.settings_changes->clone() : nullptr) , select(other.select) , comment(other.comment) @@ -63,6 +64,7 @@ StorageInMemoryMetadata & StorageInMemoryMetadata::operator=(const StorageInMemo sampling_key = other.sampling_key; column_ttls_by_name = other.column_ttls_by_name; table_ttl = other.table_ttl; + lightweight_delete_description = other.lightweight_delete_description; if (other.settings_changes) settings_changes = other.settings_changes->clone(); else diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index a9ab96909f4..84a3bcb3046 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -43,6 +43,8 @@ struct StorageInMemoryMetadata TTLColumnsDescription column_ttls_by_name; /// TTL expressions for table (Move and Rows) TTLTableDescription table_ttl; + /// Lightweight delete filter column if the storage supports it. + LightweightDeleteDescription lightweight_delete_description; /// SETTINGS expression. Supported for MergeTree, Buffer, Kafka, RabbitMQ. ASTPtr settings_changes; /// SELECT QUERY. Supported for MaterializedView and View (have to support LiveView). diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 8f60eb604b5..5170b7d326c 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -6,6 +6,8 @@ #include #include #include +#include "Core/NamesAndTypes.h" +#include "DataTypes/Serializations/ISerialization.h" namespace DB { @@ -127,4 +129,9 @@ struct TTLTableDescription static TTLTableDescription parse(const String & str, const ColumnsDescription & columns, ContextPtr context, const KeyDescription & primary_key); }; +struct LightweightDeleteDescription +{ + NameAndTypePair filter_column; +}; + } diff --git a/tests/queries/0_stateless/02352_lightweight_delete.reference b/tests/queries/0_stateless/02352_lightweight_delete.reference index 8ae6b1d3195..02a34d7b82d 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.reference +++ b/tests/queries/0_stateless/02352_lightweight_delete.reference @@ -13,3 +13,11 @@ Delete 3M more rows using light weight DELETE Rows in parts 7000000 Count 4000000 First row 6000000 10 +Do UPDATE mutation +Rows in parts 7000000 +Count 4000000 +First row 6000000 1 +Force merge to cleanup deleted rows +Rows in parts 4000000 +Count 4000000 +First row 6000000 1 diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index c5f636181cd..b9daf5e124d 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -5,21 +5,20 @@ CREATE TABLE lwd_test (id UInt64 , value String) ENGINE MergeTree() ORDER BY id; INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 10000000; SET mutations_sync = 1; +SET allow_experimental_lightweight_delete_with_row_exists = 1; +SET allow_experimental_lightweight_delete = 0; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; - SELECT 'Count', count() FROM lwd_test WHERE id >= 0; - SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; SELECT 'Delete 3M rows using UPDATE __row_exists'; -ALTER TABLE lwd_test UPDATE __row_exists = 0 WHERE id < 3000000; +--ALTER TABLE lwd_test UPDATE __row_exists = 0 WHERE id < 3000000; +DELETE FROM lwd_test WHERE id < 3000000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; - SELECT 'Count', count() FROM lwd_test WHERE id >= 0; - SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -27,21 +26,33 @@ SELECT 'Force merge to cleanup deleted rows'; OPTIMIZE TABLE lwd_test FINAL; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; - SELECT 'Count', count() FROM lwd_test WHERE id >= 0; - SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SET allow_experimental_lwd2 = 1; SELECT 'Delete 3M more rows using light weight DELETE'; DELETE FROM lwd_test WHERE id < 6000000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; - SELECT 'Count', count() FROM lwd_test WHERE id >= 0; - SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; +SELECT 'Do UPDATE mutation'; +ALTER TABLE lwd_test UPDATE value = 'v' WHERE id % 2 == 0; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +SELECT 'Force merge to cleanup deleted rows'; +OPTIMIZE TABLE lwd_test FINAL; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; +SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + + DROP TABLE lwd_test; From ae0d00083c4870e5e735cc22775d7a552a92a529 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 18 Jul 2022 09:36:28 +0200 Subject: [PATCH 096/227] Renamed __row_exists to _row_exists --- src/Interpreters/InterpreterDeleteQuery.cpp | 2 +- src/Interpreters/MutationsInterpreter.cpp | 2 +- src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp | 6 +++--- src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 4 ++-- src/Storages/MergeTree/registerStorageMergeTree.cpp | 2 +- src/Storages/StorageDistributed.cpp | 2 +- .../queries/0_stateless/02352_lightweight_delete.reference | 4 ++-- tests/queries/0_stateless/02352_lightweight_delete.sql | 6 +++--- 11 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 0b7fdbd264c..aeeb72ad06c 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -75,7 +75,7 @@ BlockIO InterpreterDeleteQuery::execute() if (getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) { - /// Build "UPDATE __row_exists = 0 WHERE predicate" query + /// Build "UPDATE _row_exists = 0 WHERE predicate" query mut_command.type = MutationCommand::Type::UPDATE; mut_command.predicate = delete_query.predicate; diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index ffe80a9502a..f896c2269e8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -353,7 +353,7 @@ static void validateUpdateColumns( } } - /// Allow to override values of virtual columns + /// Allow to override value of lightweight delete filter virtual column if (!found && column_name == metadata_snapshot->lightweight_delete_description.filter_column.name) found = true; diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index ce48a03ce8b..a5649c0b41b 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -61,7 +61,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( { non_const_virtual_column_names.emplace_back(*it); } - else if (*it == "__row_exists") + else if (*it == "_row_exists") { non_const_virtual_column_names.emplace_back(*it); } @@ -472,9 +472,9 @@ static void injectNonConstVirtualColumns( } } - if (virtual_column_name == "__row_exists") + if (virtual_column_name == "_row_exists") { - /// If __row_exists column isn't present in the part then fill it here with 1s + /// If _row_exists column isn't present in the part then fill it here with 1s ColumnPtr column; if (rows) column = DataTypeUInt8().createColumnConst(rows, 1)->convertToFullColumnIfConst(); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 299feed5a49..51805fa83a2 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -103,7 +103,7 @@ protected: StorageSnapshotPtr storage_snapshot; /// This step is added when the part has lightweight delete mask - const PrewhereExprStep lwd_filter_step { nullptr, "__row_exists", true, true }; + const PrewhereExprStep lwd_filter_step { nullptr, "_row_exists", true, true }; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 4df47eb7765..c9106b2f7b6 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6562,7 +6562,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const NameAndTypePair("_partition_value", getPartitionValueType()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), - NameAndTypePair("__row_exists", std::make_shared()), + NameAndTypePair("_row_exists", std::make_shared()), }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 36d45430cff..c78c187db8f 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1214,7 +1214,7 @@ static void selectColumnNames( { virt_column_names.push_back(name); } - else if (name == "__row_exists") + else if (name == "_row_exists") { virt_column_names.push_back(name); } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 867d43e20b3..cbc409af4e8 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -677,7 +677,7 @@ MergeTreeRangeReader::MergeTreeRangeReader( if (column_name == "_part_offset") sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); -// if (column_name == "__row_exists") +// if (column_name == "_row_exists") // sample_block.insert(ColumnWithTypeAndName(ColumnUInt8::create(), std::make_shared(), column_name)); } @@ -1240,7 +1240,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r block.insert({result.columns[pos], std::make_shared(), column_name}); } - else if (column_name == "__row_exists") + else if (column_name == "_row_exists") { /// do nothing, it will be added later /// TODO: properly implement reading non-const virtual columns or filling them with default values diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 9390514d299..beeb980a786 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -678,7 +678,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (arg_num != arg_cnt) throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS); - metadata.lightweight_delete_description.filter_column = { "__row_exists", std::make_shared() }; + metadata.lightweight_delete_description.filter_column = { "_row_exists", std::make_shared() }; if (replicated) { diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 32e0fcffca6..b3ea2cb9f5b 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -312,7 +312,7 @@ NamesAndTypesList StorageDistributed::getVirtuals() const NameAndTypePair("_partition_id", std::make_shared()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), - NameAndTypePair("__row_exists", std::make_shared()), + NameAndTypePair("_row_exists", std::make_shared()), NameAndTypePair("_shard_num", std::make_shared()), /// deprecated }; } diff --git a/tests/queries/0_stateless/02352_lightweight_delete.reference b/tests/queries/0_stateless/02352_lightweight_delete.reference index 02a34d7b82d..8d7be361ba1 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.reference +++ b/tests/queries/0_stateless/02352_lightweight_delete.reference @@ -1,7 +1,7 @@ Rows in parts 10000000 Count 10000000 First row 0 10 -Delete 3M rows using UPDATE __row_exists +Delete 3M rows using lightweight DELETE Rows in parts 10000000 Count 7000000 First row 3000000 10 @@ -9,7 +9,7 @@ Force merge to cleanup deleted rows Rows in parts 7000000 Count 7000000 First row 3000000 10 -Delete 3M more rows using light weight DELETE +Delete 3M more rows using lightweight DELETE Rows in parts 7000000 Count 4000000 First row 6000000 10 diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index b9daf5e124d..0eb30f260dd 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -13,8 +13,8 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 3M rows using UPDATE __row_exists'; ---ALTER TABLE lwd_test UPDATE __row_exists = 0 WHERE id < 3000000; +SELECT 'Delete 3M rows using lightweight DELETE'; +--ALTER TABLE lwd_test UPDATE _row_exists = 0 WHERE id < 3000000; DELETE FROM lwd_test WHERE id < 3000000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; @@ -30,7 +30,7 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 3M more rows using light weight DELETE'; +SELECT 'Delete 3M more rows using lightweight DELETE'; DELETE FROM lwd_test WHERE id < 6000000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; From ab29e18b26fcf44874fc5f014fcd89970b8193f8 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 18 Jul 2022 12:11:31 +0200 Subject: [PATCH 097/227] Speedup 02352_lightweight_delete test --- .../02352_lightweight_delete.reference | 38 +++++++++---------- .../0_stateless/02352_lightweight_delete.sql | 10 ++--- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/queries/0_stateless/02352_lightweight_delete.reference b/tests/queries/0_stateless/02352_lightweight_delete.reference index 8d7be361ba1..2c62a8cf5ea 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.reference +++ b/tests/queries/0_stateless/02352_lightweight_delete.reference @@ -1,23 +1,23 @@ -Rows in parts 10000000 -Count 10000000 +Rows in parts 1000000 +Count 1000000 First row 0 10 -Delete 3M rows using lightweight DELETE -Rows in parts 10000000 -Count 7000000 -First row 3000000 10 +Delete 300K rows using lightweight DELETE +Rows in parts 1000000 +Count 700000 +First row 300000 10 Force merge to cleanup deleted rows -Rows in parts 7000000 -Count 7000000 -First row 3000000 10 -Delete 3M more rows using lightweight DELETE -Rows in parts 7000000 -Count 4000000 -First row 6000000 10 +Rows in parts 700000 +Count 700000 +First row 300000 10 +Delete 300K more rows using lightweight DELETE +Rows in parts 700000 +Count 400000 +First row 600000 10 Do UPDATE mutation -Rows in parts 7000000 -Count 4000000 -First row 6000000 1 +Rows in parts 700000 +Count 400000 +First row 600000 1 Force merge to cleanup deleted rows -Rows in parts 4000000 -Count 4000000 -First row 6000000 1 +Rows in parts 400000 +Count 400000 +First row 600000 1 diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index 0eb30f260dd..ff0d21136d3 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -2,7 +2,7 @@ DROP TABLE IF EXISTS lwd_test; CREATE TABLE lwd_test (id UInt64 , value String) ENGINE MergeTree() ORDER BY id; -INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 10000000; +INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 1000000; SET mutations_sync = 1; SET allow_experimental_lightweight_delete_with_row_exists = 1; @@ -13,9 +13,9 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 3M rows using lightweight DELETE'; +SELECT 'Delete 300K rows using lightweight DELETE'; --ALTER TABLE lwd_test UPDATE _row_exists = 0 WHERE id < 3000000; -DELETE FROM lwd_test WHERE id < 3000000; +DELETE FROM lwd_test WHERE id < 300000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test WHERE id >= 0; @@ -30,8 +30,8 @@ SELECT 'Count', count() FROM lwd_test WHERE id >= 0; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 3M more rows using lightweight DELETE'; -DELETE FROM lwd_test WHERE id < 6000000; +SELECT 'Delete 300K more rows using lightweight DELETE'; +DELETE FROM lwd_test WHERE id < 600000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test WHERE id >= 0; From 614cb9a87f3f99e0a970da65cc10ad67fe5f387e Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 18 Jul 2022 13:50:23 +0200 Subject: [PATCH 098/227] Disable trivial count() optimization if _row_exists column is present --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++-- .../queries/0_stateless/02352_lightweight_delete.sql | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c9106b2f7b6..0223561cdb6 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1077,7 +1077,7 @@ void MergeTreeData::loadDataPartsFromDisk( has_adaptive_parts.store(true, std::memory_order_relaxed); /// Check if there is lightweight delete in part - if (part->hasLightweightDelete()) + if (part->hasLightweightDelete() || part->getColumns().contains("_row_exists")) // TODO: implement properly has_lightweight_in_parts.store(true, std::memory_order_relaxed); part->modification_time = part_disk_ptr->getLastModified(fs::path(relative_data_path) / part_name).epochTime(); @@ -2872,7 +2872,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR); - if (part->hasLightweightDelete()) + if (part->hasLightweightDelete() || part->getColumns().contains("_row_exists")) // TODO: implement properly has_lightweight_delete_parts.store(true); checkPartCanBeAddedToTable(part, lock); diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index ff0d21136d3..cc66898d749 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -9,7 +9,7 @@ SET allow_experimental_lightweight_delete_with_row_exists = 1; SET allow_experimental_lightweight_delete = 0; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -18,7 +18,7 @@ SELECT 'Delete 300K rows using lightweight DELETE'; DELETE FROM lwd_test WHERE id < 300000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -26,7 +26,7 @@ SELECT 'Force merge to cleanup deleted rows'; OPTIMIZE TABLE lwd_test FINAL; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -34,7 +34,7 @@ SELECT 'Delete 300K more rows using lightweight DELETE'; DELETE FROM lwd_test WHERE id < 600000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -42,7 +42,7 @@ SELECT 'Do UPDATE mutation'; ALTER TABLE lwd_test UPDATE value = 'v' WHERE id % 2 == 0; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; @@ -50,7 +50,7 @@ SELECT 'Force merge to cleanup deleted rows'; OPTIMIZE TABLE lwd_test FINAL; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; -SELECT 'Count', count() FROM lwd_test WHERE id >= 0; +SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; From f31788ed2a769c994e6675d20c35cf5774e8c4a3 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 18 Jul 2022 16:04:07 +0200 Subject: [PATCH 099/227] Perf test for read after deleting many rows --- tests/performance/lightweight_delete.xml | 36 ++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/performance/lightweight_delete.xml diff --git a/tests/performance/lightweight_delete.xml b/tests/performance/lightweight_delete.xml new file mode 100644 index 00000000000..af7103f02d0 --- /dev/null +++ b/tests/performance/lightweight_delete.xml @@ -0,0 +1,36 @@ + + + + CREATE TABLE lwd_test ( + id UInt64, + value String, + ) ENGINE=MergeTree() ORDER BY id; + + + + + INSERT INTO lwd_test SELECT number, randomString(100) FROM system.numbers LIMIT 10000000; + + + + OPTIMIZE TABLE lwd_test FINAL; + + + + 1 + 1 + 0 + 1 + + + + + DELETE FROM lwd_test WHERE id < 9999999; + + + + SELECT id, length(value) FROM lwd_test ORDER BY id LIMIT 1 + + DROP TABLE lwd_test + + From 8b523bec167289e182f6cbb58a5ff94335241d22 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 18 Jul 2022 22:12:26 +0200 Subject: [PATCH 100/227] Addressed review comments - Updated destructor of ForkWriteBuffer to clear buffer --- src/IO/ForkWriteBuffer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index 876a924a72b..e50c8e2409c 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -46,6 +46,7 @@ void ForkWriteBuffer::nextImpl() ForkWriteBuffer::~ForkWriteBuffer() { finalize(); + set(nullptr, 0); } From 5d3028741cc34b631801e73c3455bfb728818a7c Mon Sep 17 00:00:00 2001 From: Vladimir Chebotaryov <108669454+quickhouse@users.noreply.github.com> Date: Mon, 11 Jul 2022 07:36:37 +0300 Subject: [PATCH 101/227] Fixed regexp in `test_quota`. --- tests/integration/test_quota/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_quota/test.py b/tests/integration/test_quota/test.py index fd5a6599a59..4f98b9a0d0d 100644 --- a/tests/integration/test_quota/test.py +++ b/tests/integration/test_quota/test.py @@ -486,7 +486,7 @@ def test_exceed_quota(): ) assert re.search( - "Quota.*has\ been\ exceeded", + "Quota.*has been exceeded", instance.query_and_get_error("SELECT * from test_table"), ) system_quota_usage( From e6ded88ea3471e55504c308ad44b52f455e70a32 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Jul 2022 08:51:12 +0000 Subject: [PATCH 102/227] Small refactoring --- src/Common/ZooKeeper/IKeeper.h | 13 ++----- src/Common/ZooKeeper/TestKeeper.h | 2 +- src/Common/ZooKeeper/ZooKeeper.cpp | 2 +- src/Common/ZooKeeper/ZooKeeper.h | 2 +- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 6 ++-- src/Common/ZooKeeper/ZooKeeperImpl.h | 5 +-- src/Coordination/KeeperConstants.h | 17 +++++++++ src/Coordination/KeeperStorage.cpp | 36 +++++++++++++++---- src/Coordination/tests/gtest_coordination.cpp | 4 +-- 9 files changed, 59 insertions(+), 28 deletions(-) create mode 100644 src/Coordination/KeeperConstants.h diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 9592256b7e0..a6ed21bc1d3 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -109,16 +110,6 @@ bool isUserError(Error code); const char * errorMessage(Error code); - -enum KeeperApiVersion : uint8_t -{ - V0 = 0, // ZooKeeper compatible version - V1 // added FilteredList request -}; - -inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; -inline constexpr auto * keeper_api_version_path = "/keeper-api-version"; - struct Request; using RequestPtr = std::shared_ptr; using Requests = std::vector; @@ -525,7 +516,7 @@ public: const Requests & requests, MultiCallback callback) = 0; - virtual Coordination::KeeperApiVersion getApiVersion() = 0; + virtual DB::KeeperApiVersion getApiVersion() = 0; /// Expire session and finish all pending requests virtual void finalize(const String & reason) = 0; diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index cf2126fe18e..2492d2d6ff9 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -90,7 +90,7 @@ public: void finalize(const String & reason) override; - Coordination::KeeperApiVersion getApiVersion() override + DB::KeeperApiVersion getApiVersion() override { return KeeperApiVersion::V0; } diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index dc4e309cdfa..a7e93145218 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -905,7 +905,7 @@ bool ZooKeeper::expired() return impl->isExpired(); } -Coordination::KeeperApiVersion ZooKeeper::getApiVersion() +DB::KeeperApiVersion ZooKeeper::getApiVersion() { return impl->getApiVersion(); } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index c246f8d94ed..c9b5dc69499 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -127,7 +127,7 @@ public: /// Returns true, if the session has expired. bool expired(); - Coordination::KeeperApiVersion getApiVersion(); + DB::KeeperApiVersion getApiVersion(); /// Create a znode. /// Throw an exception if something went wrong. diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index a0544935e25..3d9405dbd8a 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1061,7 +1061,7 @@ void ZooKeeper::pushRequest(RequestInfo && info) ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions); } -Coordination::KeeperApiVersion ZooKeeper::getApiVersion() +KeeperApiVersion ZooKeeper::getApiVersion() { return keeper_api_version; } @@ -1076,7 +1076,7 @@ void ZooKeeper::initApiVersion() promise->set_value(response); }; - get(Coordination::keeper_api_version_path, std::move(callback), {}); + get(keeper_api_version_path, std::move(callback), {}); if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) return; @@ -1088,7 +1088,7 @@ void ZooKeeper::initApiVersion() uint8_t keeper_version{0}; DB::ReadBufferFromOwnString buf(response.data); DB::readIntText(keeper_version, buf); - keeper_api_version = static_cast(keeper_version); + keeper_api_version = static_cast(keeper_version); } diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index 023e46f5017..c7e44f2fc9b 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -181,7 +182,7 @@ public: const Requests & requests, MultiCallback callback) override; - Coordination::KeeperApiVersion getApiVersion() override; + DB::KeeperApiVersion getApiVersion() override; /// Without forcefully invalidating (finalizing) ZooKeeper session before /// establishing a new one, there was a possibility that server is using @@ -282,7 +283,7 @@ private: CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; - Coordination::KeeperApiVersion keeper_api_version{Coordination::KeeperApiVersion::V0}; + DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::V0}; }; } diff --git a/src/Coordination/KeeperConstants.h b/src/Coordination/KeeperConstants.h new file mode 100644 index 00000000000..4582248b7cf --- /dev/null +++ b/src/Coordination/KeeperConstants.h @@ -0,0 +1,17 @@ +#pragma once + +namespace DB +{ + +const std::string keeper_system_path = "/keeper"; +const std::string keeper_api_version_path = keeper_system_path + "/api_version"; + +enum class KeeperApiVersion : uint8_t +{ + V0 = 0, // ZooKeeper compatible version + V1 // added FilteredList request +}; + +inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; + +} diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index d07caeaf496..29f5dcd424a 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -229,9 +230,30 @@ void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other) KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_) : session_expiry_queue(tick_time_ms), digest_enabled(digest_enabled_), superdigest(superdigest_) { + Node root_node; container.insert("/", root_node); - nodes_digest += root_node.getDigest("/"); + addDigest(root_node, "/"); + + const auto insert_node = [&](const auto & path, auto data) + { + // we update numChildren during preprocessing so and createNode is called during + // commit so we need to update it manually here + container.updateValue( + parentPath(path), + [](KeeperStorage::Node & parent) + { + ++parent.stat.numChildren; + } + ); + createNode(path, std::move(data), {}, false, {}); + }; + + insert_node(keeper_system_path, ""); + + assert(Coordination::keeper_api_version_path.starts_with(keeper_system_path)); + auto api_version_data = toString(static_cast(DB::current_keeper_api_version)); + insert_node(keeper_api_version_path, std::move(api_version_data)); } template @@ -924,9 +946,9 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr std::vector new_deltas; - if (request.path == Coordination::keeper_api_version_path) + if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", request.path); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1076,9 +1098,9 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce std::vector new_deltas; - if (request.path == Coordination::keeper_api_version_path) + if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", request.path); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1343,9 +1365,9 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr { Coordination::ZooKeeperSetACLRequest & request = dynamic_cast(*zk_request); - if (request.path == Coordination::keeper_api_version_path) + if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", Coordination::keeper_api_version_path); + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", request.path); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 63edcf15508..708d99cb011 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2125,13 +2125,13 @@ TEST_P(CoordinationTest, TestCurrentApiVersion) using namespace Coordination; KeeperStorage storage{500, "", true}; auto request = std::make_shared(); - request->path = Coordination::keeper_api_version_path; + request->path = DB::keeper_api_version_path; auto responses = storage.processRequest(request, 0, std::nullopt, true, true); const auto & get_response = getSingleResponse(responses); uint8_t keeper_version{0}; DB::ReadBufferFromOwnString buf(get_response.data); DB::readIntText(keeper_version, buf); - EXPECT_EQ(keeper_version, current_keeper_api_version); + EXPECT_EQ(keeper_version, static_cast(current_keeper_api_version)); } INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite, From ce570b6ee3403d815c34d78aa9a2572be882837b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Jul 2022 09:02:57 +0000 Subject: [PATCH 103/227] Add logs and 4LW for api version --- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 7 +++++++ src/Coordination/CoordinationSettings.cpp | 2 +- src/Coordination/FourLetterCommand.cpp | 9 +++++++++ src/Coordination/FourLetterCommand.h | 12 ++++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 3d9405dbd8a..27f2d1c8f52 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1078,17 +1078,24 @@ void ZooKeeper::initApiVersion() get(keeper_api_version_path, std::move(callback), {}); if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) + { + LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Failed to get API version: timeout"); return; + } auto response = future.get(); if (response.error != Coordination::Error::ZOK) + { + LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Failed to get API version"); return; + } uint8_t keeper_version{0}; DB::ReadBufferFromOwnString buf(response.data); DB::readIntText(keeper_version, buf); keeper_api_version = static_cast(keeper_version); + LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Detected server's API version: {}", keeper_api_version); } diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp index 34d69967828..046659af01e 100644 --- a/src/Coordination/CoordinationSettings.cpp +++ b/src/Coordination/CoordinationSettings.cpp @@ -37,7 +37,7 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco } -const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr"; +const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD = "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv"; KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() : server_id(NOT_EXIST) diff --git a/src/Coordination/FourLetterCommand.cpp b/src/Coordination/FourLetterCommand.cpp index cec107806b7..c33630a913b 100644 --- a/src/Coordination/FourLetterCommand.cpp +++ b/src/Coordination/FourLetterCommand.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -132,6 +133,9 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat FourLetterCommandPtr recovery_command = std::make_shared(keeper_dispatcher); factory.registerCommand(recovery_command); + FourLetterCommandPtr api_version_command = std::make_shared(keeper_dispatcher); + factory.registerCommand(api_version_command); + factory.initializeAllowList(keeper_dispatcher); factory.setInitialize(true); } @@ -463,4 +467,9 @@ String RecoveryCommand::run() return "ok"; } +String ApiVersionCommand::run() +{ + return toString(static_cast(Coordination::current_keeper_api_version)); +} + } diff --git a/src/Coordination/FourLetterCommand.h b/src/Coordination/FourLetterCommand.h index b5d08f4c250..8a98b94b33a 100644 --- a/src/Coordination/FourLetterCommand.h +++ b/src/Coordination/FourLetterCommand.h @@ -315,4 +315,16 @@ struct RecoveryCommand : public IFourLetterCommand String run() override; ~RecoveryCommand() override = default; }; + +struct ApiVersionCommand : public IFourLetterCommand +{ + explicit ApiVersionCommand(KeeperDispatcher & keeper_dispatcher_) + : IFourLetterCommand(keeper_dispatcher_) + { + } + + String name() override { return "apiv"; } + String run() override; + ~ApiVersionCommand() override = default; +}; } From de2a0ca05e52126e7f9ccdec45e00a3b21d91895 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Jul 2022 09:03:58 +0000 Subject: [PATCH 104/227] black --- tests/integration/test_keeper_force_recovery/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_keeper_force_recovery/test.py b/tests/integration/test_keeper_force_recovery/test.py index 8eb759fae47..3109562f1c3 100644 --- a/tests/integration/test_keeper_force_recovery/test.py +++ b/tests/integration/test_keeper_force_recovery/test.py @@ -52,8 +52,9 @@ def started_cluster(): def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient( - hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout, - command_retry=KazooRetry(max_tries=10) + hosts=cluster.get_instance_ip(nodename) + ":9181", + timeout=timeout, + command_retry=KazooRetry(max_tries=10), ) _fake_zk_instance.start() From 329acfd6a80194eb1c75e3dec359c2c38a810515 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Jul 2022 12:08:57 +0000 Subject: [PATCH 105/227] Fix tests --- .../0_stateless/02149_external_schema_inference.reference | 1 - .../format_schemas/00825_protobuf_format_persons.proto | 6 ------ 2 files changed, 7 deletions(-) diff --git a/tests/queries/0_stateless/02149_external_schema_inference.reference b/tests/queries/0_stateless/02149_external_schema_inference.reference index 875659c7fb6..ebc30e874da 100644 --- a/tests/queries/0_stateless/02149_external_schema_inference.reference +++ b/tests/queries/0_stateless/02149_external_schema_inference.reference @@ -46,7 +46,6 @@ age String isOnline Enum8(\'offline\' = 0, \'online\' = 1) someRatio Float64 visitTime UInt64 -newMessage Tuple(empty Array(Tuple()), z Float32) randomBigNumber Int64 newFieldInt Array(Int32) color Array(Float32) diff --git a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_persons.proto b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_persons.proto index b588619f488..ecebb1ba452 100644 --- a/tests/queries/0_stateless/format_schemas/00825_protobuf_format_persons.proto +++ b/tests/queries/0_stateless/format_schemas/00825_protobuf_format_persons.proto @@ -75,11 +75,6 @@ message AltPerson { male = 0; female = 1; }; - message Dummy { - message Empty {}; - repeated Empty empty = 1; - float z = 2; - }; repeated int32 location = 101 [packed=false]; float pi = 103; bytes uuid = 300; @@ -92,7 +87,6 @@ message AltPerson { OnlineStatus isOnline = 1; double someRatio = 100; fixed64 visitTime = 15; - Dummy newMessage = 1000; sfixed64 randomBigNumber = 140; repeated int32 newFieldInt = 104; repeated float color = 14; From a3e00faaf9ca302decdd1f2319fd4c95244a5c64 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 19 Jul 2022 13:14:36 +0000 Subject: [PATCH 106/227] Fix unit tests --- src/Coordination/KeeperStorage.cpp | 10 ++--- src/Coordination/ZooKeeperDataReader.cpp | 3 +- src/Coordination/tests/gtest_coordination.cpp | 41 ++++++++++--------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 29f5dcd424a..edd4f624ba3 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -235,7 +235,7 @@ KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, container.insert("/", root_node); addDigest(root_node, "/"); - const auto insert_node = [&](const auto & path, auto data) + const auto create_system_node = [&](const auto & path, auto data) { // we update numChildren during preprocessing so and createNode is called during // commit so we need to update it manually here @@ -249,11 +249,11 @@ KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, createNode(path, std::move(data), {}, false, {}); }; - insert_node(keeper_system_path, ""); + create_system_node(keeper_system_path, ""); - assert(Coordination::keeper_api_version_path.starts_with(keeper_system_path)); - auto api_version_data = toString(static_cast(DB::current_keeper_api_version)); - insert_node(keeper_api_version_path, std::move(api_version_data)); + assert(keeper_api_version_path.starts_with(keeper_system_path)); + auto api_version_data = toString(static_cast(current_keeper_api_version)); + create_system_node(keeper_api_version_path, std::move(api_version_data)); } template diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index 6702c4cc718..fc39b569cc1 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -136,7 +137,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L for (const auto & itr : storage.container) { - if (itr.key != "/") + if (itr.key != "/" && !itr.key.toView().starts_with(keeper_system_path)) { auto parent_path = parentPath(itr.key); storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(path)); value.stat.numChildren++; }); diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 708d99cb011..30cb455e0f4 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -1099,7 +1099,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 2); EXPECT_EQ(snapshot.session_id, 7); - EXPECT_EQ(snapshot.snapshot_container_size, 3); + EXPECT_EQ(snapshot.snapshot_container_size, 5); EXPECT_EQ(snapshot.session_and_timeout.size(), 2); auto buf = manager.serializeSnapshotToBuffer(snapshot); @@ -1111,8 +1111,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 3); - EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.size(), 5); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); @@ -1143,14 +1143,14 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) DB::KeeperStorageSnapshot snapshot(&storage, 50); EXPECT_EQ(snapshot.snapshot_meta->get_last_log_idx(), 50); - EXPECT_EQ(snapshot.snapshot_container_size, 51); + EXPECT_EQ(snapshot.snapshot_container_size, 53); for (size_t i = 50; i < 100; ++i) { addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); } - EXPECT_EQ(storage.container.size(), 101); + EXPECT_EQ(storage.container.size(), 103); auto buf = manager.serializeSnapshotToBuffer(snapshot); manager.serializeSnapshotBufferToDisk(*buf, 50); @@ -1160,7 +1160,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) auto debuf = manager.deserializeSnapshotBufferFromDisk(50); auto [restored_storage, meta, _] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 51); + EXPECT_EQ(restored_storage->container.size(), 53); for (size_t i = 0; i < 50; ++i) { EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); @@ -1199,7 +1199,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots) auto [restored_storage, meta, _] = manager.restoreFromLatestSnapshot(); - EXPECT_EQ(restored_storage->container.size(), 251); + EXPECT_EQ(restored_storage->container.size(), 253); for (size_t i = 0; i < 250; ++i) { @@ -1233,16 +1233,16 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) if (i % 2 == 0) storage.container.erase("/hello_" + std::to_string(i)); } - EXPECT_EQ(storage.container.size(), 26); - EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 102); + EXPECT_EQ(storage.container.size(), 28); + EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 104); EXPECT_EQ(storage.container.snapshotSizeWithVersion().second, 1); auto buf = manager.serializeSnapshotToBuffer(snapshot); manager.serializeSnapshotBufferToDisk(*buf, 50); } EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin" + params.extension)); - EXPECT_EQ(storage.container.size(), 26); + EXPECT_EQ(storage.container.size(), 28); storage.clearGarbageAfterSnapshot(); - EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 26); + EXPECT_EQ(storage.container.snapshotSizeWithVersion().first, 28); for (size_t i = 0; i < 50; ++i) { if (i % 2 != 0) @@ -1658,8 +1658,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage->container.size(), 3); - EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.size(), 5); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 2); EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); @@ -1980,28 +1980,31 @@ TEST_P(CoordinationTest, TestListRequestTypes) int64_t zxid = 0; - static constexpr std::string_view path = "/test"; + static constexpr std::string_view test_path = "/list_request_type/node"; - const auto create_path = [&](bool is_ephemeral) + const auto create_path = [&](const auto & path, bool is_ephemeral, bool is_sequential = true) { const auto create_request = std::make_shared(); int new_zxid = ++zxid; create_request->path = path; - create_request->is_sequential = true; + create_request->is_sequential = is_sequential; create_request->is_ephemeral = is_ephemeral; storage.preprocessRequest(create_request, 1, 0, new_zxid); auto responses = storage.processRequest(create_request, 1, new_zxid); EXPECT_GE(responses.size(), 1); + EXPECT_EQ(responses[0].response->error, Coordination::Error::ZOK) << "Failed to create " << path; const auto & create_response = dynamic_cast(*responses[0].response); return create_response.path_created; }; + create_path(parentPath(StringRef{test_path}).toString(), false, false); + static constexpr size_t persistent_num = 5; std::unordered_set expected_persistent_children; for (size_t i = 0; i < persistent_num; ++i) { - expected_persistent_children.insert(getBaseName(create_path(false)).toString()); + expected_persistent_children.insert(getBaseName(create_path(test_path, false)).toString()); } ASSERT_EQ(expected_persistent_children.size(), persistent_num); @@ -2009,7 +2012,7 @@ TEST_P(CoordinationTest, TestListRequestTypes) std::unordered_set expected_ephemeral_children; for (size_t i = 0; i < ephemeral_num; ++i) { - expected_ephemeral_children.insert(getBaseName(create_path(true)).toString()); + expected_ephemeral_children.insert(getBaseName(create_path(test_path, true)).toString()); } ASSERT_EQ(expected_ephemeral_children.size(), ephemeral_num); @@ -2017,7 +2020,7 @@ TEST_P(CoordinationTest, TestListRequestTypes) { const auto list_request = std::make_shared(); int new_zxid = ++zxid; - list_request->path = parentPath(StringRef{path}).toString(); + list_request->path = parentPath(StringRef{test_path}).toString(); list_request->list_request_type = list_request_type; storage.preprocessRequest(list_request, 1, 0, new_zxid); auto responses = storage.processRequest(list_request, 1, new_zxid); From 150e058be959ea8fc641417cc2e8277edac291b5 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 20 Jul 2022 09:04:18 +0200 Subject: [PATCH 107/227] lockTablesForReading() comes back. --- src/Backups/BackupEntriesCollector.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 6c1b0413368..d2e4b1f8c4b 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -239,6 +239,7 @@ bool BackupEntriesCollector::tryGatherMetadataAndCompareWithPrevious(std::option table_infos.clear(); gatherDatabasesMetadata(); gatherTablesMetadata(); + lockTablesForReading(); } catch (Exception & e) { From 84ef2c9a1f8ef6cb7d5f2a26c72ac38b08295162 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 20 Jul 2022 07:55:19 +0000 Subject: [PATCH 108/227] Fix integration tests --- .../integration/test_keeper_four_word_command/test.py | 8 ++++---- .../test_keeper_zookeeper_converter/test.py | 11 ++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_keeper_four_word_command/test.py b/tests/integration/test_keeper_four_word_command/test.py index b60a8389cd0..e8136d322d3 100644 --- a/tests/integration/test_keeper_four_word_command/test.py +++ b/tests/integration/test_keeper_four_word_command/test.py @@ -227,8 +227,8 @@ def test_cmd_mntr(started_cluster): # contains: # 10 nodes created by test # 3 nodes created by clickhouse "/clickhouse/task_queue/ddl" - # 1 root node - assert int(result["zk_znode_count"]) == 11 + # 1 root node, 2 keeper system nodes + assert int(result["zk_znode_count"]) == 13 assert int(result["zk_watch_count"]) == 2 assert int(result["zk_ephemerals_count"]) == 2 assert int(result["zk_approximate_data_size"]) > 0 @@ -369,7 +369,7 @@ def test_cmd_srvr(started_cluster): assert int(result["Connections"]) == 1 assert int(result["Zxid"]) > 14 assert result["Mode"] == "leader" - assert result["Node count"] == "11" + assert result["Node count"] == "13" finally: destroy_zk_client(zk) @@ -407,7 +407,7 @@ def test_cmd_stat(started_cluster): assert int(result["Connections"]) == 1 assert int(result["Zxid"]) > 14 assert result["Mode"] == "leader" - assert result["Node count"] == "11" + assert result["Node count"] == "13" # filter connection statistics cons = [n for n in data.split("\n") if "=" in n] diff --git a/tests/integration/test_keeper_zookeeper_converter/test.py b/tests/integration/test_keeper_zookeeper_converter/test.py index 72b0d0cdd87..69e12102377 100644 --- a/tests/integration/test_keeper_zookeeper_converter/test.py +++ b/tests/integration/test_keeper_zookeeper_converter/test.py @@ -193,11 +193,16 @@ def compare_states(zk1, zk2, path="/", exclude_paths=[]): second_children = list(sorted(zk2.get_children(path))) print("Got children left", first_children) print("Got children rigth", second_children) - assert first_children == second_children, "Childrens are not equal on path " + path + + if path == "/": + assert set(first_children) ^ set(second_children) == set(["keeper"]) + else: + assert first_children == second_children, "Childrens are not equal on path " + path for children in first_children: - print("Checking child", os.path.join(path, children)) - compare_states(zk1, zk2, os.path.join(path, children), exclude_paths) + if path != "/" or children != "keeper": + print("Checking child", os.path.join(path, children)) + compare_states(zk1, zk2, os.path.join(path, children), exclude_paths) @pytest.mark.parametrize(("create_snapshots"), [True, False]) From 179d04518a4e3c7271a8501ad556bc708fa130ac Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 20 Jul 2022 08:10:52 +0000 Subject: [PATCH 109/227] Black --- tests/integration/test_keeper_zookeeper_converter/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_keeper_zookeeper_converter/test.py b/tests/integration/test_keeper_zookeeper_converter/test.py index 69e12102377..50a9ee6a4a7 100644 --- a/tests/integration/test_keeper_zookeeper_converter/test.py +++ b/tests/integration/test_keeper_zookeeper_converter/test.py @@ -197,7 +197,9 @@ def compare_states(zk1, zk2, path="/", exclude_paths=[]): if path == "/": assert set(first_children) ^ set(second_children) == set(["keeper"]) else: - assert first_children == second_children, "Childrens are not equal on path " + path + assert first_children == second_children, ( + "Childrens are not equal on path " + path + ) for children in first_children: if path != "/" or children != "keeper": From 784ee115945761ca7fc1e8262891d6cc18728070 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 20 Jul 2022 11:16:25 +0000 Subject: [PATCH 110/227] Add settings to skip fields with unsupported types in Protobuf/CapnProto schema inference --- src/Core/Settings.h | 2 + src/Formats/CapnProtoUtils.cpp | 43 +++++++++-- src/Formats/CapnProtoUtils.h | 2 +- src/Formats/FormatFactory.cpp | 2 + src/Formats/FormatSettings.h | 2 + src/Formats/ProtobufSerializer.cpp | 74 +++++++++++++------ src/Formats/ProtobufSerializer.h | 2 +- .../Formats/Impl/CapnProtoRowInputFormat.cpp | 2 +- .../Formats/Impl/ProtobufListInputFormat.cpp | 3 +- .../Formats/Impl/ProtobufListInputFormat.h | 1 + .../Formats/Impl/ProtobufRowInputFormat.cpp | 6 +- .../Formats/Impl/ProtobufRowInputFormat.h | 1 + ...apnproto_protobuf_empty_messages.reference | 8 +- ...02327_capnproto_protobuf_empty_messages.sh | 17 ++++- .../format_schemas/02327_schema.capnp | 5 +- .../format_schemas/02327_schema.proto | 5 +- 16 files changed, 129 insertions(+), 46 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bda72f089eb..17514839b58 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -686,6 +686,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \ + M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Allow to skip fields with unsupported types while schema inference for format Protobuf", 0) \ + M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format CapnProto", 0) \ M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \ M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index 65954315c0d..774ab00e2e3 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -450,7 +450,7 @@ static DataTypePtr getEnumDataTypeFromEnumSchema(const capnp::EnumSchema & enum_ throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "ClickHouse supports only 8 and 16-bit Enums"); } -static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) +static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type, bool skip_unsupported_fields) { switch (capnp_type.which()) { @@ -483,7 +483,9 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) case capnp::schema::Type::LIST: { auto list_schema = capnp_type.asList(); - auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType()); + auto nested_type = getDataTypeFromCapnProtoType(list_schema.getElementType(), skip_unsupported_fields); + if (!nested_type) + return nullptr; return std::make_shared(nested_type); } case capnp::schema::Type::STRUCT: @@ -492,19 +494,33 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) if (struct_schema.getFields().size() == 0) + { + if (skip_unsupported_fields) + return nullptr; throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Empty messages are not supported"); + } /// Check if it can be Nullable. if (checkIfStructIsNamedUnion(struct_schema)) { auto fields = struct_schema.getUnionFields(); if (fields.size() != 2 || (!fields[0].getType().isVoid() && !fields[1].getType().isVoid())) + { + if (skip_unsupported_fields) + return nullptr; throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unions are not supported"); + } auto value_type = fields[0].getType().isVoid() ? fields[1].getType() : fields[0].getType(); if (value_type.isStruct() || value_type.isList()) + { + if (skip_unsupported_fields) + return nullptr; throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Tuples and Lists cannot be inside Nullable"); + } - auto nested_type = getDataTypeFromCapnProtoType(value_type); + auto nested_type = getDataTypeFromCapnProtoType(value_type, skip_unsupported_fields); + if (!nested_type) + return nullptr; return std::make_shared(nested_type); } @@ -516,17 +532,26 @@ static DataTypePtr getDataTypeFromCapnProtoType(const capnp::Type & capnp_type) Names nested_names; for (auto field : struct_schema.getNonUnionFields()) { + auto nested_type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (!nested_type) + continue; nested_names.push_back(field.getProto().getName()); - nested_types.push_back(getDataTypeFromCapnProtoType(field.getType())); + nested_types.push_back(nested_type); } + if (nested_types.empty()) + return nullptr; return std::make_shared(std::move(nested_types), std::move(nested_names)); } default: + { + if (skip_unsupported_fields) + return nullptr; throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unsupported CapnProtoType: {}", getCapnProtoFullTypeName(capnp_type)); + } } } -NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields) { if (checkIfStructContainsUnnamedUnion(schema)) throw Exception(ErrorCodes::CAPN_PROTO_BAD_TYPE, "Unnamed union is not supported"); @@ -535,9 +560,13 @@ NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema) for (auto field : schema.getNonUnionFields()) { auto name = field.getProto().getName(); - auto type = getDataTypeFromCapnProtoType(field.getType()); - names_and_types.emplace_back(name, type); + auto type = getDataTypeFromCapnProtoType(field.getType(), skip_unsupported_fields); + if (type) + names_and_types.emplace_back(name, type); } + if (names_and_types.empty()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + return names_and_types; } diff --git a/src/Formats/CapnProtoUtils.h b/src/Formats/CapnProtoUtils.h index 47fe3ada7cd..50f146a05f6 100644 --- a/src/Formats/CapnProtoUtils.h +++ b/src/Formats/CapnProtoUtils.h @@ -38,7 +38,7 @@ capnp::DynamicValue::Reader getReaderByColumnName(const capnp::DynamicStruct::Re void checkCapnProtoSchemaStructure(const capnp::StructSchema & schema, const Block & header, FormatSettings::EnumComparingMode mode); -NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema); +NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, bool skip_unsupported_fields); } #endif diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 756b33d3eb2..50d2ee12082 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -110,6 +110,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.pretty.output_format_pretty_row_numbers = settings.output_format_pretty_row_numbers; format_settings.protobuf.input_flatten_google_wrappers = settings.input_format_protobuf_flatten_google_wrappers; format_settings.protobuf.output_nullables_with_google_wrappers = settings.output_format_protobuf_nullables_with_google_wrappers; + format_settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference; format_settings.regexp.escaping_rule = settings.format_regexp_escaping_rule; format_settings.regexp.regexp = settings.format_regexp; format_settings.regexp.skip_unmatched = settings.format_regexp_skip_unmatched; @@ -151,6 +152,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.orc.output_string_as_string = settings.output_format_orc_string_as_string; format_settings.defaults_for_omitted_fields = settings.input_format_defaults_for_omitted_fields; format_settings.capn_proto.enum_comparing_mode = settings.format_capn_proto_enum_comparising_mode; + format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference = settings.input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference; format_settings.seekable_read = settings.input_format_allow_seeks; format_settings.msgpack.number_of_columns = settings.input_format_msgpack_number_of_columns; format_settings.msgpack.output_uuid_representation = settings.output_format_msgpack_uuid_representation; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 70bf8979383..b7c55d11beb 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -185,6 +185,7 @@ struct FormatSettings * because Protobuf without delimiters is not generally useful. */ bool allow_multiple_rows_without_delimiter = false; + bool skip_fields_with_unsupported_types_in_schema_inference = false; } protobuf; struct @@ -255,6 +256,7 @@ struct FormatSettings struct { EnumComparingMode enum_comparing_mode = EnumComparingMode::BY_VALUES; + bool skip_fields_with_unsupported_types_in_schema_inference = false; } capn_proto; enum class MsgPackUUIDRepresentation diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 02054d0c1ed..203502150ad 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -3427,19 +3427,23 @@ namespace return std::make_shared>(std::move(values)); } - NameAndTypePair getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool allow_repeat = true) + std::optional getNameAndDataTypeFromField(const google::protobuf::FieldDescriptor * field_descriptor, bool skip_unsupported_fields, bool allow_repeat = true) { if (allow_repeat && field_descriptor->is_map()) { - auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); - const auto * tuple_type = assert_cast(name_and_type.type.get()); - return {name_and_type.name, std::make_shared(tuple_type->getElements())}; + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false); + if (!name_and_type) + return std::nullopt; + const auto * tuple_type = assert_cast(name_and_type->type.get()); + return NameAndTypePair{name_and_type->name, std::make_shared(tuple_type->getElements())}; } if (allow_repeat && field_descriptor->is_repeated()) { - auto name_and_type = getNameAndDataTypeFromField(field_descriptor, false); - return {name_and_type.name, std::make_shared(name_and_type.type)}; + auto name_and_type = getNameAndDataTypeFromField(field_descriptor, skip_unsupported_fields, false); + if (!name_and_type) + return std::nullopt; + return NameAndTypePair{name_and_type->name, std::make_shared(name_and_type->type)}; } switch (field_descriptor->type()) @@ -3447,31 +3451,35 @@ namespace case FieldTypeId::TYPE_SFIXED32: case FieldTypeId::TYPE_SINT32: case FieldTypeId::TYPE_INT32: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_SFIXED64: case FieldTypeId::TYPE_SINT64: case FieldTypeId::TYPE_INT64: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_BOOL: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_FLOAT: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_DOUBLE: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_UINT32: case FieldTypeId::TYPE_FIXED32: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_UINT64: case FieldTypeId::TYPE_FIXED64: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_BYTES: case FieldTypeId::TYPE_STRING: - return {field_descriptor->name(), std::make_shared()}; + return NameAndTypePair{field_descriptor->name(), std::make_shared()}; case FieldTypeId::TYPE_ENUM: { const auto * enum_descriptor = field_descriptor->enum_type(); if (enum_descriptor->value_count() == 0) + { + if (skip_unsupported_fields) + return std::nullopt; throw Exception("Empty enum field", ErrorCodes::BAD_ARGUMENTS); + } int max_abs = std::abs(enum_descriptor->value(0)->number()); for (int i = 1; i != enum_descriptor->value_count(); ++i) { @@ -3479,11 +3487,15 @@ namespace max_abs = std::abs(enum_descriptor->value(i)->number()); } if (max_abs < 128) - return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + return NameAndTypePair{field_descriptor->name(), getEnumDataType(enum_descriptor)}; else if (max_abs < 32768) - return {field_descriptor->name(), getEnumDataType(enum_descriptor)}; + return NameAndTypePair{field_descriptor->name(), getEnumDataType(enum_descriptor)}; else + { + if (skip_unsupported_fields) + return std::nullopt; throw Exception("ClickHouse supports only 8-bit and 16-bit enums", ErrorCodes::BAD_ARGUMENTS); + } } case FieldTypeId::TYPE_GROUP: case FieldTypeId::TYPE_MESSAGE: @@ -3491,13 +3503,17 @@ namespace const auto * message_descriptor = field_descriptor->message_type(); if (message_descriptor->field_count() == 0) { + if (skip_unsupported_fields) + return std::nullopt; throw Exception("Empty messages are not supported", ErrorCodes::BAD_ARGUMENTS); } else if (message_descriptor->field_count() == 1) { const auto * nested_field_descriptor = message_descriptor->field(0); - auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor); - return {field_descriptor->name() + "_" + nested_name_and_type.name, nested_name_and_type.type}; + auto nested_name_and_type = getNameAndDataTypeFromField(nested_field_descriptor, skip_unsupported_fields); + if (!nested_name_and_type) + return std::nullopt; + return NameAndTypePair{field_descriptor->name() + "_" + nested_name_and_type->name, nested_name_and_type->type}; } else { @@ -3505,11 +3521,16 @@ namespace Strings nested_names; for (int i = 0; i != message_descriptor->field_count(); ++i) { - auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i)); - nested_types.push_back(nested_name_and_type.type); - nested_names.push_back(nested_name_and_type.name); + auto nested_name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i), skip_unsupported_fields); + if (!nested_name_and_type) + continue; + nested_types.push_back(nested_name_and_type->type); + nested_names.push_back(nested_name_and_type->name); } - return {field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; + + if (nested_types.empty()) + return std::nullopt; + return NameAndTypePair{field_descriptor->name(), std::make_shared(std::move(nested_types), std::move(nested_names))}; } } } @@ -3544,11 +3565,16 @@ std::unique_ptr ProtobufSerializer::create( return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter, with_envelope, defaults_for_nullable_google_wrappers); } -NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor) +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor, bool skip_unsupported_fields) { NamesAndTypesList schema; for (int i = 0; i != message_descriptor->field_count(); ++i) - schema.push_back(getNameAndDataTypeFromField(message_descriptor->field(i))); + { + if (auto name_and_type = getNameAndDataTypeFromField(message_descriptor->field(i), skip_unsupported_fields)) + schema.push_back(*name_and_type); + } + if (schema.empty()) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot convert Protobuf schema to ClickHouse table schema, all fields have unsupported types"); return schema; } diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h index 607d0b15b63..7cbfe5fd42c 100644 --- a/src/Formats/ProtobufSerializer.h +++ b/src/Formats/ProtobufSerializer.h @@ -54,7 +54,7 @@ public: ProtobufWriter & writer); }; -NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor); +NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * message_descriptor, bool skip_unsupported_fields); } #endif diff --git a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp index ad173e449d6..8da36fecd92 100644 --- a/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CapnProtoRowInputFormat.cpp @@ -298,7 +298,7 @@ NamesAndTypesList CapnProtoSchemaReader::readSchema() auto schema_parser = CapnProtoSchemaParser(); auto schema = schema_parser.getMessageSchema(schema_info); - return capnProtoSchemaToCHSchema(schema); + return capnProtoSchemaToCHSchema(schema, format_settings.capn_proto.skip_fields_with_unsupported_types_in_schema_inference); } void registerInputFormatCapnProto(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp index 4599734591f..d4d80fe3a23 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.cpp @@ -58,13 +58,14 @@ ProtobufListSchemaReader::ProtobufListSchemaReader(const FormatSettings & format true, format_settings.schema.is_server, format_settings.schema.format_schema_path) + , skip_unsopported_fields(format_settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference) { } NamesAndTypesList ProtobufListSchemaReader::readSchema() { const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info, ProtobufSchemas::WithEnvelope::Yes); - return protobufSchemaToCHSchema(message_descriptor); + return protobufSchemaToCHSchema(message_descriptor, skip_unsopported_fields); } void registerInputFormatProtobufList(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufListInputFormat.h b/src/Processors/Formats/Impl/ProtobufListInputFormat.h index 03f56077dec..2f334048ad2 100644 --- a/src/Processors/Formats/Impl/ProtobufListInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufListInputFormat.h @@ -50,6 +50,7 @@ public: private: const FormatSchemaInfo schema_info; + bool skip_unsopported_fields; }; } diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index 0376bf2c292..f4329b7ecfe 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -78,15 +78,15 @@ ProtobufSchemaReader::ProtobufSchemaReader(const FormatSettings & format_setting format_settings.schema.format_schema, "Protobuf", true, - format_settings.schema.is_server, - format_settings.schema.format_schema_path) + format_settings.schema.is_server, format_settings.schema.format_schema_path) + , skip_unsupported_fields(format_settings.protobuf.skip_fields_with_unsupported_types_in_schema_inference) { } NamesAndTypesList ProtobufSchemaReader::readSchema() { const auto * message_descriptor = ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info, ProtobufSchemas::WithEnvelope::No); - return protobufSchemaToCHSchema(message_descriptor); + return protobufSchemaToCHSchema(message_descriptor, skip_unsupported_fields); } void registerProtobufSchemaReader(FormatFactory & factory) diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index c9ba573f103..3d00ee4794e 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -57,6 +57,7 @@ public: private: const FormatSchemaInfo schema_info; + bool skip_unsupported_fields; }; } diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference index b462a5a7baa..0c7da0c3ce4 100644 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference @@ -1,4 +1,8 @@ OK OK -OK -OK +FAIL +FAIL +str String +text String +str String +text String diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index 3890f013b3b..a569a6435f6 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -18,7 +18,20 @@ cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ $CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table t engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table t engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; + +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; + +$CLICKHOUSE_CLIENT --query="drop table if exists test_protobuf"; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="desc test_protobuf"; +$CLICKHOUSE_CLIENT --query="drop table test_protobuf"; + +$CLICKHOUSE_CLIENT --query="drop table if exists test_capnp"; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="desc test_capnp"; +$CLICKHOUSE_CLIENT --query="drop table test_capnp"; rm -rf ${SCHEMADIR:?}/${SERVER_SCHEMADIR:?} diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.capnp b/tests/queries/0_stateless/format_schemas/02327_schema.capnp index c882dcab8d4..12ccc7308c9 100644 --- a/tests/queries/0_stateless/format_schemas/02327_schema.capnp +++ b/tests/queries/0_stateless/format_schemas/02327_schema.capnp @@ -1,10 +1,11 @@ @0x9ef128e10a8010b8; -struct Nested1 +struct Empty { } struct Message { - tuple1 @0 : Nested1; + tuple1 @0 : Empty; + text @1 : Text; } diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.proto b/tests/queries/0_stateless/format_schemas/02327_schema.proto index ae1f440d279..b5067393558 100644 --- a/tests/queries/0_stateless/format_schemas/02327_schema.proto +++ b/tests/queries/0_stateless/format_schemas/02327_schema.proto @@ -1,8 +1,9 @@ syntax = "proto3"; -message Nested { +message Empty { } message Message { - Nested nested = 1; + Empty empty = 1; + string str = 2; }; From d43b1d62800ccd65656bea2f8f9dda583b3fcc29 Mon Sep 17 00:00:00 2001 From: Niek <93536181+nikoloko@users.noreply.github.com> Date: Wed, 20 Jul 2022 13:21:06 +0200 Subject: [PATCH 111/227] Corrected structure of the users section I received feedback from a user that the structure of the user's section was not fully correct. I changed accordingly. --- docs/en/operations/settings/settings-users.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings-users.md b/docs/en/operations/settings/settings-users.md index 6b3340bfce2..101ad46e55a 100644 --- a/docs/en/operations/settings/settings-users.md +++ b/docs/en/operations/settings/settings-users.md @@ -29,7 +29,7 @@ Structure of the `users` section: profile_name default - default + default From c411763c2fb5208f2828a5ff25f77525c899a2e9 Mon Sep 17 00:00:00 2001 From: DanRoscigno Date: Wed, 20 Jul 2022 09:41:00 -0400 Subject: [PATCH 112/227] split the note into two notes --- .../external-dicts-dict-sources.md | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index 1a5308b5569..280dc1f54f4 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -5,9 +5,9 @@ sidebar_label: Sources of External Dictionaries # Sources of External Dictionaries -An external dictionary can be connected from many different sources. +An external dictionary can be connected to ClickHouse from many different sources. -If dictionary is configured using xml-file, the configuration looks like this: +If the dictionary is configured using an xml-file, the configuration looks like this: ``` xml @@ -24,7 +24,7 @@ If dictionary is configured using xml-file, the configuration looks like this: ``` -In case of [DDL-query](../../../sql-reference/statements/create/dictionary.md), equal configuration will looks like: +In case of [DDL-query](../../../sql-reference/statements/create/dictionary.md), the configuration described above will look like: ``` sql CREATE DICTIONARY dict_name (...) @@ -96,7 +96,7 @@ Setting fields: - `path` – The absolute path to the file. - `format` – The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. -When dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in `user_files` directory, to prevent DB users accessing arbitrary file on ClickHouse node. +When a dictionary with source `FILE` is created via DDL command (`CREATE DICTIONARY ...`), the source file needs to be located in the `user_files` directory to prevent DB users from accessing arbitrary files on the ClickHouse node. **See Also** @@ -104,7 +104,7 @@ When dictionary with source `FILE` is created via DDL command (`CREATE DICTIONAR ## Executable File -Working with executable files depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts executable file and treats its output as dictionary data. +Working with executable files depends on [how the dictionary is stored in memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. Example of settings: @@ -120,22 +120,22 @@ Example of settings: Setting fields: -- `command` — The absolute path to the executable file, or the file name (if the program directory is written to `PATH`). +- `command` — The absolute path to the executable file, or the file name (if the command's directory is in the `PATH`). - `format` — The file format. All the formats described in [Formats](../../../interfaces/formats.md#formats) are supported. -- `command_termination_timeout` — executable script should contain main read-write loop. After dictionary is destroyed, pipe is closed, and executable file will have `command_termination_timeout` seconds to shutdown, before ClickHouse will send SIGTERM signal to child process. Specified in seconds. Default value is 10. Optional parameter. -- `command_read_timeout` - timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. -- `command_write_timeout` - timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. +- `command_termination_timeout` — The executable script should contain a main read-write loop. After the dictionary is destroyed, the pipe is closed, and the executable file will have `command_termination_timeout` seconds to shutdown before ClickHouse will send a SIGTERM signal to the child process. `command_termination_timeout` is specified in seconds. Default value is 10. Optional parameter. +- `command_read_timeout` - Timeout for reading data from command stdout in milliseconds. Default value 10000. Optional parameter. +- `command_write_timeout` - Timeout for writing data to command stdin in milliseconds. Default value 10000. Optional parameter. - `implicit_key` — The executable source file can return only values, and the correspondence to the requested keys is determined implicitly — by the order of rows in the result. Default value is false. -- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. +- `execute_direct` - If `execute_direct` = `1`, then `command` will be searched inside user_scripts folder specified by [user_scripts_path](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_scripts_path). Additional script arguments can be specified using a whitespace separator. Example: `script_name arg1 arg2`. If `execute_direct` = `0`, `command` is passed as argument for `bin/sh -c`. Default value is `0`. Optional parameter. - `send_chunk_header` - controls whether to send row count before sending a chunk of data to process. Optional. Default value is `false`. -That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. +That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. ## Executable Pool -Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](external-dicts-dict-layout.md#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, `complex_key_direct` layouts. +Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](external-dicts-dict-layout.md#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. -Executable pool will spawn pool of processes with specified command and keep them running until they exit. The program should read data from STDIN while it is available and output result to STDOUT, and it can wait for next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early. +Executable pool will spawn a pool of processes with the specified command and keep them running until they exit. The program should read data from STDIN while it is available and output the result to STDOUT. It can wait for the next block of data on STDIN. ClickHouse will not close STDIN after processing a block of data, but will pipe another chunk of data when needed. The executable script should be ready for this way of data processing — it should poll STDIN and flush data to STDOUT early. Example of settings: @@ -553,10 +553,13 @@ Setting fields: :::note The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. -There is no explicit parameter `secure`. Both cases: when establishing SSL-connection is mandatory and when it's not are handled automatically. ::: -MySQL can be connected on a local host via sockets. To do this, set `host` and `socket`. +:::note +There is no explicit parameter `secure`. When establishing an SSL-connection security is mandatory. +::: + +MySQL can be connected to on a local host via sockets. To do this, set `host` and `socket`. Example of settings: From 7d05ae786fc3ed0782bad11a67acac5384bfa74d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 20 Jul 2022 16:27:11 +0200 Subject: [PATCH 113/227] Update 02327_capnproto_protobuf_empty_messages.reference --- .../02327_capnproto_protobuf_empty_messages.reference | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference index 0c7da0c3ce4..842cf482414 100644 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.reference @@ -1,7 +1,7 @@ OK OK -FAIL -FAIL +OK +OK str String text String str String From 17a271ec30a4598d0eb4a29ec27afb58b4746e11 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 20 Jul 2022 14:33:46 +0000 Subject: [PATCH 114/227] Fix error codes --- src/Formats/CapnProtoUtils.cpp | 2 +- src/Formats/ProtobufSerializer.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index 774ab00e2e3..3db8672b6f9 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -565,7 +565,7 @@ NamesAndTypesList capnProtoSchemaToCHSchema(const capnp::StructSchema & schema, names_and_types.emplace_back(name, type); } if (names_and_types.empty()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot convert CapnProto schema to ClickHouse table schema, all fields have unsupported types"); return names_and_types; } diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp index 203502150ad..42e02fd4f45 100644 --- a/src/Formats/ProtobufSerializer.cpp +++ b/src/Formats/ProtobufSerializer.cpp @@ -3574,7 +3574,7 @@ NamesAndTypesList protobufSchemaToCHSchema(const google::protobuf::Descriptor * schema.push_back(*name_and_type); } if (schema.empty()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot convert Protobuf schema to ClickHouse table schema, all fields have unsupported types"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot convert Protobuf schema to ClickHouse table schema, all fields have unsupported types"); return schema; } From 605fc5f12186438277e621166a75b53e48175c38 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 20 Jul 2022 21:02:54 +0200 Subject: [PATCH 115/227] Addressed review comments - Added finalizeImpl() override in ForkWriteBuffer to call finalize() of all the buffers. - Removed clearing buffer in ForkWriteBuffer destructor. --- src/IO/ForkWriteBuffer.cpp | 9 ++++++++- src/IO/ForkWriteBuffer.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/IO/ForkWriteBuffer.cpp b/src/IO/ForkWriteBuffer.cpp index e50c8e2409c..e4868d5c9a8 100644 --- a/src/IO/ForkWriteBuffer.cpp +++ b/src/IO/ForkWriteBuffer.cpp @@ -43,10 +43,17 @@ void ForkWriteBuffer::nextImpl() } +void ForkWriteBuffer::finalizeImpl() +{ + for (const WriteBufferPtr & buffer : sources) + { + buffer->finalize(); + } +} + ForkWriteBuffer::~ForkWriteBuffer() { finalize(); - set(nullptr, 0); } diff --git a/src/IO/ForkWriteBuffer.h b/src/IO/ForkWriteBuffer.h index 56e9c445842..17fc82028a9 100644 --- a/src/IO/ForkWriteBuffer.h +++ b/src/IO/ForkWriteBuffer.h @@ -25,6 +25,7 @@ public: protected: void nextImpl() override; + void finalizeImpl() override; private: WriteBufferPtrs sources; From 6578d4f2b4a5f125682195cbf4023000797a5d3f Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Wed, 20 Jul 2022 22:23:52 +0200 Subject: [PATCH 116/227] Mix lightweight deletes with old-style alter-delete's --- ...lightweight_delete_on_merge_tree.reference | 4 +- ...02319_lightweight_delete_on_merge_tree.sql | 3 +- .../02352_lightweight_delete.reference | 41 ++++++++++++------- .../0_stateless/02352_lightweight_delete.sql | 34 +++++++++++++-- 4 files changed, 61 insertions(+), 21 deletions(-) diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference index bc30d677348..fc646843eee 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference @@ -8,8 +8,8 @@ 1 1 1 -1 DELETE WHERE (c % 5) = 1 1 -1 DELETE WHERE c = 4 1 +0 UPDATE _row_exists = 0 WHERE (c % 5) = 1 1 +0 UPDATE _row_exists = 0 WHERE c = 4 1 0 MATERIALIZE INDEX i_c 1 0 UPDATE b = -1 WHERE a < 3 1 0 DROP INDEX i_c 1 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index 3c3df06915f..24afa5fb196 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -5,7 +5,8 @@ CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTr INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); SET mutations_sync = 1; -SET allow_experimental_lightweight_delete = 1; +SET allow_experimental_lightweight_delete = 0; +SET allow_experimental_lightweight_delete_with_row_exists = 1; DELETE FROM merge_table_standard_delete WHERE id = 10; diff --git a/tests/queries/0_stateless/02352_lightweight_delete.reference b/tests/queries/0_stateless/02352_lightweight_delete.reference index 2c62a8cf5ea..3386b3294c3 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.reference +++ b/tests/queries/0_stateless/02352_lightweight_delete.reference @@ -1,23 +1,36 @@ Rows in parts 1000000 Count 1000000 First row 0 10 -Delete 300K rows using lightweight DELETE +Delete 100K rows using lightweight DELETE Rows in parts 1000000 -Count 700000 -First row 300000 10 +Count 900000 +First row 100000 10 Force merge to cleanup deleted rows -Rows in parts 700000 -Count 700000 -First row 300000 10 -Delete 300K more rows using lightweight DELETE -Rows in parts 700000 -Count 400000 -First row 600000 10 +Rows in parts 900000 +Count 900000 +First row 100000 10 +Delete 100K more rows using lightweight DELETE +Rows in parts 900000 +Count 800000 +First row 200000 10 Do UPDATE mutation -Rows in parts 700000 -Count 400000 -First row 600000 1 +Rows in parts 900000 +Count 800000 +First row 200000 1 +Force merge to cleanup deleted rows +Rows in parts 800000 +Count 800000 +First row 200000 1 +Delete 100K more rows using lightweight DELETE +Rows in parts 800000 +Count 700000 +First row 300000 1 +Do ALTER DELETE mutation that does a "heavyweight" delete +Rows in parts 533333 +Count 466666 +First row 300001 10 +Delete 100K more rows using lightweight DELETE Force merge to cleanup deleted rows Rows in parts 400000 Count 400000 -First row 600000 1 +First row 400000 1 diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index cc66898d749..46336a57c3a 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -13,9 +13,9 @@ SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 300K rows using lightweight DELETE'; +SELECT 'Delete 100K rows using lightweight DELETE'; --ALTER TABLE lwd_test UPDATE _row_exists = 0 WHERE id < 3000000; -DELETE FROM lwd_test WHERE id < 300000; +DELETE FROM lwd_test WHERE id < 100000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test; @@ -30,8 +30,8 @@ SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; -SELECT 'Delete 300K more rows using lightweight DELETE'; -DELETE FROM lwd_test WHERE id < 600000; +SELECT 'Delete 100K more rows using lightweight DELETE'; +DELETE FROM lwd_test WHERE id < 200000; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test; @@ -54,5 +54,31 @@ SELECT 'Count', count() FROM lwd_test; SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; +SELECT 'Delete 100K more rows using lightweight DELETE'; +DELETE FROM lwd_test WHERE id < 300000; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; +SELECT 'Count', count() FROM lwd_test; +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + + +SELECT 'Do ALTER DELETE mutation that does a "heavyweight" delete'; +ALTER TABLE lwd_test DELETE WHERE id % 3 == 0; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; +SELECT 'Count', count() FROM lwd_test; +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + +SELECT 'Delete 100K more rows using lightweight DELETE'; +DELETE FROM lwd_test WHERE id >= 300000 and id < 400000; + + +SELECT 'Force merge to cleanup deleted rows'; +OPTIMIZE TABLE lwd_test FINAL; + +SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; +SELECT 'Count', count() FROM lwd_test; +SELECT 'First row', id, length(value) FROM lwd_test ORDER BY id LIMIT 1; + DROP TABLE lwd_test; From 965f96bd8476d0edbc50521029345db03d9f249f Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 20 Jul 2022 20:44:47 +0000 Subject: [PATCH 117/227] DISTINCT in order: perf improvement + reduce allocations in DistinctSortedChunkTransform + use it for final distinct as well --- src/Processors/QueryPlan/DistinctStep.cpp | 23 ++------- .../DistinctSortedChunkTransform.cpp | 49 ++++++++++++------- .../Transforms/DistinctSortedChunkTransform.h | 16 +++--- ...ct_in_order_optimization_explain.reference | 4 +- ..._distinct_in_order_optimization_explain.sh | 2 +- 5 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index 553732fbcc5..d1ca985bb2a 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -94,8 +94,10 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil SortDescription distinct_sort_desc = getSortDescription(input_stream.sort_description, columns); if (!distinct_sort_desc.empty()) { - /// pre-distinct for sorted chunks - if (pre_distinct) + const bool sorted_stream = input_stream.sort_mode == DataStream::SortMode::Stream; + /// pre-distinct for sorted chunks or + /// final distinct for sorted stream (sorting inside and among chunks) + if (pre_distinct || sorted_stream) { pipeline.addSimpleTransform( [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr @@ -104,22 +106,7 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil return nullptr; return std::make_shared( - header, set_size_limits, limit_hint, distinct_sort_desc, columns); - }); - return; - } - /// final distinct for sorted stream (sorting inside and among chunks) - if (input_stream.sort_mode == DataStream::SortMode::Stream) - { - assert(input_stream.has_single_port); - - pipeline.addSimpleTransform( - [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr - { - if (stream_type != QueryPipelineBuilder::StreamType::Main) - return nullptr; - - return std::make_shared(header, distinct_sort_desc, set_size_limits, limit_hint, columns); + header, set_size_limits, limit_hint, distinct_sort_desc, columns, sorted_stream); }); return; } diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index 064c827a8cc..8604cca5a5c 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -13,11 +13,13 @@ DistinctSortedChunkTransform::DistinctSortedChunkTransform( const SizeLimits & output_size_limits_, UInt64 limit_hint_, const SortDescription & sorted_columns_descr_, - const Names & source_columns) + const Names & source_columns, + const bool sorted_stream_) : ISimpleTransform(header_, header_, true) , limit_hint(limit_hint_) , output_size_limits(output_size_limits_) , sorted_columns_descr(sorted_columns_descr_) + , sorted_stream(sorted_stream_) { /// calculate sorted columns positions sorted_columns_pos.reserve(sorted_columns_descr.size()); @@ -43,7 +45,7 @@ DistinctSortedChunkTransform::DistinctSortedChunkTransform( /// reserve space in auxiliary column vectors for processing sorted_columns.reserve(sorted_columns_pos.size()); other_columns.reserve(other_columns_pos.size()); - current_key.reserve(sorted_columns.size()); + prev_chunk_latest_key.reserve(sorted_columns.size()); } void DistinctSortedChunkTransform::initChunkProcessing(const Columns & input_columns) @@ -101,28 +103,40 @@ size_t DistinctSortedChunkTransform::buildFilterForRange( return count; } -void DistinctSortedChunkTransform::setCurrentKey(const size_t row_pos) +void DistinctSortedChunkTransform::saveLatestKey(const size_t row_pos) { - current_key.clear(); + prev_chunk_latest_key.clear(); for (auto const & col : sorted_columns) { - current_key.emplace_back(col->cloneEmpty()); - current_key.back()->insertFrom(*col, row_pos); + prev_chunk_latest_key.emplace_back(col->cloneEmpty()); + prev_chunk_latest_key.back()->insertFrom(*col, row_pos); } } -bool DistinctSortedChunkTransform::isCurrentKey(const size_t row_pos) const +bool DistinctSortedChunkTransform::isKey(const size_t key_pos, const size_t row_pos) const { for (size_t i = 0; i < sorted_columns.size(); ++i) { - int res = current_key[i]->compareAt(0, row_pos, *sorted_columns[i], sorted_columns_descr[i].nulls_direction); + const int res = sorted_columns[i]->compareAt(key_pos, row_pos, *sorted_columns[i], sorted_columns_descr[i].nulls_direction); if (res != 0) return false; } return true; } -size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const +bool DistinctSortedChunkTransform::isLatestKeyFromPrevChunk(const size_t row_pos) const +{ + for (size_t i = 0; i < sorted_columns.size(); ++i) + { + const int res = prev_chunk_latest_key[i]->compareAt(0, row_pos, *sorted_columns[i], sorted_columns_descr[i].nulls_direction); + if (res != 0) + return false; + } + return true; +} + +template +size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end, Predicate pred) const { assert(begin < end); @@ -133,7 +147,7 @@ size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const for (size_t pos = begin; pos < linear_probe_end; ++pos) { - if (!isCurrentKey(pos)) + if (!pred(begin, pos)) return pos; } @@ -142,7 +156,7 @@ size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const while (low <= high) { size_t mid = low + (high - low) / 2; - if (isCurrentKey(mid)) + if (pred(begin, mid)) low = mid + 1; else { @@ -155,13 +169,13 @@ size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const std::pair DistinctSortedChunkTransform::continueWithPrevRange(const size_t chunk_rows, IColumn::Filter & filter) { - /// current_key is empty on very first transform() call + /// prev_chunk_latest_key is empty on very first transform() call /// or first row doesn't match a key from previous transform() - if (current_key.empty() || !isCurrentKey(0)) + if (prev_chunk_latest_key.empty() || !isLatestKeyFromPrevChunk(0)) return {0, 0}; size_t output_rows = 0; - const size_t range_end = getRangeEnd(0, chunk_rows); + const size_t range_end = getRangeEnd(0, chunk_rows, [&](size_t, size_t row_pos) { return isLatestKeyFromPrevChunk(row_pos); }); if (other_columns.empty()) std::fill(filter.begin(), filter.begin() + range_end, 0); /// skip rows already included in distinct on previous transform() else @@ -191,11 +205,8 @@ void DistinctSortedChunkTransform::transform(Chunk & chunk) size_t range_end = range_begin; while (range_end != chunk_rows) { - // set current key to find range - setCurrentKey(range_begin); - // find new range [range_begin, range_end) - range_end = getRangeEnd(range_begin, chunk_rows); + range_end = getRangeEnd(range_begin, chunk_rows, [&](size_t key_pos, size_t row_pos) { return isKey(key_pos, row_pos); }); // update filter for range if (other_columns.empty()) @@ -214,6 +225,8 @@ void DistinctSortedChunkTransform::transform(Chunk & chunk) range_begin = range_end; } + saveLatestKey(chunk_rows - 1); + /// apply the built filter for (auto & input_column : input_columns) input_column = input_column->filter(filter, output_rows); diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.h b/src/Processors/Transforms/DistinctSortedChunkTransform.h index 2e21c36f7dc..0ce8addbf7e 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.h +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.h @@ -32,9 +32,10 @@ public: const SizeLimits & output_size_limits_, UInt64 limit_hint_, const SortDescription & sorted_columns_descr_, - const Names & source_columns_); + const Names & source_columns_, + bool sorted_stream_); - String getName() const override { return "DistinctSortedChunkTransform"; } + String getName() const override { return (!sorted_stream ? "DistinctSortedChunkTransform" : "DistinctSortedStreamTransform"); } protected: void transform(Chunk & chunk) override; @@ -43,9 +44,11 @@ private: void initChunkProcessing(const Columns & input_columns); std::pair continueWithPrevRange(size_t chunk_rows, IColumn::Filter & filter); size_t ordinaryDistinctOnRange(IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data); - inline void setCurrentKey(size_t row_pos); - inline bool isCurrentKey(size_t row_pos) const; - inline size_t getRangeEnd(size_t range_begin, size_t range_end) const; + inline void saveLatestKey(size_t row_pos); + inline bool isLatestKeyFromPrevChunk(size_t row_pos) const; + inline bool isKey(size_t key_pos, size_t row_pos) const; + template + inline size_t getRangeEnd(size_t range_begin, size_t range_end, Predicate pred) const; template size_t buildFilterForRange(Method & method, IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data); @@ -66,7 +69,8 @@ private: Sizes other_columns_sizes; ColumnRawPtrs other_columns; // used during processing - MutableColumns current_key; + MutableColumns prev_chunk_latest_key; + const bool sorted_stream = false; }; } diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index 2dac69edc41..f30d3fa30ea 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -7,13 +7,13 @@ DistinctSortedChunkTransform -- distinct with primary key prefix -> pre-distinct optimization only DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization -DistinctSortedTransform +DistinctSortedStreamTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only DistinctSortedChunkTransform -- distinct with non-primary key prefix -> no optimizations No optimizations -- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only -DistinctSortedTransform +DistinctSortedStreamTransform -- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations No optimizations diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index 21f50a147ac..9af0e98ecf4 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) DISABLE_OPTIMIZATION="set optimize_distinct_in_order=0" ENABLE_OPTIMIZATION="set optimize_distinct_in_order=1" -GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedTransform'" +GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedStreamTransform'" TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'" FIND_OPTIMIZATIONS="$GREP_OPTIMIZATIONS | $TRIM_LEADING_SPACES" From 635a566bec3317e93955de59bf84d4cd14f1c309 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 20 Jul 2022 21:37:46 +0000 Subject: [PATCH 118/227] Comment change --- src/Processors/QueryPlan/DistinctStep.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index d1ca985bb2a..b9a8932b409 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -95,8 +95,7 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (!distinct_sort_desc.empty()) { const bool sorted_stream = input_stream.sort_mode == DataStream::SortMode::Stream; - /// pre-distinct for sorted chunks or - /// final distinct for sorted stream (sorting inside and among chunks) + /// pre-distinct for sorted chunks or final distinct for sorted stream (sorting inside and among chunks) if (pre_distinct || sorted_stream) { pipeline.addSimpleTransform( From a3eb75becab03810d6c6ec5d4cc1254d2347e34b Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 20 Jul 2022 22:26:17 +0000 Subject: [PATCH 119/227] Test: compare result of DISTINCT with and w/o optimization --- .../02317_distinct_in_order_optimization.reference | 1 + .../02317_distinct_in_order_optimization.sql | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference index b53b561137e..05e65c92805 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference @@ -77,3 +77,4 @@ 2 2 1 1 0 0 +-- check that distinct with and w/o optimization produce the same result diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index afe53a95b26..8b1385768ac 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -44,3 +44,14 @@ select '-- distinct with non-key prefix and non-sorted column, order by non-sort select distinct b,c from distinct_in_order order by c desc; drop table if exists distinct_in_order sync; + +select '-- check that distinct with and w/o optimization produce the same result'; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; +create table distinct_in_order (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); +insert into distinct_in_order select distinct CounterID, EventDate from hits_v1 order by CounterID, EventDate settings optimize_distinct_in_order=1; +create table ordinary_distinct (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); +insert into ordinary_distinct select distinct CounterID, EventDate from hits_v1 settings optimize_distinct_in_order=0; +select distinct * from distinct_in_order except select * from ordinary_distinct; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; From 052e7d3fbcdfce833e1709694948b4dea5cb32b1 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Wed, 20 Jul 2022 22:43:55 +0000 Subject: [PATCH 120/227] hits_v1 -> test.hits --- .../0_stateless/02317_distinct_in_order_optimization.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index 8b1385768ac..29fcf47fa20 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -49,9 +49,9 @@ select '-- check that distinct with and w/o optimization produce the same result drop table if exists distinct_in_order sync; drop table if exists ordinary_distinct sync; create table distinct_in_order (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); -insert into distinct_in_order select distinct CounterID, EventDate from hits_v1 order by CounterID, EventDate settings optimize_distinct_in_order=1; +insert into distinct_in_order select distinct CounterID, EventDate from test.hits order by CounterID, EventDate settings optimize_distinct_in_order=1; create table ordinary_distinct (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); -insert into ordinary_distinct select distinct CounterID, EventDate from hits_v1 settings optimize_distinct_in_order=0; +insert into ordinary_distinct select distinct CounterID, EventDate from test.hits settings optimize_distinct_in_order=0; select distinct * from distinct_in_order except select * from ordinary_distinct; drop table if exists distinct_in_order sync; drop table if exists ordinary_distinct sync; From 8bd423f7dc662a4878eac76c16a01e11f8992672 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 21 Jul 2022 05:27:31 +0000 Subject: [PATCH 121/227] Fix test: move test to stateful tests --- .../02317_distinct_in_order_optimization.reference | 1 - .../02317_distinct_in_order_optimization.sql | 11 ----------- .../1_stateful/00174_distinct_in_order.reference | 1 + tests/queries/1_stateful/00174_distinct_in_order.sql | 10 ++++++++++ 4 files changed, 11 insertions(+), 12 deletions(-) create mode 100644 tests/queries/1_stateful/00174_distinct_in_order.reference create mode 100644 tests/queries/1_stateful/00174_distinct_in_order.sql diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference index 05e65c92805..b53b561137e 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference @@ -77,4 +77,3 @@ 2 2 1 1 0 0 --- check that distinct with and w/o optimization produce the same result diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index 29fcf47fa20..afe53a95b26 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -44,14 +44,3 @@ select '-- distinct with non-key prefix and non-sorted column, order by non-sort select distinct b,c from distinct_in_order order by c desc; drop table if exists distinct_in_order sync; - -select '-- check that distinct with and w/o optimization produce the same result'; -drop table if exists distinct_in_order sync; -drop table if exists ordinary_distinct sync; -create table distinct_in_order (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); -insert into distinct_in_order select distinct CounterID, EventDate from test.hits order by CounterID, EventDate settings optimize_distinct_in_order=1; -create table ordinary_distinct (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); -insert into ordinary_distinct select distinct CounterID, EventDate from test.hits settings optimize_distinct_in_order=0; -select distinct * from distinct_in_order except select * from ordinary_distinct; -drop table if exists distinct_in_order sync; -drop table if exists ordinary_distinct sync; diff --git a/tests/queries/1_stateful/00174_distinct_in_order.reference b/tests/queries/1_stateful/00174_distinct_in_order.reference new file mode 100644 index 00000000000..f3389600167 --- /dev/null +++ b/tests/queries/1_stateful/00174_distinct_in_order.reference @@ -0,0 +1 @@ +-- check that distinct with and w/o optimization produce the same result diff --git a/tests/queries/1_stateful/00174_distinct_in_order.sql b/tests/queries/1_stateful/00174_distinct_in_order.sql new file mode 100644 index 00000000000..6d79990a0e0 --- /dev/null +++ b/tests/queries/1_stateful/00174_distinct_in_order.sql @@ -0,0 +1,10 @@ +select '-- check that distinct with and w/o optimization produce the same result'; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; +create table distinct_in_order (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); +insert into distinct_in_order select distinct CounterID, EventDate from test.hits order by CounterID, EventDate settings optimize_distinct_in_order=1; +create table ordinary_distinct (CounterID UInt32, EventDate Date) engine=MergeTree() order by (CounterID, EventDate); +insert into ordinary_distinct select distinct CounterID, EventDate from test.hits settings optimize_distinct_in_order=0; +select distinct * from distinct_in_order except select * from ordinary_distinct; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; From 1ea9f143ff5bc9ffd4dbd999d1730fd5f7512f3e Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Thu, 21 Jul 2022 09:32:29 +0200 Subject: [PATCH 122/227] Leave only _row_exists-based implementation of lightweight delete --- src/Compression/CompressionFactory.cpp | 4 +- src/Compression/CompressionFactory.h | 2 +- src/Core/Settings.h | 1 - src/Formats/NativeWriter.cpp | 2 +- src/Formats/NativeWriter.h | 2 - src/Interpreters/Context.h | 6 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterDeleteQuery.cpp | 64 ++--- src/Interpreters/MutationsInterpreter.cpp | 84 +----- src/Interpreters/MutationsInterpreter.h | 13 +- .../QueryPlan/ReadFromMergeTree.cpp | 2 +- .../MergeTree/DataPartStorageOnDisk.cpp | 26 -- .../MergeTree/DataPartStorageOnDisk.h | 3 - .../MergeTree/FutureMergedMutatedPart.h | 2 - src/Storages/MergeTree/IDataPartStorage.h | 4 - src/Storages/MergeTree/IMergeTreeDataPart.cpp | 29 -- src/Storages/MergeTree/IMergeTreeDataPart.h | 5 - src/Storages/MergeTree/IMergeTreeReader.h | 2 +- .../MergeTreeBaseSelectProcessor.cpp | 5 +- .../MergeTreeDataPartDeletedMask.cpp | 162 ----------- .../MergeTree/MergeTreeDataPartDeletedMask.h | 34 --- src/Storages/MergeTree/MergeTreeIOSettings.h | 4 +- .../MergeTree/MergeTreeMutationEntry.cpp | 26 +- .../MergeTree/MergeTreeMutationEntry.h | 10 +- .../MergeTree/MergeTreeMutationStatus.h | 2 - .../MergeTree/MergeTreeRangeReader.cpp | 102 +------ src/Storages/MergeTree/MergeTreeRangeReader.h | 8 - .../MergeTree/MergeTreeSequentialSource.cpp | 121 +++----- .../MergeTree/MergeTreeSequentialSource.h | 4 - src/Storages/MergeTree/MutateTask.cpp | 258 +----------------- .../MergeTree/ReplicatedMergeTreeQueue.cpp | 1 - .../MergeTree/StorageFromMergeTreeDataPart.h | 5 + src/Storages/StorageJoin.cpp | 2 +- src/Storages/StorageMemory.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 30 +- src/Storages/StorageMergeTree.h | 4 +- .../System/StorageSystemMutations.cpp | 2 - .../02117_show_create_table_system.reference | 1 - ...lightweight_delete_on_merge_tree.reference | 10 +- ...02319_lightweight_delete_on_merge_tree.sql | 4 +- .../0_stateless/02352_lightweight_delete.sql | 2 +- 41 files changed, 132 insertions(+), 920 deletions(-) delete mode 100644 src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp delete mode 100644 src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index 94fb30af1bc..b8a1c5877a4 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -98,14 +98,14 @@ CompressionCodecPtr CompressionCodecFactory::get( } -CompressionCodecPtr CompressionCodecFactory::get(uint8_t byte_code, const IDataType * column_type) const +CompressionCodecPtr CompressionCodecFactory::get(uint8_t byte_code) const { const auto family_code_and_creator = family_code_with_codec.find(byte_code); if (family_code_and_creator == family_code_with_codec.end()) throw Exception("Unknown codec family code: " + toString(byte_code), ErrorCodes::UNKNOWN_CODEC); - return family_code_and_creator->second({}, column_type); + return family_code_and_creator->second({}, nullptr); } diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index c386784686e..a4451f9ed2e 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -63,7 +63,7 @@ public: } /// Get codec by method byte (no params available) - CompressionCodecPtr get(uint8_t byte_code, const IDataType * column_type = nullptr) const; + CompressionCodecPtr get(uint8_t byte_code) const; /// For backward compatibility with config settings CompressionCodecPtr get(const String & family_name, std::optional level) const; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index cb932843fc2..672b8c5b1fb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -458,7 +458,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ - M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \ M(Bool, allow_experimental_lightweight_delete_with_row_exists, false, "Enable lightweight DELETE mutations using __rows_exists column for mergetree tables. Work in progress", 0) \ M(Bool, lightweight_delete_mutation, true, "Enable to make ordinary ALTER DELETE queries lightweight for mergetree tables", 0) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ diff --git a/src/Formats/NativeWriter.cpp b/src/Formats/NativeWriter.cpp index 004c75182a7..77692eec6b6 100644 --- a/src/Formats/NativeWriter.cpp +++ b/src/Formats/NativeWriter.cpp @@ -46,7 +46,7 @@ void NativeWriter::flush() } -void NativeWriter::writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) +static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit) { /** If there are columns-constants - then we materialize them. * (Since the data type does not know how to serialize / deserialize constants.) diff --git a/src/Formats/NativeWriter.h b/src/Formats/NativeWriter.h index 02fc53b60fe..010a03ec722 100644 --- a/src/Formats/NativeWriter.h +++ b/src/Formats/NativeWriter.h @@ -32,8 +32,6 @@ public: static String getContentType() { return "application/octet-stream"; } - static void writeData(const ISerialization & serialization, const ColumnPtr & column, WriteBuffer & ostr, UInt64 offset, UInt64 limit); - private: WriteBuffer & ostr; UInt64 client_revision; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index c83e38a0ed1..7e3bbf43f39 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -361,7 +361,7 @@ private: inline static ContextPtr global_context_instance; /// A flag, used to mark if reader needs to apply deleted rows mask. - bool skip_deleted_mask = false; + bool apply_deleted_mask = true; public: // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context. @@ -915,8 +915,8 @@ public: bool isInternalQuery() const { return is_internal_query; } void setInternalQuery(bool internal) { is_internal_query = internal; } - bool skipDeletedMask() const { return skip_deleted_mask; } - void setSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } + bool applyDeletedMask() const { return apply_deleted_mask; } + void setApplyDeletedMask(bool apply) { apply_deleted_mask = apply; } ActionLocksManagerPtr getActionLocksManager(); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 03b0e1d5894..056a3d9f7b4 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -144,7 +144,7 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) if (!mutation_commands.empty()) { table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); table->mutate(mutation_commands, getContext()); } diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index aeeb72ad06c..cb4bc363d18 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -33,8 +33,7 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex BlockIO InterpreterDeleteQuery::execute() { - if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete && - !getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) + if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) { throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it"); } @@ -73,50 +72,29 @@ BlockIO InterpreterDeleteQuery::execute() MutationCommands mutation_commands; MutationCommand mut_command; - if (getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) - { - /// Build "UPDATE _row_exists = 0 WHERE predicate" query - mut_command.type = MutationCommand::Type::UPDATE; - mut_command.predicate = delete_query.predicate; + /// Build "UPDATE _row_exists = 0 WHERE predicate" query + mut_command.type = MutationCommand::Type::UPDATE; + mut_command.predicate = delete_query.predicate; - auto command = std::make_shared(); - command->type = ASTAlterCommand::UPDATE; - command->predicate = delete_query.predicate; - command->update_assignments = std::make_shared(); - auto set_row_does_not_exist = std::make_shared(); - set_row_does_not_exist->column_name = metadata_snapshot->lightweight_delete_description.filter_column.name; - auto zero_value = std::make_shared(DB::Field(UInt8(0))); - set_row_does_not_exist->children.push_back(zero_value); - command->update_assignments->children.push_back(set_row_does_not_exist); - command->children.push_back(command->predicate); - command->children.push_back(command->update_assignments); - mut_command.column_to_update_expression[set_row_does_not_exist->column_name] = zero_value; - mut_command.ast = command->ptr(); + auto command = std::make_shared(); + command->type = ASTAlterCommand::UPDATE; + command->predicate = delete_query.predicate; + command->update_assignments = std::make_shared(); + auto set_row_does_not_exist = std::make_shared(); + set_row_does_not_exist->column_name = metadata_snapshot->lightweight_delete_description.filter_column.name; + auto zero_value = std::make_shared(DB::Field(UInt8(0))); + set_row_does_not_exist->children.push_back(zero_value); + command->update_assignments->children.push_back(set_row_does_not_exist); + command->children.push_back(command->predicate); + command->children.push_back(command->update_assignments); + mut_command.column_to_update_expression[set_row_does_not_exist->column_name] = zero_value; + mut_command.ast = command->ptr(); - mutation_commands.emplace_back(mut_command); - - table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); - storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Ordinary); - } - else - { - mut_command.type = MutationCommand::Type::DELETE; - mut_command.predicate = delete_query.predicate; - - auto command = std::make_shared(); - command->type = ASTAlterCommand::DELETE; - command->predicate = delete_query.predicate; - command->children.push_back(command->predicate); - mut_command.ast = command->ptr(); - - mutation_commands.emplace_back(mut_command); - - table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); - MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false, false).validate(); - storage_merge_tree->mutate(mutation_commands, getContext(), MutationType::Lightweight); - } + mutation_commands.emplace_back(mut_command); + table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); + MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); + storage_merge_tree->mutate(mutation_commands, getContext()); return {}; } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index f896c2269e8..7778e316b8b 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -285,20 +285,15 @@ MutationsInterpreter::MutationsInterpreter( const StorageMetadataPtr & metadata_snapshot_, MutationCommands commands_, ContextPtr context_, - bool can_execute_, - bool is_lightweight_) + bool can_execute_) : storage(std::move(storage_)) , metadata_snapshot(metadata_snapshot_) , commands(std::move(commands_)) , context(Context::createCopy(context_)) , can_execute(can_execute_) , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections()) - , is_lightweight(is_lightweight_) { - if (is_lightweight) - mutation_ast = prepareLightweightDelete(!can_execute); - else - mutation_ast = prepare(!can_execute); + mutation_ast = prepare(!can_execute); } static NameSet getKeyColumns(const StoragePtr & storage, const StorageMetadataPtr & metadata_snapshot) @@ -777,6 +772,13 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); auto all_columns = storage_snapshot->getColumns(options); + // TODO: add _row_exists column if it is present in the part??? + if (auto part_storage = dynamic_pointer_cast(storage)) + { + if (part_storage->hasLightweightDeleteColumn()) + all_columns.push_back({metadata_snapshot->lightweight_delete_description.filter_column}); + } + /// Next, for each stage calculate columns changed by this and previous stages. for (size_t i = 0; i < prepared_stages.size(); ++i) { @@ -905,70 +907,6 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & return select; } -/// Prepare for lightweight delete -ASTPtr MutationsInterpreter::prepareLightweightDelete(bool dry_run) -{ - if (is_prepared) - throw Exception("MutationsInterpreter is already prepared. It is a bug.", ErrorCodes::LOGICAL_ERROR); - - if (commands.empty()) - throw Exception("Empty mutation commands list", ErrorCodes::LOGICAL_ERROR); - - /// For lightweight DELETE, we use predicate expression to get deleted rows. - /// Collect predicates in the commands - for (auto & command : commands) - { - if (command.type == MutationCommand::DELETE) - { - mutation_kind.set(MutationKind::MUTATE_OTHER); - if (stages.empty()) - stages.emplace_back(context); - - auto mask_predicate = getPartitionAndPredicateExpressionForMutationCommand(command); - stages.back().filters.push_back(mask_predicate); - } - else - throw Exception("Unsupported lightweight mutation command type: " + DB::toString(command.type), ErrorCodes::UNKNOWN_MUTATION_COMMAND); - } - - /// The updated_header is empty for lightweight delete. - updated_header = std::make_unique(); - - is_prepared = true; - - return prepareInterpreterSelectQueryLightweight(stages, dry_run); -} - -ASTPtr MutationsInterpreter::prepareInterpreterSelectQueryLightweight(std::vector & prepared_stages, bool) -{ - /// Construct a SELECT statement for lightweight delete is like "select _part_offset from db.table where " - auto select = std::make_shared(); - - /// DELETEs only query just need the _part_offset virtual column without real columns - select->setExpression(ASTSelectQuery::Expression::SELECT, std::make_shared()); - select->select()->children.push_back(std::make_shared("_part_offset")); - - ASTPtr where_expression; - if (!prepared_stages[0].filters.empty()) - { - if (prepared_stages[0].filters.size() == 1) - where_expression = prepared_stages[0].filters[0]; - else - { - auto coalesced_predicates = std::make_shared(); - coalesced_predicates->name = "or"; - coalesced_predicates->arguments = std::make_shared(); - coalesced_predicates->children.push_back(coalesced_predicates->arguments); - coalesced_predicates->arguments->children = prepared_stages[0].filters; - where_expression = std::move(coalesced_predicates); - } - - select->setExpression(ASTSelectQuery::Expression::WHERE, std::move(where_expression)); - } - - return select; -} - QueryPipelineBuilder MutationsInterpreter::addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const { for (size_t i_stage = 1; i_stage < prepared_stages.size(); ++i_stage) @@ -1053,10 +991,10 @@ QueryPipelineBuilder MutationsInterpreter::execute() if (!select_interpreter) { /// Skip to apply deleted mask for MutateSomePartColumn cases when part has lightweight delete. - if (!is_lightweight && skip_deleted_mask) + if (!apply_deleted_mask) { auto context_for_reading = Context::createCopy(context); - context_for_reading->setSkipDeletedMask(skip_deleted_mask); + context_for_reading->setApplyDeletedMask(apply_deleted_mask); select_interpreter = std::make_unique(mutation_ast, context_for_reading, storage, metadata_snapshot, select_limits); } else diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 49165c6f9ad..10f764caaee 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -43,8 +43,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, MutationCommands commands_, ContextPtr context_, - bool can_execute_, - bool is_lightweight_); + bool can_execute_); void validate(); @@ -79,16 +78,14 @@ public: MutationKind::MutationKindEnum getMutationKind() const { return mutation_kind.mutation_kind; } - void setSkipDeletedMask(bool skip) { skip_deleted_mask = skip; } + void setApplyDeletedMask(bool apply) { apply_deleted_mask = apply; } private: ASTPtr prepare(bool dry_run); - ASTPtr prepareLightweightDelete(bool dry_run); struct Stage; ASTPtr prepareInterpreterSelectQuery(std::vector &prepared_stages, bool dry_run); - static ASTPtr prepareInterpreterSelectQueryLightweight(std::vector &prepared_stages, bool dry_run); QueryPipelineBuilder addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const; @@ -103,10 +100,8 @@ private: bool can_execute; SelectQueryOptions select_limits; - /// True for lightweight delete. - bool is_lightweight = false; - /// True for MutateSomePartColumns on part with lightweight. - bool skip_deleted_mask = false; + /// TODO: is it needed? + bool apply_deleted_mask = true; ASTPtr mutation_ast; diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 9a65cd4f17e..f060d42c718 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -62,7 +62,7 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings( .save_marks_in_cache = true, .checksum_on_read = settings.checksum_on_read, .read_in_order = query_info.input_order_info != nullptr, - .skip_deleted_mask = context->skipDeletedMask(), + .apply_deleted_mask = context->applyDeletedMask(), }; } diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp index 1c4d8d9186a..03d24d84bb0 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -652,32 +652,6 @@ bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & stor return !volume_ptr->areMergesAvoided(); } -void DataPartStorageOnDisk::loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const -{ - String deleted_mask_path = fs::path(getRelativePath()) / deleted_mask.name; - auto disk = volume->getDisk(); - - if (disk->isFile(deleted_mask_path)) - { - auto read_buf = openForReading(disk, deleted_mask_path); - deleted_mask.read(*read_buf); - assertEOF(*read_buf); - } -} - -void DataPartStorageOnDisk::writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const -{ - const String final_path = fs::path(getRelativePath()) / deleted_mask.name; - const String tmp_path = final_path + ".tmp"; - - { - auto out = volume->getDisk()->writeFile(tmp_path, 4096); - deleted_mask.write(*out); - } - - volume->getDisk()->moveFile(tmp_path, final_path); -} - void DataPartStorageOnDisk::backup( TemporaryFilesOnDisks & temp_dirs, const MergeTreeDataPartChecksums & checksums, diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h index 2362d30a92d..2426b5eee80 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.h +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h @@ -86,9 +86,6 @@ public: bool shallParticipateInMerges(const IStoragePolicy &) const override; - void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const override; - void writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const override; - void backup( TemporaryFilesOnDisks & temp_dirs, const MergeTreeDataPartChecksums & checksums, diff --git a/src/Storages/MergeTree/FutureMergedMutatedPart.h b/src/Storages/MergeTree/FutureMergedMutatedPart.h index 06659249cae..4447687c3d9 100644 --- a/src/Storages/MergeTree/FutureMergedMutatedPart.h +++ b/src/Storages/MergeTree/FutureMergedMutatedPart.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB @@ -24,7 +23,6 @@ struct FutureMergedMutatedPart MergeTreePartInfo part_info; MergeTreeData::DataPartsVector parts; MergeType merge_type = MergeType::Regular; - MutationType mutation_type = MutationType::Ordinary; const MergeTreePartition & getPartition() const { return parts.front()->partition; } diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index cfd29a550e1..f0173baecb7 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -3,7 +3,6 @@ #include #include #include -#include #include namespace DB @@ -170,9 +169,6 @@ public: /// A leak of abstraction virtual bool shallParticipateInMerges(const IStoragePolicy &) const { return true; } - virtual void loadDeletedRowsMask(MergeTreeDataPartDeletedMask & deleted_mask) const = 0; - virtual void writeDeletedRowsMask(const MergeTreeDataPartDeletedMask & deleted_mask) const = 0; - /// Create a backup of a data part. /// This method adds a new entry to backup_entries. /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time). diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index b8fd70193f1..3f8000f3136 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1217,35 +1217,6 @@ bool IMergeTreeDataPart::supportLightweightDeleteMutate() const return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.empty(); } -bool IMergeTreeDataPart::getDeletedMask(MergeTreeDataPartDeletedMask & deleted_mask) const -{ - bool found = false; - - /// Check if deleted mask file exists. - if (data_part_storage->exists(String(deleted_mask.name))) - { - data_part_storage->loadDeletedRowsMask(deleted_mask); - - if (deleted_mask.getDeletedRows().size() != rows_count) - throw Exception(ErrorCodes::CORRUPTED_DATA, - "Size of deleted mask loaded from '{}':'{}' doesn't match expected " - "for part {}" - "(loaded {} rows, expected {} rows).", - data_part_storage->getDiskPath(), deleted_mask.name, name, deleted_mask.getDeletedRows().size(), rows_count); - - found = true; - } - - return found; -} - -void IMergeTreeDataPart::writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const -{ - MergeTreeDataPartDeletedMask deleted_mask {}; - deleted_mask.setDeletedRows(new_mask); - data_part_storage->writeDeletedRowsMask(deleted_mask); -} - void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const { TransactionID expected_tid = txn ? txn->tid : Tx::PrehistoricTID; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 7869ca52969..a9c4590c045 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -465,10 +464,6 @@ public: /// True if here is lightweight deleted mask file in part. bool hasLightweightDelete() const { return data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME); } - /// Read lightweight deleted mask when needed. - bool getDeletedMask(MergeTreeDataPartDeletedMask & deleted_mask) const; - void writeDeletedMask(MergeTreeDataPartDeletedMask::DeletedRows new_mask) const; - protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 229f62da293..41030e522ac 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -62,7 +62,7 @@ public: MergeTreeData::DataPartPtr data_part; - bool needReadDeletedMask() { return !settings.skip_deleted_mask && data_part->hasLightweightDelete(); } + bool needReadDeletedMask() { return settings.apply_deleted_mask && data_part->hasLightweightDelete(); } protected: /// Returns actual column type in part, which can differ from table metadata. diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index a5649c0b41b..4cae54492c8 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -243,7 +243,7 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart( /// Add lightweight delete filtering step const auto & lightweigth_delete_info = metadata_snapshot->lightweight_delete_description; - if (!reader_settings.skip_deleted_mask && data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) + if (reader_settings.apply_deleted_mask && data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) { pre_reader_for_step.push_back(data_part->getReader({lightweigth_delete_info.filter_column}, metadata_snapshot, mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, @@ -269,7 +269,7 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu /// Add filtering step with lightweight delete mask const auto & lightweigth_delete_info = storage_snapshot->metadata->lightweight_delete_description; - if (!reader_settings.skip_deleted_mask && current_task.data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) + if (reader_settings.apply_deleted_mask && current_task.data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) { current_task.pre_range_readers.push_back( MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lwd_filter_step, last_reader, non_const_virtual_column_names)); @@ -292,7 +292,6 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu prev_reader = ¤t_task.pre_range_readers.back(); } - } if (!last_reader) diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp deleted file mode 100644 index d1a78623278..00000000000 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.cpp +++ /dev/null @@ -1,162 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - - -namespace DB::ErrorCodes -{ - extern const int UNKNOWN_FORMAT_VERSION; - extern const int CORRUPTED_DATA; -} - -namespace DB -{ - -namespace -{ -struct DeletedRowsHash -{ -private: - char value[16]; - -public: - DeletedRowsHash() = default; - explicit DeletedRowsHash(SipHash & hasher) - { - hasher.get128(value); - - static_assert(std::is_pod_v, "Expected to be a POD-type"); - static_assert(sizeof(DeletedRowsHash) * 8 == 128); - } - - bool operator==(const DeletedRowsHash & other) const - { - return memcmp(value, other.value, sizeof(value)) == 0; - } -}; - -constexpr UInt8 FORMAT_VERSION = 1; -constexpr UInt8 DEFAULT_CODEC = static_cast(CompressionMethodByte::T64); -constexpr UInt8 PADDING_SIZE = 7; // just in case -constexpr UInt8 HEADER_SIZE = 0 - + sizeof(FORMAT_VERSION) - + sizeof(UInt64) // number of rows in mask - + sizeof(DeletedRowsHash) // column data hash - + PADDING_SIZE // padding: zero-bytes - + sizeof(DeletedRowsHash); // header hash -} - -MergeTreeDataPartDeletedMask::MergeTreeDataPartDeletedMask() - : deleted_rows(ColumnUInt8::create()) -{} - -const ColumnUInt8 & MergeTreeDataPartDeletedMask::getDeletedRows() const -{ - return *deleted_rows; -} - -void MergeTreeDataPartDeletedMask::setDeletedRows(DeletedRows new_rows) -{ - deleted_rows.swap(new_rows); -} - -void MergeTreeDataPartDeletedMask::setDeletedRows(size_t rows, bool value) -{ - setDeletedRows(ColumnUInt8::create(rows, value)); -} - -void MergeTreeDataPartDeletedMask::read(ReadBuffer & in) -{ - std::array header_buffer_data; - in.readStrict(header_buffer_data.data(), header_buffer_data.size()); - {// validate hash of the header first - SipHash hash; - hash.update(header_buffer_data.data(), header_buffer_data.size()); - const DeletedRowsHash computed_hash(hash); - - DeletedRowsHash read_hash; - readPODBinary(read_hash, in); - if (read_hash != computed_hash) - throw Exception(ErrorCodes::CORRUPTED_DATA, - "Invalid deleted masks file header hash"); - } - - UInt8 format_version = FORMAT_VERSION; - UInt64 stored_rows = 0; - DeletedRowsHash column_hash; - {// Read header values - ReadBuffer header(header_buffer_data.data(), header_buffer_data.size(), 0); - readBinary(format_version, header); - if (format_version != FORMAT_VERSION) - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, - "Unknown deleted mask file format version {}", - static_cast(format_version)); - - readBinary(stored_rows, header); - readPODBinary(column_hash, header); - header.ignore(PADDING_SIZE); - assertEOF(header); - } - - auto data_read_buffer = std::make_unique(in); - - auto res_column = DeletedRows(ColumnUInt8::create()); - ColumnPtr res_col_ptr = res_column; - SerializationPtr serialization = DataTypeUInt8().getDefaultSerialization(); - NativeReader::readData(*serialization, res_col_ptr, *data_read_buffer, stored_rows, 0); - assertEOF(*data_read_buffer); - - // we probably don't want to check column hash here, since codec verifies data integrity. - deleted_rows = std::move(res_column); -} - -void MergeTreeDataPartDeletedMask::write(WriteBuffer & out) const -{ - {// Header - std::array header_buffer_data; - WriteBuffer header(header_buffer_data.data(), header_buffer_data.size()); - - writeBinary(FORMAT_VERSION, header); - writeBinary(static_cast(deleted_rows->size()), header); - - { - SipHash hash; - deleted_rows->updateHashFast(hash); - writePODBinary(DeletedRowsHash(hash), header); - } - - { - const char padding[PADDING_SIZE] = {'\0'}; - writePODBinary(padding, header); - } - assert(header_buffer_data.max_size() == header.count()); - - writePODBinary(header_buffer_data, out); - {// header hash - SipHash hash; - hash.update(header_buffer_data.data(), header_buffer_data.size()); - writePODBinary(DeletedRowsHash(hash), out); - } - } - assert(HEADER_SIZE == out.count()); - - const DataTypeUInt8 col_datatype; - auto codec = CompressionCodecFactory::instance().get(static_cast(DEFAULT_CODEC), &col_datatype); - auto data_write_buffer = std::make_unique(out, codec); - SerializationPtr serialization = col_datatype.getDefaultSerialization(); - - NativeWriter::writeData(*serialization, deleted_rows, *data_write_buffer, 0, deleted_rows->size()); - data_write_buffer->finalize(); -} - -} diff --git a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h b/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h deleted file mode 100644 index c8652746d98..00000000000 --- a/src/Storages/MergeTree/MergeTreeDataPartDeletedMask.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -namespace DB -{ -class ReadBuffer; -class WriteBuffer; - -/// Per-part info about rows deleted by lightweight mutations. -struct MergeTreeDataPartDeletedMask -{ - explicit MergeTreeDataPartDeletedMask(); - using DeletedRows = ColumnUInt8::Ptr; - - static constexpr std::string_view name = "deleted_rows_mask.bin"; - - const ColumnUInt8 & getDeletedRows() const; - const DeletedRows & getDeletedRowsPtr() const { return deleted_rows; } - void setDeletedRows(DeletedRows new_rows); - void setDeletedRows(size_t rows, bool value); - - void read(ReadBuffer & in); - void write(WriteBuffer & out) const; - -private: - ColumnUInt8::Ptr deleted_rows; -}; - -}; diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h index 02372011876..55848e09434 100644 --- a/src/Storages/MergeTree/MergeTreeIOSettings.h +++ b/src/Storages/MergeTree/MergeTreeIOSettings.h @@ -23,8 +23,8 @@ struct MergeTreeReaderSettings bool checksum_on_read = true; /// True if we read in order of sorting key. bool read_in_order = false; - /// Do not apply deleted mask for internal select from mutate some part columns. - bool skip_deleted_mask = false; + /// Deleted mask is applied to all reads except internal select from mutate some part columns. + bool apply_deleted_mask = true; }; struct MergeTreeWriterSettings diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 08f45e85d23..a222f2a8ad8 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -46,7 +46,7 @@ UInt64 MergeTreeMutationEntry::parseFileName(const String & file_name_) } MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number, - const TransactionID & tid_, const WriteSettings & settings, MutationType type_) + const TransactionID & tid_, const WriteSettings & settings) : create_time(time(nullptr)) , commands(std::move(commands_)) , disk(std::move(disk_)) @@ -54,13 +54,11 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP , file_name("tmp_mutation_" + toString(tmp_number) + ".txt") , is_temp(true) , tid(tid_) - , type(type_) { try { auto out = disk->writeFile(std::filesystem::path(path_prefix) / file_name, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, settings); - *out << "format version: 2\n" - << "type: " << type << "\n" + *out << "format version: 1\n" << "create time: " << LocalDateTime(create_time) << "\n"; *out << "commands: "; commands.writeText(*out); @@ -123,25 +121,7 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat block_number = parseFileName(file_name); auto buf = disk->readFile(path_prefix + file_name); - int format_version; - *buf >> "format version: " >> format_version >> "\n"; - - /// Allow format_version = 1 for backward compatibility. - if (format_version != 1 && format_version != 2) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported format version in mutation.txt, expected '1' or '2', got '{}'", format_version); - - type = MutationType::Ordinary; - if (format_version == 2) - { - String type_str; - *buf >> "type: " >> type_str >> "\n"; - - auto type_value = magic_enum::enum_cast(type_str); - if (type_value.has_value()) - type = type_value.value(); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported mutation type in mutation.txt, expected 'Lightweight' or 'Ordinary', got '{}'", type_str); - } + *buf >> "format version: 1\n"; LocalDateTime create_time_dt; *buf >> "create time: " >> create_time_dt >> "\n"; diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.h b/src/Storages/MergeTree/MergeTreeMutationEntry.h index 37dbca9de7b..04297f2852a 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.h +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.h @@ -11,11 +11,6 @@ namespace DB { class IBackupEntry; -/// Type of Mutate. Used to control different mutates during mutates -/// assignment. Also allows to apply special logic during mutate process -/// Stored in FutureMergedMutatedPart and MergeTreeMutationEntry. -enum class MutationType { Ordinary, Lightweight }; - /// A mutation entry for non-replicated MergeTree storage engines. /// Stores information about mutation in file mutation_*.txt. struct MergeTreeMutationEntry @@ -41,12 +36,9 @@ struct MergeTreeMutationEntry /// or UnknownCSN if it's not committed (yet) or RolledBackCSN if it's rolled back or PrehistoricCSN if there is no transaction. CSN csn = Tx::UnknownCSN; - /// Type of mutation, used for lightweight delete. - MutationType type; - /// Create a new entry and write it to a temporary file. MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk, const String & path_prefix_, UInt64 tmp_number, - const TransactionID & tid_, const WriteSettings & settings, MutationType type_); + const TransactionID & tid_, const WriteSettings & settings); MergeTreeMutationEntry(const MergeTreeMutationEntry &) = delete; MergeTreeMutationEntry(MergeTreeMutationEntry &&) = default; diff --git a/src/Storages/MergeTree/MergeTreeMutationStatus.h b/src/Storages/MergeTree/MergeTreeMutationStatus.h index f0949047f6e..acda43b9254 100644 --- a/src/Storages/MergeTree/MergeTreeMutationStatus.h +++ b/src/Storages/MergeTree/MergeTreeMutationStatus.h @@ -5,7 +5,6 @@ #include #include #include -#include namespace DB @@ -14,7 +13,6 @@ namespace DB struct MergeTreeMutationStatus { - MutationType type; String id; String command; time_t create_time = 0; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index cbc409af4e8..cb2ead8a025 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -321,7 +321,6 @@ void MergeTreeRangeReader::ReadResult::clear() total_rows_per_granule = 0; filter_holder = nullptr; filter = nullptr; - deleted_mask_filter_holder = nullptr; } void MergeTreeRangeReader::ReadResult::shrink(Columns & old_columns) @@ -666,7 +665,7 @@ MergeTreeRangeReader::MergeTreeRangeReader( for (const auto & name_and_type : merge_tree_reader->getColumns()) sample_block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); - + for (const auto & column_name : non_const_virtual_column_names_) { if (sample_block.has(column_name)) @@ -681,9 +680,6 @@ MergeTreeRangeReader::MergeTreeRangeReader( // sample_block.insert(ColumnWithTypeAndName(ColumnUInt8::create(), std::make_shared(), column_name)); } - if (merge_tree_reader->needReadDeletedMask()) - need_apply_deleted_mask = merge_tree_reader->data_part->getDeletedMask(deleted_mask); - if (prewhere_info) { const auto & step = *prewhere_info; @@ -863,15 +859,13 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar read_result = startReadingChain(max_rows, ranges); read_result.num_rows = read_result.numReadRows(); - executeDeletedRowMaskFilterColumns(read_result); - if (read_result.num_rows) { /// Physical columns go first and then some virtual columns follow size_t physical_columns_count = read_result.columns.size() - read_result.extra_columns_filled.size(); /////////// -// TODO: properly account for "virtual columns" that are overridden with real data in the part - +// TODO: properly account for "virtual columns" that are overridden with real data in the part + ///////////// Columns physical_columns(read_result.columns.begin(), read_result.columns.begin() + physical_columns_count); @@ -968,10 +962,6 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t fillPartOffsetColumn(result, leading_begin_part_offset, leading_end_part_offset); } - /// Do similar as part_offset for deleted mask. - if (need_apply_deleted_mask) - fillDeletedRowMaskColumn(result, leading_begin_part_offset, leading_end_part_offset); - return result; } @@ -1003,47 +993,6 @@ void MergeTreeRangeReader::fillPartOffsetColumn(ReadResult & result, UInt64 lead result.extra_columns_filled.push_back("_part_offset"); } -/// Fill deleted_row_mask column, referenced from fillPartOffsetColumn(). -void MergeTreeRangeReader::fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset) -{ - size_t num_rows = result.numReadRows(); - - auto mask_column = ColumnUInt8::create(num_rows); - ColumnUInt8::Container & vec = mask_column->getData(); - - UInt8 * pos = vec.data(); - UInt8 * end = &vec[num_rows]; - - const auto & deleted_rows_col = deleted_mask.getDeletedRows(); - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); - - while (pos < end && leading_begin_part_offset < leading_end_part_offset) - { - if (deleted_rows_mask[leading_begin_part_offset++]) - *pos++ = 0; - else - *pos++ = 1; - } - - const auto start_ranges = result.startedRanges(); - - for (const auto & start_range : start_ranges) - { - UInt64 start_part_offset = index_granularity->getMarkStartingRow(start_range.range.begin); - UInt64 end_part_offset = index_granularity->getMarkStartingRow(start_range.range.end); - - while (pos < end && start_part_offset < end_part_offset) - { - if (deleted_rows_mask[start_part_offset++]) - *pos++ = 0; - else - *pos++ = 1; - } - } - - result.deleted_mask_filter_holder = std::move(mask_column); -} - Columns MergeTreeRangeReader::continueReadingChain(const ReadResult & result, size_t & num_rows) { Columns columns; @@ -1158,36 +1107,6 @@ static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second) return mut_first; } - -/// Implicitly apply deleted mask filter to columns. -/// If there is no prewhere_info, apply directly the deleted mask filter. -/// If prewhere_info exists, only apply to the first prewhere filter. -void MergeTreeRangeReader::executeDeletedRowMaskFilterColumns(ReadResult & result) -{ - if (prewhere_info || !need_apply_deleted_mask || !result.deleted_mask_filter_holder) - return; - - const ColumnUInt8 * mask_filter = typeid_cast(result.deleted_mask_filter_holder.get()); - filterColumns(result.columns, mask_filter->getData()); - - bool has_column = false; - for (auto & column : result.columns) - { - if (column) - { - has_column = true; - result.num_rows = column->size(); - break; - } - } - - /// There is only one filter column. Record the actual number. - if (!has_column) - result.num_rows = result.countBytesInResultFilter(mask_filter->getData()); - - result.need_filter = true; -} - void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { if (!prewhere_info) @@ -1243,7 +1162,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r else if (column_name == "_row_exists") { /// do nothing, it will be added later - /// TODO: properly implement reading non-const virtual columns or filling them with default values + /// TODO: properly implement reading non-const virtual columns or filling them with default values } else throw Exception("Unexpected non-const virtual column: " + column_name, ErrorCodes::LOGICAL_ERROR); @@ -1253,19 +1172,6 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. result.block_before_prewhere = block; - /// Apply deleted mask filter for the first prewhere step - if (!result.getFilter() && result.deleted_mask_filter_holder) - { - auto columns = block.getColumns(); - filterColumns(columns, result.deleted_mask_filter_holder); - if (columns.empty()) - block = block.cloneEmpty(); - else - block.setColumns(columns); - - result.setFilter(result.deleted_mask_filter_holder); - } - if (prewhere_info->actions) prewhere_info->actions->execute(block); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index ba71f1898f6..06f3f5760fb 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -2,7 +2,6 @@ #include #include #include -#include namespace DB { @@ -242,9 +241,6 @@ public: std::map filter_bytes_map; - /// Similar as filter that you need to apply to newly-read columns - ColumnPtr deleted_mask_filter_holder; - Names extra_columns_filled; }; @@ -257,8 +253,6 @@ private: Columns continueReadingChain(const ReadResult & result, size_t & num_rows); void executePrewhereActionsAndFilterColumns(ReadResult & result); void fillPartOffsetColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset); - void fillDeletedRowMaskColumn(ReadResult & result, UInt64 leading_begin_part_offset, UInt64 leading_end_part_offset); - void executeDeletedRowMaskFilterColumns(ReadResult & result); IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; @@ -272,8 +266,6 @@ private: bool last_reader_in_chain = false; bool is_initialized = false; Names non_const_virtual_column_names; - bool need_apply_deleted_mask = false; - MergeTreeDataPartDeletedMask deleted_mask; }; } diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index 53a56bad97e..de48b96edd6 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -46,7 +46,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { - auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals(); /// TODO: only add _rows_exists column (if it's present on disk) columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); } else @@ -68,9 +68,6 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( reader = data_part->getReader(columns_for_reader, storage_snapshot->metadata, MarkRanges{MarkRange(0, data_part->getMarksCount())}, /* uncompressed_cache = */ nullptr, mark_cache.get(), reader_settings, {}, {}); - - if (data_part->hasLightweightDelete()) - need_apply_deleted_mask = data_part->getDeletedMask(deleted_mask); } Chunk MergeTreeSequentialSource::generate() @@ -78,91 +75,53 @@ try { const auto & header = getPort().getHeader(); - /// The chunk after deleted mask applied maybe empty. But the empty chunk means done of read rows. - do + if (!isCancelled() && current_row < data_part->rows_count) { - if (!isCancelled() && current_row < data_part->rows_count) + size_t rows_to_read = data_part->index_granularity.getMarkRows(current_mark); + bool continue_reading = (current_mark != 0); + + const auto & sample = reader->getColumns(); + Columns columns(sample.size()); + size_t rows_read = reader->readRows(current_mark, data_part->getMarksCount(), continue_reading, rows_to_read, columns); + + if (rows_read) { - size_t rows_to_read = data_part->index_granularity.getMarkRows(current_mark); - bool continue_reading = (current_mark != 0); + current_row += rows_read; + current_mark += (rows_to_read == rows_read); - const auto & sample = reader->getColumns(); - Columns columns(sample.size()); - size_t rows_read = reader->readRows(current_mark, data_part->getMarksCount(), continue_reading, rows_to_read, columns); + bool should_evaluate_missing_defaults = false; + reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read); - if (rows_read) + if (should_evaluate_missing_defaults) { - current_row += rows_read; - current_mark += (rows_to_read == rows_read); - - if (need_apply_deleted_mask) - { - const auto & deleted_rows_col = deleted_mask.getDeletedRows(); - const ColumnUInt8::Container & deleted_rows_mask = deleted_rows_col.getData(); - - size_t pos = current_row - rows_read; - - /// Get deleted mask for rows_read - IColumn::Filter deleted_rows_filter(rows_read, true); - for (size_t i = 0; i < rows_read; i++) - { - if (deleted_rows_mask[pos++]) - deleted_rows_filter[i] = 0; - } - - // Filter only if some items were deleted - if (auto num_deleted_rows = std::count(deleted_rows_filter.begin(), deleted_rows_filter.end(), 0)) - { - const auto remaining_rows = deleted_rows_filter.size() - num_deleted_rows; - - /// If we return {} here, it means finished, no reading of the following rows. - /// Continue to read until remaining rows are not zero or reach the end (REAL finish). - if (!remaining_rows) - continue; - - for (auto & col : columns) - col = col->filter(deleted_rows_filter, remaining_rows); - - /// Update rows_read with actual rows in columns - rows_read = remaining_rows; - } - } - - bool should_evaluate_missing_defaults = false; - reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read); - - if (should_evaluate_missing_defaults) - { - reader->evaluateMissingDefaults({}, columns); - } - - reader->performRequiredConversions(columns); - - /// Reorder columns and fill result block. - size_t num_columns = sample.size(); - Columns res_columns; - res_columns.reserve(num_columns); - - auto it = sample.begin(); - for (size_t i = 0; i < num_columns; ++i) - { - if (header.has(it->name)) - res_columns.emplace_back(std::move(columns[i])); - - ++it; - } - - return Chunk(std::move(res_columns), rows_read); + reader->evaluateMissingDefaults({}, columns); } - } - else - { - finish(); - } - return {}; - } while (true); + reader->performRequiredConversions(columns); + /// Reorder columns and fill result block. + size_t num_columns = sample.size(); + Columns res_columns; + res_columns.reserve(num_columns); + + auto it = sample.begin(); + for (size_t i = 0; i < num_columns; ++i) + { + if (header.has(it->name)) + res_columns.emplace_back(std::move(columns[i])); + + ++it; + } + + return Chunk(std::move(res_columns), rows_read); + } + } + else + { + finish(); + } + + return {}; } catch (...) { diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.h b/src/Storages/MergeTree/MergeTreeSequentialSource.h index 5a571825db5..a3e4f5fa856 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.h +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.h @@ -3,7 +3,6 @@ #include #include #include -#include #include namespace DB @@ -59,9 +58,6 @@ private: /// current row at which we stop reading size_t current_row = 0; - bool need_apply_deleted_mask = false; - MergeTreeDataPartDeletedMask deleted_mask {}; - /// Closes readers and unlock part locks void finish(); }; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index ae64b08e351..0e61f499202 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -173,7 +173,7 @@ getColumnsForNewDataPart( const auto all_virtual_columns = source_part->storage.getVirtuals(); /// Preserve virtual columns that have persisted values in the source_part -/// TODO: only allow LWD mask to be overriden!!!!! +/// TODO: only allow LWD mask to be overridden! for (const auto & virtual_column : all_virtual_columns) { if (part_columns.has(virtual_column.name) && !storage_columns.contains(virtual_column.name)) @@ -188,7 +188,7 @@ getColumnsForNewDataPart( for (const auto & [column_name, _] : command.column_to_update_expression) { /// Allow to update and persist values of virtual column -/// TODO: only allow LWD mask to be overriden!!!!! +/// TODO: only allow LWD mask to be overridden! auto virtual_column = all_virtual_columns.tryGetByName(column_name); if (virtual_column && !storage_columns.contains(column_name)) storage_columns.emplace_back(column_name, virtual_column->type); @@ -1379,206 +1379,6 @@ private: std::unique_ptr part_merger_writer_task{nullptr}; }; -/// LightweightDeleteTask works for lightweight delete mutate. -/// The MutationsInterpreter returns a simple select like "select _part_offset where predicates". -/// The prepare() and execute() has special logics for LWD mutate. -class LightweightDeleteTask : public IExecutableTask -{ -public: - - explicit LightweightDeleteTask(MutationContextPtr ctx_) : ctx(ctx_) {} - - void onCompleted() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - StorageID getStorageID() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - UInt64 getPriority() override { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not implemented"); } - - bool executeStep() override - { - switch (state) - { - case State::NEED_PREPARE: - { - prepare(); - - state = State::NEED_EXECUTE; - return true; - } - case State::NEED_EXECUTE: - { - execute(); - - state = State::NEED_FINALIZE; - return true; - } - case State::NEED_FINALIZE: - { - finalize(); - - state = State::SUCCESS; - return true; - } - case State::SUCCESS: - { - return false; - } - } - return false; - } - -private: - - void prepare() - { - if (ctx->execute_ttl_type != ExecuteTTLType::NONE) - ctx->files_to_skip.insert("ttl.txt"); - - ctx->data_part_storage_builder->createDirectories(); - - /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. - TransactionID tid = ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID; - /// NOTE do not pass context for writing to system.transactions_info_log, - /// because part may have temporary name (with temporary block numbers). Will write it later. - ctx->new_data_part->version.setCreationTID(tid, nullptr); - ctx->new_data_part->storeVersionMetadata(); - - NameSet hardlinked_files; - /// Create hardlinks for unchanged files - for (auto it = ctx->source_part->data_part_storage->iterate(); it->isValid(); it->next()) - { - if (ctx->files_to_skip.contains(it->name())) - continue; - - String destination; - destination = it->name(); - - if (it->isFile()) - { - ctx->data_part_storage_builder->createHardLinkFrom( - *ctx->source_part->data_part_storage, it->name(), destination); - hardlinked_files.insert(it->name()); - } - else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir - { - // it's a projection part directory - ctx->data_part_storage_builder->createProjection(destination); - - auto projection_data_part_storage = ctx->source_part->data_part_storage->getProjection(destination); - auto projection_data_part_storage_builder = ctx->data_part_storage_builder->getProjection(destination); - - for (auto p_it = projection_data_part_storage->iterate(); p_it->isValid(); p_it->next()) - { - projection_data_part_storage_builder->createHardLinkFrom( - *projection_data_part_storage, p_it->name(), p_it->name()); - hardlinked_files.insert(p_it->name()); - } - } - } - - /// Tracking of hardlinked files required for zero-copy replication. - /// We don't remove them when we delete last copy of source part because - /// new part can use them. - ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID(); - ctx->hardlinked_files.source_part_name = ctx->source_part->name; - ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files; - - /// Only the _delete mask column will be written. - (*ctx->mutate_entry)->columns_written = 1; - - ctx->new_data_part->checksums = ctx->source_part->checksums; - - ctx->compression_codec = ctx->source_part->default_codec; - - if (ctx->mutating_pipeline_builder.initialized()) - { - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); - - if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); - - if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); - - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); - ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); - /// Is calculated inside MergeProgressCallback. - ctx->mutating_pipeline.disableProfileEventUpdate(); - ctx->mutating_executor = std::make_unique(ctx->mutating_pipeline); - } - } - - void execute() - { - Block block; - bool has_deleted_rows = false; - - auto new_deleted_rows = ColumnUInt8::create(); - auto & data = new_deleted_rows->getData(); - - /// If this part has already applied lightweight mutation, load the past latest bitmap to merge with current bitmap - if (ctx->source_part->hasLightweightDelete()) - { - MergeTreeDataPartDeletedMask deleted_mask {}; - if (ctx->source_part->getDeletedMask(deleted_mask)) - { - const auto & deleted_rows_col = deleted_mask.getDeletedRows(); - const auto & source_data = deleted_rows_col.getData(); - data.insert(source_data.begin(), source_data.begin() + ctx->source_part->rows_count); - - has_deleted_rows = true; - } - } - - if (!has_deleted_rows) - new_deleted_rows->insertManyDefaults(ctx->source_part->rows_count); - - /// Mark the data corresponding to the offset in the as deleted. - while (MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry) && ctx->mutating_executor && ctx->mutating_executor->pull(block)) - { - size_t block_rows = block.rows(); - - if (block_rows && !has_deleted_rows) - has_deleted_rows = true; - - const auto & cols = block.getColumns(); - const auto * offset_col = typeid_cast(cols[0].get()); - const UInt64 * offset = offset_col->getData().data(); - - /// Fill 1 for rows in offset - for (size_t current_row = 0; current_row < block_rows; current_row++) - data[offset[current_row]] = 1; - } - - if (has_deleted_rows) - { - ctx->new_data_part->writeDeletedMask(ColumnUInt8::Ptr(std::move(new_deleted_rows))); - } - } - - void finalize() - { - if (ctx->mutating_executor) - { - ctx->mutating_executor.reset(); - ctx->mutating_pipeline.reset(); - } - - MutationHelpers::finalizeMutatedPart(ctx->source_part, ctx->data_part_storage_builder, ctx->new_data_part, ctx->execute_ttl_type, ctx->compression_codec, ctx->context); - } - - enum class State - { - NEED_PREPARE, - NEED_EXECUTE, - NEED_FINALIZE, - - SUCCESS - }; - - State state{State::NEED_PREPARE}; - - MutationContextPtr ctx; -}; - MutateTask::MutateTask( FutureMergedMutatedPartPtr future_part_, @@ -1611,13 +1411,6 @@ MutateTask::MutateTask( ctx->source_part = ctx->future_part->parts[0]; ctx->storage_from_source_part = std::make_shared(ctx->source_part); - /// part is checked for lightweight delete in selectPartsToMutate(). - ctx->is_lightweight_mutation = ctx->future_part->mutation_type == MutationType::Lightweight; - - /// Empty mutation commands mean that the mutation is killed. Just work as ordinary, clone the part. - if (ctx->commands->empty()) - ctx->is_lightweight_mutation = false; - auto storage_snapshot = ctx->storage_from_source_part->getStorageSnapshot(ctx->metadata_snapshot, context_); extendObjectColumns(ctx->storage_columns, storage_snapshot->object_columns, /*with_subcolumns=*/ false); } @@ -1673,7 +1466,7 @@ bool MutateTask::prepare() ctx->commands_for_part.emplace_back(command); } - if (ctx->source_part->isStoredOnDisk() && !ctx->is_lightweight_mutation && !isStorageTouchedByMutations( + if (ctx->source_part->isStoredOnDisk() && !isStorageTouchedByMutations( ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation); @@ -1689,25 +1482,15 @@ bool MutateTask::prepare() ctx->stage_progress = std::make_unique(1.0); - bool need_mutate_all_columns = !isWidePart(ctx->source_part); - if (!ctx->for_interpreter.empty()) { ctx->interpreter = std::make_unique( - ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->for_interpreter, context_for_reading, true, ctx->is_lightweight_mutation); + ctx->storage_from_source_part, ctx->metadata_snapshot, ctx->for_interpreter, context_for_reading, true); ctx->materialized_indices = ctx->interpreter->grabMaterializedIndices(); ctx->materialized_projections = ctx->interpreter->grabMaterializedProjections(); ctx->mutation_kind = ctx->interpreter->getMutationKind(); - - /// Skip to apply deleted mask when reading for MutateSomePartColumns. - need_mutate_all_columns = need_mutate_all_columns || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter->isAffectingAllColumns()); - if (!need_mutate_all_columns && ctx->source_part->hasLightweightDelete() && !ctx->is_lightweight_mutation) - ctx->interpreter->setSkipDeletedMask(true); - -///// - ctx->interpreter->setSkipDeletedMask(true); -///// - + /// Always disable filtering in mutations, we want to read all rows + ctx->interpreter->setApplyDeletedMask(false); ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); ctx->progress_callback = MergeProgressCallback((*ctx->mutate_entry)->ptr(), ctx->watch_prev_elapsed, *ctx->stage_progress); @@ -1737,21 +1520,12 @@ bool MutateTask::prepare() /// It shouldn't be changed by mutation. ctx->new_data_part->index_granularity_info = ctx->source_part->index_granularity_info; - if (ctx->is_lightweight_mutation) - { - /// The metadata alter will update the metadata snapshot, we should use same as source part. - ctx->new_data_part->setColumns(ctx->source_part->getColumns()); - ctx->new_data_part->setSerializationInfos(ctx->source_part->getSerializationInfos()); - } - else - { - auto [new_columns, new_infos] = MutationHelpers::getColumnsForNewDataPart( - ctx->source_part, ctx->updated_header, ctx->storage_columns, - ctx->source_part->getSerializationInfos(), ctx->commands_for_part); + auto [new_columns, new_infos] = MutationHelpers::getColumnsForNewDataPart( + ctx->source_part, ctx->updated_header, ctx->storage_columns, + ctx->source_part->getSerializationInfos(), ctx->commands_for_part); - ctx->new_data_part->setColumns(new_columns); - ctx->new_data_part->setSerializationInfos(new_infos); - } + ctx->new_data_part->setColumns(new_columns); + ctx->new_data_part->setSerializationInfos(new_infos); ctx->new_data_part->partition.assign(ctx->source_part->partition); @@ -1768,17 +1542,11 @@ bool MutateTask::prepare() /// All columns from part are changed and may be some more that were missing before in part /// TODO We can materialize compact part without copying data - if (need_mutate_all_columns) + if (!isWidePart(ctx->source_part) + || (ctx->mutation_kind == MutationsInterpreter::MutationKind::MUTATE_OTHER && ctx->interpreter && ctx->interpreter->isAffectingAllColumns())) { task = std::make_unique(ctx); } - else if (ctx->is_lightweight_mutation) - { - ctx->files_to_skip = ctx->source_part->getFileNamesWithoutChecksums(); - - /// We will modify or create only deleted_row_mask for lightweight delete. Other columns and key values are copied as-is. - task = std::make_unique(ctx); - } else /// TODO: check that we modify only non-key columns in this case. { /// We will modify only some of the columns. Other columns and key values can be copied as-is. diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index cd31d356b4b..f6c80baba05 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1920,7 +1920,6 @@ std::vector ReplicatedMergeTreeQueue::getMutationsStatu formatAST(*command.ast, buf, false, true); result.push_back(MergeTreeMutationStatus { - MutationType::Ordinary, /// TODO: ReplicatedMergeTree supports lightweight delete. entry.znode_name, buf.str(), entry.create_time, diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index b9b5874b3e6..b2f62c2bf02 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -106,6 +106,11 @@ public: return parts.front()->storage.getSettings()->materialize_ttl_recalculate_only; } + bool hasLightweightDeleteColumn() const + { + return parts.front()->getColumns().contains("_row_exists"); // TODO: fix hardcoded column name + } + private: MergeTreeData::DataPartsVector parts; const MergeTreeData & storage; diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 0604bb304d0..5e161fc2e6a 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -123,7 +123,7 @@ void StorageJoin::mutate(const MutationCommands & commands, ContextPtr context) // New scope controls lifetime of pipeline. { auto storage_ptr = DatabaseCatalog::instance().getTable(getStorageID(), context); - auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true, false); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index fc4a671c071..f3f1162287f 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -318,7 +318,7 @@ void StorageMemory::mutate(const MutationCommands & commands, ContextPtr context new_context->setSetting("max_streams_to_max_threads_ratio", 1); new_context->setSetting("max_threads", 1); - auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, new_context, true, false); + auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, new_context, true); auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute()); PullingPipelineExecutor executor(pipeline); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index bf731f09428..34fcd4c7a78 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -332,7 +332,7 @@ void StorageMergeTree::alter( DatabaseCatalog::instance().getDatabase(table_id.database_name)->alterTable(local_context, table_id, new_metadata); if (!maybe_mutation_commands.empty()) - mutation_version = startMutation(maybe_mutation_commands, local_context, MutationType::Ordinary); + mutation_version = startMutation(maybe_mutation_commands, local_context); } /// Always execute required mutations synchronously, because alters @@ -429,7 +429,7 @@ CurrentlyMergingPartsTagger::~CurrentlyMergingPartsTagger() storage.currently_processing_in_background_condition.notify_all(); } -Int64 StorageMergeTree::startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type) +Int64 StorageMergeTree::startMutation(const MutationCommands & commands, ContextPtr query_context) { /// Choose any disk, because when we load mutations we search them at each disk /// where storage can be placed. See loadMutations(). @@ -447,7 +447,7 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, Context { std::lock_guard lock(currently_processing_in_background_mutex); - MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings(), type); + MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings()); version = increment.get(); entry.commit(version); String mutation_id = entry.file_name; @@ -554,21 +554,11 @@ void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn) } void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) -{ - /// Make ordinary ALTER DELETE queries lightweight to check all tests. - if (query_context->getSettingsRef().lightweight_delete_mutation - && commands.size() == 1 && commands.begin()->type == MutationCommand::DELETE) - mutate(commands, query_context, MutationType::Lightweight); - else - mutate(commands, query_context, MutationType::Ordinary); -} - -void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context, MutationType type) { /// Validate partition IDs (if any) before starting mutation getPartitionIdsAffectedByCommands(commands, query_context); - Int64 version = startMutation(commands, query_context, type); + Int64 version = startMutation(commands, query_context); if (query_context->getSettingsRef().mutations_sync > 0 || query_context->getCurrentTransaction()) waitForMutation(version); @@ -667,7 +657,6 @@ std::vector StorageMergeTree::getMutationsStatus() cons formatAST(*command.ast, buf, false, true); result.push_back(MergeTreeMutationStatus { - entry.type, entry.file_name, buf.str(), entry.create_time, @@ -1034,20 +1023,12 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( auto commands = std::make_shared(); size_t current_ast_elements = 0; auto last_mutation_to_apply = mutations_end_it; - - bool support_lightweight_mutate = part->supportLightweightDeleteMutate(); - MutationType first_mutation_type = support_lightweight_mutate ? mutations_begin_it->second.type : MutationType::Ordinary; for (auto it = mutations_begin_it; it != mutations_end_it; ++it) { /// Do not squash mutations from different transactions to be able to commit/rollback them independently. if (first_mutation_tid != it->second.tid) break; - /// Do not combine mutations with different types. - /// TODO: compact part support lightweight delete. - if (support_lightweight_mutate && it->second.type != first_mutation_type) - break; - size_t commands_size = 0; MutationCommands commands_for_size_validation; for (const auto & command : it->second.commands) @@ -1073,7 +1054,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( fake_query_context->makeQueryContext(); fake_query_context->setCurrentQueryId(""); MutationsInterpreter interpreter( - shared_from_this(), metadata_snapshot, commands_for_size_validation, fake_query_context, false, false); + shared_from_this(), metadata_snapshot, commands_for_size_validation, fake_query_context, false); commands_size += interpreter.evaluateCommandsSize(); } catch (...) @@ -1132,7 +1113,6 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( future_part->part_info = new_part_info; future_part->name = part->getNewName(new_part_info); future_part->type = part->getType(); - future_part->mutation_type = first_mutation_type; tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); return std::make_shared(future_part, std::move(tagger), commands, txn); diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index 379e15d5bdf..632884db033 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -87,8 +87,6 @@ public: void mutate(const MutationCommands & commands, ContextPtr context) override; - /// Support lightweight delete. - void mutate(const MutationCommands & commands, ContextPtr context, MutationType type); bool hasLightweightDeletedMask() const override; /// Return introspection information about currently processing or recently processed mutations. @@ -184,7 +182,7 @@ private: /// Allocate block number for new mutation, write mutation to disk /// and into in-memory structures. Wake up merge-mutation task. - Int64 startMutation(const MutationCommands & commands, ContextPtr query_context, MutationType type); + Int64 startMutation(const MutationCommands & commands, ContextPtr query_context); /// Wait until mutation with version will finish mutation for all parts void waitForMutation(Int64 version); void waitForMutation(const String & mutation_id) override; diff --git a/src/Storages/System/StorageSystemMutations.cpp b/src/Storages/System/StorageSystemMutations.cpp index 907376a4936..fa521c632b8 100644 --- a/src/Storages/System/StorageSystemMutations.cpp +++ b/src/Storages/System/StorageSystemMutations.cpp @@ -20,7 +20,6 @@ NamesAndTypesList StorageSystemMutations::getNamesAndTypes() return { { "database", std::make_shared() }, { "table", std::make_shared() }, - { "is_lightweight", std::make_shared() }, { "mutation_id", std::make_shared() }, { "command", std::make_shared() }, { "create_time", std::make_shared() }, @@ -131,7 +130,6 @@ void StorageSystemMutations::fillData(MutableColumns & res_columns, ContextPtr c res_columns[col_num++]->insert(database); res_columns[col_num++]->insert(table); - res_columns[col_num++]->insert(status.type == MutationType::Lightweight); res_columns[col_num++]->insert(status.id); res_columns[col_num++]->insert(status.command); res_columns[col_num++]->insert(UInt64(status.create_time)); diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index acc9b08da29..6e9d9188962 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -378,7 +378,6 @@ CREATE TABLE system.mutations ( `database` String, `table` String, - `is_lightweight` UInt8, `mutation_id` String, `command` String, `create_time` DateTime, diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference index fc646843eee..31960e2ecea 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference @@ -8,11 +8,11 @@ 1 1 1 -0 UPDATE _row_exists = 0 WHERE (c % 5) = 1 1 -0 UPDATE _row_exists = 0 WHERE c = 4 1 -0 MATERIALIZE INDEX i_c 1 -0 UPDATE b = -1 WHERE a < 3 1 -0 DROP INDEX i_c 1 +UPDATE _row_exists = 0 WHERE (c % 5) = 1 1 +UPDATE _row_exists = 0 WHERE c = 4 1 +MATERIALIZE INDEX i_c 1 +UPDATE b = -1 WHERE a < 3 1 +DROP INDEX i_c 1 -----Check that select and merge with lightweight delete.----- 7 0 -1 0 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index 24afa5fb196..6f78e1fe464 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -5,7 +5,7 @@ CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTr INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); SET mutations_sync = 1; -SET allow_experimental_lightweight_delete = 0; +--SET allow_experimental_lightweight_delete = 0; SET allow_experimental_lightweight_delete_with_row_exists = 1; DELETE FROM merge_table_standard_delete WHERE id = 10; @@ -60,7 +60,7 @@ DETACH TABLE t_light; ATTACH TABLE t_light; CHECK TABLE t_light; -SELECT is_lightweight, command, is_done FROM system.mutations WHERE database = currentDatabase() AND table = 't_light'; +SELECT command, is_done FROM system.mutations WHERE database = currentDatabase() AND table = 't_light'; SELECT '-----Check that select and merge with lightweight delete.-----'; select count(*) from t_light; diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index 46336a57c3a..4468a25448c 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -6,7 +6,7 @@ INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 1 SET mutations_sync = 1; SET allow_experimental_lightweight_delete_with_row_exists = 1; -SET allow_experimental_lightweight_delete = 0; +--SET allow_experimental_lightweight_delete = 0; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test; From 7d30ab80c43b3181da8bb6fa6f00c80798bb4596 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 21 Jul 2022 09:31:06 +0000 Subject: [PATCH 123/227] Fix compatibility --- programs/keeper-converter/KeeperConverter.cpp | 4 +- src/Coordination/KeeperSnapshotManager.cpp | 6 +- src/Coordination/KeeperStorage.cpp | 55 +++++++++++++++---- src/Coordination/KeeperStorage.h | 6 +- src/Coordination/ZooKeeperDataReader.cpp | 2 +- 5 files changed, 57 insertions(+), 16 deletions(-) diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index dda84a4d2ae..ae47287d8ee 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -39,9 +39,11 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) try { - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", true, false); DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as(), logger); + storage.initializeSystemNodes(); + DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as(), logger); DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(storage.getZXID(), 1, std::make_shared()); DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta); diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 07d351cb73c..8fbc7df1484 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB { @@ -188,6 +189,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr { const auto & path = it->key; const auto & node = it->value; + /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be /// slightly bigger than required. @@ -323,6 +325,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial readBinary(path, in); KeeperStorage::Node node{}; readNode(node, in, current_version, storage.acl_map); + storage.container.insertOrReplace(path, node); if (node.stat.ephemeralOwner != 0) storage.ephemerals[node.stat.ephemeralOwner].insert(path); @@ -583,8 +586,9 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff compressed_reader = std::make_unique(*reader); SnapshotDeserializationResult result; - result.storage = std::make_unique(storage_tick_time, superdigest, digest_enabled); + result.storage = std::make_unique(storage_tick_time, superdigest, digest_enabled, false); KeeperStorageSnapshot::deserialize(result, *compressed_reader); + result.storage->initializeSystemNodes(); return result; } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index edd4f624ba3..dd43d006979 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -227,26 +227,49 @@ void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other) cached_digest = other.cached_digest; } -KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_) +KeeperStorage::KeeperStorage( + int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_, const bool initialize_system_nodes) : session_expiry_queue(tick_time_ms), digest_enabled(digest_enabled_), superdigest(superdigest_) { - Node root_node; container.insert("/", root_node); addDigest(root_node, "/"); + if (initialize_system_nodes) + initializeSystemNodes(); +} + +void KeeperStorage::initializeSystemNodes() +{ + if (initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes initialized twice"); + const auto create_system_node = [&](const auto & path, auto data) { - // we update numChildren during preprocessing so and createNode is called during - // commit so we need to update it manually here - container.updateValue( - parentPath(path), - [](KeeperStorage::Node & parent) - { - ++parent.stat.numChildren; - } - ); - createNode(path, std::move(data), {}, false, {}); + auto node_it = container.find(path); + if (node_it == container.end()) + { + // we update numChildren during preprocessing so and createNode is called during + // commit so we need to update it manually here + container.updateValue( + parentPath(path), + [](KeeperStorage::Node & parent) + { + ++parent.stat.numChildren; + } + ); + createNode(path, std::move(data), {}, false, {}); + } + else + { + container.updateValue( + path, + [data = std::move(data)](KeeperStorage::Node & node) + { + node.setData(std::move(data)); + } + ); + } }; create_system_node(keeper_system_path, ""); @@ -254,6 +277,8 @@ KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_, assert(keeper_api_version_path.starts_with(keeper_system_path)); auto api_version_data = toString(static_cast(current_keeper_api_version)); create_system_node(keeper_api_version_path, std::move(api_version_data)); + + initialized = true; } template @@ -1847,6 +1872,9 @@ void KeeperStorage::preprocessRequest( bool check_acl, std::optional digest) { + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized"); + int64_t last_zxid = getNextZXID() - 1; if (uncommitted_transactions.empty()) @@ -1932,6 +1960,9 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( bool check_acl, bool is_local) { + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes are not initialized"); + if (new_last_zxid) { if (uncommitted_transactions.empty()) diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index f4fccd95adb..a511086110f 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -340,7 +340,11 @@ public: const String superdigest; - KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_); + bool initialized{false}; + + KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_, bool initialize_system_nodes = true); + + void initializeSystemNodes(); /// Allocate new session id with the specified timeouts int64_t getSessionID(int64_t session_timeout_ms) diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index fc39b569cc1..9e744f4fe1d 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -137,7 +137,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L for (const auto & itr : storage.container) { - if (itr.key != "/" && !itr.key.toView().starts_with(keeper_system_path)) + if (itr.key != "/") { auto parent_path = parentPath(itr.key); storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(path)); value.stat.numChildren++; }); From dc392cd4d34b25122f951418d9c7e187701cf6c7 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 20 Jul 2022 21:44:51 +0200 Subject: [PATCH 124/227] Improve synchronization between hosts in distributed backup. Use ephemeral zk nodes to check other hosts for termination. --- src/Backups/BackupCoordinationLocal.cpp | 8 +- src/Backups/BackupCoordinationLocal.h | 8 +- src/Backups/BackupCoordinationRemote.cpp | 40 +- src/Backups/BackupCoordinationRemote.h | 17 +- src/Backups/BackupCoordinationStageSync.cpp | 228 ++++++ src/Backups/BackupCoordinationStageSync.h | 39 + src/Backups/BackupCoordinationStatusSync.cpp | 182 ----- src/Backups/BackupCoordinationStatusSync.h | 37 - src/Backups/BackupEntriesCollector.cpp | 70 +- src/Backups/BackupEntriesCollector.h | 7 +- src/Backups/BackupsWorker.cpp | 723 +++++++++++-------- src/Backups/BackupsWorker.h | 20 +- src/Backups/IBackupCoordination.h | 13 +- src/Backups/IRestoreCoordination.h | 13 +- src/Backups/RestoreCoordinationLocal.cpp | 8 +- src/Backups/RestoreCoordinationLocal.h | 10 +- src/Backups/RestoreCoordinationRemote.cpp | 39 +- src/Backups/RestoreCoordinationRemote.h | 21 +- src/Backups/RestorerFromBackup.cpp | 37 +- src/Backups/RestorerFromBackup.h | 5 +- src/Interpreters/InterpreterBackupQuery.cpp | 24 +- 21 files changed, 867 insertions(+), 682 deletions(-) create mode 100644 src/Backups/BackupCoordinationStageSync.cpp create mode 100644 src/Backups/BackupCoordinationStageSync.h delete mode 100644 src/Backups/BackupCoordinationStatusSync.cpp delete mode 100644 src/Backups/BackupCoordinationStatusSync.h diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp index 417b84c6b5f..d4064902a40 100644 --- a/src/Backups/BackupCoordinationLocal.cpp +++ b/src/Backups/BackupCoordinationLocal.cpp @@ -13,20 +13,20 @@ using FileInfo = IBackupCoordination::FileInfo; BackupCoordinationLocal::BackupCoordinationLocal() = default; BackupCoordinationLocal::~BackupCoordinationLocal() = default; -void BackupCoordinationLocal::setStatus(const String &, const String &, const String &) +void BackupCoordinationLocal::setStage(const String &, const String &, const String &) { } -void BackupCoordinationLocal::setErrorStatus(const String &, const Exception &) +void BackupCoordinationLocal::setError(const String &, const Exception &) { } -Strings BackupCoordinationLocal::waitStatus(const Strings &, const String &) +Strings BackupCoordinationLocal::waitForStage(const Strings &, const String &) { return {}; } -Strings BackupCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64) +Strings BackupCoordinationLocal::waitForStage(const Strings &, const String &, std::chrono::milliseconds) { return {}; } diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h index 8cf5fba5c5c..aca7f71545b 100644 --- a/src/Backups/BackupCoordinationLocal.h +++ b/src/Backups/BackupCoordinationLocal.h @@ -20,10 +20,10 @@ public: BackupCoordinationLocal(); ~BackupCoordinationLocal() override; - void setStatus(const String & current_host, const String & new_status, const String & message) override; - void setErrorStatus(const String & current_host, const Exception & exception) override; - Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override; - Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override; + void setStage(const String & current_host, const String & new_stage, const String & message) override; + void setError(const String & current_host, const Exception & exception) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override; void addReplicatedPartNames(const String & table_shared_id, const String & table_name_for_logs, const String & replica_name, const std::vector & part_names_and_checksums) override; diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index a180358e088..bac99b0da2d 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -165,15 +165,28 @@ namespace constexpr size_t NUM_ATTEMPTS = 10; } -BackupCoordinationRemote::BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) +BackupCoordinationRemote::BackupCoordinationRemote( + const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("BackupCoordination")) + , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_) + , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("BackupCoordination")) { createRootNodes(); } -BackupCoordinationRemote::~BackupCoordinationRemote() = default; +BackupCoordinationRemote::~BackupCoordinationRemote() +{ + try + { + if (remove_zk_nodes_in_destructor) + removeAllNodes(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} void BackupCoordinationRemote::createRootNodes() { @@ -196,24 +209,24 @@ void BackupCoordinationRemote::removeAllNodes() } -void BackupCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message) +void BackupCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message) { - status_sync.set(current_host, new_status, message); + stage_sync.set(current_host, new_stage, message); } -void BackupCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception) +void BackupCoordinationRemote::setError(const String & current_host, const Exception & exception) { - status_sync.setError(current_host, exception); + stage_sync.setError(current_host, exception); } -Strings BackupCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait) +Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait) { - return status_sync.wait(all_hosts, status_to_wait); + return stage_sync.wait(all_hosts, stage_to_wait); } -Strings BackupCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) +Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) { - return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms); + return stage_sync.waitFor(all_hosts, stage_to_wait, timeout); } @@ -565,9 +578,4 @@ Strings BackupCoordinationRemote::getAllArchiveSuffixes() const return node_names; } -void BackupCoordinationRemote::drop() -{ - removeAllNodes(); -} - } diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index 349d04c7d87..d1d206683fa 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include namespace DB @@ -13,13 +13,13 @@ namespace DB class BackupCoordinationRemote : public IBackupCoordination { public: - BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_); + BackupCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_); ~BackupCoordinationRemote() override; - void setStatus(const String & current_host, const String & new_status, const String & message) override; - void setErrorStatus(const String & current_host, const Exception & exception) override; - Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override; - Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override; + void setStage(const String & current_host, const String & new_stage, const String & message) override; + void setError(const String & current_host, const Exception & exception) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override; void addReplicatedPartNames( const String & table_shared_id, @@ -56,8 +56,6 @@ public: String getNextArchiveSuffix() override; Strings getAllArchiveSuffixes() const override; - void drop() override; - private: void createRootNodes(); void removeAllNodes(); @@ -66,8 +64,9 @@ private: const String zookeeper_path; const zkutil::GetZooKeeper get_zookeeper; + const bool remove_zk_nodes_in_destructor; - BackupCoordinationStatusSync status_sync; + BackupCoordinationStageSync stage_sync; mutable std::mutex mutex; mutable std::optional replicated_tables; diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp new file mode 100644 index 00000000000..5524029bbf2 --- /dev/null +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -0,0 +1,228 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; +} + + +BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_) + : zookeeper_path(zookeeper_path_) + , get_zookeeper(get_zookeeper_) + , log(log_) +{ + createRootNodes(); +} + +void BackupCoordinationStageSync::createRootNodes() +{ + auto zookeeper = get_zookeeper(); + zookeeper->createAncestors(zookeeper_path); + zookeeper->createIfNotExists(zookeeper_path, ""); +} + +void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message) +{ + auto zookeeper = get_zookeeper(); + + /// Make an ephemeral node so the initiator can track if the current host is still working. + String alive_node_path = zookeeper_path + "/alive|" + current_host; + auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS) + throw zkutil::KeeperException(code, alive_node_path); + + zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, ""); + zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message); +} + +void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception) +{ + auto zookeeper = get_zookeeper(); + WriteBufferFromOwnString buf; + writeStringBinary(current_host, buf); + writeException(exception, buf, true); + zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str()); +} + +Strings BackupCoordinationStageSync::wait(const Strings & all_hosts, const String & stage_to_wait) +{ + return waitImpl(all_hosts, stage_to_wait, {}); +} + +Strings BackupCoordinationStageSync::waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) +{ + return waitImpl(all_hosts, stage_to_wait, timeout); +} + +namespace +{ + struct UnreadyHostState + { + bool started = false; + bool alive = false; + }; +} + +struct BackupCoordinationStageSync::State +{ + Strings results; + std::map unready_hosts; + std::optional> error; + std::optional host_terminated; +}; + +BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState( + zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const +{ + std::unordered_set zk_nodes_set{zk_nodes.begin(), zk_nodes.end()}; + + State state; + if (zk_nodes_set.contains("error")) + { + ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")}; + String host; + readStringBinary(host, buf); + state.error = std::make_pair(host, readException(buf, fmt::format("Got error from {}", host))); + return state; + } + + for (const auto & host : all_hosts) + { + if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait)) + { + UnreadyHostState unready_host_state; + unready_host_state.started = zk_nodes_set.contains("started|" + host); + unready_host_state.alive = zk_nodes_set.contains("alive|" + host); + state.unready_hosts.emplace(host, unready_host_state); + if (!unready_host_state.alive && unready_host_state.started && !state.host_terminated) + state.host_terminated = host; + } + } + + if (state.host_terminated || !state.unready_hosts.empty()) + return state; + + state.results.reserve(all_hosts.size()); + for (const auto & host : all_hosts) + state.results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait)); + + return state; +} + +Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional timeout) const +{ + if (all_hosts.empty()) + return {}; + + /// Wait until all hosts are ready or an error happens or time is out. + + auto zookeeper = get_zookeeper(); + + struct Watch + { + std::mutex mutex; + std::condition_variable event; + bool zk_nodes_changed = false; + bool watch_set = false; + }; + + /// shared_ptr because `watch_callback` can be called by ZooKeeper after leaving this function's scope. + auto watch = std::make_shared(); + + /// Called by ZooKepper when list of zk nodes have changed. + auto watch_callback = [watch](const Coordination::WatchResponse &) + { + std::lock_guard lock{watch->mutex}; + watch->zk_nodes_changed = true; + watch->watch_set = false; /// When it's triggered ZooKeeper resets the watch so we need to call getChildrenWatch() again. + watch->event.notify_all(); + }; + + auto zk_nodes_changed = [watch] { return watch->zk_nodes_changed; }; + + bool use_timeout = timeout.has_value(); + std::chrono::steady_clock::time_point end_of_timeout; + if (use_timeout) + end_of_timeout = std::chrono::steady_clock::now() + std::chrono::duration_cast(*timeout); + + State state; + + String previous_unready_host; /// Used for logging: we don't want to log the same unready host again. + + for (;;) + { + /// Get zk nodes and subscribe on their changes. + { + std::lock_guard lock{watch->mutex}; + watch->watch_set = true; + watch->zk_nodes_changed = false; + } + Strings zk_nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback); + + /// Read and analyze the current state of zk nodes. + state = readCurrentState(zookeeper, zk_nodes, all_hosts, stage_to_wait); + if (state.error || state.host_terminated || state.unready_hosts.empty()) + break; /// Error happened or everything is ready. + + /// Log that we will wait for another host. + const auto & unready_host = state.unready_hosts.begin()->first; + if (unready_host != previous_unready_host) + { + LOG_TRACE(log, "Waiting for host {}", unready_host); + previous_unready_host = unready_host; + } + + /// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed. + { + std::unique_lock lock{watch->mutex}; + if (use_timeout) + { + auto current_time = std::chrono::steady_clock::now(); + if ((current_time > end_of_timeout) || !watch->event.wait_for(lock, end_of_timeout - current_time, zk_nodes_changed)) + break; + } + else + { + watch->event.wait(lock, zk_nodes_changed); + } + assert(watch->zk_nodes_changed); + assert(!watch->watch_set); + } + } + + /// Rethrow an error raised originally on another host. + if (state.error) + state.error->second.rethrow(); + + /// Another host terminated without errors. + if (state.host_terminated) + throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Host {} suddenly stopped working", *state.host_terminated); + + /// Something's unready, timeout is probably not enough. + if (!state.unready_hosts.empty()) + { + const auto & [unready_host, unready_host_state] = *state.unready_hosts.begin(); + throw Exception( + ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, + "Waited for host {} too long (> {}){}", + unready_host, + to_string(*timeout), + unready_host_state.started ? "" : ": Operation didn't start"); + } + + return state.results; +} + +} diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h new file mode 100644 index 00000000000..623b58fd9fa --- /dev/null +++ b/src/Backups/BackupCoordinationStageSync.h @@ -0,0 +1,39 @@ +#pragma once + +#include + + +namespace DB +{ + +/// Used to coordinate hosts so all hosts would come to a specific stage at around the same time. +class BackupCoordinationStageSync +{ +public: + BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_); + + /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that. + void set(const String & current_host, const String & new_stage, const String & message); + void setError(const String & current_host, const Exception & exception); + + /// Sets the stage of the current host and waits until all hosts come to the same stage. + /// The function returns the messages all hosts set when they come to the required stage. + Strings wait(const Strings & all_hosts, const String & stage_to_wait); + + /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time. + Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout); + +private: + void createRootNodes(); + + struct State; + State readCurrentState(zkutil::ZooKeeperPtr zookeeper, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const; + + Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional timeout) const; + + String zookeeper_path; + zkutil::GetZooKeeper get_zookeeper; + Poco::Logger * log; +}; + +} diff --git a/src/Backups/BackupCoordinationStatusSync.cpp b/src/Backups/BackupCoordinationStatusSync.cpp deleted file mode 100644 index c0ecfdcaebe..00000000000 --- a/src/Backups/BackupCoordinationStatusSync.cpp +++ /dev/null @@ -1,182 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; -} - - -BackupCoordinationStatusSync::BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_) - : zookeeper_path(zookeeper_path_) - , get_zookeeper(get_zookeeper_) - , log(log_) -{ - createRootNodes(); -} - -void BackupCoordinationStatusSync::createRootNodes() -{ - auto zookeeper = get_zookeeper(); - zookeeper->createAncestors(zookeeper_path); - zookeeper->createIfNotExists(zookeeper_path, ""); -} - -void BackupCoordinationStatusSync::set(const String & current_host, const String & new_status, const String & message) -{ - auto zookeeper = get_zookeeper(); - zookeeper->createIfNotExists(zookeeper_path + "/" + current_host + "|" + new_status, message); -} - -void BackupCoordinationStatusSync::setError(const String & current_host, const Exception & exception) -{ - auto zookeeper = get_zookeeper(); - - Exception exception2 = exception; - exception2.addMessage("Host {}", current_host); - WriteBufferFromOwnString buf; - writeException(exception2, buf, true); - - zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str()); -} - -Strings BackupCoordinationStatusSync::wait(const Strings & all_hosts, const String & status_to_wait) -{ - return waitImpl(all_hosts, status_to_wait, {}); -} - -Strings BackupCoordinationStatusSync::waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) -{ - return waitImpl(all_hosts, status_to_wait, timeout_ms); -} - -Strings BackupCoordinationStatusSync::waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional timeout_ms) -{ - if (all_hosts.empty()) - return {}; - - /// Wait for other hosts. - - Strings ready_hosts_results; - ready_hosts_results.resize(all_hosts.size()); - - std::map /* index in `ready_hosts_results` */> unready_hosts; - for (size_t i = 0; i != all_hosts.size(); ++i) - unready_hosts[all_hosts[i]].push_back(i); - - std::optional error; - - auto zookeeper = get_zookeeper(); - - /// Process ZooKeeper's nodes and set `all_hosts_ready` or `unready_host` or `error_message`. - auto process_zk_nodes = [&](const Strings & zk_nodes) - { - for (const String & zk_node : zk_nodes) - { - if (zk_node.starts_with("remove_watch-")) - continue; - - if (zk_node == "error") - { - ReadBufferFromOwnString buf{zookeeper->get(zookeeper_path + "/error")}; - error = readException(buf, "", true); - break; - } - - size_t separator_pos = zk_node.find('|'); - if (separator_pos == String::npos) - throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Unexpected zk node {}", zookeeper_path + "/" + zk_node); - - String host = zk_node.substr(0, separator_pos); - String status = zk_node.substr(separator_pos + 1); - - auto it = unready_hosts.find(host); - if ((it != unready_hosts.end()) && (status == status_to_wait)) - { - String result = zookeeper->get(zookeeper_path + "/" + zk_node); - for (size_t i : it->second) - ready_hosts_results[i] = result; - unready_hosts.erase(it); - } - } - }; - - /// Wait until all hosts are ready or an error happens or time is out. - std::atomic watch_set = false; - std::condition_variable watch_triggered_event; - - auto watch_callback = [&](const Coordination::WatchResponse &) - { - watch_set = false; /// After it's triggered it's not set until we call getChildrenWatch() again. - watch_triggered_event.notify_all(); - }; - - auto watch_triggered = [&] { return !watch_set; }; - - bool use_timeout = timeout_ms.has_value(); - std::chrono::milliseconds timeout{timeout_ms.value_or(0)}; - std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - std::chrono::steady_clock::duration elapsed; - std::mutex dummy_mutex; - String previous_unready_host; - - while (!unready_hosts.empty() && !error) - { - watch_set = true; - Strings nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback); - process_zk_nodes(nodes); - - if (!unready_hosts.empty() && !error) - { - const auto & unready_host = unready_hosts.begin()->first; - if (unready_host != previous_unready_host) - { - LOG_TRACE(log, "Waiting for host {}", unready_host); - previous_unready_host = unready_host; - } - - std::unique_lock dummy_lock{dummy_mutex}; - if (use_timeout) - { - elapsed = std::chrono::steady_clock::now() - start_time; - if ((elapsed > timeout) || !watch_triggered_event.wait_for(dummy_lock, timeout - elapsed, watch_triggered)) - break; - } - else - watch_triggered_event.wait(dummy_lock, watch_triggered); - } - } - - if (watch_set) - { - /// Remove watch by triggering it. - zookeeper->create(zookeeper_path + "/remove_watch-", "", zkutil::CreateMode::EphemeralSequential); - std::unique_lock dummy_lock{dummy_mutex}; - watch_triggered_event.wait(dummy_lock, watch_triggered); - } - - if (error) - error->rethrow(); - - if (!unready_hosts.empty()) - { - throw Exception( - ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, - "Waited for host {} too long ({})", - unready_hosts.begin()->first, - to_string(elapsed)); - } - - return ready_hosts_results; -} - -} diff --git a/src/Backups/BackupCoordinationStatusSync.h b/src/Backups/BackupCoordinationStatusSync.h deleted file mode 100644 index fc03e8ec81c..00000000000 --- a/src/Backups/BackupCoordinationStatusSync.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ - -/// Used to coordinate hosts so all hosts would come to a specific status at around the same time. -class BackupCoordinationStatusSync -{ -public: - BackupCoordinationStatusSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_); - - /// Sets the status of the current host and signal other hosts if there were other hosts waiting for that. - void set(const String & current_host, const String & new_status, const String & message); - void setError(const String & current_host, const Exception & exception); - - /// Sets the status of the current host and waits until all hosts come to the same status. - /// The function returns the messages all hosts set when they come to the required status. - Strings wait(const Strings & all_hosts, const String & status_to_wait); - - /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time. - Strings waitFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms); - - static constexpr const char * kErrorStatus = "error"; - -private: - void createRootNodes(); - Strings waitImpl(const Strings & all_hosts, const String & status_to_wait, std::optional timeout_ms); - - String zookeeper_path; - zkutil::GetZooKeeper get_zookeeper; - Poco::Logger * log; -}; - -} diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index d2e4b1f8c4b..3cd9649de61 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -34,16 +34,21 @@ namespace ErrorCodes namespace { /// Finding all tables and databases which we're going to put to the backup and collecting their metadata. - constexpr const char * kGatheringMetadataStatus = "gathering metadata"; + constexpr const char * kGatheringMetadataStage = "gathering metadata"; + + String formatGatheringMetadataStage(size_t pass) + { + return fmt::format("{} ({})", kGatheringMetadataStage, pass); + } /// Making temporary hard links and prepare backup entries. - constexpr const char * kExtractingDataFromTablesStatus = "extracting data from tables"; + constexpr const char * kExtractingDataFromTablesStage = "extracting data from tables"; /// Running special tasks for replicated tables which can also prepare some backup entries. - constexpr const char * kRunningPostTasksStatus = "running post-tasks"; + constexpr const char * kRunningPostTasksStage = "running post-tasks"; /// Writing backup entries to the backup and removing temporary hard links. - constexpr const char * kWritingBackupStatus = "writing backup"; + constexpr const char * kWritingBackupStage = "writing backup"; /// Uppercases the first character of a passed string. String toUpperFirst(const String & str) @@ -90,7 +95,8 @@ BackupEntriesCollector::BackupEntriesCollector( , backup_settings(backup_settings_) , backup_coordination(backup_coordination_) , context(context_) - , consistent_metadata_snapshot_timeout(context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 300000)) + , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000)) + , consistent_metadata_snapshot_timeout(context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)) , log(&Poco::Logger::get("BackupEntriesCollector")) { } @@ -100,7 +106,7 @@ BackupEntriesCollector::~BackupEntriesCollector() = default; BackupEntries BackupEntriesCollector::run() { /// run() can be called onle once. - if (!current_status.empty()) + if (!current_stage.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Already making backup entries"); /// Find other hosts working along with us to execute this ON CLUSTER query. @@ -123,36 +129,40 @@ BackupEntries BackupEntriesCollector::run() makeBackupEntriesForTablesDefs(); /// Make backup entries for the data of the found tables. - setStatus(kExtractingDataFromTablesStatus); + setStage(kExtractingDataFromTablesStage); makeBackupEntriesForTablesData(); /// Run all the tasks added with addPostCollectingTask(). - setStatus(kRunningPostTasksStatus); + setStage(kRunningPostTasksStage); runPostTasks(); /// No more backup entries or tasks are allowed after this point. - setStatus(kWritingBackupStatus); + setStage(kWritingBackupStage); return std::move(backup_entries); } -Strings BackupEntriesCollector::setStatus(const String & new_status, const String & message) +Strings BackupEntriesCollector::setStage(const String & new_stage, const String & message) { - LOG_TRACE(log, "{}", toUpperFirst(new_status)); - current_status = new_status; + LOG_TRACE(log, "{}", toUpperFirst(new_stage)); + current_stage = new_stage; - backup_coordination->setStatus(backup_settings.host_id, new_status, message); + backup_coordination->setStage(backup_settings.host_id, new_stage, message); - if (new_status.starts_with(kGatheringMetadataStatus)) + if (new_stage == formatGatheringMetadataStage(1)) { - auto now = std::chrono::steady_clock::now(); - auto end_of_timeout = std::max(now, consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout); - return backup_coordination->waitStatusFor( - all_hosts, new_status, std::chrono::duration_cast(end_of_timeout - now).count()); + return backup_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout); + } + else if (new_stage.starts_with(kGatheringMetadataStage)) + { + auto current_time = std::chrono::steady_clock::now(); + auto end_of_timeout = std::max(current_time, consistent_metadata_snapshot_end_time); + return backup_coordination->waitForStage( + all_hosts, new_stage, std::chrono::duration_cast(end_of_timeout - current_time)); } else { - return backup_coordination->waitStatus(all_hosts, new_status); + return backup_coordination->waitForStage(all_hosts, new_stage); } } @@ -173,18 +183,18 @@ void BackupEntriesCollector::calculateRootPathInBackup() /// Finds databases and tables which we will put to the backup. void BackupEntriesCollector::gatherMetadataAndCheckConsistency() { - consistent_metadata_snapshot_start_time = std::chrono::steady_clock::now(); - auto end_of_timeout = consistent_metadata_snapshot_start_time + consistent_metadata_snapshot_timeout; - setStatus(fmt::format("{} ({})", kGatheringMetadataStatus, 1)); + setStage(formatGatheringMetadataStage(1)); + + consistent_metadata_snapshot_end_time = std::chrono::steady_clock::now() + consistent_metadata_snapshot_timeout; for (size_t pass = 1;; ++pass) { - String new_status = fmt::format("{} ({})", kGatheringMetadataStatus, pass + 1); + String next_stage = formatGatheringMetadataStage(pass + 1); std::optional inconsistency_error; if (tryGatherMetadataAndCompareWithPrevious(inconsistency_error)) { /// Gathered metadata and checked consistency, cool! But we have to check that other hosts cope with that too. - auto all_hosts_results = setStatus(new_status, "consistent"); + auto all_hosts_results = setStage(next_stage, "consistent"); std::optional host_with_inconsistency; std::optional inconsistency_error_on_other_host; @@ -210,13 +220,13 @@ void BackupEntriesCollector::gatherMetadataAndCheckConsistency() else { /// Failed to gather metadata or something wasn't consistent. We'll let other hosts know that and try again. - setStatus(new_status, inconsistency_error->displayText()); + setStage(next_stage, inconsistency_error->displayText()); } /// Two passes is minimum (we need to compare with table names with previous ones to be sure we don't miss anything). if (pass >= 2) { - if (std::chrono::steady_clock::now() > end_of_timeout) + if (std::chrono::steady_clock::now() > consistent_metadata_snapshot_end_time) inconsistency_error->rethrow(); else LOG_WARNING(log, "{}", inconsistency_error->displayText()); @@ -713,7 +723,7 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry) { - if (current_status == kWritingBackupStatus) + if (current_stage == kWritingBackupStage) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); backup_entries.emplace_back(file_name, backup_entry); } @@ -725,21 +735,21 @@ void BackupEntriesCollector::addBackupEntry(const std::pair task) { - if (current_status == kWritingBackupStatus) + if (current_stage == kWritingBackupStage) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of post tasks is not allowed"); post_tasks.push(std::move(task)); } diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h index 03710605654..c42b5aedad4 100644 --- a/src/Backups/BackupEntriesCollector.h +++ b/src/Backups/BackupEntriesCollector.h @@ -86,12 +86,13 @@ private: void runPostTasks(); - Strings setStatus(const String & new_status, const String & message = ""); + Strings setStage(const String & new_stage, const String & message = ""); const ASTBackupQuery::Elements backup_query_elements; const BackupSettings backup_settings; std::shared_ptr backup_coordination; ContextPtr context; + std::chrono::milliseconds on_cluster_first_sync_timeout; std::chrono::milliseconds consistent_metadata_snapshot_timeout; Poco::Logger * log; @@ -129,8 +130,8 @@ private: std::optional partitions; }; - String current_status; - std::chrono::steady_clock::time_point consistent_metadata_snapshot_start_time; + String current_stage; + std::chrono::steady_clock::time_point consistent_metadata_snapshot_end_time; std::unordered_map database_infos; std::unordered_map table_infos; std::vector> previous_databases_metadata; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 09614886f06..47e1bac3200 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -18,37 +18,86 @@ #include #include #include -#include #include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace { /// Coordination status meaning that a host finished its work. - constexpr const char * kCompletedCoordinationStatus = "completed"; + constexpr const char * kCompletedStage = "completed"; - /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination. - template - void sendErrorToCoordination(std::shared_ptr coordination, const String & current_host) + std::shared_ptr makeBackupCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup) + { + if (!coordination_zk_path.empty()) + { + auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); }; + return std::make_shared(coordination_zk_path, get_zookeeper, !is_internal_backup); + } + else + { + return std::make_shared(); + } + } + + std::shared_ptr makeRestoreCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup) + { + if (!coordination_zk_path.empty()) + { + auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); }; + return std::make_shared(coordination_zk_path, get_zookeeper, !is_internal_backup); + } + else + { + return std::make_shared(); + } + } + + /// Sends information about an exception to IBackupCoordination or IRestoreCoordination. + template + void sendExceptionToCoordination(std::shared_ptr coordination, const String & current_host, const Exception & exception) { - if (!coordination) - return; try { - coordination->setErrorStatus(current_host, Exception{getCurrentExceptionCode(), getCurrentExceptionMessage(true, true)}); + if (coordination) + coordination->setError(current_host, exception); } catch (...) { } } + + /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination. + template + void sendCurrentExceptionToCoordination(std::shared_ptr coordination, const String & current_host) + { + try + { + throw; + } + catch (const Exception & e) + { + sendExceptionToCoordination(coordination, current_host, e); + } + catch (...) + { + coordination->setError(current_host, Exception{getCurrentExceptionCode(), getCurrentExceptionMessage(true, true)}); + } + } + + /// Used to change num_active_backups. + size_t getNumActiveBackupsChange(BackupStatus status) + { + return status == BackupStatus::MAKING_BACKUP; + } + + /// Used to change num_active_restores. + size_t getNumActiveRestoresChange(BackupStatus status) + { + return status == BackupStatus::RESTORING; + } } @@ -60,6 +109,7 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now. } + UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) { const ASTBackupQuery & backup_query = typeid_cast(*backup_or_restore_query); @@ -74,308 +124,359 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c { auto backup_query = std::static_pointer_cast(query->clone()); auto backup_settings = BackupSettings::fromBackupQuery(*backup_query); - auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); - bool on_cluster = !backup_query->cluster.empty(); if (!backup_settings.backup_uuid) backup_settings.backup_uuid = UUIDHelpers::generateV4(); UUID backup_uuid = *backup_settings.backup_uuid; - /// Prepare context to use. - ContextPtr context_in_use = context; - ContextMutablePtr mutable_context; - if (on_cluster || backup_settings.async) + std::shared_ptr backup_coordination; + if (!backup_settings.coordination_zk_path.empty()) + backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal); + + try { - /// For ON CLUSTER queries we will need to change some settings. - /// For ASYNC queries we have to clone the context anyway. - context_in_use = mutable_context = Context::createCopy(context); + auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); + addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP, backup_settings.internal); + + /// Prepare context to use. + ContextPtr context_in_use = context; + ContextMutablePtr mutable_context; + bool on_cluster = !backup_query->cluster.empty(); + if (on_cluster || backup_settings.async) + { + /// For ON CLUSTER queries we will need to change some settings. + /// For ASYNC queries we have to clone the context anyway. + context_in_use = mutable_context = Context::createCopy(context); + } + + if (backup_settings.async) + { + backups_thread_pool.scheduleOrThrowOnError( + [this, backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context] { + doBackup( + backup_uuid, + backup_query, + backup_settings, + backup_info, + backup_coordination, + context_in_use, + mutable_context, + true); + }); + } + else + { + doBackup(backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context, false); + } + + return backup_uuid; } - - addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP, backup_settings.internal); - - auto job = [this, - backup_uuid, - backup_query, - backup_settings, - backup_info, - on_cluster, - context_in_use, - mutable_context](bool async) mutable + catch (...) { - std::optional query_scope; - std::shared_ptr backup_coordination; - SCOPE_EXIT_SAFE(if (backup_coordination && !backup_settings.internal) backup_coordination->drop();); + /// Something bad happened, the backup has not built. + setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); + sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); + throw; + } +} - try + +void BackupsWorker::doBackup( + const UUID & backup_uuid, + const std::shared_ptr & backup_query, + BackupSettings backup_settings, + const BackupInfo & backup_info, + std::shared_ptr backup_coordination, + const ContextPtr & context, + ContextMutablePtr mutable_context, + bool called_async) +{ + std::optional query_scope; + try + { + if (called_async) { - if (async) - { - query_scope.emplace(mutable_context); - setThreadName("BackupWorker"); - } - - /// Checks access rights if this is not ON CLUSTER query. - /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.) - auto required_access = getRequiredAccessToBackup(backup_query->elements); - if (!on_cluster) - context_in_use->checkAccess(required_access); - - ClusterPtr cluster; - if (on_cluster) - { - backup_query->cluster = context_in_use->getMacros()->expand(backup_query->cluster); - cluster = context_in_use->getCluster(backup_query->cluster); - backup_settings.cluster_host_ids = cluster->getHostIDs(); - if (backup_settings.coordination_zk_path.empty()) - { - String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid); - } - } - - /// Make a backup coordination. - if (!backup_settings.coordination_zk_path.empty()) - { - backup_coordination = std::make_shared( - backup_settings.coordination_zk_path, - [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); }); - } - else - { - backup_coordination = std::make_shared(); - } - - /// Opens a backup for writing. - BackupFactory::CreateParams backup_create_params; - backup_create_params.open_mode = IBackup::OpenMode::WRITE; - backup_create_params.context = context_in_use; - backup_create_params.backup_info = backup_info; - backup_create_params.base_backup_info = backup_settings.base_backup_info; - backup_create_params.compression_method = backup_settings.compression_method; - backup_create_params.compression_level = backup_settings.compression_level; - backup_create_params.password = backup_settings.password; - backup_create_params.is_internal_backup = backup_settings.internal; - backup_create_params.backup_coordination = backup_coordination; - backup_create_params.backup_uuid = backup_uuid; - BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params); - - /// Write the backup. - if (on_cluster) - { - DDLQueryOnClusterParams params; - params.cluster = cluster; - params.only_shard_num = backup_settings.shard_num; - params.only_replica_num = backup_settings.replica_num; - params.access_to_check = required_access; - backup_settings.copySettingsToQuery(*backup_query); - - // executeDDLQueryOnCluster() will return without waiting for completion - mutable_context->setSetting("distributed_ddl_task_timeout", Field{0}); - mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"}); - executeDDLQueryOnCluster(backup_query, mutable_context, params); - - /// Wait until all the hosts have written their backup entries. - auto all_hosts = BackupSettings::Util::filterHostIDs( - backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num); - backup_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus); - } - else - { - backup_query->setCurrentDatabase(context_in_use->getCurrentDatabase()); - - /// Prepare backup entries. - BackupEntries backup_entries; - { - BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context_in_use}; - backup_entries = backup_entries_collector.run(); - } - - /// Write the backup entries to the backup. - writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool); - - /// We have written our backup entries, we need to tell other hosts (they could be waiting for it). - backup_coordination->setStatus(backup_settings.host_id, kCompletedCoordinationStatus, ""); - } - - /// Finalize backup (write its metadata). - if (!backup_settings.internal) - backup->finalizeWriting(); - - /// Close the backup. - backup.reset(); - - setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); + query_scope.emplace(mutable_context); + setThreadName("BackupWorker"); } - catch (...) + + bool on_cluster = !backup_query->cluster.empty(); + assert(mutable_context || (!on_cluster && !called_async)); + + /// Checks access rights if this is not ON CLUSTER query. + /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.) + auto required_access = getRequiredAccessToBackup(backup_query->elements); + if (!on_cluster) + context->checkAccess(required_access); + + ClusterPtr cluster; + if (on_cluster) { - /// Something bad happened, the backup has not built. + backup_query->cluster = context->getMacros()->expand(backup_query->cluster); + cluster = context->getCluster(backup_query->cluster); + backup_settings.cluster_host_ids = cluster->getHostIDs(); + if (backup_settings.coordination_zk_path.empty()) + { + String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); + backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid); + } + } + + /// Make a backup coordination. + if (!backup_coordination) + backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal); + + /// Opens a backup for writing. + BackupFactory::CreateParams backup_create_params; + backup_create_params.open_mode = IBackup::OpenMode::WRITE; + backup_create_params.context = context; + backup_create_params.backup_info = backup_info; + backup_create_params.base_backup_info = backup_settings.base_backup_info; + backup_create_params.compression_method = backup_settings.compression_method; + backup_create_params.compression_level = backup_settings.compression_level; + backup_create_params.password = backup_settings.password; + backup_create_params.is_internal_backup = backup_settings.internal; + backup_create_params.backup_coordination = backup_coordination; + backup_create_params.backup_uuid = backup_uuid; + BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params); + + /// Write the backup. + if (on_cluster) + { + DDLQueryOnClusterParams params; + params.cluster = cluster; + params.only_shard_num = backup_settings.shard_num; + params.only_replica_num = backup_settings.replica_num; + params.access_to_check = required_access; + backup_settings.copySettingsToQuery(*backup_query); + + // executeDDLQueryOnCluster() will return without waiting for completion + mutable_context->setSetting("distributed_ddl_task_timeout", Field{0}); + mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"}); + executeDDLQueryOnCluster(backup_query, mutable_context, params); + + /// Wait until all the hosts have written their backup entries. + auto all_hosts = BackupSettings::Util::filterHostIDs( + backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num); + backup_coordination->waitForStage(all_hosts, kCompletedStage); + } + else + { + backup_query->setCurrentDatabase(context->getCurrentDatabase()); + + /// Prepare backup entries. + BackupEntries backup_entries; + { + BackupEntriesCollector backup_entries_collector{backup_query->elements, backup_settings, backup_coordination, context}; + backup_entries = backup_entries_collector.run(); + } + + /// Write the backup entries to the backup. + writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool); + + /// We have written our backup entries, we need to tell other hosts (they could be waiting for it). + backup_coordination->setStage(backup_settings.host_id, kCompletedStage, ""); + } + + /// Finalize backup (write its metadata). + if (!backup_settings.internal) + backup->finalizeWriting(); + + /// Close the backup. + backup.reset(); + + LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); + setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); + } + catch (...) + { + /// Something bad happened, the backup has not built. + if (called_async) + { + tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); - sendErrorToCoordination(backup_coordination, backup_settings.host_id); - if (!async) - throw; + sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } - }; - - if (backup_settings.async) - backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); }); - else - job(false); - - return backup_uuid; + else + { + /// setStatus() and sendCurrentExceptionToCoordination() will be called by startMakingBackup(). + throw; + } + } } UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) { - UUID restore_uuid = UUIDHelpers::generateV4(); auto restore_query = std::static_pointer_cast(query->clone()); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); - auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); - bool on_cluster = !restore_query->cluster.empty(); + UUID restore_uuid = UUIDHelpers::generateV4(); - /// Prepare context to use. - ContextMutablePtr context_in_use = context; - if (restore_settings.async || on_cluster) + std::shared_ptr restore_coordination; + if (!restore_settings.coordination_zk_path.empty()) + restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal); + + try { - /// For ON CLUSTER queries we will need to change some settings. - /// For ASYNC queries we have to clone the context anyway. - context_in_use = Context::createCopy(context); + auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); + addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING, restore_settings.internal); + + /// Prepare context to use. + ContextMutablePtr context_in_use = context; + bool on_cluster = !restore_query->cluster.empty(); + if (restore_settings.async || on_cluster) + { + /// For ON CLUSTER queries we will need to change some settings. + /// For ASYNC queries we have to clone the context anyway. + context_in_use = Context::createCopy(context); + } + + if (restore_settings.async) + { + backups_thread_pool.scheduleOrThrowOnError( + [this, restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use] + { doRestore(restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use, true); }); + } + else + { + doRestore(restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use, false); + } + + return restore_uuid; } - - addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING, restore_settings.internal); - - auto job = [this, - restore_uuid, - restore_query, - restore_settings, - backup_info, - on_cluster, - context_in_use](bool async) mutable + catch (...) { - std::optional query_scope; - std::shared_ptr restore_coordination; - SCOPE_EXIT_SAFE(if (restore_coordination && !restore_settings.internal) restore_coordination->drop();); + /// Something bad happened, the backup has not built. + setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); + sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); + throw; + } +} - try + +void BackupsWorker::doRestore( + const UUID & restore_uuid, + const std::shared_ptr & restore_query, + RestoreSettings restore_settings, + const BackupInfo & backup_info, + std::shared_ptr restore_coordination, + ContextMutablePtr context, + bool called_async) +{ + std::optional query_scope; + try + { + if (called_async) { - if (async) - { - query_scope.emplace(context_in_use); - setThreadName("RestoreWorker"); - } - - /// Open the backup for reading. - BackupFactory::CreateParams backup_open_params; - backup_open_params.open_mode = IBackup::OpenMode::READ; - backup_open_params.context = context_in_use; - backup_open_params.backup_info = backup_info; - backup_open_params.base_backup_info = restore_settings.base_backup_info; - backup_open_params.password = restore_settings.password; - BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); - - String current_database = context_in_use->getCurrentDatabase(); - - /// Checks access rights if this is ON CLUSTER query. - /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.) - ClusterPtr cluster; - if (on_cluster) - { - restore_query->cluster = context_in_use->getMacros()->expand(restore_query->cluster); - cluster = context_in_use->getCluster(restore_query->cluster); - restore_settings.cluster_host_ids = cluster->getHostIDs(); - - /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect - /// because different replicas can contain different set of tables and so the required access rights can differ too. - /// So the right way is pass through the entire cluster and check access for each host. - auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num); - for (const auto * address : addresses) - { - restore_settings.host_id = address->toString(); - auto restore_elements = restore_query->elements; - String addr_database = address->default_database.empty() ? current_database : address->default_database; - for (auto & element : restore_elements) - element.setCurrentDatabase(addr_database); - RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context_in_use}; - dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY); - } - } - - /// Make a restore coordination. - if (on_cluster && restore_settings.coordination_zk_path.empty()) - { - String root_zk_path = context_in_use->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid); - } - - if (!restore_settings.coordination_zk_path.empty()) - { - restore_coordination = std::make_shared( - restore_settings.coordination_zk_path, - [global_context = context_in_use->getGlobalContext()] { return global_context->getZooKeeper(); }); - } - else - { - restore_coordination = std::make_shared(); - } - - /// Do RESTORE. - if (on_cluster) - { - - DDLQueryOnClusterParams params; - params.cluster = cluster; - params.only_shard_num = restore_settings.shard_num; - params.only_replica_num = restore_settings.replica_num; - restore_settings.copySettingsToQuery(*restore_query); - - // executeDDLQueryOnCluster() will return without waiting for completion - context_in_use->setSetting("distributed_ddl_task_timeout", Field{0}); - context_in_use->setSetting("distributed_ddl_output_mode", Field{"none"}); - - executeDDLQueryOnCluster(restore_query, context_in_use, params); - - /// Wait until all the hosts have written their backup entries. - auto all_hosts = BackupSettings::Util::filterHostIDs( - restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num); - restore_coordination->waitStatus(all_hosts, kCompletedCoordinationStatus); - } - else - { - restore_query->setCurrentDatabase(current_database); - - /// Restore metadata and prepare data restoring tasks. - DataRestoreTasks data_restore_tasks; - { - RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination, - backup, context_in_use}; - data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE); - } - - /// Execute the data restoring tasks. - restoreTablesData(std::move(data_restore_tasks), restores_thread_pool); - - /// We have restored everything, we need to tell other hosts (they could be waiting for it). - restore_coordination->setStatus(restore_settings.host_id, kCompletedCoordinationStatus, ""); - } - - setStatus(restore_uuid, BackupStatus::RESTORED); + query_scope.emplace(context); + setThreadName("RestoreWorker"); } - catch (...) + + /// Open the backup for reading. + BackupFactory::CreateParams backup_open_params; + backup_open_params.open_mode = IBackup::OpenMode::READ; + backup_open_params.context = context; + backup_open_params.backup_info = backup_info; + backup_open_params.base_backup_info = restore_settings.base_backup_info; + backup_open_params.password = restore_settings.password; + BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); + + String current_database = context->getCurrentDatabase(); + + /// Checks access rights if this is ON CLUSTER query. + /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.) + ClusterPtr cluster; + bool on_cluster = !restore_query->cluster.empty(); + if (on_cluster) { - /// Something bad happened, the backup has not built. + restore_query->cluster = context->getMacros()->expand(restore_query->cluster); + cluster = context->getCluster(restore_query->cluster); + restore_settings.cluster_host_ids = cluster->getHostIDs(); + + /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect + /// because different replicas can contain different set of tables and so the required access rights can differ too. + /// So the right way is pass through the entire cluster and check access for each host. + auto addresses = cluster->filterAddressesByShardOrReplica(restore_settings.shard_num, restore_settings.replica_num); + for (const auto * address : addresses) + { + restore_settings.host_id = address->toString(); + auto restore_elements = restore_query->elements; + String addr_database = address->default_database.empty() ? current_database : address->default_database; + for (auto & element : restore_elements) + element.setCurrentDatabase(addr_database); + RestorerFromBackup dummy_restorer{restore_elements, restore_settings, nullptr, backup, context}; + dummy_restorer.run(RestorerFromBackup::CHECK_ACCESS_ONLY); + } + } + + /// Make a restore coordination. + if (on_cluster && restore_settings.coordination_zk_path.empty()) + { + String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); + restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid); + } + + if (!restore_coordination) + restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal); + + /// Do RESTORE. + if (on_cluster) + { + + DDLQueryOnClusterParams params; + params.cluster = cluster; + params.only_shard_num = restore_settings.shard_num; + params.only_replica_num = restore_settings.replica_num; + restore_settings.copySettingsToQuery(*restore_query); + + // executeDDLQueryOnCluster() will return without waiting for completion + context->setSetting("distributed_ddl_task_timeout", Field{0}); + context->setSetting("distributed_ddl_output_mode", Field{"none"}); + + executeDDLQueryOnCluster(restore_query, context, params); + + /// Wait until all the hosts have written their backup entries. + auto all_hosts = BackupSettings::Util::filterHostIDs( + restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num); + restore_coordination->waitForStage(all_hosts, kCompletedStage); + } + else + { + restore_query->setCurrentDatabase(current_database); + + /// Restore metadata and prepare data restoring tasks. + DataRestoreTasks data_restore_tasks; + { + RestorerFromBackup restorer{restore_query->elements, restore_settings, restore_coordination, + backup, context}; + data_restore_tasks = restorer.run(RestorerFromBackup::RESTORE); + } + + /// Execute the data restoring tasks. + restoreTablesData(std::move(data_restore_tasks), restores_thread_pool); + + /// We have restored everything, we need to tell other hosts (they could be waiting for it). + restore_coordination->setStage(restore_settings.host_id, kCompletedStage, ""); + } + + LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); + setStatus(restore_uuid, BackupStatus::RESTORED); + } + catch (...) + { + /// Something bad happened, the backup has not built. + if (called_async) + { + tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); - sendErrorToCoordination(restore_coordination, restore_settings.host_id); - if (!async) - throw; + sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } - }; - - if (restore_settings.async) - backups_thread_pool.scheduleOrThrowOnError([job]() mutable { job(true); }); - else - job(false); - - return restore_uuid; + else + { + /// setStatus() and sendCurrentExceptionToCoordination() will be called by startRestoring(). + throw; + } + } } @@ -387,37 +488,28 @@ void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, Backu info.status = status; info.status_changed_time = time(nullptr); info.internal = internal; + std::lock_guard lock{infos_mutex}; infos[uuid] = std::move(info); + + num_active_backups += getNumActiveBackupsChange(status); + num_active_restores += getNumActiveRestoresChange(status); } + void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status) { std::lock_guard lock{infos_mutex}; - auto & info = infos.at(uuid); + auto it = infos.find(uuid); + if (it == infos.end()) + return; + + auto & info = it->second; + auto old_status = info.status; info.status = status; info.status_changed_time = time(nullptr); - - if (status == BackupStatus::BACKUP_COMPLETE) - { - LOG_INFO(log, "{} {} was created successfully", (info.internal ? "Internal backup" : "Backup"), info.backup_name); - } - else if (status == BackupStatus::RESTORED) - { - LOG_INFO(log, "Restored from {} {} successfully", (info.internal ? "internal backup" : "backup"), info.backup_name); - } - else if ((status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::FAILED_TO_RESTORE)) - { - String start_of_message; - if (status == BackupStatus::FAILED_TO_BACKUP) - start_of_message = fmt::format("Failed to create {} {}", (info.internal ? "internal backup" : "backup"), info.backup_name); - else - start_of_message = fmt::format("Failed to restore from {} {}", (info.internal ? "internal backup" : "backup"), info.backup_name); - tryLogCurrentException(log, start_of_message); - - info.error_message = getCurrentExceptionMessage(false); - info.exception = std::current_exception(); - } + num_active_backups += getNumActiveBackupsChange(status) - getNumActiveBackupsChange(old_status); + num_active_restores += getNumActiveRestoresChange(status) - getNumActiveRestoresChange(old_status); } @@ -428,7 +520,7 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_excep { auto it = infos.find(backup_or_restore_uuid); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: Unknown UUID {}", toString(backup_or_restore_uuid)); + return true; const auto & info = it->second; auto current_status = info.status; if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE))) @@ -437,12 +529,12 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_excep }); } -BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid) const +std::optional BackupsWorker::tryGetInfo(const UUID & backup_or_restore_uuid) const { std::lock_guard lock{infos_mutex}; auto it = infos.find(backup_or_restore_uuid); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "BackupsWorker: Unknown UUID {}", toString(backup_or_restore_uuid)); + return std::nullopt; return it->second; } @@ -457,14 +549,15 @@ std::vector BackupsWorker::getAllInfos() const void BackupsWorker::shutdown() { - size_t num_active_backups = backups_thread_pool.active(); - size_t num_active_restores = restores_thread_pool.active(); - if (!num_active_backups && !num_active_restores) - return; - LOG_INFO(log, "Waiting for {} backup and {} restore tasks to be finished", num_active_backups, num_active_restores); + bool has_active_backups_or_restores = (num_active_backups || num_active_restores); + if (has_active_backups_or_restores) + LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores); + backups_thread_pool.wait(); restores_thread_pool.wait(); - LOG_INFO(log, "All backup and restore tasks have finished"); + + if (has_active_backups_or_restores) + LOG_INFO(log, "All backup and restore tasks have finished"); } } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index f546fa2497d..8db9c1367a9 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -11,6 +11,13 @@ namespace Poco::Util { class AbstractConfiguration; } namespace DB { +class ASTBackupQuery; +struct BackupSettings; +struct RestoreSettings; +struct BackupInfo; +class IBackupCoordination; +class IRestoreCoordination; + /// Manager of backups and restores: executes backups and restores' threads in the background. /// Keeps information about backups and restores started in this session. class BackupsWorker @@ -47,12 +54,21 @@ public: bool internal = false; }; - Info getInfo(const UUID & backup_or_restore_uuid) const; + std::optional tryGetInfo(const UUID & backup_or_restore_uuid) const; std::vector getAllInfos() const; private: UUID startMakingBackup(const ASTPtr & query, const ContextPtr & context); + + void doBackup(const UUID & backup_uuid, const std::shared_ptr & backup_query, BackupSettings backup_settings, + const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, + ContextMutablePtr mutable_context, bool called_async); + UUID startRestoring(const ASTPtr & query, ContextMutablePtr context); + + void doRestore(const UUID & restore_uuid, const std::shared_ptr & restore_query, RestoreSettings restore_settings, + const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, + bool called_async); void addInfo(const UUID & uuid, const String & backup_name, BackupStatus status, bool internal); void setStatus(const UUID & uuid, BackupStatus status); @@ -62,6 +78,8 @@ private: std::unordered_map infos; std::condition_variable status_changed; + std::atomic num_active_backups = 0; + std::atomic num_active_restores = 0; mutable std::mutex infos_mutex; Poco::Logger * log; }; diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h index 7cf43efea74..5e120218544 100644 --- a/src/Backups/IBackupCoordination.h +++ b/src/Backups/IBackupCoordination.h @@ -18,11 +18,11 @@ class IBackupCoordination public: virtual ~IBackupCoordination() = default; - /// Sets the current status and waits for other hosts to come to this status too. - virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0; - virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0; - virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0; - virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0; + /// Sets the current stage and waits for other hosts to come to this stage too. + virtual void setStage(const String & current_host, const String & new_stage, const String & message) = 0; + virtual void setError(const String & current_host, const Exception & exception) = 0; + virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) = 0; + virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; struct PartNameAndChecksum { @@ -115,9 +115,6 @@ public: /// Returns the list of all the archive suffixes which were generated. virtual Strings getAllArchiveSuffixes() const = 0; - - /// Removes remotely stored information. - virtual void drop() {} }; } diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h index e852fa3c2d4..692054ae267 100644 --- a/src/Backups/IRestoreCoordination.h +++ b/src/Backups/IRestoreCoordination.h @@ -16,11 +16,11 @@ class IRestoreCoordination public: virtual ~IRestoreCoordination() = default; - /// Sets the current status and waits for other hosts to come to this status too. - virtual void setStatus(const String & current_host, const String & new_status, const String & message) = 0; - virtual void setErrorStatus(const String & current_host, const Exception & exception) = 0; - virtual Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) = 0; - virtual Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) = 0; + /// Sets the current stage and waits for other hosts to come to this stage too. + virtual void setStage(const String & current_host, const String & new_stage, const String & message) = 0; + virtual void setError(const String & current_host, const Exception & exception) = 0; + virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) = 0; + virtual Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) = 0; static constexpr const char * kErrorStatus = "error"; @@ -34,9 +34,6 @@ public: /// Sets that this replica is going to restore a ReplicatedAccessStorage. /// The function returns false if this access storage is being already restored by another replica. virtual bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) = 0; - - /// Removes remotely stored information. - virtual void drop() {} }; } diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp index deab75dc7de..b2a9849c38d 100644 --- a/src/Backups/RestoreCoordinationLocal.cpp +++ b/src/Backups/RestoreCoordinationLocal.cpp @@ -7,20 +7,20 @@ namespace DB RestoreCoordinationLocal::RestoreCoordinationLocal() = default; RestoreCoordinationLocal::~RestoreCoordinationLocal() = default; -void RestoreCoordinationLocal::setStatus(const String &, const String &, const String &) +void RestoreCoordinationLocal::setStage(const String &, const String &, const String &) { } -void RestoreCoordinationLocal::setErrorStatus(const String &, const Exception &) +void RestoreCoordinationLocal::setError(const String &, const Exception &) { } -Strings RestoreCoordinationLocal::waitStatus(const Strings &, const String &) +Strings RestoreCoordinationLocal::waitForStage(const Strings &, const String &) { return {}; } -Strings RestoreCoordinationLocal::waitStatusFor(const Strings &, const String &, UInt64) +Strings RestoreCoordinationLocal::waitForStage(const Strings &, const String &, std::chrono::milliseconds) { return {}; } diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h index d8b0052cbd2..b4e70d83b72 100644 --- a/src/Backups/RestoreCoordinationLocal.h +++ b/src/Backups/RestoreCoordinationLocal.h @@ -18,11 +18,11 @@ public: RestoreCoordinationLocal(); ~RestoreCoordinationLocal() override; - /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts. - void setStatus(const String & current_host, const String & new_status, const String & message) override; - void setErrorStatus(const String & current_host, const Exception & exception) override; - Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override; - Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override; + /// Sets the current stage and waits for other hosts to come to this stage too. + void setStage(const String & current_host, const String & new_stage, const String & message) override; + void setError(const String & current_host, const Exception & exception) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override; /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index 86c8ca6b509..fcc6a2a24b3 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -6,15 +6,27 @@ namespace DB { -RestoreCoordinationRemote::RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_) +RestoreCoordinationRemote::RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) - , status_sync(zookeeper_path_ + "/status", get_zookeeper_, &Poco::Logger::get("RestoreCoordination")) + , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_) + , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("RestoreCoordination")) { createRootNodes(); } -RestoreCoordinationRemote::~RestoreCoordinationRemote() = default; +RestoreCoordinationRemote::~RestoreCoordinationRemote() +{ + try + { + if (remove_zk_nodes_in_destructor) + removeAllNodes(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} void RestoreCoordinationRemote::createRootNodes() { @@ -27,24 +39,24 @@ void RestoreCoordinationRemote::createRootNodes() } -void RestoreCoordinationRemote::setStatus(const String & current_host, const String & new_status, const String & message) +void RestoreCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message) { - status_sync.set(current_host, new_status, message); + stage_sync.set(current_host, new_stage, message); } -void RestoreCoordinationRemote::setErrorStatus(const String & current_host, const Exception & exception) +void RestoreCoordinationRemote::setError(const String & current_host, const Exception & exception) { - status_sync.setError(current_host, exception); + stage_sync.setError(current_host, exception); } -Strings RestoreCoordinationRemote::waitStatus(const Strings & all_hosts, const String & status_to_wait) +Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait) { - return status_sync.wait(all_hosts, status_to_wait); + return stage_sync.wait(all_hosts, stage_to_wait); } -Strings RestoreCoordinationRemote::waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) +Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) { - return status_sync.waitFor(all_hosts, status_to_wait, timeout_ms); + return stage_sync.waitFor(all_hosts, stage_to_wait, timeout); } @@ -93,9 +105,4 @@ void RestoreCoordinationRemote::removeAllNodes() zookeeper->removeRecursive(zookeeper_path); } -void RestoreCoordinationRemote::drop() -{ - removeAllNodes(); -} - } diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index 883ea953efc..0cbbb6622ad 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace DB @@ -11,14 +11,14 @@ namespace DB class RestoreCoordinationRemote : public IRestoreCoordination { public: - RestoreCoordinationRemote(const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper); + RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_); ~RestoreCoordinationRemote() override; - /// Sets the current status and waits for other hosts to come to this status too. If status starts with "error:" it'll stop waiting on all the hosts. - void setStatus(const String & current_host, const String & new_status, const String & message) override; - void setErrorStatus(const String & current_host, const Exception & exception) override; - Strings waitStatus(const Strings & all_hosts, const String & status_to_wait) override; - Strings waitStatusFor(const Strings & all_hosts, const String & status_to_wait, UInt64 timeout_ms) override; + /// Sets the current stage and waits for other hosts to come to this stage too. + void setStage(const String & current_host, const String & new_stage, const String & message) override; + void setError(const String & current_host, const Exception & exception) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait) override; + Strings waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) override; /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table. bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override; @@ -31,9 +31,6 @@ public: /// The function returns false if this access storage is being already restored by another replica. bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override; - /// Removes remotely stored information. - void drop() override; - private: void createRootNodes(); void removeAllNodes(); @@ -42,7 +39,9 @@ private: const String zookeeper_path; const zkutil::GetZooKeeper get_zookeeper; - BackupCoordinationStatusSync status_sync; + const bool remove_zk_nodes_in_destructor; + + BackupCoordinationStageSync stage_sync; }; } diff --git a/src/Backups/RestorerFromBackup.cpp b/src/Backups/RestorerFromBackup.cpp index b67cdf9c4dd..5e43d59ae56 100644 --- a/src/Backups/RestorerFromBackup.cpp +++ b/src/Backups/RestorerFromBackup.cpp @@ -41,16 +41,16 @@ namespace ErrorCodes namespace { /// Finding databases and tables in the backup which we're going to restore. - constexpr const char * kFindingTablesInBackupStatus = "finding tables in backup"; + constexpr const char * kFindingTablesInBackupStage = "finding tables in backup"; /// Creating databases or finding them and checking their definitions. - constexpr const char * kCreatingDatabasesStatus = "creating databases"; + constexpr const char * kCreatingDatabasesStage = "creating databases"; /// Creating tables or finding them and checking their definition. - constexpr const char * kCreatingTablesStatus = "creating tables"; + constexpr const char * kCreatingTablesStage = "creating tables"; /// Inserting restored data to tables. - constexpr const char * kInsertingDataToTablesStatus = "inserting data to tables"; + constexpr const char * kInsertingDataToTablesStage = "inserting data to tables"; /// Uppercases the first character of a passed string. String toUpperFirst(const String & str) @@ -102,6 +102,7 @@ RestorerFromBackup::RestorerFromBackup( , restore_coordination(restore_coordination_) , backup(backup_) , context(context_) + , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000)) , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000)) , log(&Poco::Logger::get("RestorerFromBackup")) { @@ -112,7 +113,7 @@ RestorerFromBackup::~RestorerFromBackup() = default; RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode) { /// run() can be called onle once. - if (!current_status.empty()) + if (!current_stage.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring"); /// Find other hosts working along with us to execute this ON CLUSTER query. @@ -126,7 +127,7 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode) findRootPathsInBackup(); /// Find all the databases and tables which we will read from the backup. - setStatus(kFindingTablesInBackupStatus); + setStage(kFindingTablesInBackupStage); findDatabasesAndTablesInBackup(); /// Check access rights. @@ -136,27 +137,31 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode) return {}; /// Create databases using the create queries read from the backup. - setStatus(kCreatingDatabasesStatus); + setStage(kCreatingDatabasesStage); createDatabases(); /// Create tables using the create queries read from the backup. - setStatus(kCreatingTablesStatus); + setStage(kCreatingTablesStage); createTables(); /// All what's left is to insert data to tables. /// No more data restoring tasks are allowed after this point. - setStatus(kInsertingDataToTablesStatus); + setStage(kInsertingDataToTablesStage); return getDataRestoreTasks(); } -void RestorerFromBackup::setStatus(const String & new_status, const String & message) +void RestorerFromBackup::setStage(const String & new_stage, const String & message) { - LOG_TRACE(log, "{}", toUpperFirst(new_status)); - current_status = new_status; + LOG_TRACE(log, "{}", toUpperFirst(new_stage)); + current_stage = new_stage; + if (restore_coordination) { - restore_coordination->setStatus(restore_settings.host_id, new_status, message); - restore_coordination->waitStatus(all_hosts, new_status); + restore_coordination->setStage(restore_settings.host_id, new_stage, message); + if (new_stage == kFindingTablesInBackupStage) + restore_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout); + else + restore_coordination->waitForStage(all_hosts, new_stage); } } @@ -814,14 +819,14 @@ std::vector RestorerFromBackup::findTablesWithoutDependencie void RestorerFromBackup::addDataRestoreTask(DataRestoreTask && new_task) { - if (current_status == kInsertingDataToTablesStatus) + if (current_stage == kInsertingDataToTablesStage) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed"); data_restore_tasks.push_back(std::move(new_task)); } void RestorerFromBackup::addDataRestoreTasks(DataRestoreTasks && new_tasks) { - if (current_status == kInsertingDataToTablesStatus) + if (current_stage == kInsertingDataToTablesStage) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed"); insertAtEnd(data_restore_tasks, std::move(new_tasks)); } diff --git a/src/Backups/RestorerFromBackup.h b/src/Backups/RestorerFromBackup.h index a53477f6e6d..b081e16e2ce 100644 --- a/src/Backups/RestorerFromBackup.h +++ b/src/Backups/RestorerFromBackup.h @@ -73,6 +73,7 @@ private: std::shared_ptr restore_coordination; BackupPtr backup; ContextMutablePtr context; + std::chrono::milliseconds on_cluster_first_sync_timeout; std::chrono::milliseconds create_table_timeout; Poco::Logger * log; @@ -100,7 +101,7 @@ private: DataRestoreTasks getDataRestoreTasks(); - void setStatus(const String & new_status, const String & message = ""); + void setStage(const String & new_stage, const String & message = ""); struct DatabaseInfo { @@ -124,7 +125,7 @@ private: std::vector findTablesWithoutDependencies() const; - String current_status; + String current_stage; std::unordered_map database_infos; std::map table_infos; std::vector data_restore_tasks; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 246d4ba24e9..af3c8df8eef 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -17,20 +17,22 @@ namespace DB namespace { - Block getResultRow(const BackupsWorker::Info & info) + Block getResultRow(const std::optional & info) { - Block res_columns; - auto column_uuid = ColumnUUID::create(); - column_uuid->insert(info.uuid); - res_columns.insert(0, {std::move(column_uuid), std::make_shared(), "uuid"}); - auto column_backup_name = ColumnString::create(); - column_backup_name->insert(info.backup_name); - res_columns.insert(1, {std::move(column_backup_name), std::make_shared(), "backup_name"}); - auto column_status = ColumnInt8::create(); - column_status->insert(static_cast(info.status)); + + if (info) + { + column_uuid->insert(info->uuid); + column_backup_name->insert(info->backup_name); + column_status->insert(static_cast(info->status)); + } + + Block res_columns; + res_columns.insert(0, {std::move(column_uuid), std::make_shared(), "uuid"}); + res_columns.insert(1, {std::move(column_backup_name), std::make_shared(), "backup_name"}); res_columns.insert(2, {std::move(column_status), std::make_shared(getBackupStatusEnumValues()), "status"}); return res_columns; @@ -42,7 +44,7 @@ BlockIO InterpreterBackupQuery::execute() auto & backups_worker = context->getBackupsWorker(); UUID uuid = backups_worker.start(query_ptr, context); BlockIO res_io; - res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.getInfo(uuid)))); + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.tryGetInfo(uuid)))); return res_io; } From 619e22fe573a3f58695efeec888b00735ba994fc Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 21 Jul 2022 11:43:42 +0200 Subject: [PATCH 125/227] Add tests for shutdown during backup. --- .../configs/lesser_timeouts.xml | 8 +++ .../test_backup_restore_on_cluster/test.py | 62 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml diff --git a/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml b/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml new file mode 100644 index 00000000000..9caf52fcca4 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml @@ -0,0 +1,8 @@ + + + + 1000 + 10000 + 1000 + + diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 438ab87b5c7..df03ebd320b 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -1,5 +1,6 @@ from time import sleep import pytest +import re import os.path from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV, assert_eq_with_retry @@ -11,6 +12,7 @@ main_configs = [ "configs/remote_servers.xml", "configs/replicated_access_storage.xml", "configs/backups_disk.xml", + "configs/lesser_timeouts.xml", # Default timeouts are quite big (a few minutes), the tests don't need them to be that big. ] user_configs = [ @@ -33,6 +35,7 @@ node2 = cluster.add_instance( external_dirs=["/backups/"], macros={"replica": "node2", "shard": "shard1"}, with_zookeeper=True, + stay_alive=True, # Necessary for the "test_stop_other_host_while_backup" test ) @@ -763,3 +766,62 @@ def test_mutation(): node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + + +def test_get_error_from_other_host(): + node1.query("CREATE TABLE tbl (`x` UInt8) ENGINE = MergeTree ORDER BY x") + node1.query("INSERT INTO tbl VALUES (3)") + + backup_name = new_backup_name() + expected_error = "Got error from node2.*Table default.tbl was not found" + assert re.search( + expected_error, + node1.query_and_get_error( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}" + ), + ) + + +@pytest.mark.parametrize("kill", [False, True]) +def test_stop_other_host_while_backup(kill): + node1.query( + "CREATE TABLE tbl ON CLUSTER 'cluster' (" + "x UInt8" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" + "ORDER BY x" + ) + + node1.query("INSERT INTO tbl VALUES (3)") + node2.query("INSERT INTO tbl VALUES (5)") + + backup_name = new_backup_name() + + id = node1.query( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC" + ).split("\t")[0] + + # If kill=False the pending backup must be completed + # If kill=True the pending backup might be completed or failed + node2.stop_clickhouse(kill=kill) + + assert_eq_with_retry( + node1, + f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP'", + "", + ) + + status = node1.query(f"SELECT status FROM system.backups WHERE uuid='{id}'").strip() + + if kill: + assert status in ["BACKUP_COMPLETE", "FAILED_TO_BACKUP"] + else: + assert status == "BACKUP_COMPLETE" + + node2.start_clickhouse() + + if status == "BACKUP_COMPLETE": + node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5]) + elif status == "FAILED_TO_BACKUP": + assert not os.path.exists(get_path_to_backup(backup_name)) From 708d0eb34cf1f6d3c248b5fb0c7f87b59bdf353a Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 18 Jul 2022 20:07:37 +0200 Subject: [PATCH 126/227] Add concurrent tests for backups. --- .../test_concurrency.py | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 tests/integration/test_backup_restore_on_cluster/test_concurrency.py diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py new file mode 100644 index 00000000000..95d4f27e5e8 --- /dev/null +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -0,0 +1,162 @@ +import pytest +import os.path +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV, assert_eq_with_retry + + +cluster = ClickHouseCluster(__file__) + +num_nodes = 10 + + +def generate_cluster_def(): + path = os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "./_gen/cluster_for_concurrency_test.xml", + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as f: + f.write("\n\t\n\t\t\n\t\t\t\n") + for i in range(num_nodes): + f.write( + f"\t\t\t\t\n\t\t\t\t\tnode{i}\n\t\t\t\t\t9000\n\t\t\t\t\n" + ) + f.write("\t\t\t\n\t\t\n\t\n") + return path + + +main_configs = ["configs/backups_disk.xml", generate_cluster_def()] + + +nodes = [] +for i in range(num_nodes): + nodes.append( + cluster.add_instance( + f"node{i}", + main_configs=main_configs, + external_dirs=["/backups/"], + macros={"replica": f"node{i}", "shard": "shard1"}, + with_zookeeper=True, + ) + ) + +node0 = nodes[0] + + +@pytest.fixture(scope="module", autouse=True) +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(autouse=True) +def drop_after_test(): + try: + yield + finally: + node0.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY") + + +backup_id_counter = 0 + + +def new_backup_name(): + global backup_id_counter + backup_id_counter += 1 + return f"Disk('backups', '{backup_id_counter}')" + + +def create_and_fill_table(): + node0.query( + "CREATE TABLE tbl ON CLUSTER 'cluster' (" + "x Int32" + ") ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')" + "ORDER BY x" + ) + for i in range(num_nodes): + nodes[i].query(f"INSERT INTO tbl VALUES ({i})") + + +expected_sum = num_nodes * (num_nodes - 1) // 2 + + +def test_replicated_table(): + create_and_fill_table() + + backup_name = new_backup_name() + node0.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}") + + node0.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + node0.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + node0.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") + + for i in range(num_nodes): + assert nodes[i].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) + + +num_concurrent_backups = 4 + + +def test_concurrent_backups_on_same_node(): + create_and_fill_table() + + backup_names = [new_backup_name() for _ in range(num_concurrent_backups)] + + ids = [] + for backup_name in backup_names: + id = node0.query( + f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name} ASYNC" + ).split("\t")[0] + ids.append(id) + + ids_list = "[" + ", ".join([f"'{id}'" for id in ids]) + "]" + + assert_eq_with_retry( + node0, + f"SELECT status, error FROM system.backups WHERE status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP' AND uuid IN {ids_list}", + "", + ) + + for backup_name in backup_names: + node0.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + node0.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") + node0.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") + for i in range(num_nodes): + assert nodes[i].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) + + +def test_concurrent_backups_on_different_nodes(): + create_and_fill_table() + + backup_names = [new_backup_name() for _ in range(num_concurrent_backups)] + + ids = [] + for i in range(num_concurrent_backups): + id = ( + nodes[i] + .query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_names[i]} ASYNC") + .split("\t")[0] + ) + ids.append(id) + + for i in range(num_concurrent_backups): + assert_eq_with_retry( + nodes[i], + f"SELECT status, error FROM system.backups WHERE status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP' AND uuid = '{ids[i]}'", + "", + ) + + for i in range(num_concurrent_backups): + assert nodes[i].query( + f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}'" + ) == TSV([["BACKUP_COMPLETE", ""]]) + + for i in range(num_concurrent_backups): + nodes[i].query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") + nodes[i].query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_names[i]}") + nodes[i].query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") + for j in range(num_nodes): + assert nodes[j].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) From 96bcae419cad2a957c981d83b7f293dabed4b7bd Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 15 Jul 2022 14:57:58 +0000 Subject: [PATCH 127/227] Cleanup logic around join_algorithm setting --- src/Core/Settings.h | 2 +- src/Core/SettingsEnums.cpp | 3 +- src/Core/SettingsEnums.h | 3 +- src/Interpreters/ExpressionAnalyzer.cpp | 106 ++++++++++-------- src/Interpreters/FullSortingMergeJoin.h | 22 +++- src/Interpreters/HashJoin.cpp | 2 +- src/Interpreters/JoinedTables.cpp | 3 +- src/Interpreters/MergeJoin.cpp | 14 +++ src/Interpreters/MergeJoin.h | 2 + src/Interpreters/TableJoin.cpp | 16 +-- src/Interpreters/TableJoin.h | 26 ++--- src/Interpreters/TreeRewriter.cpp | 5 +- .../0_stateless/02242_join_rocksdb.sql | 8 +- 13 files changed, 120 insertions(+), 92 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9034fb924ba..3f079189491 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -355,7 +355,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, max_bytes_in_join, 0, "Maximum size of the hash table for JOIN (in number of bytes in memory).", 0) \ M(OverflowMode, join_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \ M(Bool, join_any_take_last_row, false, "When disabled (default) ANY JOIN will take the first found row for a key. When enabled, it will take the last row seen if there are multiple rows for the same key.", IMPORTANT) \ - M(JoinAlgorithm, join_algorithm, JoinAlgorithm::HASH, "Specify join algorithm: 'auto', 'hash', 'partial_merge', 'prefer_partial_merge', 'parallel_hash'. 'auto' tries to change HashJoin to MergeJoin on the fly to avoid out of memory.", 0) \ + M(JoinAlgorithm, join_algorithm, JoinAlgorithm::DEFAULT, "Specify join algorithm.", 0) \ M(UInt64, default_max_bytes_in_join, 1000000000, "Maximum size of right-side table if limit is required but max_bytes_in_join is not set.", 0) \ M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, "If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread.", 0) \ M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \ diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp index 3585cffb8ec..b832096c86c 100644 --- a/src/Core/SettingsEnums.cpp +++ b/src/Core/SettingsEnums.cpp @@ -31,7 +31,8 @@ IMPLEMENT_SETTING_ENUM(JoinStrictness, ErrorCodes::UNKNOWN_JOIN, IMPLEMENT_SETTING_MULTI_ENUM(JoinAlgorithm, ErrorCodes::UNKNOWN_JOIN, - {{"auto", JoinAlgorithm::AUTO}, + {{"default", JoinAlgorithm::DEFAULT}, + {"auto", JoinAlgorithm::AUTO}, {"hash", JoinAlgorithm::HASH}, {"partial_merge", JoinAlgorithm::PARTIAL_MERGE}, {"prefer_partial_merge", JoinAlgorithm::PREFER_PARTIAL_MERGE}, diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h index 6b9ff8277b1..b8a2bdb48b0 100644 --- a/src/Core/SettingsEnums.h +++ b/src/Core/SettingsEnums.h @@ -38,7 +38,8 @@ DECLARE_SETTING_ENUM(JoinStrictness) enum class JoinAlgorithm { - AUTO = 0, + DEFAULT = 0, + AUTO, HASH, PARTIAL_MERGE, PREFER_PARTIAL_MERGE, diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index a4bdc4ed252..a6e3f88b2b8 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -89,7 +89,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int UNKNOWN_IDENTIFIER; extern const int UNKNOWN_TYPE_OF_AST_NODE; - extern const int UNSUPPORTED_METHOD; } namespace @@ -1079,34 +1078,58 @@ static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoi return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false); } -static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr analyzed_join, const Block & right_sample_block, ContextPtr context) -{ - /// HashJoin with Dictionary optimisation - if (analyzed_join->tryInitDictJoin(right_sample_block, context)) - return std::make_shared(analyzed_join, right_sample_block); +std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block); - bool allow_merge_join = analyzed_join->allowMergeJoin(); - if (analyzed_join->forceHashJoin() || (analyzed_join->preferMergeJoin() && !allow_merge_join)) +static std::shared_ptr chooseJoinAlgorithm(std::shared_ptr analyzed_join, std::unique_ptr & joined_plan, ContextPtr context) +{ + Block right_sample_block = joined_plan->getCurrentDataStream().header; + + if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) + { + if (JoinPtr kvjoin = tryKeyValueJoin(analyzed_join, right_sample_block)) + { + /// Do not need to execute plan for right part + joined_plan.reset(); + return kvjoin; + } + + /// It's not a hash join actually, that's why we check JoinAlgorithm::DIRECT + /// It's would be fixed in https://github.com/ClickHouse/ClickHouse/pull/38956 + if (analyzed_join->tryInitDictJoin(right_sample_block, context)) + { + joined_plan.reset(); + return std::make_shared(analyzed_join, right_sample_block); + } + } + + if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::PARTIAL_MERGE) || + analyzed_join->isEnabledAlgorithm(JoinAlgorithm::PREFER_PARTIAL_MERGE)) + { + if (MergeJoin::isSupported(analyzed_join)) + return std::make_shared(analyzed_join, right_sample_block); + } + + if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::HASH) || + /// partial_merge is preferred, but can't be used for specified kind of join, fallback to hash + analyzed_join->isEnabledAlgorithm(JoinAlgorithm::PREFER_PARTIAL_MERGE) || + analyzed_join->isEnabledAlgorithm(JoinAlgorithm::PARALLEL_HASH)) { if (analyzed_join->allowParallelHashJoin()) - { return std::make_shared(context, analyzed_join, context->getSettings().max_threads, right_sample_block); - } return std::make_shared(analyzed_join, right_sample_block); } - else if (analyzed_join->forceMergeJoin() || (analyzed_join->preferMergeJoin() && allow_merge_join)) + + if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::FULL_SORTING_MERGE)) { - return std::make_shared(analyzed_join, right_sample_block); + if (FullSortingMergeJoin::isSupported(analyzed_join)) + return std::make_shared(analyzed_join, right_sample_block); } - else if (analyzed_join->forceFullSortingMergeJoin()) - { - if (analyzed_join->getClauses().size() != 1) - throw Exception("Full sorting merge join is supported only for single-condition joins", ErrorCodes::NOT_IMPLEMENTED); - if (analyzed_join->isSpecialStorage()) - throw Exception("Full sorting merge join is not supported for special storage", ErrorCodes::NOT_IMPLEMENTED); - return std::make_shared(analyzed_join, right_sample_block); - } - return std::make_shared(analyzed_join, right_sample_block); + + if (analyzed_join->isEnabledAlgorithm(JoinAlgorithm::AUTO)) + return std::make_shared(analyzed_join, right_sample_block); + + throw Exception("Can't execute any of specified algorithms for specified strictness/kind and right storage type", + ErrorCodes::NOT_IMPLEMENTED); } static std::unique_ptr buildJoinedPlan( @@ -1164,27 +1187,26 @@ static std::unique_ptr buildJoinedPlan( std::shared_ptr tryKeyValueJoin(std::shared_ptr analyzed_join, const Block & right_sample_block) { - auto error_or_null = [&](const String & msg) - { - if (analyzed_join->isForcedAlgorithm(JoinAlgorithm::DIRECT)) - throw DB::Exception(ErrorCodes::UNSUPPORTED_METHOD, "Can't use '{}' join algorithm: {}", JoinAlgorithm::DIRECT, msg); - return nullptr; - }; - - if (!analyzed_join->isAllowedAlgorithm(JoinAlgorithm::DIRECT)) + if (!analyzed_join->isEnabledAlgorithm(JoinAlgorithm::DIRECT)) return nullptr; auto storage = analyzed_join->getStorageKeyValue(); if (!storage) - return error_or_null("unsupported storage"); + { + return nullptr; + } if (!isInnerOrLeft(analyzed_join->kind())) - return error_or_null("illegal kind"); + { + return nullptr; + } if (analyzed_join->strictness() != ASTTableJoin::Strictness::All && analyzed_join->strictness() != ASTTableJoin::Strictness::Any && analyzed_join->strictness() != ASTTableJoin::Strictness::RightAny) - return error_or_null("illegal strictness"); + { + return nullptr; + } const auto & clauses = analyzed_join->getClauses(); bool only_one_key = clauses.size() == 1 && @@ -1194,15 +1216,16 @@ std::shared_ptr tryKeyValueJoin(std::shared_ptr a !clauses[0].on_filter_condition_right; if (!only_one_key) - return error_or_null("multiple keys is not allowed"); + { + return nullptr; + } String key_name = clauses[0].key_names_right[0]; String original_key_name = analyzed_join->getOriginalName(key_name); const auto & storage_primary_key = storage->getPrimaryKey(); if (storage_primary_key.size() != 1 || storage_primary_key[0] != original_key_name) { - return error_or_null(fmt::format("key '{}'{} doesn't match storage '{}'", - key_name, (key_name != original_key_name ? " (aka '" + original_key_name + "')" : ""), fmt::join(storage_primary_key, ","))); + return nullptr; } return std::make_shared(analyzed_join, right_sample_block, storage); @@ -1240,18 +1263,7 @@ JoinPtr SelectQueryExpressionAnalyzer::makeJoin( joined_plan->addStep(std::move(converting_step)); } - const Block & right_sample_block = joined_plan->getCurrentDataStream().header; - if (JoinPtr kvjoin = tryKeyValueJoin(analyzed_join, right_sample_block)) - { - joined_plan.reset(); - return kvjoin; - } - - JoinPtr join = chooseJoinAlgorithm(analyzed_join, right_sample_block, getContext()); - - /// Do not make subquery for join over dictionary. - if (analyzed_join->getDictionaryReader()) - joined_plan.reset(); + JoinPtr join = chooseJoinAlgorithm(analyzed_join, joined_plan, getContext()); return join; } diff --git a/src/Interpreters/FullSortingMergeJoin.h b/src/Interpreters/FullSortingMergeJoin.h index 3ee6ce1c1fb..14c81259159 100644 --- a/src/Interpreters/FullSortingMergeJoin.h +++ b/src/Interpreters/FullSortingMergeJoin.h @@ -34,14 +34,26 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "FullSortingMergeJoin::addJoinedBlock should not be called"); } - void checkTypesOfKeys(const Block & left_block) const override + static bool isSupported(const std::shared_ptr & table_join) { - if (table_join->getClauses().size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "FullSortingMergeJoin supports only one join key"); + if (!table_join->oneDisjunct()) + return false; + + bool support_storage = !table_join->isSpecialStorage(); + + const auto & on_expr = table_join->getOnlyClause(); + bool support_conditions = !on_expr.on_filter_condition_left && !on_expr.on_filter_condition_right; /// Key column can change nullability and it's not handled on type conversion stage, so algorithm should be aware of it - if (table_join->hasUsing() && table_join->joinUseNulls()) - throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "FullSortingMergeJoin doesn't support USING with join_use_nulls"); + bool support_using_and_nulls = !table_join->hasUsing() || !table_join->joinUseNulls(); + + return support_conditions && support_using_and_nulls && support_storage; + } + + void checkTypesOfKeys(const Block & left_block) const override + { + if (!isSupported(table_join)) + throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "FullSortingMergeJoin doesn't support specified query"); const auto & onexpr = table_join->getOnlyClause(); for (size_t i = 0; i < onexpr.key_names_left.size(); ++i) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index b54c77b385f..1691f7ec10c 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -718,7 +718,7 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample) bool multiple_disjuncts = !table_join->oneDisjunct(); /// We could remove key columns for LEFT | INNER HashJoin but we should keep them for JoinSwitcher (if any). - bool save_key_columns = !table_join->forceHashJoin() || isRightOrFull(kind) || multiple_disjuncts; + bool save_key_columns = table_join->isEnabledAlgorithm(JoinAlgorithm::AUTO) || isRightOrFull(kind) || multiple_disjuncts; if (save_key_columns) { saved_block_sample = right_table_keys.cloneEmpty(); diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index df47e8acdca..9e15a525cb4 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -311,7 +311,8 @@ std::shared_ptr JoinedTables::makeTableJoin(const ASTSelectQuery & se { table_join->setStorageJoin(storage_join); } - else if (auto storage_dict = std::dynamic_pointer_cast(storage); storage_dict) + else if (auto storage_dict = std::dynamic_pointer_cast(storage); + storage_dict && join_algorithm.isSet(JoinAlgorithm::DIRECT)) { table_join->setStorageJoin(storage_dict); } diff --git a/src/Interpreters/MergeJoin.cpp b/src/Interpreters/MergeJoin.cpp index 1dea769f724..711b71a2b3d 100644 --- a/src/Interpreters/MergeJoin.cpp +++ b/src/Interpreters/MergeJoin.cpp @@ -1135,6 +1135,20 @@ void MergeJoin::addConditionJoinColumn(Block & block, JoinTableSide block_side) } } +bool MergeJoin::isSupported(const std::shared_ptr & table_join) +{ + auto kind = table_join->kind(); + auto strictness = table_join->strictness(); + + bool is_any = (strictness == ASTTableJoin::Strictness::Any); + bool is_all = (strictness == ASTTableJoin::Strictness::All); + bool is_semi = (strictness == ASTTableJoin::Strictness::Semi); + + bool all_join = is_all && (isInner(kind) || isLeft(kind) || isRight(kind) || isFull(kind)); + bool special_left = isInnerOrLeft(kind) && (is_any || is_semi); + + return (all_join || special_left) && table_join->oneDisjunct(); +} MergeJoin::RightBlockInfo::RightBlockInfo(std::shared_ptr block_, size_t block_number_, size_t & skip_, RowBitmaps * bitmaps_) : block(block_) diff --git a/src/Interpreters/MergeJoin.h b/src/Interpreters/MergeJoin.h index ab36599e6f4..3b8ad6063e3 100644 --- a/src/Interpreters/MergeJoin.h +++ b/src/Interpreters/MergeJoin.h @@ -37,6 +37,8 @@ public: std::shared_ptr getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override; + static bool isSupported(const std::shared_ptr & table_join); + private: friend class NotJoinedMerge; diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 17869e2084b..029038357c1 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -363,7 +363,7 @@ void TableJoin::addJoinedColumnsAndCorrectTypesImpl(TColumns & left_columns, boo * For `JOIN ON expr1 == expr2` we will infer common type later in makeTableJoin, * when part of plan built and types of expression will be known. */ - inferJoinKeyCommonType(left_columns, columns_from_joined_table, !isSpecialStorage(), forceFullSortingMergeJoin()); + inferJoinKeyCommonType(left_columns, columns_from_joined_table, !isSpecialStorage(), isEnabledAlgorithm(JoinAlgorithm::FULL_SORTING_MERGE)); if (auto it = left_type_map.find(col.name); it != left_type_map.end()) { @@ -409,18 +409,6 @@ bool TableJoin::oneDisjunct() const return clauses.size() == 1; } -bool TableJoin::allowMergeJoin() const -{ - bool is_any = (strictness() == ASTTableJoin::Strictness::Any); - bool is_all = (strictness() == ASTTableJoin::Strictness::All); - bool is_semi = (strictness() == ASTTableJoin::Strictness::Semi); - - bool all_join = is_all && (isInner(kind()) || isLeft(kind()) || isRight(kind()) || isFull(kind())); - bool special_left = isLeft(kind()) && (is_any || is_semi); - - return (all_join || special_left) && oneDisjunct(); -} - bool TableJoin::needStreamWithNonJoinedRows() const { if (strictness() == ASTTableJoin::Strictness::Asof || @@ -511,7 +499,7 @@ TableJoin::createConvertingActions( const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns) { - inferJoinKeyCommonType(left_sample_columns, right_sample_columns, !isSpecialStorage(), forceFullSortingMergeJoin()); + inferJoinKeyCommonType(left_sample_columns, right_sample_columns, !isSpecialStorage(), isEnabledAlgorithm(JoinAlgorithm::FULL_SORTING_MERGE)); NameToNameMap left_key_column_rename; NameToNameMap right_key_column_rename; diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 4210da6ae76..57895d6d1c1 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -193,24 +193,20 @@ public: bool sameStrictnessAndKind(ASTTableJoin::Strictness, ASTTableJoin::Kind) const; const SizeLimits & sizeLimits() const { return size_limits; } VolumePtr getTemporaryVolume() { return tmp_volume; } - bool allowMergeJoin() const; - bool isAllowedAlgorithm(JoinAlgorithm val) const { return join_algorithm.isSet(val) || join_algorithm.isSet(JoinAlgorithm::AUTO); } - bool isForcedAlgorithm(JoinAlgorithm val) const { return join_algorithm == MultiEnum(val); } - - bool preferMergeJoin() const { return join_algorithm == MultiEnum(JoinAlgorithm::PREFER_PARTIAL_MERGE); } - bool forceMergeJoin() const { return join_algorithm == MultiEnum(JoinAlgorithm::PARTIAL_MERGE); } + bool isEnabledAlgorithm(JoinAlgorithm val) const + { + /// When join_algorithm = 'default' (not specified by user) we use hash or direct algorithm. + /// It's behaviour that was initially supported by clickhouse. + bool is_enbaled_by_default = val == JoinAlgorithm::DEFAULT + || val == JoinAlgorithm::HASH + || val == JoinAlgorithm::DIRECT; + if (join_algorithm.isSet(JoinAlgorithm::DEFAULT) && is_enbaled_by_default) + return true; + return join_algorithm.isSet(val); + } bool allowParallelHashJoin() const; - bool forceFullSortingMergeJoin() const { return !isSpecialStorage() && join_algorithm.isSet(JoinAlgorithm::FULL_SORTING_MERGE); } - - bool forceHashJoin() const - { - /// HashJoin always used for DictJoin - return dictionary_reader - || join_algorithm == MultiEnum(JoinAlgorithm::HASH) - || join_algorithm == MultiEnum(JoinAlgorithm::PARALLEL_HASH); - } bool joinUseNulls() const { return join_use_nulls; } bool forceNullableRight() const { return join_use_nulls && isLeftOrFull(table_join.kind); } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index b389c3eb705..bd18984faed 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -683,7 +684,7 @@ bool tryJoinOnConst(TableJoin & analyzed_join, ASTPtr & on_expression, ContextPt else return false; - if (!analyzed_join.forceHashJoin()) + if (!analyzed_join.isEnabledAlgorithm(JoinAlgorithm::HASH)) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "JOIN ON constant ({}) supported only with join algorithm 'hash'", queryToString(on_expression)); @@ -770,7 +771,7 @@ void collectJoinedColumns(TableJoin & analyzed_join, ASTTableJoin & table_join, data.asofToJoinKeys(); } - if (!analyzed_join.oneDisjunct() && !analyzed_join.forceHashJoin()) + if (!analyzed_join.oneDisjunct() && !analyzed_join.isEnabledAlgorithm(JoinAlgorithm::HASH)) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "Only `hash` join supports multiple ORs for keys in JOIN ON section"); } } diff --git a/tests/queries/0_stateless/02242_join_rocksdb.sql b/tests/queries/0_stateless/02242_join_rocksdb.sql index 1759311163b..34b3d120eae 100644 --- a/tests/queries/0_stateless/02242_join_rocksdb.sql +++ b/tests/queries/0_stateless/02242_join_rocksdb.sql @@ -47,16 +47,16 @@ SELECT '--- totals'; SELECT rdb.key % 2, sum(k), max(value2) FROM t2 INNER JOIN rdb ON rdb.key == t2.k GROUP BY (rdb.key % 2) WITH TOTALS; SELECT '---'; -SELECT * FROM t1 RIGHT JOIN rdb ON rdb.key == t1.k; -- { serverError UNSUPPORTED_METHOD } +SELECT * FROM t1 RIGHT JOIN rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } SELECT * FROM t1 RIGHT JOIN rdb ON rdb.key == t1.k FORMAT Null SETTINGS join_algorithm = 'direct,hash'; -SELECT * FROM t1 FULL JOIN rdb ON rdb.key == t1.k; -- { serverError UNSUPPORTED_METHOD } +SELECT * FROM t1 FULL JOIN rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } SELECT * FROM t1 FULL JOIN rdb ON rdb.key == t1.k FORMAT Null SETTINGS join_algorithm = 'direct,hash'; -SELECT * FROM t1 INNER JOIN rdb ON rdb.key + 1 == t1.k; -- { serverError UNSUPPORTED_METHOD } +SELECT * FROM t1 INNER JOIN rdb ON rdb.key + 1 == t1.k; -- { serverError NOT_IMPLEMENTED } SELECT * FROM t1 INNER JOIN rdb ON rdb.key + 1 == t1.k FORMAT Null SETTINGS join_algorithm = 'direct,hash'; -SELECT * FROM t1 INNER JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k; -- { serverError UNSUPPORTED_METHOD } +SELECT * FROM t1 INNER JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k; -- { serverError NOT_IMPLEMENTED } SELECT * FROM t1 INNER JOIN (SELECT * FROM rdb) AS rdb ON rdb.key == t1.k FORMAT Null SETTINGS join_algorithm = 'direct,hash'; DROP TABLE IF EXISTS rdb; From 0deae9b4a366f617e64b14c3c1c2c3281fe2d595 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 15 Jul 2022 14:58:19 +0000 Subject: [PATCH 128/227] Update doc about join_algorithm --- docs/en/operations/settings/settings.md | 30 +++++++++++++++++++------ 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 9f66d5d29a9..59ac34bd6f1 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -302,18 +302,34 @@ Default value: `ALL`. Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm. +Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine. + Possible values: -- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. -- `partial_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) is used. -- `prefer_partial_merge` — ClickHouse always tries to use `merge` join if possible. -- `auto` — ClickHouse tries to change `hash` join to `merge` join on the fly to avoid out of memory. +- `default` — `hash` or `direct`, if possible (same as `direct,hash`) -Default value: `hash`. +- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section. -When using `hash` algorithm the right part of `JOIN` is uploaded into RAM. +- `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process. + +When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM. + +- `partial_merge` — a variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted. + +The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). + +When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks. + +- `direct` - can be applied when the right storage supports key-value requests. + +The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs. + +- `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated. + +- `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining. + +- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise uses `hash`. *Deprecated*, same as `partial_merge,hash`. -When using `partial_merge` algorithm ClickHouse sorts the data and dumps it to the disk. The `merge` algorithm in ClickHouse differs a bit from the classic realization. First ClickHouse sorts the right table by [join key](../../sql-reference/statements/select/join.md#select-join) in blocks and creates min-max index for sorted blocks. Then it sorts parts of left table by `join key` and joins them over right table. The min-max index is also used to skip unneeded right table blocks. ## join_any_take_last_row {#settings-join_any_take_last_row} From e8834c5ea3598f573e27ebd4d05b3dc72aa3f00a Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 15 Jul 2022 15:03:32 +0000 Subject: [PATCH 129/227] Remove trailing whitespaces from docs/en/sql-reference/statements/select/join.md --- docs/en/sql-reference/statements/select/join.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index b029cf4bac8..a36004566a5 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -36,7 +36,7 @@ Additional join types available in ClickHouse: - `LEFT ANY JOIN`, `RIGHT ANY JOIN` and `INNER ANY JOIN`, partially (for opposite side of `LEFT` and `RIGHT`) or completely (for `INNER` and `FULL`) disables the cartesian product for standard `JOIN` types. - `ASOF JOIN` and `LEFT ASOF JOIN`, joining sequences with a non-exact match. `ASOF JOIN` usage is described below. -:::note +:::note When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). ::: @@ -64,7 +64,7 @@ Rows are joined if the whole complex condition is met. If the conditions are not The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause. -:::note +:::note If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. ::: @@ -83,7 +83,7 @@ Consider `table_1` and `table_2`: Query with one join key condition and an additional condition for `table_2`: ``` sql -SELECT name, text FROM table_1 LEFT OUTER JOIN table_2 +SELECT name, text FROM table_1 LEFT OUTER JOIN table_2 ON table_1.Id = table_2.Id AND startsWith(table_2.text, 'Text'); ``` @@ -100,7 +100,7 @@ Note that the result contains the row with the name `C` and the empty text colum Query with `INNER` type of a join and multiple conditions: ``` sql -SELECT name, text, scores FROM table_1 INNER JOIN table_2 +SELECT name, text, scores FROM table_1 INNER JOIN table_2 ON table_1.Id = table_2.Id AND table_2.scores > 10 AND startsWith(table_2.text, 'Text'); ``` @@ -199,7 +199,7 @@ For example, consider the following tables: `ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can’t be joined. -:::note +:::note `ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine. ::: From 122a1123b232d2b85a76ae3684fda63d2f810db3 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 21 Jul 2022 14:57:31 +0000 Subject: [PATCH 130/227] - disable the worst case for distinct in order in perf test for now + functional test for query with the worst perfomance + debug logging in DistinctStep --- src/Processors/QueryPlan/DistinctStep.cpp | 25 +++++++++++++++++++ tests/performance/distinct_in_order.xml | 3 ++- ...tinct_in_order_optimization_long.reference | 1 + ...17_distinct_in_order_optimization_long.sql | 14 +++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference create mode 100644 tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index b9a8932b409..103f0f064a0 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -52,6 +52,27 @@ static SortDescription getSortDescription(const SortDescription & input_sort_des return distinct_sort_desc; } +static Poco::Logger * getLogger() +{ + static Poco::Logger & logger = Poco::Logger::get("DistinctStep"); + return &logger; +} + +static String dumpColumnNames(const Names & columns) +{ + WriteBufferFromOwnString wb; + bool first = true; + + for (const auto & name : columns) + { + if (!first) + wb << ", "; + first = false; + + wb << name; + } + return wb.str(); +} DistinctStep::DistinctStep( const DataStream & input_stream_, @@ -91,7 +112,11 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (optimize_distinct_in_order) { + LOG_DEBUG(getLogger(), "Input sort description ({}): {}", input_stream.sort_description.size(), dumpSortDescription(input_stream.sort_description)); + LOG_DEBUG(getLogger(), "Distinct columns ({}): {}", columns.size(), dumpColumnNames(columns)); SortDescription distinct_sort_desc = getSortDescription(input_stream.sort_description, columns); + LOG_DEBUG(getLogger(), "Distinct sort description ({}): {}", distinct_sort_desc.size(), dumpSortDescription(distinct_sort_desc)); + if (!distinct_sort_desc.empty()) { const bool sorted_stream = input_stream.sort_mode == DataStream::SortMode::Stream; diff --git a/tests/performance/distinct_in_order.xml b/tests/performance/distinct_in_order.xml index 834a6945622..b2c117785f9 100644 --- a/tests/performance/distinct_in_order.xml +++ b/tests/performance/distinct_in_order.xml @@ -27,7 +27,8 @@ SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null - SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null + + DROP TABLE IF EXISTS distinct_cardinality_low diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference new file mode 100644 index 00000000000..0d4d005b74c --- /dev/null +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference @@ -0,0 +1 @@ +-- check that slow query with distinct in order has the same result as ordinary distinct diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql new file mode 100644 index 00000000000..7034538a256 --- /dev/null +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql @@ -0,0 +1,14 @@ +select '-- check that slow query with distinct in order has the same result as ordinary distinct'; +drop table if exists distinct_cardinality_low sync; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; +CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); +INSERT INTO distinct_cardinality_low SELECT number % 1e2, number % 1e4, number % 1e6 FROM numbers_mt(1e8); +create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; +create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; +select distinct * from distinct_in_order except select * from ordinary_distinct; +drop table if exists distinct_in_order; +drop table if exists ordinary_distinct; +drop table if exists distinct_cardinality_low; From 50bd8147fd92c3e5442407b43f7c5c414598e58f Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Thu, 21 Jul 2022 17:59:34 +0200 Subject: [PATCH 131/227] Make lightweight delete work with compact parts (not in optimal way yet) --- src/Interpreters/MutationsInterpreter.cpp | 10 +- ...lete_on_merge_tree_compact_parts.reference | 44 ++++++++ ...ght_delete_on_merge_tree_compact_parts.sql | 103 ++++++++++++++++++ 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.reference create mode 100644 tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 7778e316b8b..18f8b493ad6 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -792,11 +792,11 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & if (i > 0) prepared_stages[i].output_columns = prepared_stages[i - 1].output_columns; - if (prepared_stages[i].output_columns.size() < all_columns.size()) - { - for (const auto & kv : prepared_stages[i].column_to_updated) - prepared_stages[i].output_columns.insert(kv.first); - } + /// Make sure that all updated columns are included into output_columns set. + /// This is important for a "hidden" column like _row_exists gets because it is a virtual column + /// and so it is not in the list of AllPhysical columns. + for (const auto & kv : prepared_stages[i].column_to_updated) + prepared_stages[i].output_columns.insert(kv.first); } /// Now, calculate `expressions_chain` for each stage except the first. diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.reference new file mode 100644 index 00000000000..f2b9f06ebff --- /dev/null +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.reference @@ -0,0 +1,44 @@ +1 Compact +1 Compact +99 +1 +1 Compact +95 +1 +1 Compact +0 +1 +5 Compact +-----lightweight mutation type----- +1 +1 +1 +UPDATE _row_exists = 0 WHERE (c % 5) = 1 1 +UPDATE _row_exists = 0 WHERE c = 4 1 +MATERIALIZE INDEX i_c 1 +UPDATE b = -1 WHERE a < 3 1 +DROP INDEX i_c 1 +-----Check that select and merge with lightweight delete.----- +7 +0 -1 0 +2 -1 2 +3 3 3 +5 5 5 +7 7 7 +8 8 8 +9 9 9 +t_light 0 0_1_1_0_10 2 +t_light 1 1_2_2_0_10 2 +t_light 2 2_3_3_0_10 2 +t_light 3 3_4_4_0_10 2 +t_light 4 4_5_5_0_10 2 +7 +t_light 0 0_1_1_1_10 2 +t_light 2 2_3_3_1_10 2 +t_light 3 3_4_4_1_10 2 +t_light 4 4_5_5_1_10 1 +-----Test lightweight delete in multi blocks----- +1 +1 +1000 -2 +1005 -2 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql new file mode 100644 index 00000000000..f47560ba95b --- /dev/null +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql @@ -0,0 +1,103 @@ +DROP TABLE IF EXISTS merge_table_standard_delete; + +CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTree order by id settings min_bytes_for_wide_part=10000000; + +INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); + +SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; + +SET mutations_sync = 1; +--SET allow_experimental_lightweight_delete = 0; +SET allow_experimental_lightweight_delete_with_row_exists = 1; + +DELETE FROM merge_table_standard_delete WHERE id = 10; +SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; + +SELECT COUNT() FROM merge_table_standard_delete; + +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE name IN ('1','2','3','4'); +SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; + +SELECT COUNT() FROM merge_table_standard_delete; + +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + +DELETE FROM merge_table_standard_delete WHERE 1; +SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; + +SELECT COUNT() FROM merge_table_standard_delete; + +DETACH TABLE merge_table_standard_delete; +ATTACH TABLE merge_table_standard_delete; +CHECK TABLE merge_table_standard_delete; + +DROP TABLE merge_table_standard_delete; + +drop table if exists t_light; +create table t_light(a int, b int, c int, index i_c(b) type minmax granularity 4) engine = MergeTree order by a partition by c % 5 settings min_bytes_for_wide_part=10000000; +INSERT INTO t_light SELECT number, number, number FROM numbers(10); +SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 't_light' AND active GROUP BY part_type ORDER BY part_type; + +SELECT '-----lightweight mutation type-----'; + +DELETE FROM t_light WHERE c%5=1; + +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + +DELETE FROM t_light WHERE c=4; + +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + +alter table t_light MATERIALIZE INDEX i_c; +alter table t_light update b=-1 where a<3; +alter table t_light drop index i_c; + +DETACH TABLE t_light; +ATTACH TABLE t_light; +CHECK TABLE t_light; + +SELECT command, is_done FROM system.mutations WHERE database = currentDatabase() AND table = 't_light'; + +SELECT '-----Check that select and merge with lightweight delete.-----'; +select count(*) from t_light; +select * from t_light order by a; + +select table, partition, name, rows from system.parts where database = currentDatabase() AND active and table ='t_light' order by name; + +optimize table t_light final; +select count(*) from t_light; + +select table, partition, name, rows from system.parts where database = currentDatabase() AND active and table ='t_light' and rows > 0 order by name; + +drop table t_light; + +SELECT '-----Test lightweight delete in multi blocks-----'; +CREATE TABLE t_large(a UInt32, b int) ENGINE=MergeTree order BY a settings min_bytes_for_wide_part=0; +INSERT INTO t_large SELECT number + 1, number + 1 FROM numbers(100000); + +DELETE FROM t_large WHERE a = 50000; + +DETACH TABLE t_large; +ATTACH TABLE t_large; +CHECK TABLE t_large; + +ALTER TABLE t_large UPDATE b = -2 WHERE a between 1000 and 1005; +ALTER TABLE t_large DELETE WHERE a=1; + +DETACH TABLE t_large; +ATTACH TABLE t_large; +CHECK TABLE t_large; + +SELECT * FROM t_large WHERE a in (1,1000,1005,50000) order by a; + +DROP TABLE t_large; From 79a76abf96167b0396b4cd93d58a0f17d480b8bc Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Jul 2022 17:47:10 +0000 Subject: [PATCH 132/227] Try fix tests --- .../02327_capnproto_protobuf_empty_messages.sh | 16 ++++++++-------- .../format_schemas/02327_schema.capnp | 2 +- .../format_schemas/02327_schema.proto | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index a569a6435f6..3d41c9bf721 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -15,22 +15,22 @@ mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ -$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; -$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="drop table if exists test_protobuf"; -$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="desc test_protobuf"; $CLICKHOUSE_CLIENT --query="drop table test_protobuf"; $CLICKHOUSE_CLIENT --query="drop table if exists test_capnp"; -$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:Message', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="desc test_capnp"; $CLICKHOUSE_CLIENT --query="drop table test_capnp"; diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.capnp b/tests/queries/0_stateless/format_schemas/02327_schema.capnp index 12ccc7308c9..0d4caa99fc6 100644 --- a/tests/queries/0_stateless/format_schemas/02327_schema.capnp +++ b/tests/queries/0_stateless/format_schemas/02327_schema.capnp @@ -4,7 +4,7 @@ struct Empty { } -struct Message +struct MessageWithEmpty { tuple1 @0 : Empty; text @1 : Text; diff --git a/tests/queries/0_stateless/format_schemas/02327_schema.proto b/tests/queries/0_stateless/format_schemas/02327_schema.proto index b5067393558..d478ccf6550 100644 --- a/tests/queries/0_stateless/format_schemas/02327_schema.proto +++ b/tests/queries/0_stateless/format_schemas/02327_schema.proto @@ -3,7 +3,7 @@ syntax = "proto3"; message Empty { } -message Message { +message MessageWithEmpty { Empty empty = 1; string str = 2; }; From 7795b2cec3fd099d5b780ceb66a90c23a4085b54 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 21 Jul 2022 20:29:49 +0200 Subject: [PATCH 133/227] Fix system.backups: now it can show duplicate UUIDs with different flag. --- src/Backups/BackupsWorker.cpp | 42 ++++++++++----------- src/Backups/BackupsWorker.h | 16 ++++---- src/Interpreters/InterpreterBackupQuery.cpp | 4 +- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 47e1bac3200..df501258db6 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -110,7 +110,7 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa } -UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) +std::pair BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) { const ASTBackupQuery & backup_query = typeid_cast(*backup_or_restore_query); if (backup_query.kind == ASTBackupQuery::Kind::BACKUP) @@ -120,7 +120,7 @@ UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutable } -UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) +std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) { auto backup_query = std::static_pointer_cast(query->clone()); auto backup_settings = BackupSettings::fromBackupQuery(*backup_query); @@ -136,7 +136,7 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c try { auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); - addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP, backup_settings.internal); + addInfo(backup_uuid, backup_settings.internal, backup_info.toString(), BackupStatus::MAKING_BACKUP); /// Prepare context to use. ContextPtr context_in_use = context; @@ -169,12 +169,12 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c doBackup(backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context, false); } - return backup_uuid; + return {backup_uuid, backup_settings.internal}; } catch (...) { /// Something bad happened, the backup has not built. - setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); throw; } @@ -286,7 +286,7 @@ void BackupsWorker::doBackup( backup.reset(); LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); - setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); + setStatus(backup_uuid, backup_settings.internal, BackupStatus::BACKUP_COMPLETE); } catch (...) { @@ -294,7 +294,7 @@ void BackupsWorker::doBackup( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } else @@ -306,7 +306,7 @@ void BackupsWorker::doBackup( } -UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) +std::pair BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) { auto restore_query = std::static_pointer_cast(query->clone()); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); @@ -319,7 +319,7 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte try { auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); - addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING, restore_settings.internal); + addInfo(restore_uuid, restore_settings.internal, backup_info.toString(), BackupStatus::RESTORING); /// Prepare context to use. ContextMutablePtr context_in_use = context; @@ -342,12 +342,12 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte doRestore(restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use, false); } - return restore_uuid; + return {restore_uuid, restore_settings.internal}; } catch (...) { /// Something bad happened, the backup has not built. - setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); throw; } @@ -460,7 +460,7 @@ void BackupsWorker::doRestore( } LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); - setStatus(restore_uuid, BackupStatus::RESTORED); + setStatus(restore_uuid, restore_settings.internal, BackupStatus::RESTORED); } catch (...) { @@ -468,7 +468,7 @@ void BackupsWorker::doRestore( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } else @@ -480,7 +480,7 @@ void BackupsWorker::doRestore( } -void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, BackupStatus status, bool internal) +void BackupsWorker::addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status) { Info info; info.uuid = uuid; @@ -490,17 +490,17 @@ void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, Backu info.internal = internal; std::lock_guard lock{infos_mutex}; - infos[uuid] = std::move(info); + infos[{uuid, internal}] = std::move(info); num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); } -void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status) +void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus status) { std::lock_guard lock{infos_mutex}; - auto it = infos.find(uuid); + auto it = infos.find({uuid, internal}); if (it == infos.end()) return; @@ -513,12 +513,12 @@ void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status) } -void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_exception) +void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception) { std::unique_lock lock{infos_mutex}; status_changed.wait(lock, [&] { - auto it = infos.find(backup_or_restore_uuid); + auto it = infos.find({backup_or_restore_uuid, internal}); if (it == infos.end()) return true; const auto & info = it->second; @@ -529,10 +529,10 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_excep }); } -std::optional BackupsWorker::tryGetInfo(const UUID & backup_or_restore_uuid) const +std::optional BackupsWorker::tryGetInfo(const UUID & backup_or_restore_uuid, bool internal) const { std::lock_guard lock{infos_mutex}; - auto it = infos.find(backup_or_restore_uuid); + auto it = infos.find({backup_or_restore_uuid, internal}); if (it == infos.end()) return std::nullopt; return it->second; diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 8db9c1367a9..f471dc822e0 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -29,11 +29,11 @@ public: void shutdown(); /// Starts executing a BACKUP or RESTORE query. Returns UUID of the operation. - UUID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); + std::pair start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); /// Waits until a BACKUP or RESTORE query started by start() is finished. /// The function returns immediately if the operation is already finished. - void wait(const UUID & backup_or_restore_uuid, bool rethrow_exception = true); + void wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception = true); /// Information about executing a BACKUP or RESTORE query started by calling start(). struct Info @@ -54,29 +54,29 @@ public: bool internal = false; }; - std::optional tryGetInfo(const UUID & backup_or_restore_uuid) const; + std::optional tryGetInfo(const UUID & backup_or_restore_uuid, bool internal) const; std::vector getAllInfos() const; private: - UUID startMakingBackup(const ASTPtr & query, const ContextPtr & context); + std::pair startMakingBackup(const ASTPtr & query, const ContextPtr & context); void doBackup(const UUID & backup_uuid, const std::shared_ptr & backup_query, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, ContextMutablePtr mutable_context, bool called_async); - UUID startRestoring(const ASTPtr & query, ContextMutablePtr context); + std::pair startRestoring(const ASTPtr & query, ContextMutablePtr context); void doRestore(const UUID & restore_uuid, const std::shared_ptr & restore_query, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); - void addInfo(const UUID & uuid, const String & backup_name, BackupStatus status, bool internal); - void setStatus(const UUID & uuid, BackupStatus status); + void addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status); + void setStatus(const UUID & uuid, bool internal, BackupStatus status); ThreadPool backups_thread_pool; ThreadPool restores_thread_pool; - std::unordered_map infos; + std::map, Info> infos; std::condition_variable status_changed; std::atomic num_active_backups = 0; std::atomic num_active_restores = 0; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index af3c8df8eef..a2bb3acf073 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -42,9 +42,9 @@ namespace BlockIO InterpreterBackupQuery::execute() { auto & backups_worker = context->getBackupsWorker(); - UUID uuid = backups_worker.start(query_ptr, context); + auto [uuid, internal] = backups_worker.start(query_ptr, context); BlockIO res_io; - res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.tryGetInfo(uuid)))); + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.tryGetInfo(uuid, internal)))); return res_io; } From 36d3923622c79dab910ede3420d84d8dbcebaf51 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 21 Jul 2022 20:31:03 +0200 Subject: [PATCH 134/227] Add one more concurrency test. --- .../test_concurrency.py | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index 95d4f27e5e8..d399723c8b1 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -1,5 +1,8 @@ +from random import randint import pytest import os.path +import time +import concurrent from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV, assert_eq_with_retry @@ -27,6 +30,7 @@ def generate_cluster_def(): main_configs = ["configs/backups_disk.xml", generate_cluster_def()] +user_configs = ["configs/allow_experimental_database_replicated.xml"] nodes = [] for i in range(num_nodes): @@ -34,6 +38,7 @@ for i in range(num_nodes): cluster.add_instance( f"node{i}", main_configs=main_configs, + user_configs=user_configs, external_dirs=["/backups/"], macros={"replica": f"node{i}", "shard": "shard1"}, with_zookeeper=True, @@ -160,3 +165,87 @@ def test_concurrent_backups_on_different_nodes(): nodes[i].query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' tbl") for j in range(num_nodes): assert nodes[j].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) + + +def test_create_or_drop_tables_during_backup(): + node0.query( + "CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE=Replicated('/clickhouse/path/','{shard}','{replica}')" + ) + + # Will do this test for 60 seconds + start_time = time.time() + end_time = start_time + 60 + + def create_table(): + while time.time() < end_time: + node = nodes[randint(0, num_nodes - 1)] + table_name = f"mydb.tbl{randint(1, num_nodes)}" + node.query( + f"CREATE TABLE IF NOT EXISTS {table_name}(x Int32) ENGINE=ReplicatedMergeTree ORDER BY x" + ) + node.query_and_get_answer_with_error( + f"INSERT INTO {table_name} SELECT rand32() FROM numbers(10)" + ) + + def drop_table(): + while time.time() < end_time: + table_name = f"mydb.tbl{randint(1, num_nodes)}" + node = nodes[randint(0, num_nodes - 1)] + node.query(f"DROP TABLE IF EXISTS {table_name} NO DELAY") + + def rename_table(): + while time.time() < end_time: + table_name1 = f"mydb.tbl{randint(1, num_nodes)}" + table_name2 = f"mydb.tbl{randint(1, num_nodes)}" + node = nodes[randint(0, num_nodes - 1)] + node.query_and_get_answer_with_error( + f"RENAME TABLE {table_name1} TO {table_name2}" + ) + + def make_backup(): + ids = [] + while time.time() < end_time: + time.sleep( + 5 + ) # 1 minute total, and around 5 seconds per each backup => around 12 backups should be created + backup_name = new_backup_name() + id = node0.query( + f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} ASYNC" + ).split("\t")[0] + ids.append(id) + return ids + + ids = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + futures = [] + ids_future = executor.submit(make_backup) + futures.append(ids_future) + futures.append(executor.submit(create_table)) + futures.append(executor.submit(drop_table)) + futures.append(executor.submit(rename_table)) + for future in futures: + future.result() + ids = ids_future.result() + + ids_list = "[" + ", ".join([f"'{id}'" for id in ids]) + "]" + for node in nodes: + assert_eq_with_retry( + node, + f"SELECT status, error from system.backups WHERE uuid IN {ids_list} AND (status == 'MAKING_BACKUP')", + "", + ) + + backup_names = {} + for node in nodes: + for id in ids: + backup_name = node.query( + f"SELECT backup_name FROM system.backups WHERE uuid='{id}' FORMAT RawBLOB" + ).strip() + if backup_name: + backup_names[id] = backup_name + + for id in ids: + node0.query("DROP DATABASE mydb ON CLUSTER 'cluster'") + node0.query( + f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_names[id]}" + ) From 7db5d54820154a0287b4f1b20a699ea34862c99f Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 21 Jul 2022 20:58:33 +0000 Subject: [PATCH 135/227] Adopt to the case when not all columns in distinct are part of sorting description --- src/Processors/QueryPlan/DistinctStep.cpp | 38 +++++++++++++++++-- tests/performance/distinct_in_order.xml | 3 +- ...7_distinct_in_order_optimization.reference | 1 + .../02317_distinct_in_order_optimization.sql | 15 ++++++++ ...ct_in_order_optimization_explain.reference | 18 +++++++-- ..._distinct_in_order_optimization_explain.sh | 23 ++++++----- ...tinct_in_order_optimization_long.reference | 1 - ...17_distinct_in_order_optimization_long.sql | 14 ------- 8 files changed, 78 insertions(+), 35 deletions(-) delete mode 100644 tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference delete mode 100644 tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index 103f0f064a0..c268cb44267 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -119,9 +119,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (!distinct_sort_desc.empty()) { - const bool sorted_stream = input_stream.sort_mode == DataStream::SortMode::Stream; - /// pre-distinct for sorted chunks or final distinct for sorted stream (sorting inside and among chunks) - if (pre_distinct || sorted_stream) + /// pre-distinct for sorted chunks + if (pre_distinct) { pipeline.addSimpleTransform( [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr @@ -130,10 +129,41 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil return nullptr; return std::make_shared( - header, set_size_limits, limit_hint, distinct_sort_desc, columns, sorted_stream); + header, set_size_limits, limit_hint, distinct_sort_desc, columns, false); }); return; } + /// final distinct for sorted stream (sorting inside and among chunks) + if (input_stream.sort_mode == DataStream::SortMode::Stream) + { + assert(input_stream.has_single_port); + + if (distinct_sort_desc.size() < columns.size()) + { + pipeline.addSimpleTransform( + [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr + { + if (stream_type != QueryPipelineBuilder::StreamType::Main) + return nullptr; + + return std::make_shared( + header, distinct_sort_desc, set_size_limits, limit_hint, columns); + }); + } + else + { + pipeline.addSimpleTransform( + [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr + { + if (stream_type != QueryPipelineBuilder::StreamType::Main) + return nullptr; + + return std::make_shared( + header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); + }); + } + return; + } } } diff --git a/tests/performance/distinct_in_order.xml b/tests/performance/distinct_in_order.xml index b2c117785f9..834a6945622 100644 --- a/tests/performance/distinct_in_order.xml +++ b/tests/performance/distinct_in_order.xml @@ -27,8 +27,7 @@ SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null - - + SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null DROP TABLE IF EXISTS distinct_cardinality_low diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference index b53b561137e..a5ae3759d5d 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference @@ -77,3 +77,4 @@ 2 2 1 1 0 0 +-- check that distinct in order has the same result as ordinary distinct diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index afe53a95b26..2da81b50f6c 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -44,3 +44,18 @@ select '-- distinct with non-key prefix and non-sorted column, order by non-sort select distinct b,c from distinct_in_order order by c desc; drop table if exists distinct_in_order sync; + +select '-- check that distinct in order has the same result as ordinary distinct'; +drop table if exists distinct_cardinality_low sync; +drop table if exists distinct_in_order sync; +drop table if exists ordinary_distinct sync; +CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); +INSERT INTO distinct_cardinality_low SELECT number % 1e2, number % 1e4, number % 1e6 FROM numbers_mt(1e8); +create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; +create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; +select distinct * from distinct_in_order except select * from ordinary_distinct; +drop table if exists distinct_in_order; +drop table if exists ordinary_distinct; +drop table if exists distinct_cardinality_low; diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index f30d3fa30ea..327d0d52dc2 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -1,19 +1,29 @@ -- disable optimize_distinct_in_order -- distinct all primary key columns -> no optimizations -No optimizations +DistinctTransform +DistinctTransform -- enable optimize_distinct_in_order -- distinct with all primary key columns -> pre-distinct optimization only +DistinctTransform DistinctSortedChunkTransform -- distinct with primary key prefix -> pre-distinct optimization only +DistinctTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization +DistinctSortedTransform +DistinctSortedChunkTransform +-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization DistinctSortedStreamTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only +DistinctTransform DistinctSortedChunkTransform -- distinct with non-primary key prefix -> no optimizations -No optimizations +DistinctTransform +DistinctTransform -- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only -DistinctSortedStreamTransform +DistinctSortedTransform +DistinctTransform -- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations -No optimizations +DistinctTransform +DistinctTransform diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index 9af0e98ecf4..e822f9695b9 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -8,9 +8,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) DISABLE_OPTIMIZATION="set optimize_distinct_in_order=0" ENABLE_OPTIMIZATION="set optimize_distinct_in_order=1" -GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedStreamTransform'" +GREP_DISTINCT="grep 'DistinctSortedChunkTransform\|DistinctSortedStreamTransform\|DistinctSortedTransform\|DistinctTransform'" TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'" -FIND_OPTIMIZATIONS="$GREP_OPTIMIZATIONS | $TRIM_LEADING_SPACES" +FIND_DISTINCT="$GREP_DISTINCT | $TRIM_LEADING_SPACES" $CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync" $CLICKHOUSE_CLIENT -q "create table distinct_in_order_explain (a int, b int, c int) engine=MergeTree() order by (a, b, c)" @@ -18,26 +18,29 @@ $CLICKHOUSE_CLIENT -q "insert into distinct_in_order_explain select number % num $CLICKHOUSE_CLIENT -q "select '-- disable optimize_distinct_in_order'" $CLICKHOUSE_CLIENT -q "select '-- distinct all primary key columns -> no optimizations'" -$CLICKHOUSE_CLIENT -nq "$DISABLE_OPTIMIZATION;explain pipeline select distinct * from distinct_in_order_explain" | eval $GREP_OPTIMIZATIONS || echo "No optimizations" +$CLICKHOUSE_CLIENT -nq "$DISABLE_OPTIMIZATION;explain pipeline select distinct * from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- enable optimize_distinct_in_order'" $CLICKHOUSE_CLIENT -q "select '-- distinct with all primary key columns -> pre-distinct optimization only'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct * from distinct_in_order_explain" | eval $FIND_OPTIMIZATIONS +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct * from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix -> pre-distinct optimization only'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain" | eval $FIND_OPTIMIZATIONS +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by c" | eval $FIND_OPTIMIZATIONS +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT + +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_OPTIMIZATIONS +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> no optimizations'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $GREP_OPTIMIZATIONS || echo "No optimizations" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_OPTIMIZATIONS +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $GREP_OPTIMIZATIONS || echo "No optimizations" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $FIND_DISTINCT diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference deleted file mode 100644 index 0d4d005b74c..00000000000 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.reference +++ /dev/null @@ -1 +0,0 @@ --- check that slow query with distinct in order has the same result as ordinary distinct diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql deleted file mode 100644 index 7034538a256..00000000000 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_long.sql +++ /dev/null @@ -1,14 +0,0 @@ -select '-- check that slow query with distinct in order has the same result as ordinary distinct'; -drop table if exists distinct_cardinality_low sync; -drop table if exists distinct_in_order sync; -drop table if exists ordinary_distinct sync; -CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); -INSERT INTO distinct_cardinality_low SELECT number % 1e2, number % 1e4, number % 1e6 FROM numbers_mt(1e8); -create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; -create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; -select distinct * from distinct_in_order except select * from ordinary_distinct; -drop table if exists distinct_in_order; -drop table if exists ordinary_distinct; -drop table if exists distinct_cardinality_low; From 75476d51107d207dcfe1f4de8aaae36732a4d589 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 22 Jul 2022 08:07:38 +0000 Subject: [PATCH 136/227] Store only root system node --- src/Coordination/KeeperConstants.h | 14 ++++- src/Coordination/KeeperSnapshotManager.cpp | 17 +++++- src/Coordination/KeeperStorage.cpp | 71 ++++++++++++---------- 3 files changed, 66 insertions(+), 36 deletions(-) diff --git a/src/Coordination/KeeperConstants.h b/src/Coordination/KeeperConstants.h index 4582248b7cf..ec0bf34246f 100644 --- a/src/Coordination/KeeperConstants.h +++ b/src/Coordination/KeeperConstants.h @@ -1,11 +1,10 @@ #pragma once +#include + namespace DB { -const std::string keeper_system_path = "/keeper"; -const std::string keeper_api_version_path = keeper_system_path + "/api_version"; - enum class KeeperApiVersion : uint8_t { V0 = 0, // ZooKeeper compatible version @@ -14,4 +13,13 @@ enum class KeeperApiVersion : uint8_t inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; +const std::string keeper_system_path = "/keeper"; +const std::string keeper_api_version_path = keeper_system_path + "/api_version"; + +using PathWithData = std::pair; +const std::vector data_for_system_paths +{ + {keeper_api_version_path, toString(static_cast(current_keeper_api_version))} +}; + } diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 8fbc7df1484..1b863cac7dd 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -145,6 +145,16 @@ namespace } } +namespace +{ + +bool isChildSystemPath(const std::string_view path) +{ + auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), keeper_system_path.begin(), keeper_system_path.end()); + return first_it != path.end() && *first_it == '/' && second_it == keeper_system_path.end(); +} + +} void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out) { @@ -183,11 +193,16 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr } /// Serialize data tree - writeBinary(snapshot.snapshot_container_size, out); + writeBinary(snapshot.snapshot_container_size - data_for_system_paths.size(), out); size_t counter = 0; for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter) { const auto & path = it->key; + + // write only the root system path because of digest + if (isChildSystemPath(path.toView())) + continue; + const auto & node = it->value; /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index dd43d006979..d67717e33ff 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -244,39 +244,46 @@ void KeeperStorage::initializeSystemNodes() if (initialized) throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes initialized twice"); - const auto create_system_node = [&](const auto & path, auto data) + // insert root system path + Node system_node; + system_node.setData(""); + container.insertOrReplace(keeper_system_path, system_node); + // store digest for the empty node because we won't update + // its stats + addDigest(system_node, keeper_system_path); + + // update root and the digest based on it + auto current_root_it = container.find("/"); + assert(current_root_it != container.end()); + removeDigest(current_root_it->value, "/"); + auto updated_root_it = container.updateValue( + "/", + [](auto & node) + { + ++node.stat.numChildren; + node.addChild(keeper_system_path); + } + ); + addDigest(updated_root_it->value, "/"); + + // insert child system nodes + for (const auto & [path, data] : data_for_system_paths) { - auto node_it = container.find(path); - if (node_it == container.end()) - { - // we update numChildren during preprocessing so and createNode is called during - // commit so we need to update it manually here - container.updateValue( - parentPath(path), - [](KeeperStorage::Node & parent) - { - ++parent.stat.numChildren; - } - ); - createNode(path, std::move(data), {}, false, {}); - } - else - { - container.updateValue( - path, - [data = std::move(data)](KeeperStorage::Node & node) - { - node.setData(std::move(data)); - } - ); - } - }; - - create_system_node(keeper_system_path, ""); - - assert(keeper_api_version_path.starts_with(keeper_system_path)); - auto api_version_data = toString(static_cast(current_keeper_api_version)); - create_system_node(keeper_api_version_path, std::move(api_version_data)); + assert(keeper_api_version_path.starts_with(keeper_system_path)); + Node child_system_node; + system_node.setData(data); + auto [map_key, _] = container.insert(std::string{path}, child_system_node); + /// Take child path from key owned by map. + auto child_path = getBaseName(map_key->getKey()); + container.updateValue( + parentPath(child_path), + [child_path](auto & parent) + { + // don't update stats so digest is okay + parent.addChild(child_path); + } + ); + } initialized = true; } From 17b504b0d48ed5d76eb8bd958b0ad022d9a61b83 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 21 Jul 2022 12:19:46 +0200 Subject: [PATCH 137/227] Build thread sanitizer binaries with clang-13 --- docker/packager/binary/Dockerfile | 15 +++++++++++++++ docker/packager/packager | 1 + tests/ci/ci_config.py | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile index 1dff4b1a2d4..6602e294327 100644 --- a/docker/packager/binary/Dockerfile +++ b/docker/packager/binary/Dockerfile @@ -118,5 +118,20 @@ ENV GOCACHE=/workdir/ RUN mkdir /workdir && chmod 777 /workdir WORKDIR /workdir +# FIXME: thread sanitizer is broken in clang-14, we have to build it with clang-13 +# https://github.com/ClickHouse/ClickHouse/pull/39450 +# https://github.com/google/sanitizers/issues/1540 +# https://github.com/google/sanitizers/issues/1552 + +RUN export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \ + && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-13 main" >> \ + /etc/apt/sources.list.d/clang.list \ + && apt-get update \ + && apt-get install \ + clang-13 \ + clang-tidy-13 \ + --yes --no-install-recommends \ + && apt-get clean + COPY build.sh / CMD ["bash", "-c", "/build.sh 2>&1"] diff --git a/docker/packager/packager b/docker/packager/packager index 7c0f046b76c..0b00bc4e9c0 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -323,6 +323,7 @@ if __name__ == "__main__": parser.add_argument( "--compiler", choices=( + "clang-13", # For TSAN builds, see #39450 "clang-14", "clang-14-darwin", "clang-14-darwin-aarch64", diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index a530b395130..2b22ccd9c7c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -63,7 +63,7 @@ CI_CONFIG = { "with_coverage": False, }, "package_tsan": { - "compiler": "clang-14", + "compiler": "clang-13", "build_type": "", "sanitizer": "thread", "package_type": "deb", From c6b251d0cb54693a9ca58b53312b0184cd31dacf Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Fri, 22 Jul 2022 11:58:15 +0200 Subject: [PATCH 138/227] Revert "Merge pull request #38106 from ClickHouse/tsan_less_flaky_tests" This reverts commit 0ae0cc54aa6b1fe479d68310a06d74b392d7f051, reversing changes made to 1d7cf28cabdc7ad62d0079582e763d630f3c27e1. --- docker/test/stress/run.sh | 37 +++++++++---------- .../00984_parser_stack_overflow.sh | 2 - .../01172_transaction_counters.sql | 3 +- .../01183_custom_separated_format_http.sh | 2 - .../01184_long_insert_values_huge_strings.sh | 3 +- .../0_stateless/01651_lc_insert_tiny_log.sql | 3 -- ..._long_zstd_http_compression_json_format.sh | 3 +- .../0_stateless/01926_order_by_desc_limit.sql | 3 +- .../00159_parallel_formatting_http.sh | 2 - 9 files changed, 21 insertions(+), 37 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index ffa0b12b8a3..33b60ae7ef9 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -7,29 +7,26 @@ set -x # Thread Fuzzer allows to check more permutations of possible thread scheduling # and find more potential issues. -# -# But under thread fuzzer, TSan build is too slow and this produces some flaky -# tests, so for now, as a temporary solution it had been disabled. -if ! test -f package_folder/clickhouse-server*tsan*.deb; then - export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 - export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 - export THREAD_FUZZER_SLEEP_TIME_US=100000 - export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 - export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 - export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 - export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 +export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 +export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 +export THREAD_FUZZER_SLEEP_TIME_US=100000 - export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 - export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 - export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 - export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 - export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 +export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 +export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 +export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 +export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 + +export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 +export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 +export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 +export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 +export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 + +export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 +export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 +export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 - export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 - export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 - export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 -fi function install_packages() { diff --git a/tests/queries/0_stateless/00984_parser_stack_overflow.sh b/tests/queries/0_stateless/00984_parser_stack_overflow.sh index 168ef155d9b..329e51e774a 100755 --- a/tests/queries/0_stateless/00984_parser_stack_overflow.sh +++ b/tests/queries/0_stateless/00984_parser_stack_overflow.sh @@ -1,6 +1,4 @@ #!/usr/bin/env bash -# Tags: no-tsan -# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan # Such a huge timeout mostly for debug build. CLICKHOUSE_CURL_TIMEOUT=60 diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql index ed40ec3f4b6..cc6212ae4c1 100644 --- a/tests/queries/0_stateless/01172_transaction_counters.sql +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -1,6 +1,5 @@ --- Tags: no-s3-storage, no-tsan, no-ordinary-database +-- Tags: no-s3-storage, no-ordinary-database -- FIXME this test fails with S3 due to a bug in DiskCacheWrapper --- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan drop table if exists txn_counters; create table txn_counters (n Int64, creation_tid DEFAULT transactionID()) engine=MergeTree order by n; diff --git a/tests/queries/0_stateless/01183_custom_separated_format_http.sh b/tests/queries/0_stateless/01183_custom_separated_format_http.sh index 744cf0c08bd..8eaa22f4ecc 100755 --- a/tests/queries/0_stateless/01183_custom_separated_format_http.sh +++ b/tests/queries/0_stateless/01183_custom_separated_format_http.sh @@ -1,6 +1,4 @@ #!/usr/bin/env bash -# Tags: no-tsan -# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh index f4bad961f21..09a43d13a42 100755 --- a/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh +++ b/tests/queries/0_stateless/01184_long_insert_values_huge_strings.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-tsan -# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan +# Tags: long CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql index ec2a1850594..22532529812 100644 --- a/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql +++ b/tests/queries/0_stateless/01651_lc_insert_tiny_log.sql @@ -1,6 +1,3 @@ --- Tags: no-tsan --- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan - drop table if exists perf_lc_num; CREATE TABLE perf_lc_num(  num UInt8,  arr Array(LowCardinality(Int64)) default [num]  ) ENGINE = TinyLog; diff --git a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh index 16f5211f012..e10032e04fd 100755 --- a/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh +++ b/tests/queries/0_stateless/01746_long_zstd_http_compression_json_format.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-fasttest, no-tsan -# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan +# Tags: long, no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/01926_order_by_desc_limit.sql b/tests/queries/0_stateless/01926_order_by_desc_limit.sql index 223dbf70fc4..86468b4fcd6 100644 --- a/tests/queries/0_stateless/01926_order_by_desc_limit.sql +++ b/tests/queries/0_stateless/01926_order_by_desc_limit.sql @@ -1,5 +1,4 @@ --- Tags: no-random-settings, no-tsan --- FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan +-- Tags: no-random-settings DROP TABLE IF EXISTS order_by_desc; diff --git a/tests/queries/1_stateful/00159_parallel_formatting_http.sh b/tests/queries/1_stateful/00159_parallel_formatting_http.sh index 7b949cf23e6..ea4a4d12867 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_http.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_http.sh @@ -1,6 +1,4 @@ #!/usr/bin/env bash -# Tags: no-tsan -# FIXME It became flaky after upgrading to llvm-14 due to obscure freezes in tsan CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 51f63c3a0c1885604f25eb90db8ab96de4192646 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 22 Jul 2022 10:57:57 +0000 Subject: [PATCH 139/227] Add setting optimize_distinct_in_order_memory_usage --- src/Core/Settings.h | 1 + src/Interpreters/InterpreterSelectQuery.cpp | 3 ++- .../InterpreterSelectWithUnionQuery.cpp | 3 ++- src/Processors/QueryPlan/DistinctStep.cpp | 14 ++++++++------ src/Processors/QueryPlan/DistinctStep.h | 4 +++- .../02317_distinct_in_order_optimization.sql | 2 +- ...istinct_in_order_optimization_explain.reference | 6 ++++++ ...02317_distinct_in_order_optimization_explain.sh | 6 ++++++ 8 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 8a1f47ec00e..4f5645de7b3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -621,6 +621,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ + M(Bool, optimize_distinct_in_order_memory_usage, false, "Try to use less memory for DISTINCT in order but can be slower", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index a05d353ac73..8590889d6f8 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2622,7 +2622,8 @@ void InterpreterSelectQuery::executeDistinct(QueryPlan & query_plan, bool before limit_for_distinct, columns, pre_distinct, - settings.optimize_distinct_in_order); + settings.optimize_distinct_in_order, + settings.optimize_distinct_in_order_memory_usage); if (pre_distinct) distinct_step->setStepDescription("Preliminary DISTINCT"); diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index 9f87a47fced..0590ce8f467 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -335,7 +335,8 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan) 0, result_header.getNames(), false, - settings.optimize_distinct_in_order); + settings.optimize_distinct_in_order, + settings.optimize_distinct_in_order_memory_usage); query_plan.addStep(std::move(distinct_step)); } diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index c268cb44267..9223218d82b 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -80,7 +80,8 @@ DistinctStep::DistinctStep( UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, - bool optimize_distinct_in_order_) + bool optimize_distinct_in_order_, + bool optimize_distinct_in_order_memory_usage_) : ITransformingStep( input_stream_, input_stream_.header, @@ -90,6 +91,7 @@ DistinctStep::DistinctStep( , columns(columns_) , pre_distinct(pre_distinct_) , optimize_distinct_in_order(optimize_distinct_in_order_) + , optimize_distinct_in_order_memory_usage(optimize_distinct_in_order_memory_usage_) { if (!output_stream->distinct_columns.empty() /// Columns already distinct, do nothing && (!pre_distinct /// Main distinct @@ -138,7 +140,7 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil { assert(input_stream.has_single_port); - if (distinct_sort_desc.size() < columns.size()) + if (distinct_sort_desc.size() >= columns.size() || optimize_distinct_in_order_memory_usage) { pipeline.addSimpleTransform( [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr @@ -146,8 +148,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (stream_type != QueryPipelineBuilder::StreamType::Main) return nullptr; - return std::make_shared( - header, distinct_sort_desc, set_size_limits, limit_hint, columns); + return std::make_shared( + header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); }); } else @@ -158,8 +160,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (stream_type != QueryPipelineBuilder::StreamType::Main) return nullptr; - return std::make_shared( - header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); + return std::make_shared( + header, distinct_sort_desc, set_size_limits, limit_hint, columns); }); } return; diff --git a/src/Processors/QueryPlan/DistinctStep.h b/src/Processors/QueryPlan/DistinctStep.h index dc734a58704..0d3b7e3b7e0 100644 --- a/src/Processors/QueryPlan/DistinctStep.h +++ b/src/Processors/QueryPlan/DistinctStep.h @@ -15,7 +15,8 @@ public: UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, /// If is enabled, execute distinct for separate streams. Otherwise, merge streams. - bool optimize_distinct_in_order_); + bool optimize_distinct_in_order_, + bool optimize_distinct_in_order_memory_usage_); String getName() const override { return "Distinct"; } @@ -32,6 +33,7 @@ private: Names columns; bool pre_distinct; bool optimize_distinct_in_order; + bool optimize_distinct_in_order_memory_usage; }; } diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index 2da81b50f6c..d43366a55fe 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -50,7 +50,7 @@ drop table if exists distinct_cardinality_low sync; drop table if exists distinct_in_order sync; drop table if exists ordinary_distinct sync; CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); -INSERT INTO distinct_cardinality_low SELECT number % 1e2, number % 1e4, number % 1e6 FROM numbers_mt(1e8); +INSERT INTO distinct_cardinality_low SELECT number % 1e1, number % 1e3, number % 1e5 FROM numbers_mt(1e6); create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index 327d0d52dc2..2a2ff8799f0 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -15,6 +15,12 @@ DistinctSortedChunkTransform -- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization DistinctSortedStreamTransform DistinctSortedChunkTransform +-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization +DistinctSortedTransform +DistinctSortedChunkTransform +-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization +DistinctSortedStreamTransform +DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only DistinctTransform DistinctSortedChunkTransform diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index e822f9695b9..f3f39997f0d 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -33,6 +33,12 @@ $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=0" | eval $FIND_DISTINCT + +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=1" | eval $FIND_DISTINCT + $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT From 46a6fbd7aa52a85bb925afd6ed1b7e5c229429d7 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 22 Jul 2022 10:55:13 +0000 Subject: [PATCH 140/227] Small polish --- src/Common/ZooKeeper/TestKeeper.h | 2 +- src/Common/ZooKeeper/ZooKeeper.cpp | 2 - src/Common/ZooKeeper/ZooKeeperImpl.cpp | 9 ++-- src/Common/ZooKeeper/ZooKeeperImpl.h | 2 +- src/Coordination/KeeperConstants.h | 8 +-- src/Coordination/KeeperSnapshotManager.cpp | 5 +- src/Coordination/KeeperStorage.cpp | 58 ++++++++++------------ src/Coordination/ZooKeeperDataReader.cpp | 1 - 8 files changed, 39 insertions(+), 48 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h index 2492d2d6ff9..5fcd00b01b0 100644 --- a/src/Common/ZooKeeper/TestKeeper.h +++ b/src/Common/ZooKeeper/TestKeeper.h @@ -92,7 +92,7 @@ public: DB::KeeperApiVersion getApiVersion() override { - return KeeperApiVersion::V0; + return KeeperApiVersion::ZOOKEEPER_COMPATIBLE; } struct Node diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index a7e93145218..96abf3b543a 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -3,8 +3,6 @@ #include "KeeperException.h" #include "TestKeeper.h" -#include -#include #include #include diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 27f2d1c8f52..6e62fccb6fd 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1,4 +1,3 @@ -#include "Common/ZooKeeper/IKeeper.h" #include #include #include @@ -1079,7 +1078,7 @@ void ZooKeeper::initApiVersion() get(keeper_api_version_path, std::move(callback), {}); if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready) { - LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Failed to get API version: timeout"); + LOG_TRACE(log, "Failed to get API version: timeout"); return; } @@ -1087,7 +1086,7 @@ void ZooKeeper::initApiVersion() if (response.error != Coordination::Error::ZOK) { - LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Failed to get API version"); + LOG_TRACE(log, "Failed to get API version"); return; } @@ -1095,7 +1094,7 @@ void ZooKeeper::initApiVersion() DB::ReadBufferFromOwnString buf(response.data); DB::readIntText(keeper_version, buf); keeper_api_version = static_cast(keeper_version); - LOG_TRACE(&Poco::Logger::get("ZooKeeper"), "Detected server's API version: {}", keeper_api_version); + LOG_TRACE(log, "Detected server's API version: {}", keeper_api_version); } @@ -1215,7 +1214,7 @@ void ZooKeeper::list( WatchCallback watch) { std::shared_ptr request{nullptr}; - if (keeper_api_version < Coordination::KeeperApiVersion::V1) + if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST) { if (list_request_type != ListRequestType::ALL) throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index c7e44f2fc9b..4f2098f25ca 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -283,7 +283,7 @@ private: CurrentMetrics::Increment active_session_metric_increment{CurrentMetrics::ZooKeeperSession}; std::shared_ptr zk_log; - DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::V0}; + DB::KeeperApiVersion keeper_api_version{DB::KeeperApiVersion::ZOOKEEPER_COMPATIBLE}; }; } diff --git a/src/Coordination/KeeperConstants.h b/src/Coordination/KeeperConstants.h index ec0bf34246f..eb75fda4547 100644 --- a/src/Coordination/KeeperConstants.h +++ b/src/Coordination/KeeperConstants.h @@ -7,17 +7,17 @@ namespace DB enum class KeeperApiVersion : uint8_t { - V0 = 0, // ZooKeeper compatible version - V1 // added FilteredList request + ZOOKEEPER_COMPATIBLE = 0, + WITH_FILTERED_LIST }; -inline constexpr auto current_keeper_api_version = KeeperApiVersion::V1; +inline constexpr auto current_keeper_api_version = KeeperApiVersion::WITH_FILTERED_LIST; const std::string keeper_system_path = "/keeper"; const std::string keeper_api_version_path = keeper_system_path + "/api_version"; using PathWithData = std::pair; -const std::vector data_for_system_paths +const std::vector child_system_paths_with_data { {keeper_api_version_path, toString(static_cast(current_keeper_api_version))} }; diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 1b863cac7dd..3c5439c23da 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -193,7 +193,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr } /// Serialize data tree - writeBinary(snapshot.snapshot_container_size - data_for_system_paths.size(), out); + writeBinary(snapshot.snapshot_container_size - child_system_paths_with_data.size(), out); size_t counter = 0; for (auto it = snapshot.begin; counter < snapshot.snapshot_container_size; ++counter) { @@ -201,7 +201,10 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr // write only the root system path because of digest if (isChildSystemPath(path.toView())) + { + ++it; continue; + } const auto & node = it->value; diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index d67717e33ff..899f215f72a 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -244,39 +244,41 @@ void KeeperStorage::initializeSystemNodes() if (initialized) throw Exception(ErrorCodes::LOGICAL_ERROR, "KeeperStorage system nodes initialized twice"); - // insert root system path - Node system_node; - system_node.setData(""); - container.insertOrReplace(keeper_system_path, system_node); - // store digest for the empty node because we won't update - // its stats - addDigest(system_node, keeper_system_path); + // insert root system path if it isn't already inserted + if (container.find(keeper_system_path) == container.end()) + { + Node system_node; + container.insert(keeper_system_path, system_node); + // store digest for the empty node because we won't update + // its stats + addDigest(system_node, keeper_system_path); - // update root and the digest based on it - auto current_root_it = container.find("/"); - assert(current_root_it != container.end()); - removeDigest(current_root_it->value, "/"); - auto updated_root_it = container.updateValue( - "/", - [](auto & node) - { - ++node.stat.numChildren; - node.addChild(keeper_system_path); - } - ); - addDigest(updated_root_it->value, "/"); + // update root and the digest based on it + auto current_root_it = container.find("/"); + assert(current_root_it != container.end()); + removeDigest(current_root_it->value, "/"); + auto updated_root_it = container.updateValue( + "/", + [](auto & node) + { + ++node.stat.numChildren; + node.addChild(getBaseName(keeper_system_path)); + } + ); + addDigest(updated_root_it->value, "/"); + } // insert child system nodes - for (const auto & [path, data] : data_for_system_paths) + for (const auto & [path, data] : child_system_paths_with_data) { assert(keeper_api_version_path.starts_with(keeper_system_path)); Node child_system_node; - system_node.setData(data); + child_system_node.setData(data); auto [map_key, _] = container.insert(std::string{path}, child_system_node); /// Take child path from key owned by map. auto child_path = getBaseName(map_key->getKey()); container.updateValue( - parentPath(child_path), + parentPath(StringRef(path)), [child_path](auto & parent) { // don't update stats so digest is okay @@ -922,16 +924,6 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce } } - // We cannot store the node because the result should be connected to the binary itself - // this way we avoid incorrect results when we read a snapshot from older Keeper that can have - // lower API version - if (request.path == Coordination::keeper_api_version_path) - { - response.data = std::to_string(static_cast(Coordination::current_keeper_api_version)); - response.error = Coordination::Error::ZOK; - return response_ptr; - } - auto & container = storage.container; auto node_it = container.find(request.path); if (node_it == container.end()) diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index 9e744f4fe1d..6702c4cc718 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include From c6d0ca2940b04a22a5f7b34fda03e05e09e13ecd Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Jul 2022 14:23:05 +0000 Subject: [PATCH 141/227] Fix data race in Avro format --- contrib/avro | 2 +- .../02372_data_race_in_avro.reference | 0 .../0_stateless/02372_data_race_in_avro.sh | 24 +++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02372_data_race_in_avro.reference create mode 100755 tests/queries/0_stateless/02372_data_race_in_avro.sh diff --git a/contrib/avro b/contrib/avro index e43c46e87fd..aac4e605f07 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit e43c46e87fd32eafdc09471e95344555454c5ef8 +Subproject commit aac4e605f070f2abd9a5f02ae70c17e4011588e8 diff --git a/tests/queries/0_stateless/02372_data_race_in_avro.reference b/tests/queries/0_stateless/02372_data_race_in_avro.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02372_data_race_in_avro.sh b/tests/queries/0_stateless/02372_data_race_in_avro.sh new file mode 100755 index 00000000000..d469e95fa78 --- /dev/null +++ b/tests/queries/0_stateless/02372_data_race_in_avro.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +for i in $(seq 1 10); +do + $CLICKHOUSE_CLIENT -q "drop table if exists t_avro_$i" + $CLICKHOUSE_CLIENT -q "create table t_avro_$i (x UInt32, s String) engine=File(Avro)" +done + +for i in $(seq 1 10); +do + $CLICKHOUSE_CLIENT -q "insert into t_avro_$i select number, 'str' from numbers(1000) settings engine_file_truncate_on_insert=1" > /dev/null & +done + +sleep 5 + +for i in $(seq 1 10); +do + $CLICKHOUSE_CLIENT-q "drop table t_avro_$i" +done + From e016f73b318f3676b1e60621200e9bdaf4742ef1 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Fri, 22 Jul 2022 11:42:14 -0400 Subject: [PATCH 142/227] update Grafana plugin --- docs/en/interfaces/third-party/gui.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index 7f1cd3a411e..c5cc8c7c34b 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -67,7 +67,7 @@ Features: ### Grafana {#grafana} -[Grafana](https://grafana.com/grafana/plugins/vertamedia-clickhouse-datasource) is a platform for monitoring and visualization. +[Grafana](https://grafana.com/grafana/plugins/grafana-clickhouse-datasource/) is a platform for monitoring and visualization. "Grafana allows you to query, visualize, alert on and understand your metrics no matter where they are stored. Create, explore, and share dashboards with your team and foster a data driven culture. Trusted and loved by the community" — grafana.com. From 30a32371eba31a41c4cbcc3327d6c96024328cf6 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Fri, 22 Jul 2022 11:50:59 -0400 Subject: [PATCH 143/227] Remove experimental status from projections --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 3e5a0635339..e216a99f986 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -483,8 +483,6 @@ For example: ## Projections {#projections} Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. -Projections are an experimental feature. To enable them you must set the [allow_experimental_projection_optimization](../../../operations/settings/settings.md#allow-experimental-projection-optimization) to `1`. See also the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting. - Projections are not supported in the `SELECT` statements with the [FINAL](../../../sql-reference/statements/select/from.md#select-from-final) modifier. ### Projection Query {#projection-query} From 6174fe1d7274e77df9540dd1e7b436738a3716c1 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Jul 2022 10:08:37 +0200 Subject: [PATCH 144/227] Fix tests. --- src/Backups/BackupCoordinationStageSync.cpp | 4 +- src/Backups/BackupsWorker.cpp | 10 ++--- src/Backups/BackupsWorker.h | 4 +- src/Backups/RestorerFromBackup.cpp | 2 +- .../test_backup_restore_new/test.py | 45 +++++++++++-------- .../test_backup_restore_on_cluster/test.py | 40 ++++++++++++----- .../test_concurrency.py | 20 +++++++-- 7 files changed, 80 insertions(+), 45 deletions(-) diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index 5524029bbf2..4b94e474345 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -15,7 +15,7 @@ namespace DB namespace ErrorCodes { extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE; -} +} BackupCoordinationStageSync::BackupCoordinationStageSync(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, Poco::Logger * log_) @@ -42,7 +42,7 @@ void BackupCoordinationStageSync::set(const String & current_host, const String auto code = zookeeper->tryCreate(alive_node_path, "", zkutil::CreateMode::Ephemeral); if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNODEEXISTS) throw zkutil::KeeperException(code, alive_node_path); - + zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, ""); zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message); } diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index df501258db6..b0d3fb55f2a 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -488,10 +488,10 @@ void BackupsWorker::addInfo(const UUID & uuid, bool internal, const String & bac info.status = status; info.status_changed_time = time(nullptr); info.internal = internal; - + std::lock_guard lock{infos_mutex}; infos[{uuid, internal}] = std::move(info); - + num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); } @@ -503,7 +503,7 @@ void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus sta auto it = infos.find({uuid, internal}); if (it == infos.end()) return; - + auto & info = it->second; auto old_status = info.status; info.status = status; @@ -552,10 +552,10 @@ void BackupsWorker::shutdown() bool has_active_backups_or_restores = (num_active_backups || num_active_restores); if (has_active_backups_or_restores) LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores); - + backups_thread_pool.wait(); restores_thread_pool.wait(); - + if (has_active_backups_or_restores) LOG_INFO(log, "All backup and restore tasks have finished"); } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index f471dc822e0..7ae69271d26 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -59,13 +59,13 @@ public: private: std::pair startMakingBackup(const ASTPtr & query, const ContextPtr & context); - + void doBackup(const UUID & backup_uuid, const std::shared_ptr & backup_query, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, ContextMutablePtr mutable_context, bool called_async); std::pair startRestoring(const ASTPtr & query, ContextMutablePtr context); - + void doRestore(const UUID & restore_uuid, const std::shared_ptr & restore_query, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); diff --git a/src/Backups/RestorerFromBackup.cpp b/src/Backups/RestorerFromBackup.cpp index 5e43d59ae56..3a2f5273611 100644 --- a/src/Backups/RestorerFromBackup.cpp +++ b/src/Backups/RestorerFromBackup.cpp @@ -154,7 +154,7 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa { LOG_TRACE(log, "{}", toUpperFirst(new_stage)); current_stage = new_stage; - + if (restore_coordination) { restore_coordination->setStage(restore_settings.host_id, new_stage, message); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index e490634e552..47f7c47d608 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -304,11 +304,13 @@ def test_async(): [id, _, status] = instance.query( f"BACKUP TABLE test.table TO {backup_name} ASYNC" ).split("\t") + assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" + assert_eq_with_retry( instance, - f"SELECT status FROM system.backups WHERE uuid='{id}'", - "BACKUP_COMPLETE\n", + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + TSV([["BACKUP_COMPLETE", ""]]), ) instance.query("DROP TABLE test.table") @@ -316,9 +318,13 @@ def test_async(): [id, _, status] = instance.query( f"RESTORE TABLE test.table FROM {backup_name} ASYNC" ).split("\t") + assert status == "RESTORING\n" or status == "RESTORED\n" + assert_eq_with_retry( - instance, f"SELECT status FROM system.backups WHERE uuid='{id}'", "RESTORED\n" + instance, + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + TSV([["RESTORED", ""]]), ) assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" @@ -341,14 +347,13 @@ def test_async_backups_to_same_destination(interface): assert_eq_with_retry( instance, - f"SELECT count() FROM system.backups WHERE uuid IN ['{id1}', '{id2}'] AND status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP'", - "0\n", + f"SELECT status FROM system.backups WHERE uuid IN ['{id1}', '{id2}'] AND status == 'MAKING_BACKUP'", + "", ) - assert ( - instance.query(f"SELECT status FROM system.backups WHERE uuid='{id1}'") - == "BACKUP_COMPLETE\n" - ) + assert instance.query( + f"SELECT status, error FROM system.backups WHERE uuid='{id1}'" + ) == TSV([["BACKUP_COMPLETE", ""]]) assert ( instance.query(f"SELECT status FROM system.backups WHERE uuid='{id2}'") @@ -747,24 +752,26 @@ def test_system_users_async(): instance.query("CREATE USER u1 IDENTIFIED BY 'qwe123' SETTINGS custom_c = 3") backup_name = new_backup_name() - [id, _, status] = instance.query( + id = instance.query( f"BACKUP DATABASE default, TABLE system.users, TABLE system.roles, TABLE system.settings_profiles, TABLE system.row_policies, TABLE system.quotas TO {backup_name} ASYNC" - ).split("\t") + ).split("\t")[0] + assert_eq_with_retry( instance, - f"SELECT status FROM system.backups WHERE uuid='{id}'", - "BACKUP_COMPLETE\n", + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + TSV([["BACKUP_COMPLETE", ""]]), ) instance.query("DROP USER u1") - [id, _, status] = instance.query( + id = instance.query( f"RESTORE DATABASE default, TABLE system.users, TABLE system.roles, TABLE system.settings_profiles, TABLE system.row_policies, TABLE system.quotas FROM {backup_name} ASYNC" - ).split("\t") + ).split("\t")[0] + assert_eq_with_retry( instance, - f"SELECT status FROM system.backups WHERE uuid='{id}'", - "RESTORED\n", + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + TSV([["RESTORED", ""]]), ) assert ( @@ -889,8 +896,8 @@ def test_mutation(): ) instance.query("ALTER TABLE test.table UPDATE x=x+1 WHERE 1") - instance.query("ALTER TABLE test.table UPDATE x=x+1+sleep(1) WHERE 1") - instance.query("ALTER TABLE test.table UPDATE x=x+1+sleep(2) WHERE 1") + instance.query("ALTER TABLE test.table UPDATE x=x+1+sleep(3) WHERE 1") + instance.query("ALTER TABLE test.table UPDATE x=x+1+sleep(3) WHERE 1") backup_name = new_backup_name() instance.query(f"BACKUP TABLE test.table TO {backup_name}") diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index df03ebd320b..58fac12f041 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -404,8 +404,8 @@ def test_replicated_database_async(): assert_eq_with_retry( node1, - f"SELECT status FROM system.backups WHERE uuid='{id}'", - "BACKUP_COMPLETE\n", + f"SELECT status, error FROM system.backups WHERE uuid='{id}' AND NOT internal", + TSV([["BACKUP_COMPLETE", ""]]), ) node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") @@ -417,7 +417,9 @@ def test_replicated_database_async(): assert status == "RESTORING\n" or status == "RESTORED\n" assert_eq_with_retry( - node1, f"SELECT status FROM system.backups WHERE uuid='{id}'", "RESTORED\n" + node1, + f"SELECT status, error FROM system.backups WHERE uuid='{id}' AND NOT internal", + TSV([["RESTORED", ""]]), ) node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl") @@ -460,8 +462,8 @@ def test_async_backups_to_same_destination(interface, on_cluster): for i in range(len(nodes)): assert_eq_with_retry( nodes[i], - f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP'", - "0\n", + f"SELECT status FROM system.backups WHERE uuid='{ids[i]}' AND status == 'MAKING_BACKUP'", + "", ) num_completed_backups = sum( @@ -469,7 +471,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): int( nodes[i] .query( - f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status == 'BACKUP_COMPLETE'" + f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status == 'BACKUP_COMPLETE' AND NOT internal" ) .strip() ) @@ -477,7 +479,16 @@ def test_async_backups_to_same_destination(interface, on_cluster): ] ) + if num_completed_backups != 1: + for i in range(len(nodes)): + print( + nodes[i].query( + f"SELECT status, error FROM system.backups WHERE uuid='{ids[i]}' AND NOT internal" + ) + ) + assert num_completed_backups == 1 + node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") node1.query(f"RESTORE TABLE tbl FROM {backup_name}") assert node1.query("SELECT * FROM tbl") == "1\n" @@ -752,8 +763,8 @@ def test_mutation(): node1.query("INSERT INTO tbl SELECT number, toString(number) FROM numbers(10, 5)") node1.query("ALTER TABLE tbl UPDATE x=x+1 WHERE 1") - node1.query("ALTER TABLE tbl UPDATE x=x+1+sleep(1) WHERE 1") - node1.query("ALTER TABLE tbl UPDATE x=x+1+sleep(2) WHERE 1") + node1.query("ALTER TABLE tbl UPDATE x=x+1+sleep(3) WHERE 1") + node1.query("ALTER TABLE tbl UPDATE x=x+1+sleep(3) WHERE 1") backup_name = new_backup_name() node1.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {backup_name}") @@ -783,7 +794,7 @@ def test_get_error_from_other_host(): @pytest.mark.parametrize("kill", [False, True]) -def test_stop_other_host_while_backup(kill): +def test_stop_other_host_during_backup(kill): node1.query( "CREATE TABLE tbl ON CLUSTER 'cluster' (" "x UInt8" @@ -806,11 +817,14 @@ def test_stop_other_host_while_backup(kill): assert_eq_with_retry( node1, - f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP' AND NOT internal", "", + retry_count=100, ) - status = node1.query(f"SELECT status FROM system.backups WHERE uuid='{id}'").strip() + status = node1.query( + f"SELECT status FROM system.backups WHERE uuid='{id}' AND NOT internal" + ).strip() if kill: assert status in ["BACKUP_COMPLETE", "FAILED_TO_BACKUP"] @@ -824,4 +838,6 @@ def test_stop_other_host_while_backup(kill): node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5]) elif status == "FAILED_TO_BACKUP": - assert not os.path.exists(get_path_to_backup(backup_name)) + assert not os.path.exists( + os.path.join(get_path_to_backup(backup_name), ".backup") + ) diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index d399723c8b1..ee26f08f14e 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -121,10 +121,14 @@ def test_concurrent_backups_on_same_node(): assert_eq_with_retry( node0, - f"SELECT status, error FROM system.backups WHERE status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP' AND uuid IN {ids_list}", + f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND uuid IN {ids_list}", "", ) + assert node0.query( + f"SELECT status, error FROM system.backups WHERE uuid IN {ids_list} AND NOT internal" + ) == TSV([["BACKUP_COMPLETE", ""]] * num_concurrent_backups) + for backup_name in backup_names: node0.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") node0.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") @@ -136,6 +140,7 @@ def test_concurrent_backups_on_same_node(): def test_concurrent_backups_on_different_nodes(): create_and_fill_table() + assert num_concurrent_backups <= num_nodes backup_names = [new_backup_name() for _ in range(num_concurrent_backups)] ids = [] @@ -150,13 +155,13 @@ def test_concurrent_backups_on_different_nodes(): for i in range(num_concurrent_backups): assert_eq_with_retry( nodes[i], - f"SELECT status, error FROM system.backups WHERE status != 'BACKUP_COMPLETE' AND status != 'FAILED_TO_BACKUP' AND uuid = '{ids[i]}'", + f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND uuid = '{ids[i]}'", "", ) for i in range(num_concurrent_backups): assert nodes[i].query( - f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}'" + f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}' AND NOT internal" ) == TSV([["BACKUP_COMPLETE", ""]]) for i in range(num_concurrent_backups): @@ -231,7 +236,14 @@ def test_create_or_drop_tables_during_backup(): for node in nodes: assert_eq_with_retry( node, - f"SELECT status, error from system.backups WHERE uuid IN {ids_list} AND (status == 'MAKING_BACKUP')", + f"SELECT status from system.backups WHERE uuid IN {ids_list} AND (status == 'MAKING_BACKUP')", + "", + ) + + for node in nodes: + assert_eq_with_retry( + node, + f"SELECT status, error from system.backups WHERE uuid IN {ids_list} AND (status == 'FAILED_TO_BACKUP')", "", ) From d23da91fe7eee79a6e89634ef782fdaea4eee365 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 22 Jul 2022 18:41:31 +0200 Subject: [PATCH 145/227] Add tag --- tests/queries/0_stateless/02372_data_race_in_avro.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02372_data_race_in_avro.sh b/tests/queries/0_stateless/02372_data_race_in_avro.sh index d469e95fa78..49c34e31923 100755 --- a/tests/queries/0_stateless/02372_data_race_in_avro.sh +++ b/tests/queries/0_stateless/02372_data_race_in_avro.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 8283cc9e2cc242b2339d357134f9ccb17edc1fdb Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Jul 2022 17:59:43 +0000 Subject: [PATCH 146/227] Fix possible heap-buffer-overflow in Avro --- contrib/avro | 2 +- .../0_stateless/02372_data_race_in_avro.sh | 2 +- .../02373_heap_buffer_overflow_in_avro.reference | 1 + .../02373_heap_buffer_overflow_in_avro.sh | 13 +++++++++++++ .../queries/0_stateless/data_avro/corrupted.avro | Bin 0 -> 161 bytes 5 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.reference create mode 100755 tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.sh create mode 100644 tests/queries/0_stateless/data_avro/corrupted.avro diff --git a/contrib/avro b/contrib/avro index aac4e605f07..7832659ec98 160000 --- a/contrib/avro +++ b/contrib/avro @@ -1 +1 @@ -Subproject commit aac4e605f070f2abd9a5f02ae70c17e4011588e8 +Subproject commit 7832659ec986075d560f930c288e973c64679552 diff --git a/tests/queries/0_stateless/02372_data_race_in_avro.sh b/tests/queries/0_stateless/02372_data_race_in_avro.sh index 49c34e31923..50a7ae1e3c5 100755 --- a/tests/queries/0_stateless/02372_data_race_in_avro.sh +++ b/tests/queries/0_stateless/02372_data_race_in_avro.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest +# Tags: no-fasttest, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh diff --git a/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.reference b/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.sh b/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.sh new file mode 100755 index 00000000000..23d6b722c09 --- /dev/null +++ b/tests/queries/0_stateless/02373_heap_buffer_overflow_in_avro.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-parallel + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +cp $CURDIR/data_avro/corrupted.avro $USER_FILES_PATH/ + +$CLICKHOUSE_CLIENT -q "select * from file(corrupted.avro)" 2>&1 | grep -F -q "Cannot read compressed data" && echo "OK" || echo "FAIL" + diff --git a/tests/queries/0_stateless/data_avro/corrupted.avro b/tests/queries/0_stateless/data_avro/corrupted.avro new file mode 100644 index 0000000000000000000000000000000000000000..0393b231d0824a7fb28d5c934c225f817c48ee82 GIT binary patch literal 161 zcmeZI%3@>@Nh~YM*GtY%NloS{&PyyPs1yT>6enk-<|g(qRx6cM7Nja!DHWwA=NF|Y z=_utT=7Rb8naW@=7Kv65AEHB7t$C<@h5TvC*om#$PBTg$++>VihV92qah>r*}6 gcX0|b$#E(O*cCndG_T#y=l9c_lh74x5xc_$07-p0x&QzG literal 0 HcmV?d00001 From 739ff34c6e15f5f355937b610bd100509549c06d Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 22 Jul 2022 22:46:56 +0000 Subject: [PATCH 147/227] Add some tests, still not sure about optimize_memory_usage option --- .../Transforms/DistinctSortedChunkTransform.cpp | 17 ++++++++++++----- tests/performance/distinct_in_order.xml | 6 ++++-- ...317_distinct_in_order_optimization.reference | 1 + .../02317_distinct_in_order_optimization.sql | 16 +++++++++++++--- ...inct_in_order_optimization_explain.reference | 4 ++-- ...17_distinct_in_order_optimization_explain.sh | 4 ++-- 6 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index 8604cca5a5c..e989fad3d1f 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -62,7 +62,8 @@ void DistinctSortedChunkTransform::initChunkProcessing(const Columns & input_col data.init(ClearableSetVariants::chooseMethod(other_columns, other_columns_sizes)); } -size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange(IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data) +size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange( + IColumn::Filter & filter, const size_t range_begin, const size_t range_end, const bool clear_data) { size_t count = 0; switch (data.type) @@ -84,7 +85,7 @@ size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange(IColumn::Filter & f template size_t DistinctSortedChunkTransform::buildFilterForRange( - Method & method, IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data) + Method & method, IColumn::Filter & filter, const size_t range_begin, const size_t range_end, const bool clear_data) { typename Method::State state(other_columns, other_columns_sizes, nullptr); if (clear_data) @@ -93,11 +94,11 @@ size_t DistinctSortedChunkTransform::buildFilterForRange( size_t count = 0; for (size_t i = range_begin; i < range_end; ++i) { - auto emplace_result = state.emplaceKey(method.data, i, data.string_pool); + const auto emplace_result = state.emplaceKey(method.data, i, data.string_pool); /// emit the record if there is no such key in the current set, skip otherwise filter[i] = emplace_result.isInserted(); - if (filter[i]) + if (emplace_result.isInserted()) ++count; } return count; @@ -106,7 +107,7 @@ size_t DistinctSortedChunkTransform::buildFilterForRange( void DistinctSortedChunkTransform::saveLatestKey(const size_t row_pos) { prev_chunk_latest_key.clear(); - for (auto const & col : sorted_columns) + for (const auto & col : sorted_columns) { prev_chunk_latest_key.emplace_back(col->cloneEmpty()); prev_chunk_latest_key.back()->insertFrom(*col, row_pos); @@ -224,6 +225,12 @@ void DistinctSortedChunkTransform::transform(Chunk & chunk) // set where next range start range_begin = range_end; } + /// if there is no any new rows in this chunk, just skip it + // if (output_rows) + // { + // chunk.clear(); + // return; + // } saveLatestKey(chunk_rows - 1); diff --git a/tests/performance/distinct_in_order.xml b/tests/performance/distinct_in_order.xml index 834a6945622..ee1094ed395 100644 --- a/tests/performance/distinct_in_order.xml +++ b/tests/performance/distinct_in_order.xml @@ -8,8 +8,9 @@ SELECT DISTINCT high, medium FROM distinct_cardinality_high FORMAT Null SELECT DISTINCT high, medium, low FROM distinct_cardinality_high FORMAT Null - SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY medium FORMAT Null + SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY high, medium FORMAT Null SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY high FORMAT Null + SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY medium FORMAT Null SELECT DISTINCT high, low FROM distinct_cardinality_high ORDER BY low FORMAT Null SELECT DISTINCT high, medium, low FROM distinct_cardinality_high ORDER BY low FORMAT Null @@ -24,8 +25,9 @@ SELECT DISTINCT low, high FROM distinct_cardinality_low FORMAT Null SELECT DISTINCT low, medium, high FROM distinct_cardinality_low FORMAT Null - SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null + SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low, medium FORMAT Null SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null + SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference index a5ae3759d5d..a82e9b5cc2e 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference @@ -78,3 +78,4 @@ 1 1 0 0 -- check that distinct in order has the same result as ordinary distinct +-- check that distinct in order has the same result as ordinary distinct, optimize memory usage diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index d43366a55fe..14618baff95 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -47,15 +47,25 @@ drop table if exists distinct_in_order sync; select '-- check that distinct in order has the same result as ordinary distinct'; drop table if exists distinct_cardinality_low sync; +CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); +INSERT INTO distinct_cardinality_low SELECT number % 1e1, number % 1e2, number % 1e3 FROM numbers_mt(1e4); drop table if exists distinct_in_order sync; drop table if exists ordinary_distinct sync; -CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); -INSERT INTO distinct_cardinality_low SELECT number % 1e1, number % 1e3, number % 1e5 FROM numbers_mt(1e6); create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; +insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1, optimize_distinct_in_order_memory_usage=0; create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; select distinct * from distinct_in_order except select * from ordinary_distinct; + +select '-- check that distinct in order has the same result as ordinary distinct, optimize memory usage'; +drop table if exists distinct_in_order; +drop table if exists ordinary_distinct; +create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1, optimize_distinct_in_order_memory_usage=1; +create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); +insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; +select distinct * from distinct_in_order except select * from ordinary_distinct; + drop table if exists distinct_in_order; drop table if exists ordinary_distinct; drop table if exists distinct_cardinality_low; diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index 2a2ff8799f0..b41d853e3eb 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -15,10 +15,10 @@ DistinctSortedChunkTransform -- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization DistinctSortedStreamTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization +-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization DistinctSortedTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization +-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization DistinctSortedStreamTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index f3f39997f0d..c19bc3e6aaf 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -33,10 +33,10 @@ $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=0" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=1" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'" From 95511428b303010676d3229d7d8bed0404149299 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 23 Jul 2022 00:03:26 +0000 Subject: [PATCH 148/227] Couple optimizations + do not apply filter to chunk if there is no data for output + checking clear_data flag at compile time --- .../DistinctSortedChunkTransform.cpp | 29 ++++++++++--------- .../Transforms/DistinctSortedChunkTransform.h | 5 ++-- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index e989fad3d1f..28a3260d742 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -62,8 +62,8 @@ void DistinctSortedChunkTransform::initChunkProcessing(const Columns & input_col data.init(ClearableSetVariants::chooseMethod(other_columns, other_columns_sizes)); } -size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange( - IColumn::Filter & filter, const size_t range_begin, const size_t range_end, const bool clear_data) +template +size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange(IColumn::Filter & filter, const size_t range_begin, const size_t range_end) { size_t count = 0; switch (data.type) @@ -73,7 +73,8 @@ size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange( // clang-format off #define M(NAME) \ case ClearableSetVariants::Type::NAME: \ - count = buildFilterForRange(*data.NAME, filter, range_begin, range_end, clear_data); \ + if constexpr (clear_data) data.NAME->data.clear(); \ + count = buildFilterForRange(*data.NAME, filter, range_begin, range_end); \ break; APPLY_FOR_SET_VARIANTS(M) @@ -85,11 +86,9 @@ size_t DistinctSortedChunkTransform::ordinaryDistinctOnRange( template size_t DistinctSortedChunkTransform::buildFilterForRange( - Method & method, IColumn::Filter & filter, const size_t range_begin, const size_t range_end, const bool clear_data) + Method & method, IColumn::Filter & filter, const size_t range_begin, const size_t range_end) { typename Method::State state(other_columns, other_columns_sizes, nullptr); - if (clear_data) - method.data.clear(); size_t count = 0; for (size_t i = range_begin; i < range_end; ++i) @@ -180,7 +179,10 @@ std::pair DistinctSortedChunkTransform::continueWithPrevRange(co if (other_columns.empty()) std::fill(filter.begin(), filter.begin() + range_end, 0); /// skip rows already included in distinct on previous transform() else - output_rows = ordinaryDistinctOnRange(filter, 0, range_end, false); + { + constexpr bool clear_data = false; + output_rows = ordinaryDistinctOnRange(filter, 0, range_end); + } return {range_end, output_rows}; } @@ -219,18 +221,19 @@ void DistinctSortedChunkTransform::transform(Chunk & chunk) else { // ordinary distinct in range if there are "non-sorted" columns - output_rows += ordinaryDistinctOnRange(filter, range_begin, range_end, true); + constexpr bool clear_data = true; + output_rows += ordinaryDistinctOnRange(filter, range_begin, range_end); } // set where next range start range_begin = range_end; } /// if there is no any new rows in this chunk, just skip it - // if (output_rows) - // { - // chunk.clear(); - // return; - // } + if (!output_rows) + { + chunk.clear(); + return; + } saveLatestKey(chunk_rows - 1); diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.h b/src/Processors/Transforms/DistinctSortedChunkTransform.h index 0ce8addbf7e..188e3d5c4c7 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.h +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.h @@ -43,7 +43,8 @@ protected: private: void initChunkProcessing(const Columns & input_columns); std::pair continueWithPrevRange(size_t chunk_rows, IColumn::Filter & filter); - size_t ordinaryDistinctOnRange(IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data); + template + size_t ordinaryDistinctOnRange(IColumn::Filter & filter, size_t range_begin, size_t range_end); inline void saveLatestKey(size_t row_pos); inline bool isLatestKeyFromPrevChunk(size_t row_pos) const; inline bool isKey(size_t key_pos, size_t row_pos) const; @@ -51,7 +52,7 @@ private: inline size_t getRangeEnd(size_t range_begin, size_t range_end, Predicate pred) const; template - size_t buildFilterForRange(Method & method, IColumn::Filter & filter, size_t range_begin, size_t range_end, bool clear_data); + size_t buildFilterForRange(Method & method, IColumn::Filter & filter, size_t range_begin, size_t range_end); ClearableSetVariants data; From 6aff87d4b5ab8186dfede2b0b39583db60e3d689 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Sat, 23 Jul 2022 13:46:59 +0000 Subject: [PATCH 149/227] Ignore system paths from snapshots and logstore --- src/Common/ZooKeeper/IKeeper.h | 2 + src/Coordination/KeeperSnapshotManager.cpp | 47 ++++++++++++++++++---- src/Coordination/KeeperStorage.cpp | 7 +++- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index a6ed21bc1d3..c6aa954688b 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -58,6 +58,8 @@ struct Stat int32_t dataLength{0}; /// NOLINT int32_t numChildren{0}; /// NOLINT int64_t pzxid{0}; + + bool operator==(const Stat &) const = default; }; enum class Error : int32_t diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 3c5439c23da..bf7550b7d17 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -148,10 +148,26 @@ namespace namespace { -bool isChildSystemPath(const std::string_view path) +enum class PathMatchResult { - auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), keeper_system_path.begin(), keeper_system_path.end()); - return first_it != path.end() && *first_it == '/' && second_it == keeper_system_path.end(); + NOT_MATCH, + EXACT, + IS_CHILD +}; + +PathMatchResult matchPath(const std::string_view path, const std::string_view match_to) +{ + using enum PathMatchResult; + + auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end()); + + if (second_it != match_to.end()) + return NOT_MATCH; + + if (first_it == path.end()) + return EXACT; + + return *first_it == '/' ? IS_CHILD : NOT_MATCH; } } @@ -200,7 +216,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr const auto & path = it->key; // write only the root system path because of digest - if (isChildSystemPath(path.toView())) + if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD) { ++it; continue; @@ -336,20 +352,35 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial if (recalculate_digest) storage.nodes_digest = 0; - size_t current_size = 0; - while (current_size < snapshot_container_size) + const auto is_node_empty = [](const auto & node) + { + return node.getData().empty() && node.stat == Coordination::Stat{}; + }; + + for (size_t nodes_read = 0; nodes_read < snapshot_container_size; ++nodes_read) { std::string path; readBinary(path, in); KeeperStorage::Node node{}; readNode(node, in, current_version, storage.acl_map); + using enum PathMatchResult; + auto match_result = matchPath(path, keeper_system_path); + if ((match_result == EXACT && !is_node_empty(node)) || match_result == IS_CHILD) + { + LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Cannot read node on path {} from a snapshot because it is used as a system node.", path); + + if (match_result == IS_CHILD) + continue; + + node = KeeperStorage::Node{}; + } + + storage.container.insertOrReplace(path, node); if (node.stat.ephemeralOwner != 0) storage.ephemerals[node.stat.ephemeralOwner].insert(path); - current_size++; - if (recalculate_digest) storage.nodes_digest += node.getDigest(path); } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 899f215f72a..3c848baa630 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -808,8 +808,11 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr path_created += seq_num_str.str(); } - if (storage.uncommitted_state.getNode(path_created)) - return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; + if (path_created.starts_with(keeper_system_path)) + { + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created); + return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; + } if (getBaseName(path_created).size == 0) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; From afb6cb6824bfc1e24761724ab8c8564c1af6f714 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Sat, 23 Jul 2022 14:27:44 +0000 Subject: [PATCH 150/227] Add KeeperContext --- programs/keeper-converter/KeeperConverter.cpp | 7 +- src/Coordination/KeeperContext.h | 20 ++++++ src/Coordination/KeeperServer.cpp | 30 +++++--- src/Coordination/KeeperServer.h | 3 + src/Coordination/KeeperSnapshotManager.cpp | 23 ++++--- src/Coordination/KeeperSnapshotManager.h | 12 ++-- src/Coordination/KeeperStateMachine.cpp | 18 +++-- src/Coordination/KeeperStateMachine.h | 7 +- src/Coordination/KeeperStorage.cpp | 17 +++-- src/Coordination/KeeperStorage.h | 5 +- src/Coordination/tests/gtest_coordination.cpp | 68 ++++++++++--------- 11 files changed, 129 insertions(+), 81 deletions(-) create mode 100644 src/Coordination/KeeperContext.h diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index ae47287d8ee..42e0894257a 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -39,7 +39,10 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) try { - DB::KeeperStorage storage(500, "", true, false); + auto keeper_context = std::make_shared(); + keeper_context->digest_enabled = true; + + DB::KeeperStorage storage(500, "", keeper_context, false); DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as(), logger); storage.initializeSystemNodes(); @@ -48,7 +51,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(storage.getZXID(), 1, std::make_shared()); DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta); - DB::KeeperSnapshotManager manager(options["output-dir"].as(), 1); + DB::KeeperSnapshotManager manager(options["output-dir"].as(), 1, keeper_context); auto snp = manager.serializeSnapshotToBuffer(snapshot); auto path = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID()); std::cout << "Snapshot serialized to path:" << path << std::endl; diff --git a/src/Coordination/KeeperContext.h b/src/Coordination/KeeperContext.h new file mode 100644 index 00000000000..b53893039a1 --- /dev/null +++ b/src/Coordination/KeeperContext.h @@ -0,0 +1,20 @@ +#pragma once + +namespace DB +{ + +struct KeeperContext +{ + enum class Phase : uint8_t + { + INIT, + RUNNING + }; + + Phase server_state{Phase::INIT}; + bool digest_enabled{true}; +}; + +using KeeperContextPtr = std::shared_ptr; + +} diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 8261f5d1e26..4378fe6b09c 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -106,20 +106,30 @@ KeeperServer::KeeperServer( SnapshotsQueue & snapshots_queue_) : server_id(configuration_and_settings_->server_id) , coordination_settings(configuration_and_settings_->coordination_settings) - , state_machine(nuraft::cs_new( - responses_queue_, - snapshots_queue_, - configuration_and_settings_->snapshot_storage_path, - coordination_settings, - checkAndGetSuperdigest(configuration_and_settings_->super_digest), - config.getBool("keeper_server.digest_enabled", false))) - , state_manager(nuraft::cs_new( - server_id, "keeper_server", configuration_and_settings_->log_storage_path, configuration_and_settings_->state_file_path, config, coordination_settings)) , log(&Poco::Logger::get("KeeperServer")) , is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery")) + , keeper_context{std::make_shared()} { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); + + keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false); + + state_machine = nuraft::cs_new( + responses_queue_, + snapshots_queue_, + configuration_and_settings_->snapshot_storage_path, + coordination_settings, + keeper_context, + checkAndGetSuperdigest(configuration_and_settings_->super_digest)); + + state_manager = nuraft::cs_new( + server_id, + "keeper_server", + configuration_and_settings_->log_storage_path, + configuration_and_settings_->state_file_path, + config, + coordination_settings); } /** @@ -341,6 +351,8 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config; launchRaftServer(enable_ipv6); + + keeper_context->server_state = KeeperContext::Phase::RUNNING; } void KeeperServer::shutdownRaftServer() diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index f6524ce97a1..74dd05631f0 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB { @@ -61,6 +62,8 @@ private: std::atomic_bool is_recovering = false; + std::shared_ptr keeper_context; + public: KeeperServer( const KeeperConfigurationAndSettingsPtr & settings_, diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index bf7550b7d17..8491c662a17 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -13,6 +13,7 @@ #include #include #include +#include "Coordination/KeeperContext.h" #include namespace DB @@ -172,7 +173,7 @@ PathMatchResult matchPath(const std::string_view path, const std::string_view ma } -void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out) +void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context) { writeBinary(static_cast(snapshot.version), out); serializeSnapshotMetadata(snapshot.snapshot_meta, out); @@ -180,7 +181,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr if (snapshot.version >= SnapshotVersion::V5) { writeBinary(snapshot.zxid, out); - if (snapshot.storage->digest_enabled) + if (keeper_context->digest_enabled) { writeBinary(static_cast(KeeperStorage::CURRENT_DIGEST_VERSION), out); writeBinary(snapshot.nodes_digest, out); @@ -277,7 +278,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr } } -void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in) +void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context) { uint8_t version; readBinary(version, in); @@ -288,7 +289,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in); KeeperStorage & storage = *deserialization_result.storage; - bool recalculate_digest = storage.digest_enabled; + bool recalculate_digest = keeper_context->digest_enabled; if (version >= SnapshotVersion::V5) { readBinary(storage.zxid, in); @@ -503,16 +504,16 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot() KeeperSnapshotManager::KeeperSnapshotManager( const std::string & snapshots_path_, size_t snapshots_to_keep_, + const KeeperContextPtr & keeper_context_, bool compress_snapshots_zstd_, const std::string & superdigest_, - size_t storage_tick_time_, - const bool digest_enabled_) + size_t storage_tick_time_) : snapshots_path(snapshots_path_) , snapshots_to_keep(snapshots_to_keep_) , compress_snapshots_zstd(compress_snapshots_zstd_) , superdigest(superdigest_) , storage_tick_time(storage_tick_time_) - , digest_enabled(digest_enabled_) + , keeper_context(keeper_context_) { namespace fs = std::filesystem; @@ -606,7 +607,7 @@ nuraft::ptr KeeperSnapshotManager::serializeSnapshotToBuffer(con else compressed_writer = std::make_unique(*writer); - KeeperStorageSnapshot::serialize(snapshot, *compressed_writer); + KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context); compressed_writer->finalize(); return buffer_raw_ptr->getBuffer(); } @@ -635,8 +636,8 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff compressed_reader = std::make_unique(*reader); SnapshotDeserializationResult result; - result.storage = std::make_unique(storage_tick_time, superdigest, digest_enabled, false); - KeeperStorageSnapshot::deserialize(result, *compressed_reader); + result.storage = std::make_unique(storage_tick_time, superdigest, keeper_context, false); + KeeperStorageSnapshot::deserialize(result, *compressed_reader, keeper_context); result.storage->initializeSystemNodes(); return result; } @@ -682,7 +683,7 @@ std::pair KeeperSnapshotManager::serializeSnapshot else compressed_writer = std::make_unique(*writer); - KeeperStorageSnapshot::serialize(snapshot, *compressed_writer); + KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context); compressed_writer->finalize(); compressed_writer->sync(); diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h index 57174cbc6f5..4984e54f15f 100644 --- a/src/Coordination/KeeperSnapshotManager.h +++ b/src/Coordination/KeeperSnapshotManager.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -55,9 +56,9 @@ public: ~KeeperStorageSnapshot(); - static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out); + static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context); - static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in); + static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context); KeeperStorage * storage; @@ -99,10 +100,10 @@ public: KeeperSnapshotManager( const std::string & snapshots_path_, size_t snapshots_to_keep_, + const KeeperContextPtr & keeper_context_, bool compress_snapshots_zstd_ = true, const std::string & superdigest_ = "", - size_t storage_tick_time_ = 500, - bool digest_enabled_ = true); + size_t storage_tick_time_ = 500); /// Restore storage from latest available snapshot SnapshotDeserializationResult restoreFromLatestSnapshot(); @@ -168,7 +169,8 @@ private: const std::string superdigest; /// Storage sessions timeout check interval (also for deserializatopn) size_t storage_tick_time; - const bool digest_enabled; + + KeeperContextPtr keeper_context; }; /// Keeper create snapshots in background thread. KeeperStateMachine just create diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 3c899a268d8..f43a3dbb319 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -41,22 +41,22 @@ KeeperStateMachine::KeeperStateMachine( SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, - const std::string & superdigest_, - const bool digest_enabled_) + const KeeperContextPtr & keeper_context_, + const std::string & superdigest_) : coordination_settings(coordination_settings_) , snapshot_manager( snapshots_path_, coordination_settings->snapshots_to_keep, + keeper_context_, coordination_settings->compress_snapshots_with_zstd_format, superdigest_, - coordination_settings->dead_session_check_period_ms.totalMilliseconds(), - digest_enabled_) + coordination_settings->dead_session_check_period_ms.totalMilliseconds()) , responses_queue(responses_queue_) , snapshots_queue(snapshots_queue_) , last_committed_idx(0) , log(&Poco::Logger::get("KeeperStateMachine")) , superdigest(superdigest_) - , digest_enabled(digest_enabled_) + , keeper_context(keeper_context_) { } @@ -109,7 +109,7 @@ void KeeperStateMachine::init() if (!storage) storage = std::make_unique( - coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, digest_enabled); + coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context); } namespace @@ -204,7 +204,7 @@ void KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req true /* check_acl */, request_for_session.digest); - if (digest_enabled && request_for_session.digest) + if (keeper_context->digest_enabled && request_for_session.digest) assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false); } @@ -253,10 +253,8 @@ nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, n response_for_session.session_id); } - if (digest_enabled && request_for_session.digest) - { + if (keeper_context->digest_enabled && request_for_session.digest) assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true); - } } ProfileEvents::increment(ProfileEvents::KeeperCommits); diff --git a/src/Coordination/KeeperStateMachine.h b/src/Coordination/KeeperStateMachine.h index c80b35bb704..adcf34c2aba 100644 --- a/src/Coordination/KeeperStateMachine.h +++ b/src/Coordination/KeeperStateMachine.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -24,8 +25,8 @@ public: SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_, - const std::string & superdigest_ = "", - bool digest_enabled_ = true); + const KeeperContextPtr & keeper_context_, + const std::string & superdigest_ = ""); /// Read state from the latest snapshot void init(); @@ -140,7 +141,7 @@ private: /// Special part of ACL system -- superdigest specified in server config. const std::string superdigest; - const bool digest_enabled; + KeeperContextPtr keeper_context; }; } diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 3c848baa630..fc55fe65f35 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -228,8 +228,8 @@ void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other) } KeeperStorage::KeeperStorage( - int64_t tick_time_ms, const String & superdigest_, const bool digest_enabled_, const bool initialize_system_nodes) - : session_expiry_queue(tick_time_ms), digest_enabled(digest_enabled_), superdigest(superdigest_) + int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, const bool initialize_system_nodes) + : session_expiry_queue(tick_time_ms), keeper_context(keeper_context_), superdigest(superdigest_) { Node root_node; container.insert("/", root_node); @@ -808,6 +808,9 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr path_created += seq_num_str.str(); } + if (storage.uncommitted_state.getNode(path_created)) + return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; + if (path_created.starts_with(keeper_system_path)) { LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created); @@ -1807,7 +1810,7 @@ KeeperStorageRequestProcessorsFactory::KeeperStorageRequestProcessorsFactory() UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vector & new_deltas) const { - if (!digest_enabled) + if (!keeper_context->digest_enabled) return current_digest; std::unordered_map> updated_nodes; @@ -1901,7 +1904,7 @@ void KeeperStorage::preprocessRequest( TransactionInfo transaction{.zxid = new_last_zxid}; uint64_t new_digest = getNodesDigest(false).value; SCOPE_EXIT({ - if (digest_enabled) + if (keeper_context->digest_enabled) // if the version of digest we got from the leader is the same as the one this instances has, we can simply copy the value // and just check the digest on the commit // a mistake can happen while applying the changes to the uncommitted_state so for now let's just recalculate the digest here also @@ -2101,7 +2104,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid) KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const { - if (!digest_enabled) + if (!keeper_context->digest_enabled) return {.version = DigestVersion::NO_DIGEST}; if (committed || uncommitted_transactions.empty()) @@ -2112,13 +2115,13 @@ KeeperStorage::Digest KeeperStorage::getNodesDigest(bool committed) const void KeeperStorage::removeDigest(const Node & node, const std::string_view path) { - if (digest_enabled) + if (keeper_context->digest_enabled) nodes_digest -= node.getDigest(path); } void KeeperStorage::addDigest(const Node & node, const std::string_view path) { - if (digest_enabled) + if (keeper_context->digest_enabled) { node.invalidateDigestCache(); nodes_digest += node.getDigest(path); diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index a511086110f..1fbe52cfbea 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -336,13 +337,13 @@ public: Digest getNodesDigest(bool committed) const; - const bool digest_enabled; + KeeperContextPtr keeper_context; const String superdigest; bool initialized{false}; - KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_, bool initialize_system_nodes = true); + KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, bool initialize_system_nodes = true); void initializeSystemNodes(); diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 30cb455e0f4..4beb40f6efd 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -2,6 +2,7 @@ #include #include "Common/ZooKeeper/IKeeper.h" +#include "Coordination/KeeperContext.h" #include "Coordination/KeeperStorage.h" #include "Core/Defines.h" #include "IO/WriteHelpers.h" @@ -63,7 +64,10 @@ struct CompressionParam }; class CoordinationTest : public ::testing::TestWithParam -{}; +{ +protected: + DB::KeeperContextPtr keeper_context = std::make_shared(); +}; TEST_P(CoordinationTest, BuildTest) { @@ -1083,9 +1087,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", "world", 1); addNode(storage, "/hello/somepath", "somedata", 3); storage.session_id_counter = 5; @@ -1131,9 +1135,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", keeper_context); storage.getSessionID(130); for (size_t i = 0; i < 50; ++i) @@ -1172,9 +1176,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", keeper_context); storage.getSessionID(130); for (size_t j = 1; j <= 5; ++j) @@ -1211,8 +1215,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + DB::KeeperStorage storage(500, "", keeper_context); for (size_t i = 0; i < 50; ++i) { addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); @@ -1264,8 +1268,8 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken) { auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); + DB::KeeperStorage storage(500, "", keeper_context); for (size_t i = 0; i < 50; ++i) { addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i)); @@ -1304,7 +1308,7 @@ nuraft::ptr getLogEntryFromZKRequest(size_t term, int64_t ses return nuraft::cs_new(term, buffer); } -void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression) +void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint64_t total_logs, bool enable_compression, Coordination::KeeperContextPtr keeper_context) { using namespace Coordination; using namespace DB; @@ -1314,7 +1318,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context); state_machine->init(); DB::KeeperLogStore changelog("./logs", settings->rotate_log_storage_interval, true, enable_compression); changelog.init(state_machine->last_commit_index() + 1, settings->reserved_log_items); @@ -1355,7 +1359,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint } SnapshotsQueue snapshots_queue1{1}; - auto restore_machine = std::make_shared(queue, snapshots_queue1, "./snapshots", settings); + auto restore_machine = std::make_shared(queue, snapshots_queue1, "./snapshots", settings, keeper_context); restore_machine->init(); EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance); @@ -1397,63 +1401,63 @@ TEST_P(CoordinationTest, TestStateMachineAndLogStore) settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 37, params.enable_compression); + testLogAndStateMachine(settings, 37, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 11, params.enable_compression); + testLogAndStateMachine(settings, 11, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 10; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 40, params.enable_compression); + testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 20; settings->rotate_log_storage_interval = 30; - testLogAndStateMachine(settings, 40, params.enable_compression); + testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 0; settings->rotate_log_storage_interval = 10; - testLogAndStateMachine(settings, 40, params.enable_compression); + testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 1; settings->reserved_log_items = 1; settings->rotate_log_storage_interval = 32; - testLogAndStateMachine(settings, 32, params.enable_compression); + testLogAndStateMachine(settings, 32, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 10; settings->reserved_log_items = 7; settings->rotate_log_storage_interval = 1; - testLogAndStateMachine(settings, 33, params.enable_compression); + testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 37; settings->reserved_log_items = 1000; settings->rotate_log_storage_interval = 5000; - testLogAndStateMachine(settings, 33, params.enable_compression); + testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); } { CoordinationSettingsPtr settings = std::make_shared(); settings->snapshot_distance = 37; settings->reserved_log_items = 1000; settings->rotate_log_storage_interval = 5000; - testLogAndStateMachine(settings, 45, params.enable_compression); + testLogAndStateMachine(settings, 45, params.enable_compression, keeper_context); } } @@ -1467,7 +1471,7 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove) ResponsesQueue queue(std::numeric_limits::max()); SnapshotsQueue snapshots_queue{1}; - auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings, keeper_context); state_machine->init(); std::shared_ptr request_c = std::make_shared(); @@ -1634,9 +1638,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) auto params = GetParam(); ChangelogDirTest test("./snapshots"); - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", "world", 1); addNode(storage, "/hello/somepath", "somedata", 3); storage.session_id_counter = 5; @@ -1652,7 +1656,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) manager.serializeSnapshotBufferToDisk(*buf, 2); EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin" + params.extension)); - DB::KeeperSnapshotManager new_manager("./snapshots", 3, !params.enable_compression); + DB::KeeperSnapshotManager new_manager("./snapshots", 3, keeper_context, !params.enable_compression); auto debuf = new_manager.deserializeSnapshotBufferFromDisk(2); @@ -1786,9 +1790,9 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual) std::optional snapshot_hash; for (size_t i = 0; i < 15; ++i) { - DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression); + DB::KeeperSnapshotManager manager("./snapshots", 3, keeper_context, params.enable_compression); - DB::KeeperStorage storage(500, "", true); + DB::KeeperStorage storage(500, "", keeper_context); addNode(storage, "/hello", ""); for (size_t j = 0; j < 5000; ++j) { @@ -1859,7 +1863,7 @@ TEST_P(CoordinationTest, TestUncommittedStateBasicCrud) using namespace DB; using namespace Coordination; - DB::KeeperStorage storage{500, "", true}; + DB::KeeperStorage storage{500, "", keeper_context}; constexpr std::string_view path = "/test"; @@ -1976,7 +1980,7 @@ TEST_P(CoordinationTest, TestListRequestTypes) using namespace DB; using namespace Coordination; - KeeperStorage storage{500, "", true}; + KeeperStorage storage{500, "", keeper_context}; int64_t zxid = 0; @@ -2126,7 +2130,7 @@ TEST_P(CoordinationTest, TestDurableState) TEST_P(CoordinationTest, TestCurrentApiVersion) { using namespace Coordination; - KeeperStorage storage{500, "", true}; + KeeperStorage storage{500, "", keeper_context}; auto request = std::make_shared(); request->path = DB::keeper_api_version_path; auto responses = storage.processRequest(request, 0, std::nullopt, true, true); From 594195451ee5d75ed1425c6770ed81399a39ac8e Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Thu, 21 Jul 2022 21:50:19 +0200 Subject: [PATCH 151/227] Cleanups --- src/Core/Settings.h | 3 +-- src/Interpreters/InterpreterDeleteQuery.cpp | 5 ++-- src/Interpreters/MutationsInterpreter.cpp | 14 +++++----- src/Interpreters/MutationsInterpreter.h | 2 -- src/Storages/AlterCommands.cpp | 5 ++-- src/Storages/ColumnsDescription.h | 7 +++++ src/Storages/IStorage.h | 2 +- src/Storages/LightweightDeleteDescription.cpp | 9 +++++++ src/Storages/LightweightDeleteDescription.h | 13 ++++++++++ .../MergeTree/DataPartStorageOnDisk.cpp | 3 --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 --- src/Storages/MergeTree/IMergeTreeDataPart.h | 5 ++-- src/Storages/MergeTree/IMergeTreeReader.h | 2 -- src/Storages/MergeTree/MergeTask.cpp | 11 ++++---- .../MergeTreeBaseSelectProcessor.cpp | 18 ++++++------- .../MergeTree/MergeTreeBaseSelectProcessor.h | 2 +- .../MergeTree/MergeTreeBlockReadUtils.cpp | 20 +++++++------- .../MergeTree/MergeTreeBlockReadUtils.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 6 ++--- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- .../MergeTree/MergeTreeRangeReader.cpp | 16 +++--------- src/Storages/MergeTree/MergeTreeReadPool.cpp | 2 +- .../MergeTree/MergeTreeSequentialSource.cpp | 4 ++- src/Storages/MergeTree/MutateTask.cpp | 7 ----- .../MergeTree/StorageFromMergeTreeDataPart.h | 4 +-- .../MergeTree/registerStorageMergeTree.cpp | 3 --- src/Storages/StorageInMemoryMetadata.cpp | 2 -- src/Storages/StorageInMemoryMetadata.h | 2 -- src/Storages/StorageSnapshot.cpp | 26 ++++++++++++++++++- src/Storages/StorageSnapshot.h | 4 +++ src/Storages/TTLDescription.h | 7 ----- tests/performance/lightweight_delete.xml | 3 +-- ...02319_lightweight_delete_on_merge_tree.sql | 3 +-- ...ght_delete_on_merge_tree_compact_parts.sql | 3 +-- .../0_stateless/02352_lightweight_delete.sql | 3 +-- 35 files changed, 120 insertions(+), 104 deletions(-) create mode 100644 src/Storages/LightweightDeleteDescription.cpp create mode 100644 src/Storages/LightweightDeleteDescription.h diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 672b8c5b1fb..4bf476befb2 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -458,8 +458,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(Bool, optimize_respect_aliases, true, "If it is set to true, it will respect aliases in WHERE/GROUP BY/ORDER BY, that will help with partition pruning/secondary indexes/optimize_aggregation_in_order/optimize_read_in_order/optimize_trivial_count", 0) \ M(UInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ - M(Bool, allow_experimental_lightweight_delete_with_row_exists, false, "Enable lightweight DELETE mutations using __rows_exists column for mergetree tables. Work in progress", 0) \ - M(Bool, lightweight_delete_mutation, true, "Enable to make ordinary ALTER DELETE queries lightweight for mergetree tables", 0) \ + M(Bool, allow_experimental_lightweight_delete, false, "Enable lightweight DELETE mutations for mergetree tables. Work in progress", 0) \ M(Bool, optimize_move_functions_out_of_any, false, "Move functions out of aggregate functions 'any', 'anyLast'.", 0) \ M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index cb4bc363d18..8c8030c6a51 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace DB @@ -33,7 +34,7 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex BlockIO InterpreterDeleteQuery::execute() { - if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete_with_row_exists) + if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete) { throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it"); } @@ -81,7 +82,7 @@ BlockIO InterpreterDeleteQuery::execute() command->predicate = delete_query.predicate; command->update_assignments = std::make_shared(); auto set_row_does_not_exist = std::make_shared(); - set_row_does_not_exist->column_name = metadata_snapshot->lightweight_delete_description.filter_column.name; + set_row_does_not_exist->column_name = LightweightDeleteDescription::filter_column.name; auto zero_value = std::make_shared(DB::Field(UInt8(0))); set_row_does_not_exist->children.push_back(zero_value); command->update_assignments->children.push_back(set_row_does_not_exist); diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 18f8b493ad6..180e160aca6 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -28,6 +28,8 @@ #include #include #include +#include + namespace DB { @@ -349,7 +351,7 @@ static void validateUpdateColumns( } /// Allow to override value of lightweight delete filter virtual column - if (!found && column_name == metadata_snapshot->lightweight_delete_description.filter_column.name) + if (!found && column_name == LightweightDeleteDescription::filter_column.name) found = true; if (!found) @@ -508,8 +510,8 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) DataTypePtr type; if (auto physical_column = columns_desc.tryGetPhysical(column)) type = physical_column->type; - else if (column == metadata_snapshot->lightweight_delete_description.filter_column.name) - type = metadata_snapshot->lightweight_delete_description.filter_column.type; + else if (column == LightweightDeleteDescription::filter_column.name) + type = LightweightDeleteDescription::filter_column.type; else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown column {}", column); @@ -772,11 +774,11 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); auto all_columns = storage_snapshot->getColumns(options); - // TODO: add _row_exists column if it is present in the part??? + /// Add _row_exists column if it is present in the part if (auto part_storage = dynamic_pointer_cast(storage)) { - if (part_storage->hasLightweightDeleteColumn()) - all_columns.push_back({metadata_snapshot->lightweight_delete_description.filter_column}); + if (part_storage->hasLightweightDeletedMask()) + all_columns.push_back({LightweightDeleteDescription::filter_column}); } /// Next, for each stage calculate columns changed by this and previous stages. diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 10f764caaee..94525bf6b8c 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -86,7 +86,6 @@ private: struct Stage; ASTPtr prepareInterpreterSelectQuery(std::vector &prepared_stages, bool dry_run); - QueryPipelineBuilder addStreamsForLaterStages(const std::vector & prepared_stages, QueryPlan & plan) const; std::optional getStorageSortDescriptionIfPossible(const Block & header) const; @@ -100,7 +99,6 @@ private: bool can_execute; SelectQueryOptions select_limits; - /// TODO: is it needed? bool apply_deleted_mask = true; ASTPtr mutation_ast; diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 03053eb9b18..808d634b1ea 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -786,7 +787,7 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada /// Drop alias is metadata alter, in other case mutation is required. if (type == DROP_COLUMN) return metadata.columns.hasColumnOrNested(GetColumnsOptions::AllPhysical, column_name) || - column_name == metadata.lightweight_delete_description.filter_column.name; + column_name == LightweightDeleteDescription::filter_column.name; if (type != MODIFY_COLUMN || data_type == nullptr) return false; @@ -1152,7 +1153,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.has(command.column_name) || all_columns.hasNested(command.column_name) || - (command.clear && column_name == metadata.lightweight_delete_description.filter_column.name)) + (command.clear && column_name == LightweightDeleteDescription::filter_column.name)) { if (!command.clear) /// CLEAR column is Ok even if there are dependencies. { diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 209dee885f4..eea5dc7fcbb 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -60,10 +60,17 @@ struct GetColumnsOptions return *this; } + GetColumnsOptions & withSystemColumns(bool value = true) + { + with_system_columns = value; + return *this; + } + Kind kind; bool with_subcolumns = false; bool with_virtuals = false; bool with_extended_objects = false; + bool with_system_columns = false; }; /// Description of a single table column (in CREATE TABLE for example). diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 3647941cc57..0a25a30ec6f 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -235,7 +235,7 @@ public: /// Returns true if the storage supports backup/restore for specific partitions. virtual bool supportsBackupPartition() const { return false; } - /// Return true if there are at least one part containing lightweight deleted mask. + /// Return true if there is at least one part containing lightweight deleted mask. virtual bool hasLightweightDeletedMask() const { return false; } private: diff --git a/src/Storages/LightweightDeleteDescription.cpp b/src/Storages/LightweightDeleteDescription.cpp new file mode 100644 index 00000000000..0ffb7766c80 --- /dev/null +++ b/src/Storages/LightweightDeleteDescription.cpp @@ -0,0 +1,9 @@ +#include +#include + +namespace DB +{ + +const NameAndTypePair LightweightDeleteDescription::filter_column {"_row_exists", std::make_shared()}; + +} diff --git a/src/Storages/LightweightDeleteDescription.h b/src/Storages/LightweightDeleteDescription.h new file mode 100644 index 00000000000..7177b2a5b54 --- /dev/null +++ b/src/Storages/LightweightDeleteDescription.h @@ -0,0 +1,13 @@ +#pragma once +#include +#include "Storages/TTLDescription.h" + +namespace DB +{ + +struct LightweightDeleteDescription +{ + static const NameAndTypePair filter_column; +}; + +} diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp index 03d24d84bb0..f3b228a0748 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp @@ -368,10 +368,7 @@ void DataPartStorageOnDisk::clearDirectory( request.emplace_back(fs::path(dir) / "delete-on-destroy.txt", true); if (!is_projection) - { request.emplace_back(fs::path(dir) / "txn_version.txt", true); - request.emplace_back(fs::path(dir) / "deleted_rows_mask.bin", true); - } disk->removeSharedFiles(request, !can_remove_shared_data, names_not_to_remove); disk->removeDirectory(dir); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 3f8000f3136..36855fe48bb 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -294,7 +294,6 @@ static void decrementTypeMetric(MergeTreeDataPartType type) } } - IMergeTreeDataPart::IMergeTreeDataPart( const MergeTreeData & storage_, const String & name_, @@ -790,9 +789,6 @@ NameSet IMergeTreeDataPart::getFileNamesWithoutChecksums() const if (data_part_storage->exists(TXN_VERSION_METADATA_FILE_NAME)) result.emplace(TXN_VERSION_METADATA_FILE_NAME); - if (data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME)) - result.emplace(DELETED_ROWS_MARK_FILE_NAME); - return result; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index a9c4590c045..b205da4d4c2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -404,8 +405,6 @@ public: static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; - static inline constexpr auto DELETED_ROWS_MARK_FILE_NAME = "deleted_rows_mask.bin"; - /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part /// for zero copy replication. Sadly it's very complex. @@ -462,7 +461,7 @@ public: bool supportLightweightDeleteMutate() const; /// True if here is lightweight deleted mask file in part. - bool hasLightweightDelete() const { return data_part_storage->exists(DELETED_ROWS_MARK_FILE_NAME); } + bool hasLightweightDelete() const { return columns.contains(LightweightDeleteDescription::filter_column.name); } protected: diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index 41030e522ac..b13db9c3255 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -62,8 +62,6 @@ public: MergeTreeData::DataPartPtr data_part; - bool needReadDeletedMask() { return settings.apply_deleted_mask && data_part->hasLightweightDelete(); } - protected: /// Returns actual column type in part, which can differ from table metadata. NameAndTypePair getColumnFromPart(const NameAndTypePair & required_column) const; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 65c9523f861..20992ff2a99 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -814,10 +815,10 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() auto columns = global_ctx->merging_column_names; /// The part might have some rows masked by lightweight deletes - const auto lwd_filter_column = global_ctx->metadata_snapshot->lightweight_delete_description.filter_column.name; - const bool need_to_filter_deleted_rows = !lwd_filter_column.empty() && part->getColumns().contains(lwd_filter_column); + const auto lightweight_delete_filter_column = LightweightDeleteDescription::filter_column.name; + const bool need_to_filter_deleted_rows = part->hasLightweightDelete(); if (need_to_filter_deleted_rows) - columns.emplace_back(lwd_filter_column); + columns.emplace_back(lightweight_delete_filter_column); auto input = std::make_unique( *global_ctx->data, global_ctx->storage_snapshot, part, columns, ctx->read_with_direct_io, true); @@ -827,9 +828,9 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() /// Add filtering step that discards deleted rows if (need_to_filter_deleted_rows) { - pipe.addSimpleTransform([lwd_filter_column](const Block & header) + pipe.addSimpleTransform([lightweight_delete_filter_column](const Block & header) { - return std::make_shared(header, nullptr, lwd_filter_column, true); + return std::make_shared(header, nullptr, lightweight_delete_filter_column, true); }); } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 4cae54492c8..00b9959739f 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -61,7 +61,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( { non_const_virtual_column_names.emplace_back(*it); } - else if (*it == "_row_exists") + else if (*it == LightweightDeleteDescription::filter_column.name) { non_const_virtual_column_names.emplace_back(*it); } @@ -242,10 +242,9 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart( pre_reader_for_step.clear(); /// Add lightweight delete filtering step - const auto & lightweigth_delete_info = metadata_snapshot->lightweight_delete_description; - if (reader_settings.apply_deleted_mask && data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) + if (reader_settings.apply_deleted_mask && data_part->hasLightweightDelete()) { - pre_reader_for_step.push_back(data_part->getReader({lightweigth_delete_info.filter_column}, metadata_snapshot, mark_ranges, + pre_reader_for_step.push_back(data_part->getReader({LightweightDeleteDescription::filter_column}, metadata_snapshot, mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, value_size_map, profile_callback)); } @@ -268,11 +267,10 @@ void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & cu size_t pre_readers_shift = 0; /// Add filtering step with lightweight delete mask - const auto & lightweigth_delete_info = storage_snapshot->metadata->lightweight_delete_description; - if (reader_settings.apply_deleted_mask && current_task.data_part->getColumns().contains(lightweigth_delete_info.filter_column.name)) + if (reader_settings.apply_deleted_mask && current_task.data_part->hasLightweightDelete()) { current_task.pre_range_readers.push_back( - MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lwd_filter_step, last_reader, non_const_virtual_column_names)); + MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names)); prev_reader = ¤t_task.pre_range_readers.back(); pre_readers_shift++; } @@ -471,14 +469,14 @@ static void injectNonConstVirtualColumns( } } - if (virtual_column_name == "_row_exists") + if (virtual_column_name == LightweightDeleteDescription::filter_column.name) { /// If _row_exists column isn't present in the part then fill it here with 1s ColumnPtr column; if (rows) - column = DataTypeUInt8().createColumnConst(rows, 1)->convertToFullColumnIfConst(); + column = LightweightDeleteDescription::filter_column.type->createColumnConst(rows, 1)->convertToFullColumnIfConst(); else - column = DataTypeUInt8().createColumn(); + column = LightweightDeleteDescription::filter_column.type->createColumn(); inserter.insertUInt8Column(column, virtual_column_name); } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 51805fa83a2..1a04c2ef25f 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -103,7 +103,7 @@ protected: StorageSnapshotPtr storage_snapshot; /// This step is added when the part has lightweight delete mask - const PrewhereExprStep lwd_filter_step { nullptr, "_row_exists", true, true }; + const PrewhereExprStep lightweight_delete_filter_step { nullptr, LightweightDeleteDescription::filter_column.name, true, true }; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index ad208f6b041..56007c5c5e7 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -101,16 +101,16 @@ NameSet injectRequiredColumns( if (!part->isProjectionPart()) alter_conversions = storage.getAlterConversionsForPart(part); - auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical) + .withExtendedObjects() + .withSystemColumns(); if (with_subcolumns) options.withSubcolumns(); for (size_t i = 0; i < columns.size(); ++i) { - /// We are going to fetch only physical columns - const bool is_real_column = storage_snapshot->tryGetColumn(options, columns[i]).has_value(); - const bool is_virtual_column = storage.isVirtualColumn(columns[i], storage_snapshot->getMetadataForQuery()); - if (!is_real_column && !is_virtual_column) + /// We are going to fetch only physical columns and system columns + if (!storage_snapshot->tryGetColumn(options, columns[i])) throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]); have_at_least_one_physical_column |= injectRequiredColumnsRecursively( @@ -274,15 +274,15 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const Names & non_const_virtual_columns, + const Names & system_columns, const PrewhereInfoPtr & prewhere_info, bool with_subcolumns) { Names column_names = required_columns; Names pre_column_names; - /// read non-const virtual column from data if it exists - for (const auto & name : non_const_virtual_columns) + /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part + for (const auto & name : system_columns) { if (data_part->getColumns().contains(name)) column_names.push_back(name); @@ -293,7 +293,9 @@ MergeTreeReadTaskColumns getReadTaskColumns( storage, storage_snapshot, data_part, with_subcolumns, column_names); MergeTreeReadTaskColumns result; - auto options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); + auto options = GetColumnsOptions(GetColumnsOptions::All) + .withExtendedObjects() + .withSystemColumns(); if (with_subcolumns) options.withSubcolumns(); diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index d6277167555..5a36955b4d3 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -79,7 +79,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const Names & non_const_virtual_columns, + const Names & system_columns, const PrewhereInfoPtr & prewhere_info, bool with_subcolumns); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 0223561cdb6..bd52ca3dfd7 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1077,7 +1077,7 @@ void MergeTreeData::loadDataPartsFromDisk( has_adaptive_parts.store(true, std::memory_order_relaxed); /// Check if there is lightweight delete in part - if (part->hasLightweightDelete() || part->getColumns().contains("_row_exists")) // TODO: implement properly + if (part->hasLightweightDelete()) has_lightweight_in_parts.store(true, std::memory_order_relaxed); part->modification_time = part_disk_ptr->getLastModified(fs::path(relative_data_path) / part_name).epochTime(); @@ -2872,7 +2872,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR); - if (part->hasLightweightDelete() || part->getColumns().contains("_row_exists")) // TODO: implement properly + if (part->hasLightweightDelete()) has_lightweight_delete_parts.store(true); checkPartCanBeAddedToTable(part, lock); @@ -6562,7 +6562,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const NameAndTypePair("_partition_value", getPartitionValueType()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), - NameAndTypePair("_row_exists", std::make_shared()), + LightweightDeleteDescription::filter_column, }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index c78c187db8f..e205bb581d4 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1214,7 +1214,7 @@ static void selectColumnNames( { virt_column_names.push_back(name); } - else if (name == "_row_exists") + else if (name == LightweightDeleteDescription::filter_column.name) { virt_column_names.push_back(name); } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index cb2ead8a025..acc90fe7313 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -656,10 +656,7 @@ MergeTreeRangeReader::MergeTreeRangeReader( , prewhere_info(prewhere_info_) , last_reader_in_chain(last_reader_in_chain_) , is_initialized(true) -// , non_const_virtual_column_names() { - - if (prev_reader) sample_block = prev_reader->getSampleBlock(); @@ -675,9 +672,6 @@ MergeTreeRangeReader::MergeTreeRangeReader( if (column_name == "_part_offset") sample_block.insert(ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)); - -// if (column_name == "_row_exists") -// sample_block.insert(ColumnWithTypeAndName(ColumnUInt8::create(), std::make_shared(), column_name)); } if (prewhere_info) @@ -862,11 +856,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar if (read_result.num_rows) { /// Physical columns go first and then some virtual columns follow + /// TODO: is there a better way to account for virtual columns that were filled by previous readers? size_t physical_columns_count = read_result.columns.size() - read_result.extra_columns_filled.size(); -/////////// -// TODO: properly account for "virtual columns" that are overridden with real data in the part - -///////////// Columns physical_columns(read_result.columns.begin(), read_result.columns.begin() + physical_columns_count); bool should_evaluate_missing_defaults; @@ -1159,10 +1150,9 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r block.insert({result.columns[pos], std::make_shared(), column_name}); } - else if (column_name == "_row_exists") + else if (column_name == LightweightDeleteDescription::filter_column.name) { - /// do nothing, it will be added later - /// TODO: properly implement reading non-const virtual columns or filling them with default values + /// Do nothing, it will be added later } else throw Exception("Unexpected non-const virtual column: " + column_name, ErrorCodes::LOGICAL_ERROR); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index 9bcc6535abb..cc2c20eda5a 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -215,7 +215,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & auto task_columns = getReadTaskColumns( data, storage_snapshot, part.data_part, - column_names, virtual_column_names /*TODO: fill non-const virtual columns*/, prewhere_info, /*with_subcolumns=*/ true); + column_names, virtual_column_names, prewhere_info, /*with_subcolumns=*/ true); auto size_predictor = !predict_block_size_bytes ? nullptr : MergeTreeBaseSelectProcessor::getSizePredictor(part.data_part, task_columns, sample_block); diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp index de48b96edd6..c86acf11547 100644 --- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp +++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp @@ -46,7 +46,9 @@ MergeTreeSequentialSource::MergeTreeSequentialSource( NamesAndTypesList columns_for_reader; if (take_column_types_from_storage) { - auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withVirtuals(); /// TODO: only add _rows_exists column (if it's present on disk) + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical) + .withExtendedObjects() + .withSystemColumns(); columns_for_reader = storage_snapshot->getColumnsByNames(options, columns_to_read); } else diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 0e61f499202..cb041775ca6 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -453,10 +453,6 @@ NameSet collectFilesToSkip( { NameSet files_to_skip = source_part->getFileNamesWithoutChecksums(); - /// Remove deleted rows mask file name to create hard link for it when mutate some columns. - if (files_to_skip.contains(IMergeTreeDataPart::DELETED_ROWS_MARK_FILE_NAME)) - files_to_skip.erase(IMergeTreeDataPart::DELETED_ROWS_MARK_FILE_NAME); - /// Skip updated files for (const auto & entry : updated_header) { @@ -650,8 +646,6 @@ struct MutationContext MergeTreeData::DataPartPtr source_part; StoragePtr storage_from_source_part; - bool is_lightweight_mutation{false}; - StorageMetadataPtr metadata_snapshot; MutationCommandsConstPtr commands; @@ -1526,7 +1520,6 @@ bool MutateTask::prepare() ctx->new_data_part->setColumns(new_columns); ctx->new_data_part->setSerializationInfos(new_infos); - ctx->new_data_part->partition.assign(ctx->source_part->partition); /// Don't change granularity type while mutating subset of columns diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index b2f62c2bf02..9249306293d 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -106,9 +106,9 @@ public: return parts.front()->storage.getSettings()->materialize_ttl_recalculate_only; } - bool hasLightweightDeleteColumn() const + bool hasLightweightDeletedMask() const override { - return parts.front()->getColumns().contains("_row_exists"); // TODO: fix hardcoded column name + return parts.front()->hasLightweightDelete(); } private: diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index beeb980a786..e52a0fed674 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -10,7 +10,6 @@ #include #include #include -#include "DataTypes/DataTypesNumber.h" #include #include @@ -678,8 +677,6 @@ static StoragePtr create(const StorageFactory::Arguments & args) if (arg_num != arg_cnt) throw Exception("Wrong number of engine arguments.", ErrorCodes::BAD_ARGUMENTS); - metadata.lightweight_delete_description.filter_column = { "_row_exists", std::make_shared() }; - if (replicated) { auto storage_policy = args.getContext()->getStoragePolicy(storage_settings->storage_policy); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index de12467bdec..66dcc938aef 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -38,7 +38,6 @@ StorageInMemoryMetadata::StorageInMemoryMetadata(const StorageInMemoryMetadata & , sampling_key(other.sampling_key) , column_ttls_by_name(other.column_ttls_by_name) , table_ttl(other.table_ttl) - , lightweight_delete_description(other.lightweight_delete_description) , settings_changes(other.settings_changes ? other.settings_changes->clone() : nullptr) , select(other.select) , comment(other.comment) @@ -64,7 +63,6 @@ StorageInMemoryMetadata & StorageInMemoryMetadata::operator=(const StorageInMemo sampling_key = other.sampling_key; column_ttls_by_name = other.column_ttls_by_name; table_ttl = other.table_ttl; - lightweight_delete_description = other.lightweight_delete_description; if (other.settings_changes) settings_changes = other.settings_changes->clone(); else diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index 84a3bcb3046..a9ab96909f4 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -43,8 +43,6 @@ struct StorageInMemoryMetadata TTLColumnsDescription column_ttls_by_name; /// TTL expressions for table (Move and Rows) TTLTableDescription table_ttl; - /// Lightweight delete filter column if the storage supports it. - LightweightDeleteDescription lightweight_delete_description; /// SETTINGS expression. Supported for MergeTree, Buffer, Kafka, RabbitMQ. ASTPtr settings_changes; /// SELECT QUERY. Supported for MaterializedView and View (have to support LiveView). diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index b47623db50b..5f9857b28ef 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -19,6 +20,9 @@ void StorageSnapshot::init() { for (const auto & [name, type] : storage.getVirtuals()) virtual_columns[name] = type; + + if (storage.hasLightweightDeletedMask()) + system_columns[LightweightDeleteDescription::filter_column.name] = LightweightDeleteDescription::filter_column.type; } NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) const @@ -28,13 +32,13 @@ NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) if (options.with_extended_objects) extendObjectColumns(all_columns, object_columns, options.with_subcolumns); + NameSet column_names; if (options.with_virtuals) { /// Virtual columns must be appended after ordinary, /// because user can override them. if (!virtual_columns.empty()) { - NameSet column_names; for (const auto & column : all_columns) column_names.insert(column.name); @@ -44,6 +48,19 @@ NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) } } + if (options.with_system_columns) + { + if (!system_columns.empty() && column_names.empty()) + { + for (const auto & column : all_columns) + column_names.insert(column.name); + } + + for (const auto & [name, type] : system_columns) + if (!column_names.contains(name)) + all_columns.emplace_back(name, type); + } + return all_columns; } @@ -76,6 +93,13 @@ std::optional StorageSnapshot::tryGetColumn(const GetColumnsOpt return NameAndTypePair(column_name, it->second); } + if (options.with_system_columns) + { + auto it = system_columns.find(column_name); + if (it != system_columns.end()) + return NameAndTypePair(column_name, it->second); + } + return {}; } diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h index 6dad82dffd2..badf0d3a1e8 100644 --- a/src/Storages/StorageSnapshot.h +++ b/src/Storages/StorageSnapshot.h @@ -85,6 +85,10 @@ private: void init(); std::unordered_map virtual_columns; + + /// System columns are not visible in the schema but might be persisted in the data. + /// One example of such column is lightweight delete mask '_row_exists'. + std::unordered_map system_columns; }; using StorageSnapshotPtr = std::shared_ptr; diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 5170b7d326c..8f60eb604b5 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -6,8 +6,6 @@ #include #include #include -#include "Core/NamesAndTypes.h" -#include "DataTypes/Serializations/ISerialization.h" namespace DB { @@ -129,9 +127,4 @@ struct TTLTableDescription static TTLTableDescription parse(const String & str, const ColumnsDescription & columns, ContextPtr context, const KeyDescription & primary_key); }; -struct LightweightDeleteDescription -{ - NameAndTypePair filter_column; -}; - } diff --git a/tests/performance/lightweight_delete.xml b/tests/performance/lightweight_delete.xml index af7103f02d0..b29684f177f 100644 --- a/tests/performance/lightweight_delete.xml +++ b/tests/performance/lightweight_delete.xml @@ -19,8 +19,7 @@ 1 1 - 0 - 1 + 1 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index 6f78e1fe464..a0cdec12157 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -5,8 +5,7 @@ CREATE TABLE merge_table_standard_delete(id Int32, name String) ENGINE = MergeTr INSERT INTO merge_table_standard_delete select number, toString(number) from numbers(100); SET mutations_sync = 1; ---SET allow_experimental_lightweight_delete = 0; -SET allow_experimental_lightweight_delete_with_row_exists = 1; +SET allow_experimental_lightweight_delete = 1; DELETE FROM merge_table_standard_delete WHERE id = 10; diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql index f47560ba95b..c032c5bb9d1 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree_compact_parts.sql @@ -7,8 +7,7 @@ INSERT INTO merge_table_standard_delete select number, toString(number) from num SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; SET mutations_sync = 1; ---SET allow_experimental_lightweight_delete = 0; -SET allow_experimental_lightweight_delete_with_row_exists = 1; +SET allow_experimental_lightweight_delete = 1; DELETE FROM merge_table_standard_delete WHERE id = 10; SELECT COUNT(), part_type FROM system.parts WHERE database = currentDatabase() AND table = 'merge_table_standard_delete' AND active GROUP BY part_type ORDER BY part_type; diff --git a/tests/queries/0_stateless/02352_lightweight_delete.sql b/tests/queries/0_stateless/02352_lightweight_delete.sql index 4468a25448c..4d0d53bb997 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete.sql @@ -5,8 +5,7 @@ CREATE TABLE lwd_test (id UInt64 , value String) ENGINE MergeTree() ORDER BY id; INSERT INTO lwd_test SELECT number, randomString(10) FROM system.numbers LIMIT 1000000; SET mutations_sync = 1; -SET allow_experimental_lightweight_delete_with_row_exists = 1; ---SET allow_experimental_lightweight_delete = 0; +SET allow_experimental_lightweight_delete = 1; SELECT 'Rows in parts', SUM(rows) FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_test' AND active; SELECT 'Count', count() FROM lwd_test; From be64b45583dc9c97dd63cb09cd0a51a003f3c0ba Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Sat, 23 Jul 2022 01:14:03 +0200 Subject: [PATCH 152/227] Properly handle empty parts list --- .../MergeTree/StorageFromMergeTreeDataPart.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index 9249306293d..79535d2b4ff 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -9,11 +9,17 @@ #include #include #include +#include namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + /// A Storage that allows reading from a single MergeTree data part. class StorageFromMergeTreeDataPart final : public IStorage { @@ -103,19 +109,21 @@ public: bool materializeTTLRecalculateOnly() const { + if (parts.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "parts must not be empty for materializeTTLRecalculateOnly"); return parts.front()->storage.getSettings()->materialize_ttl_recalculate_only; } bool hasLightweightDeletedMask() const override { - return parts.front()->hasLightweightDelete(); + return !parts.empty() && parts.front()->hasLightweightDelete(); } private: - MergeTreeData::DataPartsVector parts; + const MergeTreeData::DataPartsVector parts; const MergeTreeData & storage; - String partition_id; - MergeTreeDataSelectAnalysisResultPtr analysis_result_ptr; + const String partition_id; + const MergeTreeDataSelectAnalysisResultPtr analysis_result_ptr; static StorageID getIDFromPart(const MergeTreeData::DataPartPtr & part_) { From c8b3c574a435e31b9f32df13a4e9f46e7c8f33e0 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Sat, 23 Jul 2022 13:55:43 +0200 Subject: [PATCH 153/227] Disable lightweight delete if table has projections --- src/Interpreters/MutationsInterpreter.cpp | 4 ++++ src/Storages/IStorage.h | 3 +++ src/Storages/MergeTree/IMergeTreeDataPart.cpp | 3 ++- src/Storages/MergeTree/MergeTreeData.cpp | 11 +++++++++++ src/Storages/MergeTree/MergeTreeData.h | 2 ++ .../MergeTree/StorageFromMergeTreeDataPart.h | 5 +++++ ...2319_lightweight_delete_on_merge_tree.reference | 2 ++ .../02319_lightweight_delete_on_merge_tree.sql | 14 ++++++++++++++ 8 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 180e160aca6..64ea313a1af 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -352,7 +352,11 @@ static void validateUpdateColumns( /// Allow to override value of lightweight delete filter virtual column if (!found && column_name == LightweightDeleteDescription::filter_column.name) + { + if (!storage->supportsLightweightDelete()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Lightweight delete is not supported for table"); found = true; + } if (!found) { diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 0a25a30ec6f..03e10f98b49 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -238,6 +238,9 @@ public: /// Return true if there is at least one part containing lightweight deleted mask. virtual bool hasLightweightDeletedMask() const { return false; } + /// Return true if storage can execute lightweight delete mutations. + virtual bool supportsLightweightDelete() const { return false; } + private: StorageID storage_id; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 36855fe48bb..f6b6d7bccdb 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1210,7 +1210,8 @@ void IMergeTreeDataPart::loadColumns(bool require) /// Project part / part with project parts / compact part doesn't support LWD. bool IMergeTreeDataPart::supportLightweightDeleteMutate() const { - return part_type == MergeTreeDataPartType::Wide && parent_part == nullptr && projection_parts.empty(); + return (part_type == MergeTreeDataPartType::Wide || part_type == MergeTreeDataPartType::Compact) && + parent_part == nullptr && projection_parts.empty(); } void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index bd52ca3dfd7..d0f1fe75560 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4496,6 +4496,17 @@ MergeTreeData::DataPartsVector MergeTreeData::getAllDataPartsVector(MergeTreeDat return res; } +bool MergeTreeData::supportsLightweightDelete() const +{ + auto lock = lockParts(); + for (const auto & part : data_parts_by_info) + { + if (!part->supportLightweightDeleteMutate()) + return false; + } + return true; +} + MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const { ProjectionPartsVector res; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index e18a3b20b74..ed5f6cc8b74 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -440,6 +440,8 @@ public: bool supportsDynamicSubcolumns() const override { return true; } + bool supportsLightweightDelete() const override; + NamesAndTypesList getVirtuals() const override; bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr, const StorageMetadataPtr & metadata_snapshot) const override; diff --git a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h index 79535d2b4ff..fd313a10bc8 100644 --- a/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h +++ b/src/Storages/MergeTree/StorageFromMergeTreeDataPart.h @@ -119,6 +119,11 @@ public: return !parts.empty() && parts.front()->hasLightweightDelete(); } + bool supportsLightweightDelete() const override + { + return !parts.empty() && parts.front()->supportLightweightDeleteMutate(); + } + private: const MergeTreeData::DataPartsVector parts; const MergeTreeData & storage; diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference index 31960e2ecea..fec37f4306e 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.reference @@ -37,3 +37,5 @@ t_light 4 4_5_5_1_10 1 1 1000 -2 1005 -2 +----Test lighweight delete is disabled if table has projections----- +500.5 500.5 1000 diff --git a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql index a0cdec12157..67513a1cdff 100644 --- a/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql +++ b/tests/queries/0_stateless/02319_lightweight_delete_on_merge_tree.sql @@ -94,3 +94,17 @@ CHECK TABLE t_large; SELECT * FROM t_large WHERE a in (1,1000,1005,50000) order by a; DROP TABLE t_large; + +SELECT '----Test lighweight delete is disabled if table has projections-----'; + +CREATE TABLE t_proj(a UInt32, b int) ENGINE=MergeTree order BY a settings min_bytes_for_wide_part=0; + +ALTER TABLE t_proj ADD PROJECTION p_1 (SELECT avg(a), avg(b), count()); + +INSERT INTO t_proj SELECT number + 1, number + 1 FROM numbers(1000); + +DELETE FROM t_proj WHERE a < 100; -- { serverError NOT_IMPLEMENTED } + +SELECT avg(a), avg(b), count() FROM t_proj; + +DROP TABLE t_proj; From 66927701139641e8826e4f10c2f2b278346238fb Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Sat, 23 Jul 2022 16:42:21 +0200 Subject: [PATCH 154/227] Allow to update "system" virtual columns such as _row_exists --- src/Storages/MergeTree/MutateTask.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index cb041775ca6..59c784208c5 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -170,14 +170,15 @@ getColumnsForNewDataPart( NameToNameMap renamed_columns_to_from; NameToNameMap renamed_columns_from_to; ColumnsDescription part_columns(source_part->getColumns()); - const auto all_virtual_columns = source_part->storage.getVirtuals(); + NamesAndTypesList system_columns; + if (source_part->supportLightweightDeleteMutate()) + system_columns.push_back(LightweightDeleteDescription::filter_column); - /// Preserve virtual columns that have persisted values in the source_part -/// TODO: only allow LWD mask to be overridden! - for (const auto & virtual_column : all_virtual_columns) + /// Preserve system columns that have persisted values in the source_part + for (const auto & column : system_columns) { - if (part_columns.has(virtual_column.name) && !storage_columns.contains(virtual_column.name)) - storage_columns.emplace_back(virtual_column); + if (part_columns.has(column.name) && !storage_columns.contains(column.name)) + storage_columns.emplace_back(column); } /// All commands are validated in AlterCommand so we don't care about order @@ -187,11 +188,10 @@ getColumnsForNewDataPart( { for (const auto & [column_name, _] : command.column_to_update_expression) { - /// Allow to update and persist values of virtual column -/// TODO: only allow LWD mask to be overridden! - auto virtual_column = all_virtual_columns.tryGetByName(column_name); - if (virtual_column && !storage_columns.contains(column_name)) - storage_columns.emplace_back(column_name, virtual_column->type); + /// Allow to update and persist values of system column + auto column = system_columns.tryGetByName(column_name); + if (column && !storage_columns.contains(column_name)) + storage_columns.emplace_back(column_name, column->type); } } From 2785783defbf63ef25ed7b214baa30ce6da84e04 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 20 Jul 2022 09:05:43 +0000 Subject: [PATCH 155/227] Mark DEFLAPT_QPL as 'experimental' codec + cosmetics --- .../sql-reference/statements/create/table.md | 8 ++-- .../CompressionCodecDeflateQpl.cpp | 13 +++--- src/Compression/CompressionCodecDeflateQpl.h | 43 ++++++++----------- .../02372_qpl_is_experimental.reference | 0 .../0_stateless/02372_qpl_is_experimental.sql | 9 ++++ 5 files changed, 39 insertions(+), 34 deletions(-) create mode 100644 tests/queries/0_stateless/02372_qpl_is_experimental.reference create mode 100644 tests/queries/0_stateless/02372_qpl_is_experimental.sql diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 0c2e87fbcac..0e033456998 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -250,10 +250,12 @@ High compression levels are useful for asymmetric scenarios, like compress once, #### DEFLATE_QPL -`DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library, which has dependency on Intel Hardware: +`DEFLATE_QPL` — [Deflate compression algorithm](https://github.com/intel/qpl) implemented by Intel® Query Processing Library. Some limitations apply: -- DEFLATE_QPL is only supported on systems with AVX2/AVX512/IAA. -- DEFLATE_QPL-compressed data can only be transferred between nodes with AVX2/AVX512/IAA. +- DEFLATE_QPL is experimental and can only be used after setting configuration parameter `allow_experimental_codecs=1`. +- DEFLATE_QPL only works if ClickHouse was compiled with support for AVX2 or AVX512 instructions +- DEFLATE_QPL works best if the system has a Intel® IAA (In-Memory Analytics Accelerator) offloading device +- DEFLATE_QPL-compressed data can only be transferred between ClickHouse nodes compiled with support for AVX2/AVX512 ### Specialized Codecs diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index 81ec7ee5dca..9e165a9c913 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -28,8 +28,8 @@ DeflateQplJobHWPool & DeflateQplJobHWPool::instance() } DeflateQplJobHWPool::DeflateQplJobHWPool() - :random_engine(std::random_device()()) - ,distribution(0, MAX_HW_JOB_NUMBER-1) + : random_engine(std::random_device()()) + , distribution(0, MAX_HW_JOB_NUMBER - 1) { Poco::Logger * log = &Poco::Logger::get("DeflateQplJobHWPool"); UInt32 job_size = 0; @@ -73,7 +73,7 @@ DeflateQplJobHWPool::~DeflateQplJobHWPool() job_pool_ready = false; } -qpl_job * DeflateQplJobHWPool::acquireJob(UInt32 &job_id) +qpl_job * DeflateQplJobHWPool::acquireJob(UInt32 & job_id) { if (isJobPoolReady()) { @@ -141,7 +141,7 @@ HardwareCodecDeflateQpl::~HardwareCodecDeflateQpl() Int32 HardwareCodecDeflateQpl::doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const { UInt32 job_id = 0; - qpl_job* job_ptr = nullptr; + qpl_job * job_ptr = nullptr; UInt32 compressed_size = 0; if (!(job_ptr = DeflateQplJobHWPool::instance().acquireJob(job_id))) { @@ -330,10 +330,9 @@ void SoftwareCodecDeflateQpl::doDecompressData(const char * source, UInt32 sourc "Execution of DeflateQpl software fallback codec failed. (Details: qpl_execute_job with error code: {} - please refer to qpl_status in ./contrib/qpl/include/qpl/c_api/status.h)", status); } -//CompressionCodecDeflateQpl CompressionCodecDeflateQpl::CompressionCodecDeflateQpl() - :hw_codec(std::make_unique()) - ,sw_codec(std::make_unique()) + : hw_codec(std::make_unique()) + , sw_codec(std::make_unique()) { setCodecDescription("DEFLATE_QPL"); } diff --git a/src/Compression/CompressionCodecDeflateQpl.h b/src/Compression/CompressionCodecDeflateQpl.h index c15f537fd3f..1a13a1ca42c 100644 --- a/src/Compression/CompressionCodecDeflateQpl.h +++ b/src/Compression/CompressionCodecDeflateQpl.h @@ -1,8 +1,9 @@ #pragma once #include -#include +#include #include +#include namespace Poco { @@ -18,20 +19,16 @@ class DeflateQplJobHWPool { public: DeflateQplJobHWPool(); - ~DeflateQplJobHWPool(); - qpl_job * acquireJob(UInt32 &job_id); - - static void releaseJob(UInt32 job_id); - - static const bool & isJobPoolReady() { return job_pool_ready; } - static DeflateQplJobHWPool & instance(); + qpl_job * acquireJob(UInt32 & job_id); + static void releaseJob(UInt32 job_id); + static const bool & isJobPoolReady() { return job_pool_ready; } + private: static bool tryLockJob(UInt32 index); - static void unLockJob(UInt32 index); /// Maximum jobs running in parallel supported by IAA hardware @@ -39,9 +36,9 @@ private: /// Entire buffer for storing all job objects static std::unique_ptr hw_jobs_buffer; /// Job pool for storing all job object pointers - static std::array hw_job_ptr_pool; + static std::array hw_job_ptr_pool; /// Locks for accessing each job object pointers - static std::array hw_job_ptr_locks; + static std::array hw_job_ptr_locks; static bool job_pool_ready; std::mt19937 random_engine; std::uniform_int_distribution distribution; @@ -57,23 +54,25 @@ public: private: qpl_job * sw_job = nullptr; std::unique_ptr sw_buffer; + qpl_job * getJobCodecPtr(); }; class HardwareCodecDeflateQpl { public: - /// RET_ERROR stands for hardware codec fail,need fallback to software codec. + /// RET_ERROR stands for hardware codec fail, needs fallback to software codec. static constexpr Int32 RET_ERROR = -1; HardwareCodecDeflateQpl(); ~HardwareCodecDeflateQpl(); + Int32 doCompressData(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) const; - ///Submit job request to the IAA hardware and then busy waiting till it complete. + /// Submit job request to the IAA hardware and then busy waiting till it complete. Int32 doDecompressDataSynchronous(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size); - ///Submit job request to the IAA hardware and return immediately. IAA hardware will process decompression jobs automatically. + /// Submit job request to the IAA hardware and return immediately. IAA hardware will process decompression jobs automatically. Int32 doDecompressDataAsynchronous(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size); /// Flush result for all previous requests which means busy waiting till all the jobs in "decomp_async_job_map" are finished. @@ -96,23 +95,19 @@ public: void updateHash(SipHash & hash) const override; protected: - bool isCompression() const override - { - return true; - } - - bool isGenericCompression() const override - { - return true; - } + bool isCompression() const override { return true; } + bool isGenericCompression() const override { return true; } + bool isExperimental() const override { return true; } UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; - ///Flush result for previous asynchronous decompression requests on asynchronous mode. + + /// Flush result for previous asynchronous decompression requests on asynchronous mode. void flushAsynchronousDecompressRequests() override; private: UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; + std::unique_ptr hw_codec; std::unique_ptr sw_codec; }; diff --git a/tests/queries/0_stateless/02372_qpl_is_experimental.reference b/tests/queries/0_stateless/02372_qpl_is_experimental.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02372_qpl_is_experimental.sql b/tests/queries/0_stateless/02372_qpl_is_experimental.sql new file mode 100644 index 00000000000..c2bebbd8e9c --- /dev/null +++ b/tests/queries/0_stateless/02372_qpl_is_experimental.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS qpl_codec; + +CREATE TABLE qpl_codec (id Int32 CODEC(DEFLATE_QPL)) ENGINE = MergeTree() ORDER BY id; -- { serverError 36 } + +SET allow_experimental_codecs = 1; +CREATE TABLE qpl_codec (id Int32 CODEC(DEFLATE_QPL)) ENGINE = MergeTree() ORDER BY id; + +DROP TABLE IF EXISTS qpl_codec; + From b1da2eb9e63ad521f84244e237c33376a93879b6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 24 Jul 2022 20:42:38 +0000 Subject: [PATCH 156/227] Remove tests again ... They broke because QPL is compiled only when AVX2/512 is on. This is not the case in CI. --- .../0_stateless/02372_qpl_is_experimental.reference | 0 tests/queries/0_stateless/02372_qpl_is_experimental.sql | 9 --------- 2 files changed, 9 deletions(-) delete mode 100644 tests/queries/0_stateless/02372_qpl_is_experimental.reference delete mode 100644 tests/queries/0_stateless/02372_qpl_is_experimental.sql diff --git a/tests/queries/0_stateless/02372_qpl_is_experimental.reference b/tests/queries/0_stateless/02372_qpl_is_experimental.reference deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/queries/0_stateless/02372_qpl_is_experimental.sql b/tests/queries/0_stateless/02372_qpl_is_experimental.sql deleted file mode 100644 index c2bebbd8e9c..00000000000 --- a/tests/queries/0_stateless/02372_qpl_is_experimental.sql +++ /dev/null @@ -1,9 +0,0 @@ -DROP TABLE IF EXISTS qpl_codec; - -CREATE TABLE qpl_codec (id Int32 CODEC(DEFLATE_QPL)) ENGINE = MergeTree() ORDER BY id; -- { serverError 36 } - -SET allow_experimental_codecs = 1; -CREATE TABLE qpl_codec (id Int32 CODEC(DEFLATE_QPL)) ENGINE = MergeTree() ORDER BY id; - -DROP TABLE IF EXISTS qpl_codec; - From dc96a6270d557d4c2bb65a473fffe50b36582576 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 25 Jul 2022 10:27:33 +0000 Subject: [PATCH 157/227] Remove optimize_memory_usage flag Memory usage with DistinctSortedTransform fixed in #39538 --- src/Core/Settings.h | 1 - src/Interpreters/InterpreterSelectQuery.cpp | 3 +-- .../InterpreterSelectWithUnionQuery.cpp | 3 +-- src/Processors/QueryPlan/DistinctStep.cpp | 14 ++++++-------- src/Processors/QueryPlan/DistinctStep.h | 4 +--- ...02317_distinct_in_order_optimization.reference | 1 - .../02317_distinct_in_order_optimization.sql | 11 ++--------- ...stinct_in_order_optimization_explain.reference | 9 +++------ ...2317_distinct_in_order_optimization_explain.sh | 15 +++++++-------- 9 files changed, 21 insertions(+), 40 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 0a6f4bee5ba..17e4d27bbcd 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -624,7 +624,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ - M(Bool, optimize_distinct_in_order_memory_usage, false, "Try to use less memory for DISTINCT in order but can be slower", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 279694f917e..24bbaea7dcf 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2684,8 +2684,7 @@ void InterpreterSelectQuery::executeDistinct(QueryPlan & query_plan, bool before limit_for_distinct, columns, pre_distinct, - settings.optimize_distinct_in_order, - settings.optimize_distinct_in_order_memory_usage); + settings.optimize_distinct_in_order); if (pre_distinct) distinct_step->setStepDescription("Preliminary DISTINCT"); diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index f01a1210c3a..b2622607760 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -334,8 +334,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan) 0, result_header.getNames(), false, - settings.optimize_distinct_in_order, - settings.optimize_distinct_in_order_memory_usage); + settings.optimize_distinct_in_order); query_plan.addStep(std::move(distinct_step)); } diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index 9223218d82b..c268cb44267 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -80,8 +80,7 @@ DistinctStep::DistinctStep( UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, - bool optimize_distinct_in_order_, - bool optimize_distinct_in_order_memory_usage_) + bool optimize_distinct_in_order_) : ITransformingStep( input_stream_, input_stream_.header, @@ -91,7 +90,6 @@ DistinctStep::DistinctStep( , columns(columns_) , pre_distinct(pre_distinct_) , optimize_distinct_in_order(optimize_distinct_in_order_) - , optimize_distinct_in_order_memory_usage(optimize_distinct_in_order_memory_usage_) { if (!output_stream->distinct_columns.empty() /// Columns already distinct, do nothing && (!pre_distinct /// Main distinct @@ -140,7 +138,7 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil { assert(input_stream.has_single_port); - if (distinct_sort_desc.size() >= columns.size() || optimize_distinct_in_order_memory_usage) + if (distinct_sort_desc.size() < columns.size()) { pipeline.addSimpleTransform( [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr @@ -148,8 +146,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (stream_type != QueryPipelineBuilder::StreamType::Main) return nullptr; - return std::make_shared( - header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); + return std::make_shared( + header, distinct_sort_desc, set_size_limits, limit_hint, columns); }); } else @@ -160,8 +158,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (stream_type != QueryPipelineBuilder::StreamType::Main) return nullptr; - return std::make_shared( - header, distinct_sort_desc, set_size_limits, limit_hint, columns); + return std::make_shared( + header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); }); } return; diff --git a/src/Processors/QueryPlan/DistinctStep.h b/src/Processors/QueryPlan/DistinctStep.h index 0d3b7e3b7e0..dc734a58704 100644 --- a/src/Processors/QueryPlan/DistinctStep.h +++ b/src/Processors/QueryPlan/DistinctStep.h @@ -15,8 +15,7 @@ public: UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, /// If is enabled, execute distinct for separate streams. Otherwise, merge streams. - bool optimize_distinct_in_order_, - bool optimize_distinct_in_order_memory_usage_); + bool optimize_distinct_in_order_); String getName() const override { return "Distinct"; } @@ -33,7 +32,6 @@ private: Names columns; bool pre_distinct; bool optimize_distinct_in_order; - bool optimize_distinct_in_order_memory_usage; }; } diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference index a82e9b5cc2e..a5ae3759d5d 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.reference @@ -78,4 +78,3 @@ 1 1 0 0 -- check that distinct in order has the same result as ordinary distinct --- check that distinct in order has the same result as ordinary distinct, optimize memory usage diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql index 14618baff95..f1de07e3db2 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization.sql @@ -49,19 +49,12 @@ select '-- check that distinct in order has the same result as ordinary distinct drop table if exists distinct_cardinality_low sync; CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium); INSERT INTO distinct_cardinality_low SELECT number % 1e1, number % 1e2, number % 1e3 FROM numbers_mt(1e4); + drop table if exists distinct_in_order sync; drop table if exists ordinary_distinct sync; -create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1, optimize_distinct_in_order_memory_usage=0; -create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; -select distinct * from distinct_in_order except select * from ordinary_distinct; -select '-- check that distinct in order has the same result as ordinary distinct, optimize memory usage'; -drop table if exists distinct_in_order; -drop table if exists ordinary_distinct; create table distinct_in_order (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); -insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1, optimize_distinct_in_order_memory_usage=1; +insert into distinct_in_order select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=1; create table ordinary_distinct (low UInt64, medium UInt64, high UInt64) engine=MergeTree() order by (low, medium); insert into ordinary_distinct select distinct * from distinct_cardinality_low order by high settings optimize_distinct_in_order=0; select distinct * from distinct_in_order except select * from ordinary_distinct; diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index b41d853e3eb..3e57d4de586 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -1,5 +1,5 @@ -- disable optimize_distinct_in_order --- distinct all primary key columns -> no optimizations +-- distinct all primary key columns -> ordinary distinct DistinctTransform DistinctTransform -- enable optimize_distinct_in_order @@ -18,18 +18,15 @@ DistinctSortedChunkTransform -- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization DistinctSortedTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization -DistinctSortedStreamTransform -DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only DistinctTransform DistinctSortedChunkTransform --- distinct with non-primary key prefix -> no optimizations +-- distinct with non-primary key prefix -> ordinary distinct DistinctTransform DistinctTransform -- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only DistinctSortedTransform DistinctTransform --- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations +-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct DistinctTransform DistinctTransform diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index c19bc3e6aaf..903f3bb9e11 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -13,11 +13,11 @@ TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'" FIND_DISTINCT="$GREP_DISTINCT | $TRIM_LEADING_SPACES" $CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync" -$CLICKHOUSE_CLIENT -q "create table distinct_in_order_explain (a int, b int, c int) engine=MergeTree() order by (a, b, c)" +$CLICKHOUSE_CLIENT -q "create table distinct_in_order_explain (a int, b int, c int) engine=MergeTree() order by (a, b)" $CLICKHOUSE_CLIENT -q "insert into distinct_in_order_explain select number % number, number % 5, number % 10 from numbers(1,10)" $CLICKHOUSE_CLIENT -q "select '-- disable optimize_distinct_in_order'" -$CLICKHOUSE_CLIENT -q "select '-- distinct all primary key columns -> no optimizations'" +$CLICKHOUSE_CLIENT -q "select '-- distinct all primary key columns -> ordinary distinct'" $CLICKHOUSE_CLIENT -nq "$DISABLE_OPTIMIZATION;explain pipeline select distinct * from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- enable optimize_distinct_in_order'" @@ -34,19 +34,18 @@ $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=0" | eval $FIND_DISTINCT - -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix, optimize memory usage -> pre-distinct and final distinct optimization'" -$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c settings optimize_distinct_in_order_memory_usage=1" | eval $FIND_DISTINCT +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> no optimizations'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> ordinary distinct'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $FIND_DISTINCT + +$CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync" From dd8667d3f995bcfa92ec546ad51cfb6ad1f7e89e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 25 Jul 2022 13:02:25 +0200 Subject: [PATCH 158/227] Update settings description --- src/Core/Settings.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index fcb23fe2d39..61ea25bcc3d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -696,11 +696,11 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(UInt64, input_format_max_rows_to_read_for_schema_inference, 25000, "The maximum rows of data to read for automatic schema inference", 0) \ M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ - M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Parquet", 0) \ - M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Allow to skip fields with unsupported types while schema inference for format Protobuf", 0) \ - M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format CapnProto", 0) \ - M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format ORC", 0) \ - M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Allow to skip columns with unsupported types while schema inference for format Arrow", 0) \ + M(Bool, input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Parquet", 0) \ + M(Bool, input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format Protobuf", 0) \ + M(Bool, input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format CapnProto", 0) \ + M(Bool, input_format_orc_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format ORC", 0) \ + M(Bool, input_format_arrow_skip_columns_with_unsupported_types_in_schema_inference, false, "Skip columns with unsupported types while schema inference for format Arrow", 0) \ M(String, column_names_for_schema_inference, "", "The list of column names to use in schema inference for formats without column names. The format: 'column1,column2,column3,...'", 0) \ M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \ From 33ee858d18bcc4d9d80e8754d01e5d1cce507b75 Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Mon, 25 Jul 2022 13:11:02 +0200 Subject: [PATCH 159/227] Fix bug with maxsplit in the splitByChar --- src/Functions/FunctionsStringArray.h | 1 + tests/queries/0_stateless/02185_split_by_char.reference | 2 ++ tests/queries/0_stateless/02185_split_by_char.sql | 4 +++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index b0f415be58a..95c06ae74ca 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -337,6 +337,7 @@ public: { pos = pos_; end = end_; + curr_split = 0; } bool get(Pos & token_begin, Pos & token_end) diff --git a/tests/queries/0_stateless/02185_split_by_char.reference b/tests/queries/0_stateless/02185_split_by_char.reference index f69d8d35006..6afd0824a73 100644 --- a/tests/queries/0_stateless/02185_split_by_char.reference +++ b/tests/queries/0_stateless/02185_split_by_char.reference @@ -3,3 +3,5 @@ ['1','2,3'] ['1','2','3'] ['1','2','3'] +['expr1','1+1=2'] +['expr2','2+2=4=1+3'] diff --git a/tests/queries/0_stateless/02185_split_by_char.sql b/tests/queries/0_stateless/02185_split_by_char.sql index 6c490654813..c45f3de07eb 100644 --- a/tests/queries/0_stateless/02185_split_by_char.sql +++ b/tests/queries/0_stateless/02185_split_by_char.sql @@ -5,4 +5,6 @@ select splitByChar(',', '1,2,3', 2); select splitByChar(',', '1,2,3', 3); select splitByChar(',', '1,2,3', -2); -- { serverError 44 } -select splitByChar(',', '1,2,3', ''); -- { serverError 43 } \ No newline at end of file +select splitByChar(',', '1,2,3', ''); -- { serverError 43 } + +SELECT splitByChar('=', s, 1) FROM values('s String', 'expr1=1+1=2', 'expr2=2+2=4=1+3') From 04b03b6a9092e066b1575b50e78ed0693b025d15 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 25 Jul 2022 12:38:48 +0000 Subject: [PATCH 160/227] Don't allow to overwrite on startup --- src/Coordination/KeeperContext.h | 2 + src/Coordination/KeeperServer.cpp | 1 + src/Coordination/KeeperSnapshotManager.cpp | 34 ++++++++-- src/Coordination/KeeperStorage.cpp | 78 ++++++++++++++++------ 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/src/Coordination/KeeperContext.h b/src/Coordination/KeeperContext.h index b53893039a1..84ec65cecde 100644 --- a/src/Coordination/KeeperContext.h +++ b/src/Coordination/KeeperContext.h @@ -12,6 +12,8 @@ struct KeeperContext }; Phase server_state{Phase::INIT}; + + bool ignore_system_path_on_startup{false}; bool digest_enabled{true}; }; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 4378fe6b09c..587ab9c8f66 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -114,6 +114,7 @@ KeeperServer::KeeperServer( LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); keeper_context->digest_enabled = config.getBool("keeper_server.digest_enabled", false); + keeper_context->ignore_system_path_on_startup = config.getBool("keeper_server.ignore_system_path_on_startup", false); state_machine = nuraft::cs_new( responses_queue_, diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 8491c662a17..90281e6fc5a 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -367,16 +367,36 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial using enum PathMatchResult; auto match_result = matchPath(path, keeper_system_path); - if ((match_result == EXACT && !is_node_empty(node)) || match_result == IS_CHILD) + + const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path); + if (match_result == IS_CHILD) { - LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Cannot read node on path {} from a snapshot because it is used as a system node.", path); - - if (match_result == IS_CHILD) + if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT) + { + LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg); continue; - - node = KeeperStorage::Node{}; + } + else + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true", + error_msg); + } + else if (match_result == EXACT && !is_node_empty(node)) + { + if (keeper_context->ignore_system_path_on_startup || keeper_context->server_state != KeeperContext::Phase::INIT) + { + LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "{}. Ignoring it", error_msg); + node = KeeperStorage::Node{}; + } + else + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true", + error_msg); } - storage.container.insertOrReplace(path, node); if (node.stat.ephemeralOwner != 0) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index fc55fe65f35..3956bb0e930 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -666,7 +666,7 @@ struct KeeperStorageRequestProcessor explicit KeeperStorageRequestProcessor(const Coordination::ZooKeeperRequestPtr & zk_request_) : zk_request(zk_request_) { } virtual Coordination::ZooKeeperResponsePtr process(KeeperStorage & storage, int64_t zxid) const = 0; virtual std::vector - preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const + preprocess(KeeperStorage & /*storage*/, int64_t /*zxid*/, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const { return {}; } @@ -782,7 +782,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr } std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override { Coordination::ZooKeeperCreateRequest & request = dynamic_cast(*zk_request); @@ -808,15 +808,24 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr path_created += seq_num_str.str(); } - if (storage.uncommitted_state.getNode(path_created)) - return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; - if (path_created.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created); + auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created); + + if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", + error_msg); + + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } + if (storage.uncommitted_state.getNode(path_created)) + return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; + if (getBaseName(path_created).size == 0) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; @@ -901,7 +910,7 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperGetRequest & request = dynamic_cast(*zk_request); @@ -970,7 +979,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override { Coordination::ZooKeeperRemoveRequest & request = dynamic_cast(*zk_request); @@ -978,7 +987,16 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to delete an internal Keeper path ({}) which is not allowed", request.path); + auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path); + + if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", + error_msg); + + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1058,7 +1076,7 @@ struct KeeperStorageExistsRequestProcessor final : public KeeperStorageRequestPr using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperExistsRequest & request = dynamic_cast(*zk_request); @@ -1122,7 +1140,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override { Coordination::ZooKeeperSetRequest & request = dynamic_cast(*zk_request); @@ -1130,7 +1148,16 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", request.path); + auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path); + + if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", + error_msg); + + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1212,7 +1239,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperListRequest & request = dynamic_cast(*zk_request); @@ -1313,7 +1340,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperCheckRequest & request = dynamic_cast(*zk_request); @@ -1391,13 +1418,22 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & digest, const KeeperContext & keeper_context) const override { Coordination::ZooKeeperSetACLRequest & request = dynamic_cast(*zk_request); if (request.path.starts_with(keeper_system_path)) { - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), "Trying to update an internal Keeper path ({}) which is not allowed", request.path); + auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path); + + if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", + error_msg); + + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1471,7 +1507,7 @@ struct KeeperStorageGetACLRequestProcessor final : public KeeperStorageRequestPr using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t /*session_id*/, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperGetACLRequest & request = dynamic_cast(*zk_request); @@ -1568,14 +1604,14 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro } std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t time, uint64_t & digest, const KeeperContext & keeper_context) const override { std::vector response_errors; response_errors.reserve(concrete_requests.size()); uint64_t current_digest = digest; for (size_t i = 0; i < concrete_requests.size(); ++i) { - auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest); + auto new_deltas = concrete_requests[i]->preprocess(storage, zxid, session_id, time, current_digest, keeper_context); if (!new_deltas.empty()) { @@ -1694,7 +1730,7 @@ struct KeeperStorageAuthRequestProcessor final : public KeeperStorageRequestProc { using KeeperStorageRequestProcessor::KeeperStorageRequestProcessor; std::vector - preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/) const override + preprocess(KeeperStorage & storage, int64_t zxid, int64_t session_id, int64_t /*time*/, uint64_t & /*digest*/, const KeeperContext & /*keeper_context*/) const override { Coordination::ZooKeeperAuthRequest & auth_request = dynamic_cast(*zk_request); Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); @@ -1955,7 +1991,7 @@ void KeeperStorage::preprocessRequest( return; } - new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest); + new_deltas = request_processor->preprocess(*this, transaction.zxid, session_id, time, new_digest, *keeper_context); } KeeperStorage::ResponsesForSessions KeeperStorage::processRequest( From b6f640013d8d6d96b3e78e42561d5667de1535e8 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 25 Jul 2022 12:40:11 +0000 Subject: [PATCH 161/227] Fix bug in ASOF JOIN with enable_optimize_predicate_expression --- src/Interpreters/JoinedTables.cpp | 11 ++++++++++- ..._asof_optimize_predicate_bug_37813.reference | 2 ++ .../02366_asof_optimize_predicate_bug_37813.sql | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.reference create mode 100644 tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.sql diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index df47e8acdca..27c372ab5c7 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -59,11 +59,20 @@ void replaceJoinedTable(const ASTSelectQuery & select_query) if (!join || !join->table_expression) return; - /// TODO: Push down for CROSS JOIN is not OK [disabled] const auto & table_join = join->table_join->as(); + + /// TODO: Push down for CROSS JOIN is not OK [disabled] if (table_join.kind == ASTTableJoin::Kind::Cross) return; + /* Do not push down predicates for ASOF because it can lead to incorrect results + * (for example, if we will filter a suitable row before joining and will choose another, not the closest row). + * ANY join behavior can also be different with this optimization, + * but it's ok because we don't guarantee which row to choose for ANY, unlike ASOF, where we have to pick the closest one. + */ + if (table_join.strictness == ASTTableJoin::Strictness::Asof) + return; + auto & table_expr = join->table_expression->as(); if (table_expr.database_and_table_name) { diff --git a/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.reference b/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.reference new file mode 100644 index 00000000000..6ed281c757a --- /dev/null +++ b/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.reference @@ -0,0 +1,2 @@ +1 +1 diff --git a/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.sql b/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.sql new file mode 100644 index 00000000000..4db42a22560 --- /dev/null +++ b/tests/queries/0_stateless/02366_asof_optimize_predicate_bug_37813.sql @@ -0,0 +1,17 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 (c1 Int32, c2 Int32) ENGINE MergeTree ORDER BY c1; +INSERT INTO t1 (c1, c2) VALUES (1, 10), (1, 20), (1, 30); + +DROP TABLE IF EXISTS t2; +CREATE TABLE t2 (c1 Int32, c2 Int32, c3 String) ENGINE MergeTree ORDER BY (c1, c2, c3); +INSERT INTO t2 (c1, c2, c3) VALUES (1, 5, 'a'), (1, 15, 'b'), (1, 25, 'c'); + +SET enable_optimize_predicate_expression = 1; +WITH + v1 AS (SELECT t1.c2, t2.c2, t2.c3 FROM t1 ASOF JOIN t2 USING (c1, c2)) + SELECT count() FROM v1 WHERE c3 = 'b'; + +SET enable_optimize_predicate_expression = 0; +WITH + v1 AS (SELECT t1.c2, t2.c2, t2.c3 FROM t1 ASOF JOIN t2 USING (c1, c2)) + SELECT count() FROM v1 WHERE c3 = 'b'; From 7b44950d1dca791b02a6154095aea1635ec9dd2c Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:14:33 +0200 Subject: [PATCH 162/227] More detailed comment --- src/Storages/MergeTree/MutateTask.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 59c784208c5..e09c87311aa 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1483,7 +1483,8 @@ bool MutateTask::prepare() ctx->materialized_indices = ctx->interpreter->grabMaterializedIndices(); ctx->materialized_projections = ctx->interpreter->grabMaterializedProjections(); ctx->mutation_kind = ctx->interpreter->getMutationKind(); - /// Always disable filtering in mutations, we want to read all rows + /// Always disable filtering in mutations: we want to read and write all rows because for updates we rewrite only some of the + /// columns and preserve the columns that are not affected, but after the update all columns must have the same number of rows. ctx->interpreter->setApplyDeletedMask(false); ctx->mutating_pipeline_builder = ctx->interpreter->execute(); ctx->updated_header = ctx->interpreter->getUpdatedHeader(); From 48de02a7b89e868ae44b25d8abb4dd95d1710336 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:15:15 +0200 Subject: [PATCH 163/227] Capitalized const name --- src/Interpreters/InterpreterDeleteQuery.cpp | 2 +- src/Interpreters/MutationsInterpreter.cpp | 8 ++++---- src/Storages/AlterCommands.cpp | 4 ++-- src/Storages/LightweightDeleteDescription.cpp | 2 +- src/Storages/LightweightDeleteDescription.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MergeTask.cpp | 2 +- .../MergeTree/MergeTreeBaseSelectProcessor.cpp | 10 +++++----- src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 2 +- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 2 +- src/Storages/StorageSnapshot.cpp | 2 +- 14 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 8c8030c6a51..47b0050e46c 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -82,7 +82,7 @@ BlockIO InterpreterDeleteQuery::execute() command->predicate = delete_query.predicate; command->update_assignments = std::make_shared(); auto set_row_does_not_exist = std::make_shared(); - set_row_does_not_exist->column_name = LightweightDeleteDescription::filter_column.name; + set_row_does_not_exist->column_name = LightweightDeleteDescription::FILTER_COLUMN.name; auto zero_value = std::make_shared(DB::Field(UInt8(0))); set_row_does_not_exist->children.push_back(zero_value); command->update_assignments->children.push_back(set_row_does_not_exist); diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 64ea313a1af..7b2d55911d1 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -351,7 +351,7 @@ static void validateUpdateColumns( } /// Allow to override value of lightweight delete filter virtual column - if (!found && column_name == LightweightDeleteDescription::filter_column.name) + if (!found && column_name == LightweightDeleteDescription::FILTER_COLUMN.name) { if (!storage->supportsLightweightDelete()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Lightweight delete is not supported for table"); @@ -514,8 +514,8 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) DataTypePtr type; if (auto physical_column = columns_desc.tryGetPhysical(column)) type = physical_column->type; - else if (column == LightweightDeleteDescription::filter_column.name) - type = LightweightDeleteDescription::filter_column.type; + else if (column == LightweightDeleteDescription::FILTER_COLUMN.name) + type = LightweightDeleteDescription::FILTER_COLUMN.type; else throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown column {}", column); @@ -782,7 +782,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & if (auto part_storage = dynamic_pointer_cast(storage)) { if (part_storage->hasLightweightDeletedMask()) - all_columns.push_back({LightweightDeleteDescription::filter_column}); + all_columns.push_back({LightweightDeleteDescription::FILTER_COLUMN}); } /// Next, for each stage calculate columns changed by this and previous stages. diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 808d634b1ea..7f99abf31fc 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -787,7 +787,7 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada /// Drop alias is metadata alter, in other case mutation is required. if (type == DROP_COLUMN) return metadata.columns.hasColumnOrNested(GetColumnsOptions::AllPhysical, column_name) || - column_name == LightweightDeleteDescription::filter_column.name; + column_name == LightweightDeleteDescription::FILTER_COLUMN.name; if (type != MODIFY_COLUMN || data_type == nullptr) return false; @@ -1153,7 +1153,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const { if (all_columns.has(command.column_name) || all_columns.hasNested(command.column_name) || - (command.clear && column_name == LightweightDeleteDescription::filter_column.name)) + (command.clear && column_name == LightweightDeleteDescription::FILTER_COLUMN.name)) { if (!command.clear) /// CLEAR column is Ok even if there are dependencies. { diff --git a/src/Storages/LightweightDeleteDescription.cpp b/src/Storages/LightweightDeleteDescription.cpp index 0ffb7766c80..ae5e68da9c2 100644 --- a/src/Storages/LightweightDeleteDescription.cpp +++ b/src/Storages/LightweightDeleteDescription.cpp @@ -4,6 +4,6 @@ namespace DB { -const NameAndTypePair LightweightDeleteDescription::filter_column {"_row_exists", std::make_shared()}; +const NameAndTypePair LightweightDeleteDescription::FILTER_COLUMN {"_row_exists", std::make_shared()}; } diff --git a/src/Storages/LightweightDeleteDescription.h b/src/Storages/LightweightDeleteDescription.h index 7177b2a5b54..45bde59ea71 100644 --- a/src/Storages/LightweightDeleteDescription.h +++ b/src/Storages/LightweightDeleteDescription.h @@ -7,7 +7,7 @@ namespace DB struct LightweightDeleteDescription { - static const NameAndTypePair filter_column; + static const NameAndTypePair FILTER_COLUMN; }; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index b205da4d4c2..1218740daa0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -461,7 +461,7 @@ public: bool supportLightweightDeleteMutate() const; /// True if here is lightweight deleted mask file in part. - bool hasLightweightDelete() const { return columns.contains(LightweightDeleteDescription::filter_column.name); } + bool hasLightweightDelete() const { return columns.contains(LightweightDeleteDescription::FILTER_COLUMN.name); } protected: diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 77d3089d37b..e4eadf9adf7 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -814,7 +814,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() auto columns = global_ctx->merging_column_names; /// The part might have some rows masked by lightweight deletes - const auto lightweight_delete_filter_column = LightweightDeleteDescription::filter_column.name; + const auto lightweight_delete_filter_column = LightweightDeleteDescription::FILTER_COLUMN.name; const bool need_to_filter_deleted_rows = part->hasLightweightDelete(); if (need_to_filter_deleted_rows) columns.emplace_back(lightweight_delete_filter_column); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 00b9959739f..08142bd8dd1 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -61,7 +61,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( { non_const_virtual_column_names.emplace_back(*it); } - else if (*it == LightweightDeleteDescription::filter_column.name) + else if (*it == LightweightDeleteDescription::FILTER_COLUMN.name) { non_const_virtual_column_names.emplace_back(*it); } @@ -244,7 +244,7 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart( /// Add lightweight delete filtering step if (reader_settings.apply_deleted_mask && data_part->hasLightweightDelete()) { - pre_reader_for_step.push_back(data_part->getReader({LightweightDeleteDescription::filter_column}, metadata_snapshot, mark_ranges, + pre_reader_for_step.push_back(data_part->getReader({LightweightDeleteDescription::FILTER_COLUMN}, metadata_snapshot, mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, value_size_map, profile_callback)); } @@ -469,14 +469,14 @@ static void injectNonConstVirtualColumns( } } - if (virtual_column_name == LightweightDeleteDescription::filter_column.name) + if (virtual_column_name == LightweightDeleteDescription::FILTER_COLUMN.name) { /// If _row_exists column isn't present in the part then fill it here with 1s ColumnPtr column; if (rows) - column = LightweightDeleteDescription::filter_column.type->createColumnConst(rows, 1)->convertToFullColumnIfConst(); + column = LightweightDeleteDescription::FILTER_COLUMN.type->createColumnConst(rows, 1)->convertToFullColumnIfConst(); else - column = LightweightDeleteDescription::filter_column.type->createColumn(); + column = LightweightDeleteDescription::FILTER_COLUMN.type->createColumn(); inserter.insertUInt8Column(column, virtual_column_name); } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 1a04c2ef25f..aa1b9d3541e 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -103,7 +103,7 @@ protected: StorageSnapshotPtr storage_snapshot; /// This step is added when the part has lightweight delete mask - const PrewhereExprStep lightweight_delete_filter_step { nullptr, LightweightDeleteDescription::filter_column.name, true, true }; + const PrewhereExprStep lightweight_delete_filter_step { nullptr, LightweightDeleteDescription::FILTER_COLUMN.name, true, true }; PrewhereInfoPtr prewhere_info; std::unique_ptr prewhere_actions; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f20121898ee..53425bc02f3 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6575,7 +6575,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const NameAndTypePair("_partition_value", getPartitionValueType()), NameAndTypePair("_sample_factor", std::make_shared()), NameAndTypePair("_part_offset", std::make_shared()), - LightweightDeleteDescription::filter_column, + LightweightDeleteDescription::FILTER_COLUMN, }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 0a7b3071559..a96486c79ae 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1215,7 +1215,7 @@ static void selectColumnNames( { virt_column_names.push_back(name); } - else if (name == LightweightDeleteDescription::filter_column.name) + else if (name == LightweightDeleteDescription::FILTER_COLUMN.name) { virt_column_names.push_back(name); } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index acc90fe7313..a10192c4cc1 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -1150,7 +1150,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r block.insert({result.columns[pos], std::make_shared(), column_name}); } - else if (column_name == LightweightDeleteDescription::filter_column.name) + else if (column_name == LightweightDeleteDescription::FILTER_COLUMN.name) { /// Do nothing, it will be added later } diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e09c87311aa..7d0c37051e7 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -172,7 +172,7 @@ getColumnsForNewDataPart( ColumnsDescription part_columns(source_part->getColumns()); NamesAndTypesList system_columns; if (source_part->supportLightweightDeleteMutate()) - system_columns.push_back(LightweightDeleteDescription::filter_column); + system_columns.push_back(LightweightDeleteDescription::FILTER_COLUMN); /// Preserve system columns that have persisted values in the source_part for (const auto & column : system_columns) diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp index 5f9857b28ef..a99fec8c154 100644 --- a/src/Storages/StorageSnapshot.cpp +++ b/src/Storages/StorageSnapshot.cpp @@ -22,7 +22,7 @@ void StorageSnapshot::init() virtual_columns[name] = type; if (storage.hasLightweightDeletedMask()) - system_columns[LightweightDeleteDescription::filter_column.name] = LightweightDeleteDescription::filter_column.type; + system_columns[LightweightDeleteDescription::FILTER_COLUMN.name] = LightweightDeleteDescription::FILTER_COLUMN.type; } NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options) const From 5aae0a2e044627f67e8889f2d7b24c0b2f532d7d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 25 Jul 2022 17:20:01 +0200 Subject: [PATCH 164/227] Fix style --- src/Formats/CapnProtoUtils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Formats/CapnProtoUtils.cpp b/src/Formats/CapnProtoUtils.cpp index 84af46de5de..b8702380aa7 100644 --- a/src/Formats/CapnProtoUtils.cpp +++ b/src/Formats/CapnProtoUtils.cpp @@ -29,6 +29,7 @@ namespace ErrorCodes extern const int UNKNOWN_EXCEPTION; extern const int INCORRECT_DATA; extern const int CAPN_PROTO_BAD_TYPE; + extern const int BAD_ARGUMENTS; } capnp::StructSchema CapnProtoSchemaParser::getMessageSchema(const FormatSchemaInfo & schema_info) From 9ffaf2fef2de730d9724615cae6235a381ff3086 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 25 Jul 2022 17:32:51 +0200 Subject: [PATCH 165/227] temporarily disable all tests with materialized postgres --- .../test_postgresql_replica_database_engine_1/test.py | 8 ++++++++ .../test_postgresql_replica_database_engine_2/test.py | 8 ++++++++ tests/integration/test_storage_postgresql_replica/test.py | 8 ++++++++ 3 files changed, 24 insertions(+) diff --git a/tests/integration/test_postgresql_replica_database_engine_1/test.py b/tests/integration/test_postgresql_replica_database_engine_1/test.py index e7b642b5028..5072c261cf7 100644 --- a/tests/integration/test_postgresql_replica_database_engine_1/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_1/test.py @@ -1,4 +1,12 @@ import pytest + +# FIXME Tests with MaterializedPostgresSQL are temporarily disabled +# https://github.com/ClickHouse/ClickHouse/issues/36898 +# https://github.com/ClickHouse/ClickHouse/issues/38677 +# https://github.com/ClickHouse/ClickHouse/pull/39272#issuecomment-1190087190 + +pytestmark = pytest.mark.skip + import time import os.path as p import random diff --git a/tests/integration/test_postgresql_replica_database_engine_2/test.py b/tests/integration/test_postgresql_replica_database_engine_2/test.py index 796edf04f06..9b4de5356bf 100644 --- a/tests/integration/test_postgresql_replica_database_engine_2/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_2/test.py @@ -1,4 +1,12 @@ import pytest + +# FIXME Tests with MaterializedPostgresSQL are temporarily disabled +# https://github.com/ClickHouse/ClickHouse/issues/36898 +# https://github.com/ClickHouse/ClickHouse/issues/38677 +# https://github.com/ClickHouse/ClickHouse/pull/39272#issuecomment-1190087190 + +pytestmark = pytest.mark.skip + import time import psycopg2 import os.path as p diff --git a/tests/integration/test_storage_postgresql_replica/test.py b/tests/integration/test_storage_postgresql_replica/test.py index 64f41022aef..5df8b9029e6 100644 --- a/tests/integration/test_storage_postgresql_replica/test.py +++ b/tests/integration/test_storage_postgresql_replica/test.py @@ -1,4 +1,12 @@ import pytest + +# FIXME Tests with MaterializedPostgresSQL are temporarily disabled +# https://github.com/ClickHouse/ClickHouse/issues/36898 +# https://github.com/ClickHouse/ClickHouse/issues/38677 +# https://github.com/ClickHouse/ClickHouse/pull/39272#issuecomment-1190087190 + +pytestmark = pytest.mark.skip + import time import psycopg2 import os.path as p From f1e1cff27235b97b98a1dc445a589daadf76f07c Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 25 Jul 2022 18:00:54 +0200 Subject: [PATCH 166/227] fix create/drop index on cluster --- src/Interpreters/DDLWorker.cpp | 8 ++++++- ...9_sql_standard_create_drop_index.reference | 6 +++++ .../02319_sql_standard_create_drop_index.sql | 22 +++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 51932ad051b..13432940c1b 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include #include @@ -652,7 +654,11 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, const Stora if (auto * query = ast_ddl->as(); query && query->kind != ASTDropQuery::Kind::Truncate) return false; - if (!ast_ddl->as() && !ast_ddl->as() && !ast_ddl->as()) + if (!ast_ddl->as() && + !ast_ddl->as() && + !ast_ddl->as() && + !ast_ddl->as() && + !ast_ddl->as()) return false; if (auto * alter = ast_ddl->as()) diff --git a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference index 6565857f89d..a4a924fd229 100644 --- a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference +++ b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference @@ -2,3 +2,9 @@ CREATE TABLE default.t_index\n(\n `a` Int32,\n `b` String,\n INDEX i_a t_index i_a minmax a 4 t_index i_b bloom_filter b 2 t_index i_b bloom_filter b 2 +CREATE TABLE default.t_index\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default/\', \'1\')\nORDER BY a\nSETTINGS index_granularity = 8192 +CREATE TABLE default.t_index_replica\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default/\', \'2\')\nORDER BY a\nSETTINGS index_granularity = 8192 +t_index i_a minmax a 4 +t_index i_b bloom_filter b 2 +t_index i_b bloom_filter b 2 +t_index_replica i_b bloom_filter b 2 diff --git a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql index a33505ced3a..bb01dcf2e64 100644 --- a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql +++ b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql @@ -15,3 +15,25 @@ drop index if exists i_a on t_index; select table, name, type, expr, granularity from system.data_skipping_indices where database = currentDatabase() and table = 't_index'; drop table t_index; + +create table t_index(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}/', '1') order by a; +create table t_index_replica(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}/', '2') order by a; + +create index i_a on t_index(a) TYPE minmax GRANULARITY 4; +create index if not exists i_a on t_index(a) TYPE minmax GRANULARITY 2; + +create index i_b on t_index(b) TYPE bloom_filter GRANULARITY 2; + +show create table t_index; +system sync replica t_index_replica; +show create table t_index_replica; +select table, name, type, expr, granularity from system.data_skipping_indices where database = currentDatabase() and table = 't_index'; + +drop index i_a on t_index; +drop index if exists i_a on t_index; + +select table, name, type, expr, granularity from system.data_skipping_indices where database = currentDatabase() and table = 't_index'; +system sync replica t_index_replica; +select table, name, type, expr, granularity from system.data_skipping_indices where database = currentDatabase() and table = 't_index_replica'; + +drop table t_index; From 72efcc65c15b3b36ab2ec3fc9c6209894702c307 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Mon, 25 Jul 2022 12:58:57 -0400 Subject: [PATCH 167/227] Update docs/en/engines/table-engines/mergetree-family/mergetree.md --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index e216a99f986..2ca07276e63 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -482,7 +482,9 @@ For example: ## Projections {#projections} Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. - +::: note +When you are implementing projections you should also consider the [force_optimize_projection](../../../operations/settings/settings.md#force-optimize-projection) setting. +::: Projections are not supported in the `SELECT` statements with the [FINAL](../../../sql-reference/statements/select/from.md#select-from-final) modifier. ### Projection Query {#projection-query} From 813438a29d1a6a66efdafe4b24d5820f531a0db2 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Mon, 25 Jul 2022 14:48:53 -0300 Subject: [PATCH 168/227] Update settings.md --- docs/ru/operations/settings/settings.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 1d57b688217..5ad6e4ef3b6 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -656,8 +656,9 @@ ClickHouse может парсить только базовый формат `Y Изменяет поведение операций, выполняемых со строгостью `ANY`. -:::danger "Внимание" +:::warning "Внимание" Настройка применяется только для операций `JOIN`, выполняемых над таблицами с движком [Join](../../engines/table-engines/special/join.md). +::: Возможные значения: @@ -2112,8 +2113,9 @@ SELECT * FROM test_table Устанавливает приоритет ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) для потоков, исполняющих запросы. Планировщик ОС учитывает эти приоритеты при выборе следующего потока для исполнения на доступном ядре CPU. -:::danger "Предупреждение" +:::warning "Предупреждение" Для использования этой настройки необходимо установить свойство `CAP_SYS_NICE`. Пакет `clickhouse-server` устанавливает его во время инсталляции. Некоторые виртуальные окружения не позволяют установить `CAP_SYS_NICE`. В этом случае, `clickhouse-server` выводит сообщение при запуске. +::: Допустимые значения: From ddff6c86df62231469dac714c27b26f4064bcc96 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 25 Jul 2022 19:13:10 +0000 Subject: [PATCH 169/227] Update version_date.tsv after v22.3.9.19-lts --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index e4c7aae8b25..e8b2adce2b6 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -9,6 +9,7 @@ v22.4.5.9-stable 2022-05-06 v22.4.4.7-stable 2022-04-29 v22.4.3.3-stable 2022-04-26 v22.4.2.1-stable 2022-04-22 +v22.3.9.19-lts 2022-07-25 v22.3.8.39-lts 2022-07-07 v22.3.7.28-lts 2022-06-20 v22.3.6.5-lts 2022-05-06 From 250f19378dea3007b6d2e25a0d2d0545f9f36419 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Mon, 25 Jul 2022 15:22:48 -0400 Subject: [PATCH 170/227] free compression and decompression contexts --- utils/self-extracting-executable/compressor.cpp | 9 +++++++++ utils/self-extracting-executable/decompressor.cpp | 2 ++ 2 files changed, 11 insertions(+) diff --git a/utils/self-extracting-executable/compressor.cpp b/utils/self-extracting-executable/compressor.cpp index 58674818c44..f40c4725c32 100644 --- a/utils/self-extracting-executable/compressor.cpp +++ b/utils/self-extracting-executable/compressor.cpp @@ -103,12 +103,14 @@ int compress(int in_fd, int out_fd, int level, off_t & pointer, const struct sta if (ZSTD_isError(check_result)) { std::cerr << "Error (ZSTD): " << check_result << " " << ZSTD_getErrorName(check_result) << std::endl; + ZSTD_freeCCtx(cctx); return 1; } check_result = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1); if (ZSTD_isError(check_result)) { std::cerr << "Error (ZSTD): " << check_result << " " << ZSTD_getErrorName(check_result) << std::endl; + ZSTD_freeCCtx(cctx); return 1; } @@ -129,11 +131,13 @@ int compress(int in_fd, int out_fd, int level, off_t & pointer, const struct sta if (output == MAP_FAILED) { perror(nullptr); + ZSTD_freeCCtx(cctx); return 1; } if (-1 == lseek(out_fd, 0, SEEK_END)) { perror(nullptr); + ZSTD_freeCCtx(cctx); return 1; } @@ -154,6 +158,7 @@ int compress(int in_fd, int out_fd, int level, off_t & pointer, const struct sta perror(nullptr); if (0 != munmap(output, 2 * max_block_size)) perror(nullptr); + ZSTD_freeCCtx(cctx); return 1; } @@ -161,6 +166,7 @@ int compress(int in_fd, int out_fd, int level, off_t & pointer, const struct sta if (current_block_size != write_data(out_fd, output, current_block_size)) { perror(nullptr); + ZSTD_freeCCtx(cctx); return 1; } pointer += current_block_size; @@ -172,8 +178,11 @@ int compress(int in_fd, int out_fd, int level, off_t & pointer, const struct sta 0 != munmap(output, 2 * max_block_size)) { perror(nullptr); + ZSTD_freeCCtx(cctx); return 1; } + + ZSTD_freeCCtx(cctx); return 0; } diff --git a/utils/self-extracting-executable/decompressor.cpp b/utils/self-extracting-executable/decompressor.cpp index 3ec06e91176..679dc144f13 100644 --- a/utils/self-extracting-executable/decompressor.cpp +++ b/utils/self-extracting-executable/decompressor.cpp @@ -151,6 +151,8 @@ int decompress(char * input, char * output, off_t start, off_t end, size_t max_n --number_of_forks; } + ZSTD_freeDCtx(dctx); + /// If error happen end of processed part will not reach end if (in_pointer < end || error_happened) return 1; From f04ad30f0d49873711dafb6aca0341b12f3418e0 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 25 Jul 2022 19:25:27 +0000 Subject: [PATCH 171/227] Update version_date.tsv and changelogs after v22.6.4.35-stable --- docs/changelogs/v22.6.4.35-stable.md | 36 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 3 +++ 2 files changed, 39 insertions(+) create mode 100644 docs/changelogs/v22.6.4.35-stable.md diff --git a/docs/changelogs/v22.6.4.35-stable.md b/docs/changelogs/v22.6.4.35-stable.md new file mode 100644 index 00000000000..d70d20d6134 --- /dev/null +++ b/docs/changelogs/v22.6.4.35-stable.md @@ -0,0 +1,36 @@ +--- +sidebar_position: 1 +sidebar_label: 2022 +--- + +# 2022 Changelog + +### ClickHouse release v22.6.4.35-stable FIXME as compared to v22.6.3.35-stable + +#### Build/Testing/Packaging Improvement +* Backported in [#38822](https://github.com/ClickHouse/ClickHouse/issues/38822): - Change `all|noarch` packages to architecture-dependent - Fix some documentation for it - Push aarch64|arm64 packages to artifactory and release assets - Fixes [#36443](https://github.com/ClickHouse/ClickHouse/issues/36443). [#38580](https://github.com/ClickHouse/ClickHouse/pull/38580) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#38242](https://github.com/ClickHouse/ClickHouse/issues/38242): Fix possible crash in `Distributed` async insert in case of removing a replica from config. [#38029](https://github.com/ClickHouse/ClickHouse/pull/38029) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#38865](https://github.com/ClickHouse/ClickHouse/issues/38865): Fix s3 seekable reads with parallel read buffer. (Affected memory usage during query). Closes [#38258](https://github.com/ClickHouse/ClickHouse/issues/38258). [#38802](https://github.com/ClickHouse/ClickHouse/pull/38802) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#38853](https://github.com/ClickHouse/ClickHouse/issues/38853): Update `simdjson`. This fixes [#38621](https://github.com/ClickHouse/ClickHouse/issues/38621). [#38838](https://github.com/ClickHouse/ClickHouse/pull/38838) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#38942](https://github.com/ClickHouse/ClickHouse/issues/38942): - Fix settings profile with seconds unit. [#38896](https://github.com/ClickHouse/ClickHouse/pull/38896) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#39063](https://github.com/ClickHouse/ClickHouse/issues/39063): Any allocations inside OvercommitTracker may lead to deadlock. Logging was not very informative so it's easier just to remove logging. Fixes [#37794](https://github.com/ClickHouse/ClickHouse/issues/37794). [#39030](https://github.com/ClickHouse/ClickHouse/pull/39030) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#39077](https://github.com/ClickHouse/ClickHouse/issues/39077): Fix bug in filesystem cache that could happen in some corner case which coincided with cache capacity hitting the limit. Closes [#39066](https://github.com/ClickHouse/ClickHouse/issues/39066). [#39070](https://github.com/ClickHouse/ClickHouse/pull/39070) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#39151](https://github.com/ClickHouse/ClickHouse/issues/39151): Fix error `Block structure mismatch` which could happen for INSERT into table with attached MATERIALIZED VIEW and enabled setting `extremes = 1`. Closes [#29759](https://github.com/ClickHouse/ClickHouse/issues/29759) and [#38729](https://github.com/ClickHouse/ClickHouse/issues/38729). [#39125](https://github.com/ClickHouse/ClickHouse/pull/39125) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#39275](https://github.com/ClickHouse/ClickHouse/issues/39275): Fixed error `Not found column Type in block` in selects with `PREWHERE` and read-in-order optimizations. [#39157](https://github.com/ClickHouse/ClickHouse/pull/39157) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#39371](https://github.com/ClickHouse/ClickHouse/issues/39371): Declare RabbitMQ queue without default arguments `x-max-length` and `x-overflow`. [#39259](https://github.com/ClickHouse/ClickHouse/pull/39259) ([rnbondarenko](https://github.com/rnbondarenko)). +* Backported in [#39352](https://github.com/ClickHouse/ClickHouse/issues/39352): Fix incorrect fetch postgresql tables query fro PostgreSQL database engine. Closes [#33502](https://github.com/ClickHouse/ClickHouse/issues/33502). [#39283](https://github.com/ClickHouse/ClickHouse/pull/39283) ([Kseniia Sumarokova](https://github.com/kssenii)). + +#### NO CL CATEGORY + +* Backported in [#38685](https://github.com/ClickHouse/ClickHouse/issues/38685):. [#38449](https://github.com/ClickHouse/ClickHouse/pull/38449) ([Maksim Kita](https://github.com/kitaisreal)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Use native Map type for OpenTelemetry attributes [#38814](https://github.com/ClickHouse/ClickHouse/pull/38814) ([Ilya Yatsishin](https://github.com/qoega)). +* Retry docker buildx commands with progressive sleep in between [#38898](https://github.com/ClickHouse/ClickHouse/pull/38898) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add docker_server.py running to backport and release CIs [#39011](https://github.com/ClickHouse/ClickHouse/pull/39011) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix meilisearch tests [#39110](https://github.com/ClickHouse/ClickHouse/pull/39110) ([Kseniia Sumarokova](https://github.com/kssenii)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index e4c7aae8b25..777fd424321 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,7 +1,9 @@ v22.7.1.2484-stable 2022-07-21 +v22.6.4.35-stable 2022-07-25 v22.6.3.35-stable 2022-07-06 v22.6.2.12-stable 2022-06-29 v22.6.1.1985-stable 2022-06-16 +v22.5.3.21-stable 2022-07-25 v22.5.2.53-stable 2022-07-07 v22.5.1.2079-stable 2022-05-19 v22.4.6.53-stable 2022-07-07 @@ -9,6 +11,7 @@ v22.4.5.9-stable 2022-05-06 v22.4.4.7-stable 2022-04-29 v22.4.3.3-stable 2022-04-26 v22.4.2.1-stable 2022-04-22 +v22.3.9.19-lts 2022-07-25 v22.3.8.39-lts 2022-07-07 v22.3.7.28-lts 2022-06-20 v22.3.6.5-lts 2022-05-06 From 9abbb35cda0709408e9c3bd70a4b80caa3afee44 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 25 Jul 2022 23:40:23 +0300 Subject: [PATCH 172/227] Fix integration test --- src/IO/ReadWriteBufferFromHTTP.h | 55 +++++++++++++++----------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h index bbd1f92f0ad..d5abc4609ed 100644 --- a/src/IO/ReadWriteBufferFromHTTP.h +++ b/src/IO/ReadWriteBufferFromHTTP.h @@ -515,44 +515,39 @@ namespace detail for (size_t i = 0; i < settings.http_max_tries; ++i) { + exception = nullptr; + try { if (!impl) { initialize(); - switch (initialization_error) - { - case InitializeError::NON_RETRIABLE_ERROR: - { - assert(exception); - break; - } - case InitializeError::SKIP_NOT_FOUND_URL: - { - return false; - } - case InitializeError::RETRIABLE_ERROR: - { - LOG_ERROR( - log, - "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. " - "(Current backoff wait is {}/{} ms)", - uri.toString(), i + 1, settings.http_max_tries, getOffset(), - read_range.end ? toString(*read_range.end) : "unknown", - milliseconds_to_wait, settings.http_retry_max_backoff_ms); - assert(exception); - on_retriable_error(); - continue; - } - case InitializeError::NONE: - { - break; - } + if (initialization_error == InitializeError::NON_RETRIABLE_ERROR) + { + assert(exception); + break; + } + else if (initialization_error == InitializeError::SKIP_NOT_FOUND_URL) + { + return false; + } + else if (initialization_error == InitializeError::RETRIABLE_ERROR) + { + LOG_ERROR( + log, + "HTTP request to `{}` failed at try {}/{} with bytes read: {}/{}. " + "(Current backoff wait is {}/{} ms)", + uri.toString(), i + 1, settings.http_max_tries, getOffset(), + read_range.end ? toString(*read_range.end) : "unknown", + milliseconds_to_wait, settings.http_retry_max_backoff_ms); + + assert(exception); + on_retriable_error(); + continue; } - if (exception) - break; + assert(!exception); if (use_external_buffer) { From f32d9c5539de0fe89bdf50d881b857741b1ebf25 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotaryov <108669454+quickhouse@users.noreply.github.com> Date: Mon, 25 Jul 2022 23:53:53 +0300 Subject: [PATCH 173/227] Uppercase `ROWS`, `GROUPS`, `RANGE` in queries with windows. (#39410) --- src/Client/QueryFuzzer.cpp | 8 ++++---- src/Interpreters/ExpressionAnalyzer.cpp | 4 ++-- src/Interpreters/WindowDescription.cpp | 6 +++--- src/Interpreters/WindowDescription.h | 4 ++-- src/Parsers/ASTWindowDefinition.h | 2 +- src/Parsers/ExpressionElementParsers.cpp | 6 +++--- src/Processors/Transforms/WindowTransform.cpp | 14 +++++++------- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 703e6de82c6..787fad5990a 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -329,9 +329,9 @@ void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def) case 0: { const auto r = fuzz_rand() % 3; - def.frame_type = r == 0 ? WindowFrame::FrameType::Rows - : r == 1 ? WindowFrame::FrameType::Range - : WindowFrame::FrameType::Groups; + def.frame_type = r == 0 ? WindowFrame::FrameType::ROWS + : r == 1 ? WindowFrame::FrameType::RANGE + : WindowFrame::FrameType::GROUPS; break; } case 1: @@ -385,7 +385,7 @@ void QueryFuzzer::fuzzWindowFrame(ASTWindowDefinition & def) break; } - if (def.frame_type == WindowFrame::FrameType::Range + if (def.frame_type == WindowFrame::FrameType::RANGE && def.frame_begin_type == WindowFrame::BoundaryType::Unbounded && def.frame_begin_preceding && def.frame_end_type == WindowFrame::BoundaryType::Current) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 8a14c09819a..f16922f9772 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -828,8 +828,8 @@ void ExpressionAnalyzer::makeWindowDescriptionFromAST(const Context & context_, desc.full_sort_description.insert(desc.full_sort_description.end(), desc.order_by.begin(), desc.order_by.end()); - if (definition.frame_type != WindowFrame::FrameType::Rows - && definition.frame_type != WindowFrame::FrameType::Range) + if (definition.frame_type != WindowFrame::FrameType::ROWS + && definition.frame_type != WindowFrame::FrameType::RANGE) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Window frame '{}' is not implemented (while processing '{}')", diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp index 4661a148d70..335610b2be9 100644 --- a/src/Interpreters/WindowDescription.cpp +++ b/src/Interpreters/WindowDescription.cpp @@ -90,8 +90,8 @@ void WindowFrame::toString(WriteBuffer & buf) const void WindowFrame::checkValid() const { // Check the validity of offsets. - if (type == WindowFrame::FrameType::Rows - || type == WindowFrame::FrameType::Groups) + if (type == WindowFrame::FrameType::ROWS + || type == WindowFrame::FrameType::GROUPS) { if (begin_type == BoundaryType::Offset && !((begin_offset.getType() == Field::Types::UInt64 @@ -197,7 +197,7 @@ void WindowDescription::checkValid() const frame.checkValid(); // RANGE OFFSET requires exactly one ORDER BY column. - if (frame.type == WindowFrame::FrameType::Range + if (frame.type == WindowFrame::FrameType::RANGE && (frame.begin_type == WindowFrame::BoundaryType::Offset || frame.end_type == WindowFrame::BoundaryType::Offset) && order_by.size() != 1) diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h index 65c8cb9423c..e7bc0473c26 100644 --- a/src/Interpreters/WindowDescription.h +++ b/src/Interpreters/WindowDescription.h @@ -28,7 +28,7 @@ struct WindowFunctionDescription struct WindowFrame { - enum class FrameType { Rows, Groups, Range }; + enum class FrameType { ROWS, GROUPS, RANGE }; enum class BoundaryType { Unbounded, Current, Offset }; // This flag signifies that the frame properties were not set explicitly by @@ -36,7 +36,7 @@ struct WindowFrame // for the default frame of RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW. bool is_default = true; - FrameType type = FrameType::Range; + FrameType type = FrameType::RANGE; // UNBOUNDED FOLLOWING for the frame end is forbidden by the standard, but for // uniformity the begin_preceding still has to be set to true for UNBOUNDED diff --git a/src/Parsers/ASTWindowDefinition.h b/src/Parsers/ASTWindowDefinition.h index c53f8ef856e..507825f11d2 100644 --- a/src/Parsers/ASTWindowDefinition.h +++ b/src/Parsers/ASTWindowDefinition.h @@ -17,7 +17,7 @@ struct ASTWindowDefinition : public IAST ASTPtr order_by; bool frame_is_default = true; - WindowFrame::FrameType frame_type = WindowFrame::FrameType::Range; + WindowFrame::FrameType frame_type = WindowFrame::FrameType::RANGE; WindowFrame::BoundaryType frame_begin_type = WindowFrame::BoundaryType::Unbounded; ASTPtr frame_begin_offset; bool frame_begin_preceding = true; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index bd65305cc52..1de9adb834e 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -1198,15 +1198,15 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p node->frame_is_default = false; if (keyword_rows.ignore(pos, expected)) { - node->frame_type = WindowFrame::FrameType::Rows; + node->frame_type = WindowFrame::FrameType::ROWS; } else if (keyword_groups.ignore(pos, expected)) { - node->frame_type = WindowFrame::FrameType::Groups; + node->frame_type = WindowFrame::FrameType::GROUPS; } else if (keyword_range.ignore(pos, expected)) { - node->frame_type = WindowFrame::FrameType::Range; + node->frame_type = WindowFrame::FrameType::RANGE; } else { diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 5c833cf8f69..5e0d896599c 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -263,7 +263,7 @@ WindowTransform::WindowTransform(const Block & input_header_, // Choose a row comparison function for RANGE OFFSET frame based on the // type of the ORDER BY column. - if (window_description.frame.type == WindowFrame::FrameType::Range + if (window_description.frame.type == WindowFrame::FrameType::RANGE && (window_description.frame.begin_type == WindowFrame::BoundaryType::Offset || window_description.frame.end_type @@ -612,10 +612,10 @@ void WindowTransform::advanceFrameStart() case WindowFrame::BoundaryType::Offset: switch (window_description.frame.type) { - case WindowFrame::FrameType::Rows: + case WindowFrame::FrameType::ROWS: advanceFrameStartRowsOffset(); break; - case WindowFrame::FrameType::Range: + case WindowFrame::FrameType::RANGE: advanceFrameStartRangeOffset(); break; default: @@ -659,14 +659,14 @@ bool WindowTransform::arePeers(const RowNumber & x, const RowNumber & y) const return true; } - if (window_description.frame.type == WindowFrame::FrameType::Rows) + if (window_description.frame.type == WindowFrame::FrameType::ROWS) { // For ROWS frame, row is only peers with itself (checked above); return false; } // For RANGE and GROUPS frames, rows that compare equal w/ORDER BY are peers. - assert(window_description.frame.type == WindowFrame::FrameType::Range); + assert(window_description.frame.type == WindowFrame::FrameType::RANGE); const size_t n = order_by_indices.size(); if (n == 0) { @@ -844,10 +844,10 @@ void WindowTransform::advanceFrameEnd() case WindowFrame::BoundaryType::Offset: switch (window_description.frame.type) { - case WindowFrame::FrameType::Rows: + case WindowFrame::FrameType::ROWS: advanceFrameEndRowsOffset(); break; - case WindowFrame::FrameType::Range: + case WindowFrame::FrameType::RANGE: advanceFrameEndRangeOffset(); break; default: From c1834d183b4a6daff298d1531301d715fec94717 Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 26 Jul 2022 11:50:09 +0800 Subject: [PATCH 174/227] ReplicatedMergeTree support lightweight delete --- src/Interpreters/InterpreterDeleteQuery.cpp | 8 ++- src/Storages/StorageReplicatedMergeTree.cpp | 5 ++ src/Storages/StorageReplicatedMergeTree.h | 2 + ..._delete_on_replicated_merge_tree.reference | 30 +++++++++ ...weight_delete_on_replicated_merge_tree.sql | 64 +++++++++++++++++++ 5 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.reference create mode 100644 tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp index 47b0050e46c..497fae8f573 100644 --- a/src/Interpreters/InterpreterDeleteQuery.cpp +++ b/src/Interpreters/InterpreterDeleteQuery.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -50,8 +51,9 @@ BlockIO InterpreterDeleteQuery::execute() /// First check table storage for validations. StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); auto storage_merge_tree = std::dynamic_pointer_cast(table); - if (!storage_merge_tree) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree tables are supported"); + auto storage_replicated_merge_tree = std::dynamic_pointer_cast(table); + if (!storage_merge_tree && !storage_replicated_merge_tree) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree or ReplicatedMergeTree tables are supported"); checkStorageSupportsTransactionsIfNeeded(table, getContext()); if (table->isStaticStorage()) @@ -95,7 +97,7 @@ BlockIO InterpreterDeleteQuery::execute() table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef()); MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate(); - storage_merge_tree->mutate(mutation_commands, getContext()); + table->mutate(mutation_commands, getContext()); return {}; } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 219093e8d75..1ec14f643d4 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6015,6 +6015,11 @@ CancellationCode StorageReplicatedMergeTree::killMutation(const String & mutatio return CancellationCode::CancelSent; } +bool StorageReplicatedMergeTree::hasLightweightDeletedMask() const +{ + return has_lightweight_delete_parts.load(std::memory_order_relaxed); +} + void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() { auto table_lock = lockForShare( diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 18b9ef54777..c35e2d5cf5c 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -152,6 +152,8 @@ public: std::vector getMutationsStatus() const override; CancellationCode killMutation(const String & mutation_id) override; + bool hasLightweightDeletedMask() const override; + /** Removes a replica from ZooKeeper. If there are no other replicas, it deletes the entire table from ZooKeeper. */ void drop() override; diff --git a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.reference b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.reference new file mode 100644 index 00000000000..0153257a80b --- /dev/null +++ b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.reference @@ -0,0 +1,30 @@ +99 +99 +95 +0 +0 +-----Check that select and merge with lightweight delete.----- +7 +0 0 0 +2 2 2 +3 3 3 +5 5 5 +7 7 7 +8 8 8 +9 9 9 +0 0 0 +2 2 2 +3 3 3 +5 5 5 +7 7 7 +8 8 8 +9 9 9 +7 +-----Check fetch part with lightweight delete----- +0 0 0 +2 2 2 +3 3 3 +5 5 5 +6 6 6 +8 8 8 +9 9 9 diff --git a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql new file mode 100644 index 00000000000..3df9acbee87 --- /dev/null +++ b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql @@ -0,0 +1,64 @@ +DROP TABLE IF EXISTS replicated_table_r1 SYNC; +DROP TABLE IF EXISTS replicated_table_r2 SYNC; + +CREATE TABLE replicated_table_r1(id Int32, name String) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_rep','1') ORDER BY id; +CREATE TABLE replicated_table_r2(id Int32, name String) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_rep','2') ORDER BY id; + +INSERT INTO replicated_table_r1 select number, toString(number) FROM numbers(100); + +SET mutations_sync = 1; +SET allow_experimental_lightweight_delete = 1; + +DELETE FROM replicated_table_r1 WHERE id = 10; + +SELECT COUNT() FROM replicated_table_r1; +SELECT COUNT() FROM replicated_table_r2; + +DELETE FROM replicated_table_r2 WHERE name IN ('1','2','3','4'); + +SELECT COUNT() FROM replicated_table_r1; + +DELETE FROM replicated_table_r1 WHERE 1; + +SELECT COUNT() FROM replicated_table_r1; +SELECT COUNT() FROM replicated_table_r2; + +DROP TABLE IF EXISTS replicated_table_r1 SYNC; +DROP TABLE IF EXISTS replicated_table_r2 SYNC; + +DROP TABLE IF EXISTS t_light_r1 SYNC; +DROP TABLE IF EXISTS t_light_r2 SYNC; + +CREATE TABLE t_light_r1(a int, b int, c int, index i_c(b) TYPE minmax granularity 4) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_light','1') ORDER BY a PARTITION BY c % 5; +CREATE TABLE t_light_r2(a int, b int, c int, index i_c(b) TYPE minmax granularity 4) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_light','2') ORDER BY a PARTITION BY c % 5; + +INSERT INTO t_light_r1 SELECT number, number, number FROM numbers(10); + +DELETE FROM t_light_r1 WHERE c%5=1; +DELETE FROM t_light_r2 WHERE c=4; + +SELECT '-----Check that select and merge with lightweight delete.-----'; +SELECT count(*) FROM t_light_r1; +SELECT * FROM t_light_r1 ORDER BY a; +SELECT * FROM t_light_r2 ORDER BY a; + +OPTIMIZE TABLE t_light_r1 FINAL; +SELECT count(*) FROM t_light_r1; + +DROP TABLE IF EXISTS t_light_r1 SYNC; +DROP TABLE IF EXISTS t_light_r2 SYNC; + +CREATE TABLE t_light_sync_r1(a int, b int, c int, index i_c(b) TYPE minmax granularity 4) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_sync','1') ORDER BY a PARTITION BY c % 5 SETTINGS min_bytes_for_wide_part=0; + +INSERT INTO t_light_sync_r1 SELECT number, number, number FROM numbers(10); + +DELETE FROM t_light_sync_r1 WHERE c%3=1; + +SELECT '-----Check fetch part with lightweight delete-----'; +CREATE TABLE t_light_sync_r2(a int, b int, c int, index i_c(b) TYPE minmax granularity 4) ENGINE = ReplicatedMergeTree('/test/02352/{database}/t_sync','2') ORDER BY a PARTITION BY c % 5 SETTINGS min_bytes_for_wide_part=0; +SYSTEM SYNC REPLICA t_light_sync_r2; + +SELECT * FROM t_light_sync_r2 ORDER BY a; + +DROP TABLE IF EXISTS t_light_sync_r1 SYNC; +DROP TABLE IF EXISTS t_light_sync_r2 SYNC; From 196b517e79e00b4d868dab379194ae3d1450623d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 22 Jul 2022 10:38:37 +0300 Subject: [PATCH 175/227] tests: add echo for 01601_custom_tld Signed-off-by: Azat Khuzhin --- .../0_stateless/01601_custom_tld.reference | 39 +++++++++++++++++++ .../queries/0_stateless/01601_custom_tld.sql | 2 + 2 files changed, 41 insertions(+) diff --git a/tests/queries/0_stateless/01601_custom_tld.reference b/tests/queries/0_stateless/01601_custom_tld.reference index ee326a77834..b4afe625dac 100644 --- a/tests/queries/0_stateless/01601_custom_tld.reference +++ b/tests/queries/0_stateless/01601_custom_tld.reference @@ -1,34 +1,73 @@ +-- { echo } + +select '-- no-tld'; -- no-tld +-- even if there is no TLD, 2-nd level by default anyway +-- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real) +select cutToFirstSignificantSubdomain('there-is-no-such-domain'); +select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain'); foo.there-is-no-such-domain +select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain'); foo.there-is-no-such-domain +select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list'); foo.there-is-no-such-domain +select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); foo.there-is-no-such-domain +select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); foo +select '-- generic'; -- generic +select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel kernel +select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss kernel.biz.ss +select '-- difference'; -- difference +-- biz.ss is not in the default TLD list, hence: +select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss biz.ss +select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss kernel.biz.ss +select '-- 3+level'; -- 3+level +select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at xx.blogspot.co.at +select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot blogspot +select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at xx.blogspot.co.at +select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot blogspot +select '-- url'; -- url +select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list'); foobar.com +select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list'); foobar.com +select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list'); foobar.com +select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list'); xx.blogspot.co.at +select '-- www'; -- www +select cutToFirstSignificantSubdomainCustomWithWWW('http://www.foo', 'public_suffix_list'); www.foo +select cutToFirstSignificantSubdomainCustom('http://www.foo', 'public_suffix_list'); foo +select '-- vector'; -- vector +select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at/' || toString(number), 'public_suffix_list') from numbers(1); xx.blogspot.co.at +select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain' || toString(number), 'public_suffix_list') from numbers(1); +select '-- no new line'; -- no new line +select cutToFirstSignificantSubdomainCustom('foo.bar', 'no_new_line_list'); foo.bar +select cutToFirstSignificantSubdomainCustom('a.foo.bar', 'no_new_line_list'); a.foo.bar +select cutToFirstSignificantSubdomainCustom('a.foo.baz', 'no_new_line_list'); foo.baz diff --git a/tests/queries/0_stateless/01601_custom_tld.sql b/tests/queries/0_stateless/01601_custom_tld.sql index 92ce28828f8..fd55e229fbc 100644 --- a/tests/queries/0_stateless/01601_custom_tld.sql +++ b/tests/queries/0_stateless/01601_custom_tld.sql @@ -1,3 +1,5 @@ +-- { echo } + select '-- no-tld'; -- even if there is no TLD, 2-nd level by default anyway -- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real) From 1d4a7c72903a1d698954919b0ebcb107bb49aae4 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 22 Jul 2022 10:36:50 +0300 Subject: [PATCH 176/227] Add support of !/* (exclamation/asterisk) in custom TLDs Public suffix list may contain special characters (you may find format here - [1]): - asterisk (*) - exclamation mark (!) [1]: https://github.com/publicsuffix/list/wiki/Format It is easier to describe how it should be interpreted with an examples. Consider the following part of the list: *.sch.uk *.kawasaki.jp !city.kawasaki.jp And here are the results for `cutToFirstSignificantSubdomainCustom()`: If you have only asterisk (*): foo.something.sheffield.sch.uk -> something.sheffield.sch.uk sheffield.sch.uk -> sheffield.sch.uk If you have exclamation mark (!) too: foo.kawasaki.jp -> foo.kawasaki.jp foo.foo.kawasaki.jp -> foo.foo.kawasaki.jp city.kawasaki.jp -> city.kawasaki.jp some.city.kawasaki.jp -> city.kawasaki.jp TLDs had been verified wit the following script [2], to match with python publicsuffix2 module. [2]: https://gist.github.com/azat/c1a7a9f1e3519793134ef4b1df5461a6 v2: fix StringHashTable padding requirements Fixes: #39468 Follow-up for: #17748 Signed-off-by: Azat Khuzhin --- src/Common/TLDListsHolder.cpp | 57 +++++++--- src/Common/TLDListsHolder.h | 31 ++++-- .../URL/ExtractFirstSignificantSubdomain.h | 104 +++++++++++------- .../URL/FirstSignificantSubdomainCustomImpl.h | 4 +- .../0_stateless/01601_custom_tld.reference | 18 +++ .../queries/0_stateless/01601_custom_tld.sql | 11 ++ 6 files changed, 159 insertions(+), 66 deletions(-) diff --git a/src/Common/TLDListsHolder.cpp b/src/Common/TLDListsHolder.cpp index a3019ac1c49..75e57d9b9d4 100644 --- a/src/Common/TLDListsHolder.cpp +++ b/src/Common/TLDListsHolder.cpp @@ -15,20 +15,31 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +constexpr size_t StringHashTablePadRequirement = 8; + /// TLDList TLDList::TLDList(size_t size) : tld_container(size) - , pool(std::make_unique(10 << 20)) -{} -bool TLDList::insert(StringRef host) + , memory_pool(std::make_unique()) { - bool inserted; - tld_container.emplace(DB::ArenaKeyHolder{host, *pool}, inserted); - return inserted; + /// StringHashTable requires padded to 8 bytes key, + /// and Arena (memory_pool here) does satisfies this, + /// since it has padding with 15 bytes at the right. + /// + /// However, StringHashTable may reference -1 byte of the key, + /// so left padding is also required: + memory_pool->alignedAlloc(StringHashTablePadRequirement, StringHashTablePadRequirement); } -bool TLDList::has(StringRef host) const +void TLDList::insert(const String & host, TLDType type) { - return tld_container.has(host); + StringRef owned_host{memory_pool->insert(host.data(), host.size()), host.size()}; + tld_container[owned_host] = type; +} +TLDType TLDList::lookup(StringRef host) const +{ + if (auto it = tld_container.find(host); it != nullptr) + return it->getMapped(); + return TLDType::TLD_NONE; } /// TLDListsHolder @@ -57,32 +68,44 @@ void TLDListsHolder::parseConfig(const std::string & top_level_domains_path, con size_t TLDListsHolder::parseAndAddTldList(const std::string & name, const std::string & path) { - std::unordered_set tld_list_tmp; + std::unordered_map tld_list_tmp; ReadBufferFromFile in(path); - String line; + String buffer; while (!in.eof()) { - readEscapedStringUntilEOL(line, in); + readEscapedStringUntilEOL(buffer, in); if (!in.eof()) ++in.position(); + std::string_view line(buffer); /// Skip comments - if (line.size() > 2 && line[0] == '/' && line[1] == '/') + if (line.starts_with("//")) continue; - line = trim(line, [](char c) { return std::isspace(c); }); + line = line.substr(0, line.rend() - std::find_if_not(line.rbegin(), line.rend(), ::isspace)); /// Skip empty line if (line.empty()) continue; - tld_list_tmp.emplace(line); + /// Validate special symbols. + if (line.starts_with("*.")) + { + line = line.substr(2); + tld_list_tmp.emplace(line, TLDType::TLD_ANY); + } + else if (line[0] == '!') + { + line = line.substr(1); + tld_list_tmp.emplace(line, TLDType::TLD_EXCLUDE); + } + else + tld_list_tmp.emplace(line, TLDType::TLD_REGULAR); } if (!in.eof()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Not all list had been read", name); TLDList tld_list(tld_list_tmp.size()); - for (const auto & host : tld_list_tmp) + for (const auto & [host, type] : tld_list_tmp) { - StringRef host_ref{host.data(), host.size()}; - tld_list.insert(host_ref); + tld_list.insert(host, type); } size_t tld_list_size = tld_list.size(); diff --git a/src/Common/TLDListsHolder.h b/src/Common/TLDListsHolder.h index e8acefb1b5e..5ea8c5afe9f 100644 --- a/src/Common/TLDListsHolder.h +++ b/src/Common/TLDListsHolder.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -12,25 +12,35 @@ namespace DB { +enum TLDType +{ + /// Does not exist marker + TLD_NONE, + /// For regular lines + TLD_REGULAR, + /// For asterisk (*) + TLD_ANY, + /// For exclamation mark (!) + TLD_EXCLUDE, +}; + /// Custom TLD List /// -/// Unlike tldLookup (which uses gperf) this one uses plain StringHashSet. +/// Unlike tldLookup (which uses gperf) this one uses plain StringHashMap. class TLDList { public: - using Container = StringHashSet<>; + using Container = StringHashMap; explicit TLDList(size_t size); - /// Return true if the tld_container does not contains such element. - bool insert(StringRef host); - /// Check is there such TLD - bool has(StringRef host) const; + void insert(const String & host, TLDType type); + TLDType lookup(StringRef host) const; size_t size() const { return tld_container.size(); } private: Container tld_container; - std::unique_ptr pool; + std::unique_ptr memory_pool; }; class TLDListsHolder @@ -48,6 +58,11 @@ public: /// - "//" -- comment, /// - empty lines will be ignored. /// + /// Treats the following special symbols: + /// - "*" + /// - "!" + /// + /// Format : https://github.com/publicsuffix/list/wiki/Format /// Example: https://publicsuffix.org/list/public_suffix_list.dat /// /// Return size of the list. diff --git a/src/Functions/URL/ExtractFirstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h index 70c9c25e4f3..73137da474f 100644 --- a/src/Functions/URL/ExtractFirstSignificantSubdomain.h +++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h @@ -3,15 +3,16 @@ #include #include "domain.h" #include "tldLookup.h" +#include /// TLDType namespace DB { struct FirstSignificantSubdomainDefaultLookup { - bool operator()(const char *src, size_t len) const + bool operator()(StringRef host) const { - return tldLookup::isValid(src, len); + return tldLookup::isValid(host.data, host.size); } }; @@ -51,44 +52,46 @@ struct ExtractFirstSignificantSubdomain const auto * begin = tmp; const auto * end = begin + domain_length; - const char * last_3_periods[3]{}; + std::array last_periods{}; const auto * pos = find_first_symbols<'.'>(begin, end); while (pos < end) { - last_3_periods[2] = last_3_periods[1]; - last_3_periods[1] = last_3_periods[0]; - last_3_periods[0] = pos; + last_periods[2] = last_periods[1]; + last_periods[1] = last_periods[0]; + last_periods[0] = pos; pos = find_first_symbols<'.'>(pos + 1, end); } - if (!last_3_periods[0]) + if (!last_periods[0]) return; - if (!last_3_periods[1]) + if (!last_periods[1]) { - res_size = last_3_periods[0] - begin; + res_size = last_periods[0] - begin; return; } - if (!last_3_periods[2]) - last_3_periods[2] = begin - 1; + if (!last_periods[2]) + last_periods[2] = begin - 1; - const auto * end_of_level_domain = find_first_symbols<'/'>(last_3_periods[0], end); + const auto * end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end); if (!end_of_level_domain) { end_of_level_domain = end; } - if (lookup(last_3_periods[1] + 1, end_of_level_domain - last_3_periods[1] - 1)) + size_t host_len = static_cast(end_of_level_domain - last_periods[1] - 1); + StringRef host{last_periods[1] + 1, host_len}; + if (lookup(host)) { - res_data += last_3_periods[2] + 1 - begin; - res_size = last_3_periods[1] - last_3_periods[2] - 1; + res_data += last_periods[2] + 1 - begin; + res_size = last_periods[1] - last_periods[2] - 1; } else { - res_data += last_3_periods[1] + 1 - begin; - res_size = last_3_periods[0] - last_3_periods[1] - 1; + res_data += last_periods[1] + 1 - begin; + res_size = last_periods[0] - last_periods[1] - 1; } } @@ -119,40 +122,63 @@ struct ExtractFirstSignificantSubdomain const auto * begin = tmp; const auto * end = begin + domain_length; - const char * last_2_periods[2]{}; - const char * prev = begin - 1; + std::array last_periods{}; + last_periods[0] = begin - 1; + StringRef excluded_host{}; const auto * pos = find_first_symbols<'.'>(begin, end); while (pos < end) { - if (lookup(pos + 1, end - pos - 1)) + size_t host_len = static_cast(end - pos - 1); + StringRef host{pos + 1, host_len}; + TLDType tld_type = lookup(host); + switch (tld_type) { - res_data += prev + 1 - begin; - res_size = end - 1 - prev; - return; + case TLDType::TLD_NONE: + break; + case TLDType::TLD_REGULAR: + res_data += last_periods[0] + 1 - begin; + res_size = end - 1 - last_periods[0]; + return; + case TLDType::TLD_ANY: + { + StringRef regular_host{last_periods[0] + 1, static_cast(end - 1 - last_periods[0])}; + if (last_periods[1] && excluded_host != regular_host) + { + /// Return TLD_REGULAR + 1 + res_data += last_periods[1] + 1 - begin; + res_size = end - 1 - last_periods[1]; + } + else + { + /// Same as TLD_REGULAR + res_data += last_periods[0] + 1 - begin; + res_size = end - 1 - last_periods[0]; + } + return; + } + case TLDType::TLD_EXCLUDE: + excluded_host = host; + break; } - last_2_periods[1] = last_2_periods[0]; - last_2_periods[0] = pos; - prev = pos; + last_periods[1] = last_periods[0]; + last_periods[0] = pos; pos = find_first_symbols<'.'>(pos + 1, end); } - /// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing - if (!last_2_periods[0]) + /// - if there is domain of the first level (i.e. no dots in the hostname) -> + /// return nothing + if (last_periods[0] == begin - 1) return; - /// if there is domain of the second level -> always return itself - if (!last_2_periods[1]) - { - res_size = last_2_periods[0] - begin; - return; - } - - /// if there is domain of the 3+ level, and zero records in TLD list -> - /// fallback to domain of the second level - res_data += last_2_periods[1] + 1 - begin; - res_size = last_2_periods[0] - last_2_periods[1] - 1; + /// - if there is domain of the second level -> + /// always return itself + /// + /// - if there is domain of the 3+ level, and zero records in TLD list -> + /// fallback to domain of the second level + res_data += last_periods[1] + 1 - begin; + res_size = last_periods[0] - last_periods[1] - 1; } }; diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h index 5d78500c252..88aa2e72db9 100644 --- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -25,9 +25,9 @@ struct FirstSignificantSubdomainCustomLookup { } - bool operator()(const char *pos, size_t len) const + TLDType operator()(StringRef host) const { - return tld_list.has(StringRef{pos, len}); + return tld_list.lookup(host); } }; diff --git a/tests/queries/0_stateless/01601_custom_tld.reference b/tests/queries/0_stateless/01601_custom_tld.reference index b4afe625dac..981067606a2 100644 --- a/tests/queries/0_stateless/01601_custom_tld.reference +++ b/tests/queries/0_stateless/01601_custom_tld.reference @@ -71,3 +71,21 @@ select cutToFirstSignificantSubdomainCustom('a.foo.bar', 'no_new_line_list'); a.foo.bar select cutToFirstSignificantSubdomainCustom('a.foo.baz', 'no_new_line_list'); foo.baz +select '-- asterisk'; +-- asterisk +select cutToFirstSignificantSubdomainCustom('foo.something.sheffield.sch.uk', 'public_suffix_list'); +something.sheffield.sch.uk +select cutToFirstSignificantSubdomainCustom('something.sheffield.sch.uk', 'public_suffix_list'); +something.sheffield.sch.uk +select cutToFirstSignificantSubdomainCustom('sheffield.sch.uk', 'public_suffix_list'); +sheffield.sch.uk +select '-- exclamation mark'; +-- exclamation mark +select cutToFirstSignificantSubdomainCustom('foo.kawasaki.jp', 'public_suffix_list'); +foo.kawasaki.jp +select cutToFirstSignificantSubdomainCustom('foo.foo.kawasaki.jp', 'public_suffix_list'); +foo.foo.kawasaki.jp +select cutToFirstSignificantSubdomainCustom('city.kawasaki.jp', 'public_suffix_list'); +city.kawasaki.jp +select cutToFirstSignificantSubdomainCustom('some.city.kawasaki.jp', 'public_suffix_list'); +city.kawasaki.jp diff --git a/tests/queries/0_stateless/01601_custom_tld.sql b/tests/queries/0_stateless/01601_custom_tld.sql index fd55e229fbc..69ae209af2c 100644 --- a/tests/queries/0_stateless/01601_custom_tld.sql +++ b/tests/queries/0_stateless/01601_custom_tld.sql @@ -44,3 +44,14 @@ select '-- no new line'; select cutToFirstSignificantSubdomainCustom('foo.bar', 'no_new_line_list'); select cutToFirstSignificantSubdomainCustom('a.foo.bar', 'no_new_line_list'); select cutToFirstSignificantSubdomainCustom('a.foo.baz', 'no_new_line_list'); + +select '-- asterisk'; +select cutToFirstSignificantSubdomainCustom('foo.something.sheffield.sch.uk', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('something.sheffield.sch.uk', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('sheffield.sch.uk', 'public_suffix_list'); + +select '-- exclamation mark'; +select cutToFirstSignificantSubdomainCustom('foo.kawasaki.jp', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('foo.foo.kawasaki.jp', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('city.kawasaki.jp', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('some.city.kawasaki.jp', 'public_suffix_list'); From bb292b6aeb4556de394a9d6e206f04e7ead88a9c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 26 Jul 2022 07:59:58 +0200 Subject: [PATCH 177/227] Fix wrong REGEXP_REPLACE alias --- tests/queries/0_stateless/02374_regexp_replace.reference | 1 + tests/queries/0_stateless/02374_regexp_replace.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/02374_regexp_replace.reference create mode 100644 tests/queries/0_stateless/02374_regexp_replace.sql diff --git a/tests/queries/0_stateless/02374_regexp_replace.reference b/tests/queries/0_stateless/02374_regexp_replace.reference new file mode 100644 index 00000000000..18915fea169 --- /dev/null +++ b/tests/queries/0_stateless/02374_regexp_replace.reference @@ -0,0 +1 @@ +https://www.clickhouse.com/ clickhouse.com diff --git a/tests/queries/0_stateless/02374_regexp_replace.sql b/tests/queries/0_stateless/02374_regexp_replace.sql new file mode 100644 index 00000000000..326adb7e618 --- /dev/null +++ b/tests/queries/0_stateless/02374_regexp_replace.sql @@ -0,0 +1 @@ +SELECT 'https://www.clickhouse.com/' AS s, REGEXP_REPLACE(s, '^https?://(?:www\.)?([^/]+)/.*$', '\1'); From a2522d6196969a71e5d47157be35f4f3a960bc1d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 26 Jul 2022 08:00:49 +0200 Subject: [PATCH 178/227] Revert "Fix wrong REGEXP_REPLACE alias" This reverts commit bb292b6aeb4556de394a9d6e206f04e7ead88a9c. --- tests/queries/0_stateless/02374_regexp_replace.reference | 1 - tests/queries/0_stateless/02374_regexp_replace.sql | 1 - 2 files changed, 2 deletions(-) delete mode 100644 tests/queries/0_stateless/02374_regexp_replace.reference delete mode 100644 tests/queries/0_stateless/02374_regexp_replace.sql diff --git a/tests/queries/0_stateless/02374_regexp_replace.reference b/tests/queries/0_stateless/02374_regexp_replace.reference deleted file mode 100644 index 18915fea169..00000000000 --- a/tests/queries/0_stateless/02374_regexp_replace.reference +++ /dev/null @@ -1 +0,0 @@ -https://www.clickhouse.com/ clickhouse.com diff --git a/tests/queries/0_stateless/02374_regexp_replace.sql b/tests/queries/0_stateless/02374_regexp_replace.sql deleted file mode 100644 index 326adb7e618..00000000000 --- a/tests/queries/0_stateless/02374_regexp_replace.sql +++ /dev/null @@ -1 +0,0 @@ -SELECT 'https://www.clickhouse.com/' AS s, REGEXP_REPLACE(s, '^https?://(?:www\.)?([^/]+)/.*$', '\1'); From bbb5e1306dae3533a87ce5d0eef00b1a6dadf60b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 26 Jul 2022 07:59:58 +0200 Subject: [PATCH 179/227] Fix wrong REGEXP_REPLACE alias --- tests/queries/0_stateless/02374_regexp_replace.reference | 1 + tests/queries/0_stateless/02374_regexp_replace.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/02374_regexp_replace.reference create mode 100644 tests/queries/0_stateless/02374_regexp_replace.sql diff --git a/tests/queries/0_stateless/02374_regexp_replace.reference b/tests/queries/0_stateless/02374_regexp_replace.reference new file mode 100644 index 00000000000..18915fea169 --- /dev/null +++ b/tests/queries/0_stateless/02374_regexp_replace.reference @@ -0,0 +1 @@ +https://www.clickhouse.com/ clickhouse.com diff --git a/tests/queries/0_stateless/02374_regexp_replace.sql b/tests/queries/0_stateless/02374_regexp_replace.sql new file mode 100644 index 00000000000..326adb7e618 --- /dev/null +++ b/tests/queries/0_stateless/02374_regexp_replace.sql @@ -0,0 +1 @@ +SELECT 'https://www.clickhouse.com/' AS s, REGEXP_REPLACE(s, '^https?://(?:www\.)?([^/]+)/.*$', '\1'); From 833b24b4868e7fa8e882d1638bf700d0cb409b49 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 26 Jul 2022 08:01:49 +0200 Subject: [PATCH 180/227] Fix the wrong REGEXP_REPLACE alias --- src/Functions/replaceAll.cpp | 1 - src/Functions/replaceRegexpAll.cpp | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/replaceAll.cpp b/src/Functions/replaceAll.cpp index 25a5b33c3a0..cc29e57ea69 100644 --- a/src/Functions/replaceAll.cpp +++ b/src/Functions/replaceAll.cpp @@ -21,7 +21,6 @@ void registerFunctionReplaceAll(FunctionFactory & factory) { factory.registerFunction(); factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive); - factory.registerAlias("REGEXP_REPLACE", NameReplaceAll::name, FunctionFactory::CaseInsensitive); } } diff --git a/src/Functions/replaceRegexpAll.cpp b/src/Functions/replaceRegexpAll.cpp index ad67efa82f4..07ffbdae792 100644 --- a/src/Functions/replaceRegexpAll.cpp +++ b/src/Functions/replaceRegexpAll.cpp @@ -20,6 +20,7 @@ using FunctionReplaceRegexpAll = FunctionStringReplace, void registerFunctionReplaceRegexpAll(FunctionFactory & factory) { factory.registerFunction(); + factory.registerAlias("REGEXP_REPLACE", NameReplaceRegexpAll::name, FunctionFactory::CaseInsensitive); } } From 4c98a7bc0f3fa9c2d65a262415a5ec303b87abdf Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 08:46:51 +0200 Subject: [PATCH 181/227] Add test for Ordinary database. --- ...eplicated.xml => allow_database_types.xml} | 1 + .../test_backup_restore_on_cluster/test.py | 2 +- .../test_concurrency.py | 20 +++++++++++++------ 3 files changed, 16 insertions(+), 7 deletions(-) rename tests/integration/test_backup_restore_on_cluster/configs/{allow_experimental_database_replicated.xml => allow_database_types.xml} (68%) diff --git a/tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml b/tests/integration/test_backup_restore_on_cluster/configs/allow_database_types.xml similarity index 68% rename from tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml rename to tests/integration/test_backup_restore_on_cluster/configs/allow_database_types.xml index 0434df06457..e0e026210b1 100644 --- a/tests/integration/test_backup_restore_on_cluster/configs/allow_experimental_database_replicated.xml +++ b/tests/integration/test_backup_restore_on_cluster/configs/allow_database_types.xml @@ -2,6 +2,7 @@ 1 + 1
diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 58fac12f041..d1898213725 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -16,7 +16,7 @@ main_configs = [ ] user_configs = [ - "configs/allow_experimental_database_replicated.xml", + "configs/allow_database_types.xml", ] node1 = cluster.add_instance( diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index ee26f08f14e..8eaed5ac486 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -30,7 +30,7 @@ def generate_cluster_def(): main_configs = ["configs/backups_disk.xml", generate_cluster_def()] -user_configs = ["configs/allow_experimental_database_replicated.xml"] +user_configs = ["configs/allow_database_types.xml"] nodes = [] for i in range(num_nodes): @@ -63,6 +63,7 @@ def drop_after_test(): yield finally: node0.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' NO DELAY") + node0.query("DROP DATABASE IF EXISTS mydb ON CLUSTER 'cluster' NO DELAY") backup_id_counter = 0 @@ -172,10 +173,17 @@ def test_concurrent_backups_on_different_nodes(): assert nodes[j].query("SELECT sum(x) FROM tbl") == TSV([expected_sum]) -def test_create_or_drop_tables_during_backup(): - node0.query( - "CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE=Replicated('/clickhouse/path/','{shard}','{replica}')" - ) +@pytest.mark.parametrize( + "db_engine, table_engine", + [("Replicated", "ReplicatedMergeTree"), ("Ordinary", "MergeTree")], +) +def test_create_or_drop_tables_during_backup(db_engine, table_engine): + if db_engine == "Replicated": + db_engine = "Replicated('/clickhouse/path/','{shard}','{replica}')" + if table_engine.endswith("MergeTree"): + table_engine += " ORDER BY tuple()" + + node0.query(f"CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE={db_engine}") # Will do this test for 60 seconds start_time = time.time() @@ -186,7 +194,7 @@ def test_create_or_drop_tables_during_backup(): node = nodes[randint(0, num_nodes - 1)] table_name = f"mydb.tbl{randint(1, num_nodes)}" node.query( - f"CREATE TABLE IF NOT EXISTS {table_name}(x Int32) ENGINE=ReplicatedMergeTree ORDER BY x" + f"CREATE TABLE IF NOT EXISTS {table_name}(x Int32) ENGINE={table_engine}" ) node.query_and_get_answer_with_error( f"INSERT INTO {table_name} SELECT rand32() FROM numbers(10)" From 76599d123190701b9e73367ce71609c6bdcb043e Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 08:58:33 +0200 Subject: [PATCH 182/227] Finally fix locking storages for reading during backup. --- src/Backups/BackupEntriesCollector.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 3cd9649de61..695b5a0fbb4 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -537,12 +537,11 @@ void BackupEntriesCollector::lockTablesForReading() for (auto & [table_name, table_info] : table_infos) { auto storage = table_info.storage; - TableLockHolder table_lock; if (storage) { try { - table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); + table_info.table_lock = storage->lockForShare(context->getInitialQueryId(), context->getSettingsRef().lock_acquire_timeout); } catch (Exception & e) { From c0ec6fd9130e7bd397a6a11b38284c28a5f23e84 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 09:53:32 +0200 Subject: [PATCH 183/227] Use Poco::Event to simplify code. --- src/Backups/BackupCoordinationStageSync.cpp | 39 ++++----------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index 4b94e474345..5e0efbdd7b9 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -130,27 +129,8 @@ Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const S auto zookeeper = get_zookeeper(); - struct Watch - { - std::mutex mutex; - std::condition_variable event; - bool zk_nodes_changed = false; - bool watch_set = false; - }; - - /// shared_ptr because `watch_callback` can be called by ZooKeeper after leaving this function's scope. - auto watch = std::make_shared(); - - /// Called by ZooKepper when list of zk nodes have changed. - auto watch_callback = [watch](const Coordination::WatchResponse &) - { - std::lock_guard lock{watch->mutex}; - watch->zk_nodes_changed = true; - watch->watch_set = false; /// When it's triggered ZooKeeper resets the watch so we need to call getChildrenWatch() again. - watch->event.notify_all(); - }; - - auto zk_nodes_changed = [watch] { return watch->zk_nodes_changed; }; + /// Set by ZooKepper when list of zk nodes have changed. + auto watch = std::make_shared(); bool use_timeout = timeout.has_value(); std::chrono::steady_clock::time_point end_of_timeout; @@ -164,12 +144,7 @@ Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const S for (;;) { /// Get zk nodes and subscribe on their changes. - { - std::lock_guard lock{watch->mutex}; - watch->watch_set = true; - watch->zk_nodes_changed = false; - } - Strings zk_nodes = zookeeper->getChildrenWatch(zookeeper_path, nullptr, watch_callback); + Strings zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch); /// Read and analyze the current state of zk nodes. state = readCurrentState(zookeeper, zk_nodes, all_hosts, stage_to_wait); @@ -186,19 +161,17 @@ Strings BackupCoordinationStageSync::waitImpl(const Strings & all_hosts, const S /// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed. { - std::unique_lock lock{watch->mutex}; if (use_timeout) { auto current_time = std::chrono::steady_clock::now(); - if ((current_time > end_of_timeout) || !watch->event.wait_for(lock, end_of_timeout - current_time, zk_nodes_changed)) + if ((current_time > end_of_timeout) + || !watch->tryWait(std::chrono::duration_cast(end_of_timeout - current_time).count())) break; } else { - watch->event.wait(lock, zk_nodes_changed); + watch->wait(); } - assert(watch->zk_nodes_changed); - assert(!watch->watch_set); } } From 162be5acbeff28d5947f717bc5fd9a9e9729b63a Mon Sep 17 00:00:00 2001 From: jianmei zhang Date: Tue, 26 Jul 2022 16:10:00 +0800 Subject: [PATCH 184/227] set mutations_sync to 2 to wait for mutations to complete on all replicas --- .../02352_lightweight_delete_on_replicated_merge_tree.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql index 3df9acbee87..417dfeea094 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql @@ -6,7 +6,7 @@ CREATE TABLE replicated_table_r2(id Int32, name String) ENGINE = ReplicatedMerge INSERT INTO replicated_table_r1 select number, toString(number) FROM numbers(100); -SET mutations_sync = 1; +SET mutations_sync = 2; SET allow_experimental_lightweight_delete = 1; DELETE FROM replicated_table_r1 WHERE id = 10; From 6e3a4b0a3d49085946d1d5b225380c191769d634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 25 Jul 2022 18:12:41 +0200 Subject: [PATCH 185/227] Add result_rows and result_bytes to progress reports and summary --- src/IO/Progress.cpp | 22 +++++++++++++++++++ src/IO/Progress.h | 20 ++++++++++++++++- src/Interpreters/executeQuery.cpp | 13 ++++++----- ...copatch_progress_in_http_headers.reference | 12 +++++----- ...h_format_jsoneachrowwithprogress.reference | 6 ++--- ...0_live_view_watch_events_http_heartbeat.py | 6 ++--- .../02136_scalar_progress.reference | 12 +++++----- 7 files changed, 67 insertions(+), 24 deletions(-) diff --git a/src/IO/Progress.cpp b/src/IO/Progress.cpp index 45710db5c92..eb6eb7fe573 100644 --- a/src/IO/Progress.cpp +++ b/src/IO/Progress.cpp @@ -61,6 +61,10 @@ void ProgressValues::writeJSON(WriteBuffer & out) const writeText(this->written_bytes, out); writeCString("\",\"total_rows_to_read\":\"", out); writeText(this->total_rows_to_read, out); + writeCString("\",\"result_rows\":\"", out); + writeText(this->result_rows, out); + writeCString("\",\"result_bytes\":\"", out); + writeText(this->result_bytes, out); writeCString("\"}", out); } @@ -75,6 +79,9 @@ bool Progress::incrementPiecewiseAtomically(const Progress & rhs) written_rows += rhs.written_rows; written_bytes += rhs.written_bytes; + result_rows += rhs.result_rows; + result_bytes += rhs.result_bytes; + return rhs.read_rows || rhs.written_rows; } @@ -88,6 +95,9 @@ void Progress::reset() written_rows = 0; written_bytes = 0; + + result_rows = 0; + result_bytes = 0; } ProgressValues Progress::getValues() const @@ -103,6 +113,9 @@ ProgressValues Progress::getValues() const res.written_rows = written_rows.load(std::memory_order_relaxed); res.written_bytes = written_bytes.load(std::memory_order_relaxed); + res.result_rows = result_rows.load(std::memory_order_relaxed); + res.result_bytes = result_bytes.load(std::memory_order_relaxed); + return res; } @@ -119,6 +132,9 @@ ProgressValues Progress::fetchValuesAndResetPiecewiseAtomically() res.written_rows = written_rows.fetch_and(0); res.written_bytes = written_bytes.fetch_and(0); + res.result_rows = result_rows.fetch_and(0); + res.result_bytes = result_bytes.fetch_and(0); + return res; } @@ -135,6 +151,9 @@ Progress Progress::fetchAndResetPiecewiseAtomically() res.written_rows = written_rows.fetch_and(0); res.written_bytes = written_bytes.fetch_and(0); + res.result_rows = result_rows.fetch_and(0); + res.result_bytes = result_bytes.fetch_and(0); + return res; } @@ -149,6 +168,9 @@ Progress & Progress::operator=(Progress && other) noexcept written_rows = other.written_rows.load(std::memory_order_relaxed); written_bytes = other.written_bytes.load(std::memory_order_relaxed); + result_rows = other.result_rows.load(std::memory_order_relaxed); + result_bytes = other.result_bytes.load(std::memory_order_relaxed); + return *this; } diff --git a/src/IO/Progress.h b/src/IO/Progress.h index f04822f26bb..8340974b03d 100644 --- a/src/IO/Progress.h +++ b/src/IO/Progress.h @@ -25,6 +25,9 @@ struct ProgressValues size_t written_rows; size_t written_bytes; + size_t result_rows; + size_t result_bytes; + void read(ReadBuffer & in, UInt64 server_revision); void write(WriteBuffer & out, UInt64 client_revision) const; void writeJSON(WriteBuffer & out) const; @@ -49,6 +52,15 @@ struct WriteProgress : written_rows(written_rows_), written_bytes(written_bytes_) {} }; +struct ResultProgress +{ + size_t result_rows; + size_t result_bytes; + + ResultProgress(size_t result_rows_, size_t result_bytes_) + : result_rows(result_rows_), result_bytes(result_bytes_) {} +}; + struct FileProgress { /// Here read_bytes (raw bytes) - do not equal ReadProgress::read_bytes, which are calculated according to column types. @@ -77,6 +89,9 @@ struct Progress std::atomic written_rows {0}; std::atomic written_bytes {0}; + std::atomic result_rows {0}; + std::atomic result_bytes {0}; + Progress() = default; Progress(size_t read_rows_, size_t read_bytes_, size_t total_rows_to_read_ = 0) @@ -86,7 +101,10 @@ struct Progress : read_rows(read_progress.read_rows), read_bytes(read_progress.read_bytes), total_rows_to_read(read_progress.total_rows_to_read) {} explicit Progress(WriteProgress write_progress) - : written_rows(write_progress.written_rows), written_bytes(write_progress.written_bytes) {} + : written_rows(write_progress.written_rows), written_bytes(write_progress.written_bytes) {} + + explicit Progress(ResultProgress result_progress) + : result_rows(result_progress.result_rows), result_bytes(result_progress.result_bytes) {} explicit Progress(FileProgress file_progress) : read_bytes(file_progress.read_bytes), total_bytes_to_read(file_progress.total_bytes_to_read) {} diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index ae915aab867..0cfc0a88356 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -863,11 +863,6 @@ static std::tuple executeQueryImpl( elem.event_time_microseconds = time_in_microseconds(finish_time); status_info_to_query_log(elem, info, ast, context); - auto progress_callback = context->getProgressCallback(); - - if (progress_callback) - progress_callback(Progress(WriteProgress(info.written_rows, info.written_bytes))); - if (pulling_pipeline) { query_pipeline.tryGetResultRowsAndBytes(elem.result_rows, elem.result_bytes); @@ -879,6 +874,14 @@ static std::tuple executeQueryImpl( elem.result_bytes = progress_out.written_bytes; } + auto progress_callback = context->getProgressCallback(); + if (progress_callback) + { + Progress p(WriteProgress{info.written_rows, info.written_bytes}); + p.incrementPiecewiseAtomically(Progress{ResultProgress{elem.result_rows, elem.result_bytes}}); + progress_callback(p); + } + if (elem.read_rows != 0) { LOG_INFO(&Poco::Logger::get("executeQuery"), "Read {} rows, {} in {} sec., {} rows/sec., {}/sec.", diff --git a/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.reference b/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.reference index 07c736f3bb0..c6db9cc1614 100644 --- a/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.reference +++ b/tests/queries/0_stateless/00416_pocopatch_progress_in_http_headers.reference @@ -1,9 +1,9 @@ -< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"10"} -< X-ClickHouse-Progress: {"read_rows":"5","read_bytes":"40","written_rows":"0","written_bytes":"0","total_rows_to_read":"10"} -< X-ClickHouse-Progress: {"read_rows":"10","read_bytes":"80","written_rows":"0","written_bytes":"0","total_rows_to_read":"10"} +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"10","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"5","read_bytes":"40","written_rows":"0","written_bytes":"0","total_rows_to_read":"10","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"10","read_bytes":"80","written_rows":"0","written_bytes":"0","total_rows_to_read":"10","result_rows":"0","result_bytes":"0"} 9 -< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"10"} -< X-ClickHouse-Progress: {"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"10"} +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"10","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"10","result_rows":"0","result_bytes":"0"} 0 1 2 @@ -24,4 +24,4 @@ 7 8 9 -< X-ClickHouse-Summary: {"read_rows":"10","read_bytes":"80","written_rows":"10","written_bytes":"40","total_rows_to_read":"0"} +< X-ClickHouse-Summary: {"read_rows":"10","read_bytes":"80","written_rows":"10","written_bytes":"40","total_rows_to_read":"0","result_rows":"10","result_bytes":"40"} diff --git a/tests/queries/0_stateless/00969_live_view_watch_format_jsoneachrowwithprogress.reference b/tests/queries/0_stateless/00969_live_view_watch_format_jsoneachrowwithprogress.reference index 287a1ced92d..80ec35990d6 100644 --- a/tests/queries/0_stateless/00969_live_view_watch_format_jsoneachrowwithprogress.reference +++ b/tests/queries/0_stateless/00969_live_view_watch_format_jsoneachrowwithprogress.reference @@ -1,6 +1,6 @@ {"row":{"sum(a)":"0","_version":"1"}} -{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}} +{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}} {"row":{"sum(a)":"6","_version":"2"}} -{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}} +{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}} {"row":{"sum(a)":"21","_version":"3"}} -{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}} +{"progress":{"read_rows":"1","read_bytes":"16","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}} diff --git a/tests/queries/0_stateless/00970_live_view_watch_events_http_heartbeat.py b/tests/queries/0_stateless/00970_live_view_watch_events_http_heartbeat.py index 8c5126bbaf3..febe439e63b 100755 --- a/tests/queries/0_stateless/00970_live_view_watch_events_http_heartbeat.py +++ b/tests/queries/0_stateless/00970_live_view_watch_events_http_heartbeat.py @@ -38,17 +38,17 @@ with client(name="client1>", log=log) as client1: log=log, ) as client2: client2.expect( - '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}}\n', + '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}}\n', escape=True, ) client2.expect('{"row":{"version":"1"}', escape=True) client2.expect( - '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}}', + '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}}', escape=True, ) # heartbeat is provided by progress message client2.expect( - '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}}', + '{"progress":{"read_rows":"1","read_bytes":"8","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}}', escape=True, ) diff --git a/tests/queries/0_stateless/02136_scalar_progress.reference b/tests/queries/0_stateless/02136_scalar_progress.reference index 21f6d3e0043..e9204f2d02e 100644 --- a/tests/queries/0_stateless/02136_scalar_progress.reference +++ b/tests/queries/0_stateless/02136_scalar_progress.reference @@ -1,6 +1,6 @@ -< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} -< X-ClickHouse-Progress: {"read_rows":"65505","read_bytes":"524040","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} -< X-ClickHouse-Progress: {"read_rows":"131010","read_bytes":"1048080","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} -< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} -< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} -< X-ClickHouse-Summary: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000"} +< X-ClickHouse-Progress: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"65505","read_bytes":"524040","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"131010","read_bytes":"1048080","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"0","result_bytes":"0"} +< X-ClickHouse-Progress: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"1","result_bytes":"80"} +< X-ClickHouse-Summary: {"read_rows":"131011","read_bytes":"1048081","written_rows":"0","written_bytes":"0","total_rows_to_read":"100000","result_rows":"1","result_bytes":"80"} From 06341e1f172d00c7518247243c984ad7a3d87257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 25 Jul 2022 18:20:10 +0200 Subject: [PATCH 186/227] Check result_rows and result_bytes in X-ClickHouse-Summary header --- .../0_stateless/02373_progress_contain_result.reference | 1 + .../queries/0_stateless/02373_progress_contain_result.sh | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 tests/queries/0_stateless/02373_progress_contain_result.reference create mode 100755 tests/queries/0_stateless/02373_progress_contain_result.sh diff --git a/tests/queries/0_stateless/02373_progress_contain_result.reference b/tests/queries/0_stateless/02373_progress_contain_result.reference new file mode 100644 index 00000000000..1e7492e2829 --- /dev/null +++ b/tests/queries/0_stateless/02373_progress_contain_result.reference @@ -0,0 +1 @@ +< X-ClickHouse-Summary: {"read_rows":"100","read_bytes":"800","written_rows":"0","written_bytes":"0","total_rows_to_read":"100","result_rows":"100","result_bytes":"131"} diff --git a/tests/queries/0_stateless/02373_progress_contain_result.sh b/tests/queries/0_stateless/02373_progress_contain_result.sh new file mode 100755 index 00000000000..1b257b699f5 --- /dev/null +++ b/tests/queries/0_stateless/02373_progress_contain_result.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo 'SELECT 1 FROM numbers(100)' | + ${CLICKHOUSE_CURL_COMMAND} -v "${CLICKHOUSE_URL}&wait_end_of_query=1&send_progress_in_http_headers=0" --data-binary @- 2>&1 | + grep 'X-ClickHouse-Summary' From 9908bff6675be6a7c81e85bb42ddf9b509cfdb37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 26 Jul 2022 10:39:47 +0200 Subject: [PATCH 187/227] Fix missing tests --- ...68_live_view_select_format_jsoneachrowwithprogress.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00968_live_view_select_format_jsoneachrowwithprogress.reference b/tests/queries/0_stateless/00968_live_view_select_format_jsoneachrowwithprogress.reference index 5ae423d90d1..5f48ead3147 100644 --- a/tests/queries/0_stateless/00968_live_view_select_format_jsoneachrowwithprogress.reference +++ b/tests/queries/0_stateless/00968_live_view_select_format_jsoneachrowwithprogress.reference @@ -1,4 +1,4 @@ {"row":{"a":1}} {"row":{"a":2}} {"row":{"a":3}} -{"progress":{"read_rows":"3","read_bytes":"36","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}} +{"progress":{"read_rows":"3","read_bytes":"36","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","result_rows":"0","result_bytes":"0"}} From d0183de34a6142a49087cae25ccd773efd958285 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Tue, 26 Jul 2022 11:00:08 +0200 Subject: [PATCH 188/227] Update docs/en/operations/settings/settings.md Co-authored-by: Antonio Andelic --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 59ac34bd6f1..5d8ab5683c6 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -308,7 +308,7 @@ Possible values: - `default` — `hash` or `direct`, if possible (same as `direct,hash`) -- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section. +- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section. - `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process. From 0d65415086d0e31c7071d83a4d70f69e8f62026a Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Tue, 26 Jul 2022 11:00:12 +0200 Subject: [PATCH 189/227] Update docs/en/operations/settings/settings.md Co-authored-by: Antonio Andelic --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 5d8ab5683c6..94fc8512bc8 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -328,7 +328,7 @@ The `direct` algorithm performs a lookup in the right table using rows from the - `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining. -- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise uses `hash`. *Deprecated*, same as `partial_merge,hash`. +- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`. ## join_any_take_last_row {#settings-join_any_take_last_row} From 24ab5fbb86e89d248089e42c5bc6225d9e90abed Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Tue, 26 Jul 2022 11:32:45 +0200 Subject: [PATCH 190/227] fix finish() condition to account only active threads --- src/Common/ConcurrencyControl.h | 5 ++++ src/Processors/Executors/ExecutorTasks.cpp | 30 ++++++++++++++++--- src/Processors/Executors/ExecutorTasks.h | 7 ++++- src/Processors/Executors/PipelineExecutor.cpp | 17 ++++++++--- 4 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index 9ea5efd53d0..6f37bb45c84 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -96,6 +96,11 @@ public: return {}; // avoid unnecessary locking } + SlotCount grantedCount() const + { + return granted; + } + private: friend struct Slot; // for release() friend class ConcurrencyControl; // for grant(), free() and ctor diff --git a/src/Processors/Executors/ExecutorTasks.cpp b/src/Processors/Executors/ExecutorTasks.cpp index 824b4e962d2..3b5fdac3ee3 100644 --- a/src/Processors/Executors/ExecutorTasks.cpp +++ b/src/Processors/Executors/ExecutorTasks.cpp @@ -32,7 +32,7 @@ void ExecutorTasks::tryWakeUpAnyOtherThreadWithTasks(ExecutionThreadContext & se { if (!task_queue.empty() && !threads_queue.empty() && !finished) { - size_t next_thread = self.thread_number + 1 == num_threads ? 0 : (self.thread_number + 1); + size_t next_thread = (self.thread_number + 1) % use_threads; auto thread_to_wake = task_queue.getAnyThreadWithTasks(next_thread); if (threads_queue.has(thread_to_wake)) @@ -40,6 +40,9 @@ void ExecutorTasks::tryWakeUpAnyOtherThreadWithTasks(ExecutionThreadContext & se else thread_to_wake = threads_queue.popAny(); + if (thread_to_wake >= use_threads) + throw Exception("Non-empty queue without allocated thread", ErrorCodes::LOGICAL_ERROR); + lock.unlock(); executor_contexts[thread_to_wake]->wakeUp(); } @@ -50,6 +53,7 @@ void ExecutorTasks::tryGetTask(ExecutionThreadContext & context) { std::unique_lock lock(mutex); + /// Try get async task assigned to this thread or any other task from queue. if (auto * async_task = context.tryPopAsyncTask()) { context.setTask(async_task); @@ -58,13 +62,18 @@ void ExecutorTasks::tryGetTask(ExecutionThreadContext & context) else if (!task_queue.empty()) context.setTask(task_queue.pop(context.thread_number)); + /// Task found. if (context.hasTask()) { + /// We have to wake up at least one thread if there are pending tasks. + /// That thread will wake up other threads during its `tryGetTask()` call if any. tryWakeUpAnyOtherThreadWithTasks(context, lock); return; } - if (threads_queue.size() + 1 == num_threads && async_task_queue.empty() && num_waiting_async_tasks == 0) + /// This thread has no tasks to do and is going to wait. + /// Finish execution if this was the last active thread. + if (threads_queue.size() + 1 == use_threads && async_task_queue.empty() && num_waiting_async_tasks == 0) { lock.unlock(); finish(); @@ -88,6 +97,7 @@ void ExecutorTasks::tryGetTask(ExecutionThreadContext & context) } #endif + /// Enqueue thread into stack of waiting threads. threads_queue.push(context.thread_number); } @@ -124,13 +134,15 @@ void ExecutorTasks::pushTasks(Queue & queue, Queue & async_queue, ExecutionThrea queue.pop(); } + /// Wake up at least one thread that will wake up other threads if required tryWakeUpAnyOtherThreadWithTasks(context, lock); } } -void ExecutorTasks::init(size_t num_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback) +void ExecutorTasks::init(size_t num_threads_, size_t use_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback) { num_threads = num_threads_; + use_threads = use_threads_; threads_queue.init(num_threads); task_queue.init(num_threads); @@ -154,11 +166,21 @@ void ExecutorTasks::fill(Queue & queue) queue.pop(); ++next_thread; - if (next_thread >= num_threads) + + /// It is important to keep queues empty for threads that are not started yet. + /// Otherwise that thread can be selected by `tryWakeUpAnyOtherThreadWithTasks()`, leading to deadlock. + if (next_thread >= use_threads) next_thread = 0; } } +void ExecutorTasks::upscale(size_t use_threads_) +{ + std::lock_guard lock(mutex); + if (use_threads < use_threads_) + use_threads = use_threads_; +} + void ExecutorTasks::processAsyncTasks() { #if defined(OS_LINUX) diff --git a/src/Processors/Executors/ExecutorTasks.h b/src/Processors/Executors/ExecutorTasks.h index 668470e7b11..d35f8de94d1 100644 --- a/src/Processors/Executors/ExecutorTasks.h +++ b/src/Processors/Executors/ExecutorTasks.h @@ -32,8 +32,12 @@ class ExecutorTasks /// For single thread, will wait for async tasks only when task_queue is empty. PollingQueue async_task_queue; + /// Maximum amount of threads. Constant after initialization, based on `max_threads` setting. size_t num_threads = 0; + /// Started thread count (allocated by `ConcurrencyControl`). Can increase during execution up to `num_threads`. + size_t use_threads = 0; + /// This is the total number of waited async tasks which are not executed yet. /// sum(executor_contexts[i].async_tasks.size()) size_t num_waiting_async_tasks = 0; @@ -54,8 +58,9 @@ public: void tryGetTask(ExecutionThreadContext & context); void pushTasks(Queue & queue, Queue & async_queue, ExecutionThreadContext & context); - void init(size_t num_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback); + void init(size_t num_threads_, size_t use_threads_, bool profile_processors, bool trace_processors, ReadProgressCallback * callback); void fill(Queue & queue); + void upscale(size_t use_threads_); void processAsyncTasks(); diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 6ee2aa54658..ae20d97604b 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -210,7 +210,6 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie Stopwatch total_time_watch; #endif - // auto & node = tasks.getNode(thread_num); auto & context = tasks.getThreadContext(thread_num); bool yield = false; @@ -256,7 +255,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie context.processing_time_ns += processing_time_watch.elapsed(); #endif - // Upscale if possible + /// Upscale if possible. spawnThreads(); /// We have executed single processor. Check if we need to yield execution. @@ -275,13 +274,17 @@ void PipelineExecutor::initializeExecution(size_t num_threads) { is_execution_initialized = true; + /// Allocate CPU slots from concurrency control + constexpr size_t min_threads = 1; + slots = ConcurrencyControl::instance().allocate(min_threads, num_threads); + size_t use_threads = slots->grantedCount(); + Queue queue; graph->initializeExecution(queue); - tasks.init(num_threads, profile_processors, trace_processors, read_progress_callback.get()); + tasks.init(num_threads, use_threads, profile_processors, trace_processors, read_progress_callback.get()); tasks.fill(queue); - slots = ConcurrencyControl::instance().allocate(1, num_threads); std::unique_lock lock{threads_mutex}; threads.reserve(num_threads); } @@ -292,6 +295,12 @@ void PipelineExecutor::spawnThreads() { std::unique_lock lock{threads_mutex}; size_t thread_num = threads.size(); + + /// Count of threads in use should be updated for proper finish() condition. + /// NOTE: this will not decrease `use_threads` below initially granted count + tasks.upscale(thread_num + 1); + + /// Start new thread threads.emplace_back([this, thread_num, thread_group = CurrentThread::getGroup(), slot = std::move(slot)] { /// ThreadStatus thread_status; From 683a8866ef4798e1c0695ac13afb2b16b0de28c4 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Jul 2022 09:36:39 +0000 Subject: [PATCH 191/227] Fix Chain::addSink --- src/QueryPipeline/Chain.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/QueryPipeline/Chain.cpp b/src/QueryPipeline/Chain.cpp index c33713dbe81..e5f2556a44f 100644 --- a/src/QueryPipeline/Chain.cpp +++ b/src/QueryPipeline/Chain.cpp @@ -96,7 +96,7 @@ void Chain::addSink(ProcessorPtr processor) if (!processors.empty()) connect(getOutputPort(), processor->getInputs().front()); - processors.emplace_front(std::move(processor)); + processors.emplace_back(std::move(processor)); } IProcessor & Chain::getSource() From f0cd564648c7dfdd772fbd7884e6ca86ef5a0ae5 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 11:17:08 +0200 Subject: [PATCH 192/227] Changes after review and added comments. --- src/Backups/BackupCoordinationRemote.cpp | 6 ++ src/Backups/BackupCoordinationStage.cpp | 13 ++++ src/Backups/BackupCoordinationStage.h | 41 +++++++++++ src/Backups/BackupCoordinationStageSync.cpp | 2 +- src/Backups/BackupEntriesCollector.cpp | 43 ++++------- src/Backups/BackupsWorker.cpp | 80 ++++++++++++++++----- src/Backups/BackupsWorker.h | 2 +- src/Backups/RestoreCoordinationRemote.cpp | 7 ++ src/Backups/RestorerFromBackup.cpp | 29 +++----- src/Interpreters/InterpreterBackupQuery.cpp | 13 ++-- 10 files changed, 160 insertions(+), 76 deletions(-) create mode 100644 src/Backups/BackupCoordinationStage.cpp create mode 100644 src/Backups/BackupCoordinationStage.h diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index bac99b0da2d..8ef2db5d6f1 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -204,6 +204,12 @@ void BackupCoordinationRemote::createRootNodes() void BackupCoordinationRemote::removeAllNodes() { + /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore. + /// + /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query + /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination + /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part + /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that). auto zookeeper = get_zookeeper(); zookeeper->removeRecursive(zookeeper_path); } diff --git a/src/Backups/BackupCoordinationStage.cpp b/src/Backups/BackupCoordinationStage.cpp new file mode 100644 index 00000000000..bb8abdf95c4 --- /dev/null +++ b/src/Backups/BackupCoordinationStage.cpp @@ -0,0 +1,13 @@ +#include +#include + + +namespace DB +{ + +String BackupCoordinationStage::formatGatheringMetadata(size_t pass) +{ + return fmt::format("{} ({})", GATHERING_METADATA, pass); +} + +} diff --git a/src/Backups/BackupCoordinationStage.h b/src/Backups/BackupCoordinationStage.h new file mode 100644 index 00000000000..091c1f11463 --- /dev/null +++ b/src/Backups/BackupCoordinationStage.h @@ -0,0 +1,41 @@ +#pragma once + +#include + + +namespace DB +{ + +namespace BackupCoordinationStage +{ + /// Finding all tables and databases which we're going to put to the backup and collecting their metadata. + constexpr const char * GATHERING_METADATA = "gathering metadata"; + + String formatGatheringMetadata(size_t pass); + + /// Making temporary hard links and prepare backup entries. + constexpr const char * EXTRACTING_DATA_FROM_TABLES = "extracting data from tables"; + + /// Running special tasks for replicated tables which can also prepare some backup entries. + constexpr const char * RUNNING_POST_TASKS = "running post-tasks"; + + /// Writing backup entries to the backup and removing temporary hard links. + constexpr const char * WRITING_BACKUP = "writing backup"; + + /// Finding databases and tables in the backup which we're going to restore. + constexpr const char * FINDING_TABLES_IN_BACKUP = "finding tables in backup"; + + /// Creating databases or finding them and checking their definitions. + constexpr const char * CREATING_DATABASES = "creating databases"; + + /// Creating tables or finding them and checking their definition. + constexpr const char * CREATING_TABLES = "creating tables"; + + /// Inserting restored data to tables. + constexpr const char * INSERTING_DATA_TO_TABLES = "inserting data to tables"; + + /// Coordination stage meaning that a host finished its work. + constexpr const char * COMPLETED = "completed"; +} + +} diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp index 5e0efbdd7b9..e4773223075 100644 --- a/src/Backups/BackupCoordinationStageSync.cpp +++ b/src/Backups/BackupCoordinationStageSync.cpp @@ -43,7 +43,7 @@ void BackupCoordinationStageSync::set(const String & current_host, const String throw zkutil::KeeperException(code, alive_node_path); zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, ""); - zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message); + zookeeper->create(zookeeper_path + "/current|" + current_host + "|" + new_stage, message, zkutil::CreateMode::Persistent); } void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception) diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 695b5a0fbb4..22245f7056a 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -31,25 +32,11 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } + +namespace Stage = BackupCoordinationStage; + namespace { - /// Finding all tables and databases which we're going to put to the backup and collecting their metadata. - constexpr const char * kGatheringMetadataStage = "gathering metadata"; - - String formatGatheringMetadataStage(size_t pass) - { - return fmt::format("{} ({})", kGatheringMetadataStage, pass); - } - - /// Making temporary hard links and prepare backup entries. - constexpr const char * kExtractingDataFromTablesStage = "extracting data from tables"; - - /// Running special tasks for replicated tables which can also prepare some backup entries. - constexpr const char * kRunningPostTasksStage = "running post-tasks"; - - /// Writing backup entries to the backup and removing temporary hard links. - constexpr const char * kWritingBackupStage = "writing backup"; - /// Uppercases the first character of a passed string. String toUpperFirst(const String & str) { @@ -129,15 +116,15 @@ BackupEntries BackupEntriesCollector::run() makeBackupEntriesForTablesDefs(); /// Make backup entries for the data of the found tables. - setStage(kExtractingDataFromTablesStage); + setStage(Stage::EXTRACTING_DATA_FROM_TABLES); makeBackupEntriesForTablesData(); /// Run all the tasks added with addPostCollectingTask(). - setStage(kRunningPostTasksStage); + setStage(Stage::RUNNING_POST_TASKS); runPostTasks(); /// No more backup entries or tasks are allowed after this point. - setStage(kWritingBackupStage); + setStage(Stage::WRITING_BACKUP); return std::move(backup_entries); } @@ -149,11 +136,11 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String backup_coordination->setStage(backup_settings.host_id, new_stage, message); - if (new_stage == formatGatheringMetadataStage(1)) + if (new_stage == Stage::formatGatheringMetadata(1)) { return backup_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout); } - else if (new_stage.starts_with(kGatheringMetadataStage)) + else if (new_stage.starts_with(Stage::GATHERING_METADATA)) { auto current_time = std::chrono::steady_clock::now(); auto end_of_timeout = std::max(current_time, consistent_metadata_snapshot_end_time); @@ -183,13 +170,13 @@ void BackupEntriesCollector::calculateRootPathInBackup() /// Finds databases and tables which we will put to the backup. void BackupEntriesCollector::gatherMetadataAndCheckConsistency() { - setStage(formatGatheringMetadataStage(1)); + setStage(Stage::formatGatheringMetadata(1)); consistent_metadata_snapshot_end_time = std::chrono::steady_clock::now() + consistent_metadata_snapshot_timeout; for (size_t pass = 1;; ++pass) { - String next_stage = formatGatheringMetadataStage(pass + 1); + String next_stage = Stage::formatGatheringMetadata(pass + 1); std::optional inconsistency_error; if (tryGatherMetadataAndCompareWithPrevious(inconsistency_error)) { @@ -722,7 +709,7 @@ void BackupEntriesCollector::makeBackupEntriesForTableData(const QualifiedTableN void BackupEntriesCollector::addBackupEntry(const String & file_name, BackupEntryPtr backup_entry) { - if (current_stage == kWritingBackupStage) + if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding backup entries is not allowed"); backup_entries.emplace_back(file_name, backup_entry); } @@ -734,21 +721,21 @@ void BackupEntriesCollector::addBackupEntry(const std::pair task) { - if (current_stage == kWritingBackupStage) + if (current_stage == Stage::WRITING_BACKUP) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of post tasks is not allowed"); post_tasks.push(std::move(task)); } diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index b0d3fb55f2a..cd505ed587c 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -24,11 +25,15 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace Stage = BackupCoordinationStage; + namespace { - /// Coordination status meaning that a host finished its work. - constexpr const char * kCompletedStage = "completed"; - std::shared_ptr makeBackupCoordination(const String & coordination_zk_path, const ContextPtr & context, bool is_internal_backup) { if (!coordination_zk_path.empty()) @@ -130,8 +135,14 @@ std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, con UUID backup_uuid = *backup_settings.backup_uuid; std::shared_ptr backup_coordination; - if (!backup_settings.coordination_zk_path.empty()) + + if (backup_settings.internal) + { + /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination + /// if it's not created here. However to handle errors better it's better to make a coordination here because this way + /// if an exception will be thrown in startMakingBackup() other hosts will know about that. backup_coordination = makeBackupCoordination(backup_settings.coordination_zk_path, context, backup_settings.internal); + } try { @@ -161,12 +172,20 @@ std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, con backup_coordination, context_in_use, mutable_context, - true); + /* called_async= */ true); }); } else { - doBackup(backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context, false); + doBackup( + backup_uuid, + backup_query, + backup_settings, + backup_info, + backup_coordination, + context_in_use, + mutable_context, + /* called_async= */ false); } return {backup_uuid, backup_settings.internal}; @@ -258,7 +277,7 @@ void BackupsWorker::doBackup( /// Wait until all the hosts have written their backup entries. auto all_hosts = BackupSettings::Util::filterHostIDs( backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num); - backup_coordination->waitForStage(all_hosts, kCompletedStage); + backup_coordination->waitForStage(all_hosts, Stage::COMPLETED); } else { @@ -275,7 +294,7 @@ void BackupsWorker::doBackup( writeBackupEntries(backup, std::move(backup_entries), backups_thread_pool); /// We have written our backup entries, we need to tell other hosts (they could be waiting for it). - backup_coordination->setStage(backup_settings.host_id, kCompletedStage, ""); + backup_coordination->setStage(backup_settings.host_id, Stage::COMPLETED, ""); } /// Finalize backup (write its metadata). @@ -313,8 +332,14 @@ std::pair BackupsWorker::startRestoring(const ASTPtr & query, Contex UUID restore_uuid = UUIDHelpers::generateV4(); std::shared_ptr restore_coordination; - if (!restore_settings.coordination_zk_path.empty()) + + if (restore_settings.internal) + { + /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination + /// if it's not created here. However to handle errors better it's better to make a coordination here because this way + /// if an exception will be thrown in startRestoring() other hosts will know about that. restore_coordination = makeRestoreCoordination(restore_settings.coordination_zk_path, context, restore_settings.internal); + } try { @@ -334,12 +359,27 @@ std::pair BackupsWorker::startRestoring(const ASTPtr & query, Contex if (restore_settings.async) { backups_thread_pool.scheduleOrThrowOnError( - [this, restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use] - { doRestore(restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use, true); }); + [this, restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use] { + doRestore( + restore_uuid, + restore_query, + restore_settings, + backup_info, + restore_coordination, + context_in_use, + /* called_async= */ true); + }); } else { - doRestore(restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use, false); + doRestore( + restore_uuid, + restore_query, + restore_settings, + backup_info, + restore_coordination, + context_in_use, + /* called_async= */ false); } return {restore_uuid, restore_settings.internal}; @@ -438,7 +478,7 @@ void BackupsWorker::doRestore( /// Wait until all the hosts have written their backup entries. auto all_hosts = BackupSettings::Util::filterHostIDs( restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num); - restore_coordination->waitForStage(all_hosts, kCompletedStage); + restore_coordination->waitForStage(all_hosts, Stage::COMPLETED); } else { @@ -456,7 +496,7 @@ void BackupsWorker::doRestore( restoreTablesData(std::move(data_restore_tasks), restores_thread_pool); /// We have restored everything, we need to tell other hosts (they could be waiting for it). - restore_coordination->setStage(restore_settings.host_id, kCompletedStage, ""); + restore_coordination->setStage(restore_settings.host_id, Stage::COMPLETED, ""); } LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); @@ -490,7 +530,9 @@ void BackupsWorker::addInfo(const UUID & uuid, bool internal, const String & bac info.internal = internal; std::lock_guard lock{infos_mutex}; - infos[{uuid, internal}] = std::move(info); + bool inserted = infos.try_emplace({uuid, internal}, std::move(info)).second; + if (!inserted) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Pair of UUID={} and internal={} is already in use", uuid, internal); num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); @@ -502,7 +544,7 @@ void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus sta std::lock_guard lock{infos_mutex}; auto it = infos.find({uuid, internal}); if (it == infos.end()) - return; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", uuid, internal); auto & info = it->second; auto old_status = info.status; @@ -520,7 +562,7 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, boo { auto it = infos.find({backup_or_restore_uuid, internal}); if (it == infos.end()) - return true; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal); const auto & info = it->second; auto current_status = info.status; if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE))) @@ -529,12 +571,12 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, boo }); } -std::optional BackupsWorker::tryGetInfo(const UUID & backup_or_restore_uuid, bool internal) const +BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid, bool internal) const { std::lock_guard lock{infos_mutex}; auto it = infos.find({backup_or_restore_uuid, internal}); if (it == infos.end()) - return std::nullopt; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal); return it->second; } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 7ae69271d26..7db62633412 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -54,7 +54,7 @@ public: bool internal = false; }; - std::optional tryGetInfo(const UUID & backup_or_restore_uuid, bool internal) const; + Info getInfo(const UUID & backup_or_restore_uuid, bool internal) const; std::vector getAllInfos() const; private: diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index fcc6a2a24b3..e1ec8313cb5 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -101,6 +101,13 @@ bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & ac void RestoreCoordinationRemote::removeAllNodes() { + /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore. + /// + /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query + /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination + /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part + /// of their restore work before that. + auto zookeeper = get_zookeeper(); zookeeper->removeRecursive(zookeeper_path); } diff --git a/src/Backups/RestorerFromBackup.cpp b/src/Backups/RestorerFromBackup.cpp index 3a2f5273611..185c23a479e 100644 --- a/src/Backups/RestorerFromBackup.cpp +++ b/src/Backups/RestorerFromBackup.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -38,20 +39,10 @@ namespace ErrorCodes } +namespace Stage = BackupCoordinationStage; + namespace { - /// Finding databases and tables in the backup which we're going to restore. - constexpr const char * kFindingTablesInBackupStage = "finding tables in backup"; - - /// Creating databases or finding them and checking their definitions. - constexpr const char * kCreatingDatabasesStage = "creating databases"; - - /// Creating tables or finding them and checking their definition. - constexpr const char * kCreatingTablesStage = "creating tables"; - - /// Inserting restored data to tables. - constexpr const char * kInsertingDataToTablesStage = "inserting data to tables"; - /// Uppercases the first character of a passed string. String toUpperFirst(const String & str) { @@ -127,7 +118,7 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode) findRootPathsInBackup(); /// Find all the databases and tables which we will read from the backup. - setStage(kFindingTablesInBackupStage); + setStage(Stage::FINDING_TABLES_IN_BACKUP); findDatabasesAndTablesInBackup(); /// Check access rights. @@ -137,16 +128,16 @@ RestorerFromBackup::DataRestoreTasks RestorerFromBackup::run(Mode mode) return {}; /// Create databases using the create queries read from the backup. - setStage(kCreatingDatabasesStage); + setStage(Stage::CREATING_DATABASES); createDatabases(); /// Create tables using the create queries read from the backup. - setStage(kCreatingTablesStage); + setStage(Stage::CREATING_TABLES); createTables(); /// All what's left is to insert data to tables. /// No more data restoring tasks are allowed after this point. - setStage(kInsertingDataToTablesStage); + setStage(Stage::INSERTING_DATA_TO_TABLES); return getDataRestoreTasks(); } @@ -158,7 +149,7 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa if (restore_coordination) { restore_coordination->setStage(restore_settings.host_id, new_stage, message); - if (new_stage == kFindingTablesInBackupStage) + if (new_stage == Stage::FINDING_TABLES_IN_BACKUP) restore_coordination->waitForStage(all_hosts, new_stage, on_cluster_first_sync_timeout); else restore_coordination->waitForStage(all_hosts, new_stage); @@ -819,14 +810,14 @@ std::vector RestorerFromBackup::findTablesWithoutDependencie void RestorerFromBackup::addDataRestoreTask(DataRestoreTask && new_task) { - if (current_stage == kInsertingDataToTablesStage) + if (current_stage == Stage::INSERTING_DATA_TO_TABLES) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed"); data_restore_tasks.push_back(std::move(new_task)); } void RestorerFromBackup::addDataRestoreTasks(DataRestoreTasks && new_tasks) { - if (current_stage == kInsertingDataToTablesStage) + if (current_stage == Stage::INSERTING_DATA_TO_TABLES) throw Exception(ErrorCodes::LOGICAL_ERROR, "Adding of data-restoring tasks is not allowed"); insertAtEnd(data_restore_tasks, std::move(new_tasks)); } diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index a2bb3acf073..8f2060d2d02 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -17,18 +17,15 @@ namespace DB namespace { - Block getResultRow(const std::optional & info) + Block getResultRow(const BackupsWorker::Info & info) { auto column_uuid = ColumnUUID::create(); auto column_backup_name = ColumnString::create(); auto column_status = ColumnInt8::create(); - if (info) - { - column_uuid->insert(info->uuid); - column_backup_name->insert(info->backup_name); - column_status->insert(static_cast(info->status)); - } + column_uuid->insert(info.uuid); + column_backup_name->insert(info.backup_name); + column_status->insert(static_cast(info.status)); Block res_columns; res_columns.insert(0, {std::move(column_uuid), std::make_shared(), "uuid"}); @@ -44,7 +41,7 @@ BlockIO InterpreterBackupQuery::execute() auto & backups_worker = context->getBackupsWorker(); auto [uuid, internal] = backups_worker.start(query_ptr, context); BlockIO res_io; - res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.tryGetInfo(uuid, internal)))); + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.getInfo(uuid, internal)))); return res_io; } From 1b1fa54db1546712491b19b1aa0985abbee979d4 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov <440544+davenger@users.noreply.github.com> Date: Tue, 26 Jul 2022 12:42:37 +0200 Subject: [PATCH 193/227] Added 'long' tag --- .../02352_lightweight_delete_on_replicated_merge_tree.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql index 417dfeea094..15e4d45581d 100644 --- a/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql +++ b/tests/queries/0_stateless/02352_lightweight_delete_on_replicated_merge_tree.sql @@ -1,3 +1,5 @@ +-- Tags: long + DROP TABLE IF EXISTS replicated_table_r1 SYNC; DROP TABLE IF EXISTS replicated_table_r2 SYNC; From 98cb7d2dfeaddae79cdea746369dc083b784ef02 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 26 Jul 2022 14:59:07 +0300 Subject: [PATCH 194/227] Update 02319_sql_standard_create_drop_index.sql --- .../0_stateless/02319_sql_standard_create_drop_index.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql index bb01dcf2e64..581b170ee65 100644 --- a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql +++ b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.sql @@ -16,8 +16,8 @@ select table, name, type, expr, granularity from system.data_skipping_indices wh drop table t_index; -create table t_index(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}/', '1') order by a; -create table t_index_replica(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}/', '2') order by a; +create table t_index(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}', '1') order by a; +create table t_index_replica(a int, b String) engine=ReplicatedMergeTree('/test/2319/{database}', '2') order by a; create index i_a on t_index(a) TYPE minmax GRANULARITY 4; create index if not exists i_a on t_index(a) TYPE minmax GRANULARITY 2; From e302bb8f38442e4a1ab3519bd0ddf348286f26cc Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 26 Jul 2022 14:59:32 +0300 Subject: [PATCH 195/227] Update 02319_sql_standard_create_drop_index.reference --- .../02319_sql_standard_create_drop_index.reference | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference index a4a924fd229..bb0c387976a 100644 --- a/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference +++ b/tests/queries/0_stateless/02319_sql_standard_create_drop_index.reference @@ -2,8 +2,8 @@ CREATE TABLE default.t_index\n(\n `a` Int32,\n `b` String,\n INDEX i_a t_index i_a minmax a 4 t_index i_b bloom_filter b 2 t_index i_b bloom_filter b 2 -CREATE TABLE default.t_index\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default/\', \'1\')\nORDER BY a\nSETTINGS index_granularity = 8192 -CREATE TABLE default.t_index_replica\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default/\', \'2\')\nORDER BY a\nSETTINGS index_granularity = 8192 +CREATE TABLE default.t_index\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default\', \'1\')\nORDER BY a\nSETTINGS index_granularity = 8192 +CREATE TABLE default.t_index_replica\n(\n `a` Int32,\n `b` String,\n INDEX i_a a TYPE minmax GRANULARITY 4,\n INDEX i_b b TYPE bloom_filter GRANULARITY 2\n)\nENGINE = ReplicatedMergeTree(\'/test/2319/default\', \'2\')\nORDER BY a\nSETTINGS index_granularity = 8192 t_index i_a minmax a 4 t_index i_b bloom_filter b 2 t_index i_b bloom_filter b 2 From 413024b4f44b42684fd5e706c9e441cec4b6ca32 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 13:15:34 +0200 Subject: [PATCH 196/227] Add call ZooKeeper::sync(). --- src/Backups/BackupCoordinationRemote.cpp | 158 ++++++++++++---------- src/Backups/BackupCoordinationRemote.h | 5 +- src/Backups/RestoreCoordinationRemote.cpp | 59 +++++--- src/Backups/RestoreCoordinationRemote.h | 6 +- 4 files changed, 136 insertions(+), 92 deletions(-) diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 8ef2db5d6f1..8d8cfc4225e 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -170,9 +170,10 @@ BackupCoordinationRemote::BackupCoordinationRemote( : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_) - , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("BackupCoordination")) { createRootNodes(); + stage_sync.emplace( + zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("BackupCoordination")); } BackupCoordinationRemote::~BackupCoordinationRemote() @@ -188,18 +189,37 @@ BackupCoordinationRemote::~BackupCoordinationRemote() } } +zkutil::ZooKeeperPtr BackupCoordinationRemote::getZooKeeper() const +{ + std::lock_guard lock{mutex}; + return getZooKeeperNoLock(); +} + +zkutil::ZooKeeperPtr BackupCoordinationRemote::getZooKeeperNoLock() const +{ + if (!zookeeper || zookeeper->expired()) + { + zookeeper = get_zookeeper(); + + /// It's possible that we connected to different [Zoo]Keeper instance + /// so we may read a bit stale state. + zookeeper->sync(zookeeper_path); + } + return zookeeper; +} + void BackupCoordinationRemote::createRootNodes() { - auto zookeeper = get_zookeeper(); - zookeeper->createAncestors(zookeeper_path); - zookeeper->createIfNotExists(zookeeper_path, ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_part_names", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_mutations", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_data_paths", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_access", ""); - zookeeper->createIfNotExists(zookeeper_path + "/file_names", ""); - zookeeper->createIfNotExists(zookeeper_path + "/file_infos", ""); - zookeeper->createIfNotExists(zookeeper_path + "/archive_suffixes", ""); + auto zk = getZooKeeper(); + zk->createAncestors(zookeeper_path); + zk->createIfNotExists(zookeeper_path, ""); + zk->createIfNotExists(zookeeper_path + "/repl_part_names", ""); + zk->createIfNotExists(zookeeper_path + "/repl_mutations", ""); + zk->createIfNotExists(zookeeper_path + "/repl_data_paths", ""); + zk->createIfNotExists(zookeeper_path + "/repl_access", ""); + zk->createIfNotExists(zookeeper_path + "/file_names", ""); + zk->createIfNotExists(zookeeper_path + "/file_infos", ""); + zk->createIfNotExists(zookeeper_path + "/archive_suffixes", ""); } void BackupCoordinationRemote::removeAllNodes() @@ -210,29 +230,29 @@ void BackupCoordinationRemote::removeAllNodes() /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that). - auto zookeeper = get_zookeeper(); - zookeeper->removeRecursive(zookeeper_path); + auto zk = getZooKeeper(); + zk->removeRecursive(zookeeper_path); } void BackupCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message) { - stage_sync.set(current_host, new_stage, message); + stage_sync->set(current_host, new_stage, message); } void BackupCoordinationRemote::setError(const String & current_host, const Exception & exception) { - stage_sync.setError(current_host, exception); + stage_sync->setError(current_host, exception); } Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait) { - return stage_sync.wait(all_hosts, stage_to_wait); + return stage_sync->wait(all_hosts, stage_to_wait); } Strings BackupCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) { - return stage_sync.waitFor(all_hosts, stage_to_wait, timeout); + return stage_sync->waitFor(all_hosts, stage_to_wait, timeout); } @@ -248,11 +268,11 @@ void BackupCoordinationRemote::addReplicatedPartNames( throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedPartNames() must not be called after preparing"); } - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_part_names/" + escapeForFileName(table_shared_id); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(replica_name); - zookeeper->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent); + zk->create(path, ReplicatedPartNames::serialize(part_names_and_checksums, table_name_for_logs), zkutil::CreateMode::Persistent); } Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_shared_id, const String & replica_name) const @@ -274,11 +294,11 @@ void BackupCoordinationRemote::addReplicatedMutations( throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedMutations() must not be called after preparing"); } - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_mutations/" + escapeForFileName(table_shared_id); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(replica_name); - zookeeper->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent); + zk->create(path, ReplicatedMutations::serialize(mutations, table_name_for_logs), zkutil::CreateMode::Persistent); } std::vector BackupCoordinationRemote::getReplicatedMutations(const String & table_shared_id, const String & replica_name) const @@ -298,11 +318,11 @@ void BackupCoordinationRemote::addReplicatedDataPath( throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedDataPath() must not be called after preparing"); } - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_data_paths/" + escapeForFileName(table_shared_id); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(data_path); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); } Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_shared_id) const @@ -319,18 +339,18 @@ void BackupCoordinationRemote::prepareReplicatedTables() const return; replicated_tables.emplace(); - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeperNoLock(); { String path = zookeeper_path + "/repl_part_names"; - for (const String & escaped_table_shared_id : zookeeper->getChildren(path)) + for (const String & escaped_table_shared_id : zk->getChildren(path)) { String table_shared_id = unescapeForFileName(escaped_table_shared_id); String path2 = path + "/" + escaped_table_shared_id; - for (const String & escaped_replica_name : zookeeper->getChildren(path2)) + for (const String & escaped_replica_name : zk->getChildren(path2)) { String replica_name = unescapeForFileName(escaped_replica_name); - auto part_names = ReplicatedPartNames::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name)); + auto part_names = ReplicatedPartNames::deserialize(zk->get(path2 + "/" + escaped_replica_name)); replicated_tables->addPartNames(table_shared_id, part_names.table_name_for_logs, replica_name, part_names.part_names_and_checksums); } } @@ -338,14 +358,14 @@ void BackupCoordinationRemote::prepareReplicatedTables() const { String path = zookeeper_path + "/repl_mutations"; - for (const String & escaped_table_shared_id : zookeeper->getChildren(path)) + for (const String & escaped_table_shared_id : zk->getChildren(path)) { String table_shared_id = unescapeForFileName(escaped_table_shared_id); String path2 = path + "/" + escaped_table_shared_id; - for (const String & escaped_replica_name : zookeeper->getChildren(path2)) + for (const String & escaped_replica_name : zk->getChildren(path2)) { String replica_name = unescapeForFileName(escaped_replica_name); - auto mutations = ReplicatedMutations::deserialize(zookeeper->get(path2 + "/" + escaped_replica_name)); + auto mutations = ReplicatedMutations::deserialize(zk->get(path2 + "/" + escaped_replica_name)); replicated_tables->addMutations(table_shared_id, mutations.table_name_for_logs, replica_name, mutations.mutations); } } @@ -353,11 +373,11 @@ void BackupCoordinationRemote::prepareReplicatedTables() const { String path = zookeeper_path + "/repl_data_paths"; - for (const String & escaped_table_shared_id : zookeeper->getChildren(path)) + for (const String & escaped_table_shared_id : zk->getChildren(path)) { String table_shared_id = unescapeForFileName(escaped_table_shared_id); String path2 = path + "/" + escaped_table_shared_id; - for (const String & escaped_data_path : zookeeper->getChildren(path2)) + for (const String & escaped_data_path : zk->getChildren(path2)) { String data_path = unescapeForFileName(escaped_data_path); replicated_tables->addDataPath(table_shared_id, data_path); @@ -375,13 +395,13 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access throw Exception(ErrorCodes::LOGICAL_ERROR, "addReplicatedAccessFilePath() must not be called after preparing"); } - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_access/" + escapeForFileName(access_zk_path); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + AccessEntityTypeInfo::get(access_entity_type).name; - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + host_id; - zookeeper->createIfNotExists(path, file_path); + zk->createIfNotExists(path, file_path); } Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type, const String & host_id) const @@ -397,20 +417,20 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const return; replicated_access.emplace(); - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeperNoLock(); String path = zookeeper_path + "/repl_access"; - for (const String & escaped_access_zk_path : zookeeper->getChildren(path)) + for (const String & escaped_access_zk_path : zk->getChildren(path)) { String access_zk_path = unescapeForFileName(escaped_access_zk_path); String path2 = path + "/" + escaped_access_zk_path; - for (const String & type_str : zookeeper->getChildren(path2)) + for (const String & type_str : zk->getChildren(path2)) { AccessEntityType type = AccessEntityTypeInfo::parseType(type_str); String path3 = path2 + "/" + type_str; - for (const String & host_id : zookeeper->getChildren(path3)) + for (const String & host_id : zk->getChildren(path3)) { - String file_path = zookeeper->get(path3 + "/" + host_id); + String file_path = zk->get(path3 + "/" + host_id); replicated_access->addFilePath(access_zk_path, type, host_id, file_path); } } @@ -420,11 +440,11 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const void BackupCoordinationRemote::addFileInfo(const FileInfo & file_info, bool & is_data_file_required) { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String full_path = zookeeper_path + "/file_names/" + escapeForFileName(file_info.file_name); String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum}); - zookeeper->create(full_path, size_and_checksum, zkutil::CreateMode::Persistent); + zk->create(full_path, size_and_checksum, zkutil::CreateMode::Persistent); if (!file_info.size) { @@ -433,7 +453,7 @@ void BackupCoordinationRemote::addFileInfo(const FileInfo & file_info, bool & is } full_path = zookeeper_path + "/file_infos/" + size_and_checksum; - auto code = zookeeper->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent); + auto code = zk->tryCreate(full_path, serializeFileInfo(file_info), zkutil::CreateMode::Persistent); if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) throw zkutil::KeeperException(code, full_path); @@ -445,15 +465,15 @@ void BackupCoordinationRemote::updateFileInfo(const FileInfo & file_info) if (!file_info.size) return; /// we don't keep FileInfos for empty files, nothing to update - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String size_and_checksum = serializeSizeAndChecksum(std::pair{file_info.size, file_info.checksum}); String full_path = zookeeper_path + "/file_infos/" + size_and_checksum; for (size_t attempt = 0; attempt < NUM_ATTEMPTS; ++attempt) { Coordination::Stat stat; - auto new_info = deserializeFileInfo(zookeeper->get(full_path, &stat)); + auto new_info = deserializeFileInfo(zk->get(full_path, &stat)); new_info.archive_suffix = file_info.archive_suffix; - auto code = zookeeper->trySet(full_path, serializeFileInfo(new_info), stat.version); + auto code = zk->trySet(full_path, serializeFileInfo(new_info), stat.version); if (code == Coordination::Error::ZOK) return; bool is_last_attempt = (attempt == NUM_ATTEMPTS - 1); @@ -464,16 +484,16 @@ void BackupCoordinationRemote::updateFileInfo(const FileInfo & file_info) std::vector BackupCoordinationRemote::getAllFileInfos() const { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); std::vector file_infos; - Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names"); for (const String & escaped_name : escaped_names) { - String size_and_checksum = zookeeper->get(zookeeper_path + "/file_names/" + escaped_name); + String size_and_checksum = zk->get(zookeeper_path + "/file_names/" + escaped_name); UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first; FileInfo file_info; if (size) /// we don't keep FileInfos for empty files - file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum)); + file_info = deserializeFileInfo(zk->get(zookeeper_path + "/file_infos/" + size_and_checksum)); file_info.file_name = unescapeForFileName(escaped_name); file_infos.emplace_back(std::move(file_info)); } @@ -482,8 +502,8 @@ std::vector BackupCoordinationRemote::getAllFileInfos() const Strings BackupCoordinationRemote::listFiles(const String & directory, bool recursive) const { - auto zookeeper = get_zookeeper(); - Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + auto zk = getZooKeeper(); + Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names"); String prefix = directory; if (!prefix.empty() && !prefix.ends_with('/')) @@ -515,8 +535,8 @@ Strings BackupCoordinationRemote::listFiles(const String & directory, bool recur bool BackupCoordinationRemote::hasFiles(const String & directory) const { - auto zookeeper = get_zookeeper(); - Strings escaped_names = zookeeper->getChildren(zookeeper_path + "/file_names"); + auto zk = getZooKeeper(); + Strings escaped_names = zk->getChildren(zookeeper_path + "/file_names"); String prefix = directory; if (!prefix.empty() && !prefix.ends_with('/')) @@ -534,42 +554,42 @@ bool BackupCoordinationRemote::hasFiles(const String & directory) const std::optional BackupCoordinationRemote::getFileInfo(const String & file_name) const { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String size_and_checksum; - if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) + if (!zk->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) return std::nullopt; UInt64 size = deserializeSizeAndChecksum(size_and_checksum).first; FileInfo file_info; if (size) /// we don't keep FileInfos for empty files - file_info = deserializeFileInfo(zookeeper->get(zookeeper_path + "/file_infos/" + size_and_checksum)); + file_info = deserializeFileInfo(zk->get(zookeeper_path + "/file_infos/" + size_and_checksum)); file_info.file_name = file_name; return file_info; } std::optional BackupCoordinationRemote::getFileInfo(const SizeAndChecksum & size_and_checksum) const { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String file_info_str; - if (!zookeeper->tryGet(zookeeper_path + "/file_infos/" + serializeSizeAndChecksum(size_and_checksum), file_info_str)) + if (!zk->tryGet(zookeeper_path + "/file_infos/" + serializeSizeAndChecksum(size_and_checksum), file_info_str)) return std::nullopt; return deserializeFileInfo(file_info_str); } std::optional BackupCoordinationRemote::getFileSizeAndChecksum(const String & file_name) const { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String size_and_checksum; - if (!zookeeper->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) + if (!zk->tryGet(zookeeper_path + "/file_names/" + escapeForFileName(file_name), size_and_checksum)) return std::nullopt; return deserializeSizeAndChecksum(size_and_checksum); } String BackupCoordinationRemote::getNextArchiveSuffix() { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/archive_suffixes/a"; String path_created; - auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::PersistentSequential, path_created); + auto code = zk->tryCreate(path, "", zkutil::CreateMode::PersistentSequential, path_created); if (code != Coordination::Error::ZOK) throw zkutil::KeeperException(code, path); return formatArchiveSuffix(extractCounterFromSequentialNodeName(path_created)); @@ -577,8 +597,8 @@ String BackupCoordinationRemote::getNextArchiveSuffix() Strings BackupCoordinationRemote::getAllArchiveSuffixes() const { - auto zookeeper = get_zookeeper(); - Strings node_names = zookeeper->getChildren(zookeeper_path + "/archive_suffixes"); + auto zk = getZooKeeper(); + Strings node_names = zk->getChildren(zookeeper_path + "/archive_suffixes"); for (auto & node_name : node_names) node_name = formatArchiveSuffix(extractCounterFromSequentialNodeName(node_name)); return node_names; diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationRemote.h index d1d206683fa..83ddd7b16dc 100644 --- a/src/Backups/BackupCoordinationRemote.h +++ b/src/Backups/BackupCoordinationRemote.h @@ -57,6 +57,8 @@ public: Strings getAllArchiveSuffixes() const override; private: + zkutil::ZooKeeperPtr getZooKeeper() const; + zkutil::ZooKeeperPtr getZooKeeperNoLock() const; void createRootNodes(); void removeAllNodes(); void prepareReplicatedTables() const; @@ -66,9 +68,10 @@ private: const zkutil::GetZooKeeper get_zookeeper; const bool remove_zk_nodes_in_destructor; - BackupCoordinationStageSync stage_sync; + std::optional stage_sync; mutable std::mutex mutex; + mutable zkutil::ZooKeeperPtr zookeeper; mutable std::optional replicated_tables; mutable std::optional replicated_access; }; diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp index e1ec8313cb5..89a9950aad2 100644 --- a/src/Backups/RestoreCoordinationRemote.cpp +++ b/src/Backups/RestoreCoordinationRemote.cpp @@ -6,13 +6,16 @@ namespace DB { -RestoreCoordinationRemote::RestoreCoordinationRemote(const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_) +RestoreCoordinationRemote::RestoreCoordinationRemote( + const String & zookeeper_path_, zkutil::GetZooKeeper get_zookeeper_, bool remove_zk_nodes_in_destructor_) : zookeeper_path(zookeeper_path_) , get_zookeeper(get_zookeeper_) , remove_zk_nodes_in_destructor(remove_zk_nodes_in_destructor_) - , stage_sync(zookeeper_path_ + "/stage", get_zookeeper_, &Poco::Logger::get("RestoreCoordination")) { createRootNodes(); + + stage_sync.emplace( + zookeeper_path_ + "/stage", [this] { return getZooKeeper(); }, &Poco::Logger::get("RestoreCoordination")); } RestoreCoordinationRemote::~RestoreCoordinationRemote() @@ -28,47 +31,61 @@ RestoreCoordinationRemote::~RestoreCoordinationRemote() } } +zkutil::ZooKeeperPtr RestoreCoordinationRemote::getZooKeeper() const +{ + std::lock_guard lock{mutex}; + if (!zookeeper || zookeeper->expired()) + { + zookeeper = get_zookeeper(); + + /// It's possible that we connected to different [Zoo]Keeper instance + /// so we may read a bit stale state. + zookeeper->sync(zookeeper_path); + } + return zookeeper; +} + void RestoreCoordinationRemote::createRootNodes() { - auto zookeeper = get_zookeeper(); - zookeeper->createAncestors(zookeeper_path); - zookeeper->createIfNotExists(zookeeper_path, ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", ""); - zookeeper->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", ""); + auto zk = getZooKeeper(); + zk->createAncestors(zookeeper_path); + zk->createIfNotExists(zookeeper_path, ""); + zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", ""); + zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", ""); + zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", ""); } void RestoreCoordinationRemote::setStage(const String & current_host, const String & new_stage, const String & message) { - stage_sync.set(current_host, new_stage, message); + stage_sync->set(current_host, new_stage, message); } void RestoreCoordinationRemote::setError(const String & current_host, const Exception & exception) { - stage_sync.setError(current_host, exception); + stage_sync->setError(current_host, exception); } Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait) { - return stage_sync.wait(all_hosts, stage_to_wait); + return stage_sync->wait(all_hosts, stage_to_wait); } Strings RestoreCoordinationRemote::waitForStage(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout) { - return stage_sync.waitFor(all_hosts, stage_to_wait, timeout); + return stage_sync->waitFor(all_hosts, stage_to_wait, timeout); } bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path); - zookeeper->createIfNotExists(path, ""); + zk->createIfNotExists(path, ""); path += "/" + escapeForFileName(table_name); - auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent); if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) throw zkutil::KeeperException(code, path); @@ -77,10 +94,10 @@ bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const S bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path) { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path); - auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent); if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) throw zkutil::KeeperException(code, path); @@ -89,10 +106,10 @@ bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const St bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path) { - auto zookeeper = get_zookeeper(); + auto zk = getZooKeeper(); String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path); - auto code = zookeeper->tryCreate(path, "", zkutil::CreateMode::Persistent); + auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent); if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS)) throw zkutil::KeeperException(code, path); @@ -108,8 +125,8 @@ void RestoreCoordinationRemote::removeAllNodes() /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part /// of their restore work before that. - auto zookeeper = get_zookeeper(); - zookeeper->removeRecursive(zookeeper_path); + auto zk = getZooKeeper(); + zk->removeRecursive(zookeeper_path); } } diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationRemote.h index 0cbbb6622ad..83760a2d883 100644 --- a/src/Backups/RestoreCoordinationRemote.h +++ b/src/Backups/RestoreCoordinationRemote.h @@ -32,6 +32,7 @@ public: bool acquireReplicatedAccessStorage(const String & access_storage_zk_path) override; private: + zkutil::ZooKeeperPtr getZooKeeper() const; void createRootNodes(); void removeAllNodes(); @@ -41,7 +42,10 @@ private: const zkutil::GetZooKeeper get_zookeeper; const bool remove_zk_nodes_in_destructor; - BackupCoordinationStageSync stage_sync; + std::optional stage_sync; + + mutable std::mutex mutex; + mutable zkutil::ZooKeeperPtr zookeeper; }; } From 142f7d4b44b6634ec8885bbdd6b81c362ee7228b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 20 Jul 2022 19:33:35 +0200 Subject: [PATCH 197/227] Require clear style check to continue building --- .github/workflows/pull_request.yml | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 958916b2d16..c2e816b44dc 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -254,7 +254,7 @@ jobs: #################################### ORDINARY BUILDS #################################### ######################################################################################### BuilderDebRelease: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -301,7 +301,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" BuilderBinRelease: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -346,7 +346,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" # BuilderBinGCC: - # needs: [DockerHubPush, FastTest] + # needs: [DockerHubPush, FastTest, StyleCheck] # runs-on: [self-hosted, builder] # steps: # - name: Set envs @@ -391,7 +391,7 @@ jobs: # docker rm -f $(docker ps -a -q) ||: # sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebAarch64: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -438,7 +438,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebAsan: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -483,7 +483,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebUBsan: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -528,7 +528,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebTsan: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -573,7 +573,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebMsan: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -618,7 +618,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebDebug: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -666,7 +666,7 @@ jobs: ##################################### SPECIAL BUILDS ##################################### ########################################################################################## BuilderDebSplitted: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -711,7 +711,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinClangTidy: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -756,7 +756,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwin: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -801,7 +801,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinAarch64: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -846,7 +846,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinFreeBSD: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -891,7 +891,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinDarwinAarch64: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs @@ -936,7 +936,7 @@ jobs: docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderBinPPC64: - needs: [DockerHubPush, FastTest] + needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] steps: - name: Set envs From db0653758e612a93fca0398f0d6eb84100885300 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 21 Jul 2022 18:12:07 +0200 Subject: [PATCH 198/227] Add typing to rerun and commit_status helpers --- tests/ci/commit_status_helper.py | 14 ++++++--- tests/ci/rerun_helper.py | 50 +++++++++++++++++++------------- 2 files changed, 40 insertions(+), 24 deletions(-) diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 420ca7a0ff7..83b6203c050 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -import time -import os import csv -from env_helper import GITHUB_REPOSITORY, GITHUB_RUN_URL +import os +import time +from typing import Optional + from ci_config import CI_CONFIG +from env_helper import GITHUB_REPOSITORY, GITHUB_RUN_URL +from github import Github +from github.Commit import Commit from pr_info import SKIP_SIMPLE_CHECK_LABEL RETRY = 5 @@ -22,7 +26,9 @@ def override_status(status, check_name, invert=False): return status -def get_commit(gh, commit_sha, retry_count=RETRY): +def get_commit( + gh: Github, commit_sha: str, retry_count: int = RETRY +) -> Optional[Commit]: for i in range(retry_count): try: repo = gh.get_repo(GITHUB_REPOSITORY) diff --git a/tests/ci/rerun_helper.py b/tests/ci/rerun_helper.py index 0d523640f56..c4ae70eadb9 100644 --- a/tests/ci/rerun_helper.py +++ b/tests/ci/rerun_helper.py @@ -1,33 +1,26 @@ #!/usr/bin/env python3 +from typing import List, Optional from commit_status_helper import get_commit +from github import Github +from github.CommitStatus import CommitStatus +from pr_info import PRInfo - -def _filter_statuses(statuses): - """ - Squash statuses to latest state - 1. context="first", state="success", update_time=1 - 2. context="second", state="success", update_time=2 - 3. context="first", stat="failure", update_time=3 - =========> - 1. context="second", state="success" - 2. context="first", stat="failure" - """ - filt = {} - for status in sorted(statuses, key=lambda x: x.updated_at): - filt[status.context] = status - return filt.values() +CommitStatuses = List[CommitStatus] class RerunHelper: - def __init__(self, gh, pr_info, check_name): + def __init__(self, gh: Github, pr_info: PRInfo, check_name: str): self.gh = gh self.pr_info = pr_info self.check_name = check_name - self.pygh_commit = get_commit(gh, self.pr_info.sha) - self.statuses = _filter_statuses(self.pygh_commit.get_statuses()) + commit = get_commit(gh, self.pr_info.sha) + if commit is None: + raise ValueError(f"unable to receive commit for {pr_info.sha}") + self.pygh_commit = commit + self.statuses = self.ger_filtered_statuses() - def is_already_finished_by_status(self): + def is_already_finished_by_status(self) -> bool: # currently we agree even for failed statuses for status in self.statuses: if self.check_name in status.context and status.state in ( @@ -37,8 +30,25 @@ class RerunHelper: return True return False - def get_finished_status(self): + def get_finished_status(self) -> Optional[CommitStatus]: for status in self.statuses: if self.check_name in status.context: return status return None + + def ger_filtered_statuses(self) -> CommitStatuses: + """ + Squash statuses to latest state + 1. context="first", state="success", update_time=1 + 2. context="second", state="success", update_time=2 + 3. context="first", stat="failure", update_time=3 + =========> + 1. context="second", state="success" + 2. context="first", stat="failure" + """ + filt = {} + for status in sorted( + self.pygh_commit.get_statuses(), key=lambda x: x.updated_at + ): + filt[status.context] = status + return list(filt.values()) From d9baa0b1f7f9440f59dd7609ac1fdf772636996b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 21 Jul 2022 18:25:22 +0200 Subject: [PATCH 199/227] Finish following runs with the same exit code as the first --- tests/ci/style_check.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py index 0d619d28526..7ba0dc2a297 100644 --- a/tests/ci/style_check.py +++ b/tests/ci/style_check.py @@ -153,7 +153,11 @@ if __name__ == "__main__": rerun_helper = RerunHelper(gh, pr_info, NAME) if rerun_helper.is_already_finished_by_status(): logging.info("Check is already finished according to github status, exiting") - sys.exit(0) + # Finish with the same code as previous + state = rerun_helper.get_finished_status().state # type: ignore + # state == "success" -> code = 0 + code = int(state != "success") + sys.exit(code) if not os.path.exists(temp_path): os.makedirs(temp_path) From b9cda3d19283a3a46fa6e2cac34967fb49812c77 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Jul 2022 00:43:15 +0300 Subject: [PATCH 200/227] Update pull_request.yml --- .github/workflows/pull_request.yml | 81 ------------------------------ 1 file changed, 81 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index c2e816b44dc..26726302beb 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -345,51 +345,6 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" - # BuilderBinGCC: - # needs: [DockerHubPush, FastTest, StyleCheck] - # runs-on: [self-hosted, builder] - # steps: - # - name: Set envs - # run: | - # cat >> "$GITHUB_ENV" << 'EOF' - # TEMP_PATH=${{runner.temp}}/build_check - # IMAGES_PATH=${{runner.temp}}/images_path - # REPO_COPY=${{runner.temp}}/build_check/ClickHouse - # CACHES_PATH=${{runner.temp}}/../ccaches - # BUILD_NAME=binary_gcc - # EOF - # - name: Download changed images - # uses: actions/download-artifact@v2 - # with: - # name: changed_images - # path: ${{ runner.temp }}/images_path - # - name: Clear repository - # run: | - # sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - # - name: Check out repository code - # uses: actions/checkout@v2 - # - name: Build - # run: | - # git -C "$GITHUB_WORKSPACE" submodule sync --recursive - # git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10 - # sudo rm -fr "$TEMP_PATH" - # mkdir -p "$TEMP_PATH" - # cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - # cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME" - # - name: Upload build URLs to artifacts - # if: ${{ success() || failure() }} - # uses: actions/upload-artifact@v2 - # with: - # name: ${{ env.BUILD_URLS }} - # path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json - # - name: Cleanup - # if: always() - # run: | - # # shellcheck disable=SC2046 - # docker kill $(docker ps -q) ||: - # # shellcheck disable=SC2046 - # docker rm -f $(docker ps -a -q) ||: - # sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" BuilderDebAarch64: needs: [DockerHubPush, FastTest, StyleCheck] runs-on: [self-hosted, builder] @@ -2974,42 +2929,6 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" - # UnitTestsReleaseGCC: - # needs: [BuilderBinGCC] - # runs-on: [self-hosted, fuzzer-unit-tester] - # steps: - # - name: Set envs - # run: | - # cat >> "$GITHUB_ENV" << 'EOF' - # TEMP_PATH=${{runner.temp}}/unit_tests_asan - # REPORTS_PATH=${{runner.temp}}/reports_dir - # CHECK_NAME=Unit tests (release-gcc) - # REPO_COPY=${{runner.temp}}/unit_tests_asan/ClickHouse - # EOF - # - name: Download json reports - # uses: actions/download-artifact@v2 - # with: - # path: ${{ env.REPORTS_PATH }} - # - name: Clear repository - # run: | - # sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - # - name: Check out repository code - # uses: actions/checkout@v2 - # - name: Unit test - # run: | - # sudo rm -fr "$TEMP_PATH" - # mkdir -p "$TEMP_PATH" - # cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - # cd "$REPO_COPY/tests/ci" - # python3 unit_tests_check.py "$CHECK_NAME" - # - name: Cleanup - # if: always() - # run: | - # # shellcheck disable=SC2046 - # docker kill $(docker ps -q) ||: - # # shellcheck disable=SC2046 - # docker rm -f $(docker ps -a -q) ||: - # sudo rm -fr "$TEMP_PATH" UnitTestsTsan: needs: [BuilderDebTsan] runs-on: [self-hosted, fuzzer-unit-tester] From 39d6327fe1d2a1f3ae0f360cb74ea6fc380207a5 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Tue, 26 Jul 2022 11:10:16 -0400 Subject: [PATCH 201/227] Update docs/en/sql-reference/aggregate-functions/reference/any.md --- docs/en/sql-reference/aggregate-functions/reference/any.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index 0707a2a4f4d..c0af7a2a59e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -4,7 +4,7 @@ sidebar_position: 6 # any -Selects the first encountered (non-NULL) value. +Selects the first encountered (non-NULL) value, unless all rows have NULL values in that column. The query can be executed in any order and even in a different order each time, so the result of this function is indeterminate. To get a determinate result, you can use the ‘min’ or ‘max’ function instead of ‘any’. From 33ce662d3ebb0b393a45155299492d7520aa34dc Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Jul 2022 15:22:00 +0000 Subject: [PATCH 202/227] Replace MemoryTrackerBlockerInThread to LockMemoryExceptionInThread in some places. Reduced MemoryTrackerBlockerInThread level to User. --- src/Common/SystemLogBase.cpp | 2 +- src/Interpreters/executeQuery.cpp | 3 ++- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 18 +----------------- src/Storages/StorageBuffer.cpp | 2 +- 4 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp index 67aedbd5670..791f976d5e0 100644 --- a/src/Common/SystemLogBase.cpp +++ b/src/Common/SystemLogBase.cpp @@ -79,7 +79,7 @@ void SystemLogBase::add(const LogElement & element) /// The size of allocation can be in order of a few megabytes. /// But this should not be accounted for query memory usage. /// Otherwise the tests like 01017_uniqCombined_memory_usage.sql will be flacky. - MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); + MemoryTrackerBlockerInThread temporarily_disable_memory_tracker; /// Should not log messages under mutex. bool queue_is_half_full = false; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index ae915aab867..7c0a95e5fcc 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -180,7 +180,8 @@ static void setExceptionStackTrace(QueryLogElement & elem) { /// Disable memory tracker for stack trace. /// Because if exception is "Memory limit (for query) exceed", then we probably can't allocate another one string. - MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); + + LockMemoryExceptionInThread lock(VariableContext::Global); try { diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 60941108f00..aa7f16245a2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -603,22 +603,6 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize( return *minimum_size_column; } -// String IMergeTreeDataPart::getFullPath() const -// { -// if (relative_path.empty()) -// throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); - -// return fs::path(storage.getFullPathOnDisk(volume->getDisk())) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; -// } - -// String IMergeTreeDataPart::getRelativePath() const -// { -// if (relative_path.empty()) -// throw Exception("Part relative_path cannot be empty. It's bug.", ErrorCodes::LOGICAL_ERROR); - -// return fs::path(storage.relative_data_path) / (parent_part ? parent_part->relative_path : "") / relative_path / ""; -// } - void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checksums, bool check_consistency) { assertOnDisk(); @@ -626,7 +610,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks /// Memory should not be limited during ATTACH TABLE query. /// This is already true at the server startup but must be also ensured for manual table ATTACH. /// Motivation: memory for index is shared between queries - not belong to the query itself. - MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); + MemoryTrackerBlockerInThread temporarily_disable_memory_tracker; try { diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 85fb20d6571..c14584a382b 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -465,7 +465,7 @@ static void appendBlock(const Block & from, Block & to) /// In case of rollback, it is better to ignore memory limits instead of abnormal server termination. /// So ignore any memory limits, even global (since memory tracking has drift). - MemoryTrackerBlockerInThread temporarily_ignore_any_memory_limits(VariableContext::Global); + LockMemoryExceptionInThread temporarily_ignore_any_memory_limits(VariableContext::Global); try { From 9f053935b639f274e0f80a2ce53dc7991418a723 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 26 Jul 2022 20:49:41 +0300 Subject: [PATCH 203/227] Update 02327_capnproto_protobuf_empty_messages.sh --- .../0_stateless/02327_capnproto_protobuf_empty_messages.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index 3d41c9bf721..9de01dbe294 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest, no-parallel, no-replicated-database +# Tags: no-fasttest, no-parallel, no-replicated-database, no-ordinary-database CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From fe0db1e859f76cf6378dd4ccc97c11650aa602f9 Mon Sep 17 00:00:00 2001 From: Tyler Hannan Date: Tue, 26 Jul 2022 20:57:49 +0200 Subject: [PATCH 204/227] add Dell PowerEdge R740XD results --- website/benchmark/hardware/index.html | 1 + .../benchmark/hardware/results/dell_r740.json | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 website/benchmark/hardware/results/dell_r740.json diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index a8ab49c2c3f..1dcb7d84cfd 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -103,6 +103,7 @@ Results for Hetzner EX62-NVME are from Talles Airan.
Results for AMD Ryzen 9 5950X are from Stefan.
Results for ScaleFlux CSD 3000 are from Cliicy Luo of ScaleFlux.
Results for Equinix metal n3.xlarge.x84 are from Dave Cottlehuber. +Results for Dell PowerEdge R740xd are from Yu ZiChange at EioTek.

diff --git a/website/benchmark/hardware/results/dell_r740.json b/website/benchmark/hardware/results/dell_r740.json new file mode 100644 index 00000000000..21242e5400a --- /dev/null +++ b/website/benchmark/hardware/results/dell_r740.json @@ -0,0 +1,54 @@ +[ + { + "system": "Dell PowerEdge R740xd", + "system_full": "Dell PowerEdge R740xd, 256GB, 2 * Intel(R) Xeon(R) Silver 4214R CPU @ 2.40GHz, 48 vCPU", + "time": "2022-06-18 00:00:00", + "kind": "server", + "result": + [ + [0.004, 0.003, 0.070], + [0.086, 0.019, 0.021], + [0.220, 0.038, 0.037], + [0.596, 0.051, 0.050], + [0.189, 0.149, 0.148], + [0.991, 0.233, 0.230], + [0.004, 0.004, 0.004], + [0.022, 0.018, 0.017], + [0.519, 0.315, 0.305], + [0.469, 0.341, 0.337], + [0.252, 0.158, 0.166], + [0.252, 0.201, 0.184], + [0.532, 0.500, 0.479], + [0.642, 0.613, 0.596], + [0.635, 0.506, 0.508], + [0.579, 0.556, 0.560], + [1.587, 1.532, 1.518], + [0.813, 0.752, 0.737], + [3.990, 3.826, 3.737], + [0.114, 0.073, 0.054], + [4.866, 0.513, 0.514], + [3.822, 0.580, 0.569], + [7.784, 1.550, 1.535], + [17.171, 1.168, 0.834], + [0.511, 0.184, 0.185], + [0.190, 0.181, 0.169], + [0.214, 0.182, 0.183], + [4.611, 0.620, 0.616], + [4.234, 0.793, 0.779], + [1.823, 1.767, 1.737], + [0.813, 0.412, 0.371], + [2.306, 0.772, 0.737], + [3.995, 4.061, 4.041], + [5.142, 2.523, 2.562], + [4.803, 2.595, 2.482], + [1.172, 0.982, 0.990], + [0.454, 0.248, 0.275], + [0.200, 0.145, 0.153], + [0.232, 0.135, 0.134], + [0.621, 0.548, 0.478], + [0.130, 0.088, 0.047], + [0.112, 0.044, 0.049], + [0.048, 0.023, 0.011] + ] + } +] From dcb86eca0baee06fc9cdca0723e9075bc4b777b2 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Jul 2022 19:03:26 +0000 Subject: [PATCH 205/227] Make only one possible public ctor in MemoryTrackerBlockerInThread --- src/Common/MemoryTrackerBlockerInThread.cpp | 6 ++++++ src/Common/MemoryTrackerBlockerInThread.h | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Common/MemoryTrackerBlockerInThread.cpp b/src/Common/MemoryTrackerBlockerInThread.cpp index 8eb119b2fe5..3d763576c7b 100644 --- a/src/Common/MemoryTrackerBlockerInThread.cpp +++ b/src/Common/MemoryTrackerBlockerInThread.cpp @@ -3,12 +3,18 @@ // MemoryTrackerBlockerInThread thread_local uint64_t MemoryTrackerBlockerInThread::counter = 0; thread_local VariableContext MemoryTrackerBlockerInThread::level = VariableContext::Global; + MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread(VariableContext level_) : previous_level(level) { ++counter; level = level_; } + +MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread() : MemoryTrackerBlockerInThread(VariableContext::User) +{ +} + MemoryTrackerBlockerInThread::~MemoryTrackerBlockerInThread() { --counter; diff --git a/src/Common/MemoryTrackerBlockerInThread.h b/src/Common/MemoryTrackerBlockerInThread.h index 381eb80df0c..7711988625f 100644 --- a/src/Common/MemoryTrackerBlockerInThread.h +++ b/src/Common/MemoryTrackerBlockerInThread.h @@ -11,9 +11,13 @@ private: static thread_local VariableContext level; VariableContext previous_level; + + /// level_ - block in level and above + explicit MemoryTrackerBlockerInThread(VariableContext level_); + public: /// level_ - block in level and above - explicit MemoryTrackerBlockerInThread(VariableContext level_ = VariableContext::User); + explicit MemoryTrackerBlockerInThread(); ~MemoryTrackerBlockerInThread(); MemoryTrackerBlockerInThread(const MemoryTrackerBlockerInThread &) = delete; @@ -23,4 +27,6 @@ public: { return counter > 0 && current_level >= level; } + + friend class MemoryTracker; }; From f1818a9103408d919942fe11ae1f75da540012b5 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 26 Jul 2022 19:04:20 +0000 Subject: [PATCH 206/227] Make only one possible public ctor in MemoryTrackerBlockerInThread --- src/Common/MemoryTrackerBlockerInThread.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Common/MemoryTrackerBlockerInThread.h b/src/Common/MemoryTrackerBlockerInThread.h index 7711988625f..d3882056f54 100644 --- a/src/Common/MemoryTrackerBlockerInThread.h +++ b/src/Common/MemoryTrackerBlockerInThread.h @@ -16,7 +16,6 @@ private: explicit MemoryTrackerBlockerInThread(VariableContext level_); public: - /// level_ - block in level and above explicit MemoryTrackerBlockerInThread(); ~MemoryTrackerBlockerInThread(); From d93c67e3030a9b49e499d0bd8add43ac8d949926 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 26 Jul 2022 15:28:11 -0400 Subject: [PATCH 207/227] comment and test added --- src/Processors/Transforms/FillingTransform.cpp | 1 + tests/queries/0_stateless/02366_with_fill_date.reference | 0 tests/queries/0_stateless/02366_with_fill_date.sql | 6 ++++++ 3 files changed, 7 insertions(+) create mode 100644 tests/queries/0_stateless/02366_with_fill_date.reference create mode 100644 tests/queries/0_stateless/02366_with_fill_date.sql diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index 502fb81149c..bd4842b3361 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -54,6 +54,7 @@ static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & WhichDataType which(type); DataTypePtr to_type; + /// For Date/DateTime types TO/FROM type should match column type if (descr.fill_from_type) { WhichDataType which_from(descr.fill_from_type); diff --git a/tests/queries/0_stateless/02366_with_fill_date.reference b/tests/queries/0_stateless/02366_with_fill_date.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02366_with_fill_date.sql b/tests/queries/0_stateless/02366_with_fill_date.sql new file mode 100644 index 00000000000..64e23b845f8 --- /dev/null +++ b/tests/queries/0_stateless/02366_with_fill_date.sql @@ -0,0 +1,6 @@ +-- Tags: no-backward-compatibility-check + +SELECT toDate('2022-02-01') AS d1 +FROM numbers(18) AS number +ORDER BY d1 ASC WITH FILL FROM toDateTime('2022-02-01') TO toDateTime('2022-07-01') STEP toIntervalMonth(1); -- { serverError 475 } + From d475086100c837e84ca43f0bf57540c63cd3a0ce Mon Sep 17 00:00:00 2001 From: HarryLeeIBM Date: Tue, 26 Jul 2022 12:57:45 -0700 Subject: [PATCH 208/227] Fix wide integer unit test --- src/Common/tests/gtest_wide_integer.cpp | 29 +++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/Common/tests/gtest_wide_integer.cpp b/src/Common/tests/gtest_wide_integer.cpp index 4021ae0ea91..fa614e9390a 100644 --- a/src/Common/tests/gtest_wide_integer.cpp +++ b/src/Common/tests/gtest_wide_integer.cpp @@ -61,8 +61,11 @@ GTEST_TEST(WideInteger, Conversions) ASSERT_EQ(zero, minus_one); zero += minus_one; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero))); +#else ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero))); - +#endif zero += 2; ASSERT_EQ(zero, 0); @@ -156,8 +159,11 @@ GTEST_TEST(WideInteger, Arithmetic) ASSERT_EQ(zero, minus_one); zero += minus_one; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&zero, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero))); +#else ASSERT_EQ(0, memcmp(&zero, "\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(zero))); - +#endif zero += 2; ASSERT_EQ(zero, 0); @@ -236,8 +242,12 @@ GTEST_TEST(WideInteger, Shift) Int128 x = 1; auto y = x << 64; - ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128))); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", sizeof(Int128))); +#else + ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00", sizeof(Int128))); +#endif auto z = y << 11; ASSERT_EQ(toString(z), "37778931862957161709568"); @@ -250,8 +260,11 @@ GTEST_TEST(WideInteger, Shift) x = -1; y = x << 16; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); +#else ASSERT_EQ(0, memcmp(&y, "\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); - +#endif y >>= 16; ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); @@ -259,10 +272,18 @@ GTEST_TEST(WideInteger, Shift) ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); y >>= 32; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&y, "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); +#else ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", sizeof(Int128))); +#endif y <<= 64; +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF\x00\x00\x00\x00", sizeof(Int128))); +#else ASSERT_EQ(0, memcmp(&y, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF\xFF\xFF", sizeof(Int128))); +#endif } From 0055c9307d606b3144097c78f926db40d2d62116 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Tue, 26 Jul 2022 16:08:03 -0400 Subject: [PATCH 209/227] style fix --- src/Processors/Transforms/FillingTransform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Transforms/FillingTransform.cpp b/src/Processors/Transforms/FillingTransform.cpp index bd4842b3361..311c88e46e8 100644 --- a/src/Processors/Transforms/FillingTransform.cpp +++ b/src/Processors/Transforms/FillingTransform.cpp @@ -54,7 +54,7 @@ static bool tryConvertFields(FillColumnDescription & descr, const DataTypePtr & WhichDataType which(type); DataTypePtr to_type; - /// For Date/DateTime types TO/FROM type should match column type + /// For Date/DateTime types TO/FROM type should match column type if (descr.fill_from_type) { WhichDataType which_from(descr.fill_from_type); From 2dbb159b53cb046198f9bf975f96f05628cb2d83 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 26 Jul 2022 20:35:45 +0000 Subject: [PATCH 210/227] Add optimize_distinct_in_order in settings randomizer --- tests/clickhouse-test | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e060535c1ae..952fc7fb0a9 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -447,6 +447,7 @@ class SettingsRandomizer: "compile_aggregate_expressions": lambda: random.randint(0, 1), "compile_sort_description": lambda: random.randint(0, 1), "merge_tree_coarse_index_granularity": lambda: random.randint(2, 32), + "optimize_distinct_in_order": lambda: random.randint(0, 1), } @staticmethod From afd0982187324a0d77914d0a91f1b57c120d0339 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Jul 2022 11:01:22 +0200 Subject: [PATCH 211/227] Remove column "internal" from system.backups --- src/Backups/BackupsWorker.cpp | 60 +++++++++++--------- src/Backups/BackupsWorker.h | 20 +++---- src/Interpreters/InterpreterBackupQuery.cpp | 4 +- src/Storages/System/StorageSystemBackups.cpp | 3 - 4 files changed, 44 insertions(+), 43 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index cd505ed587c..fbd68f077e8 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -115,7 +115,7 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa } -std::pair BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) +UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) { const ASTBackupQuery & backup_query = typeid_cast(*backup_or_restore_query); if (backup_query.kind == ASTBackupQuery::Kind::BACKUP) @@ -125,7 +125,7 @@ std::pair BackupsWorker::start(const ASTPtr & backup_or_restore_quer } -std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) +UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) { auto backup_query = std::static_pointer_cast(query->clone()); auto backup_settings = BackupSettings::fromBackupQuery(*backup_query); @@ -147,7 +147,9 @@ std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, con try { auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); - addInfo(backup_uuid, backup_settings.internal, backup_info.toString(), BackupStatus::MAKING_BACKUP); + + if (!backup_settings.internal) + addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP); /// Prepare context to use. ContextPtr context_in_use = context; @@ -188,12 +190,13 @@ std::pair BackupsWorker::startMakingBackup(const ASTPtr & query, con /* called_async= */ false); } - return {backup_uuid, backup_settings.internal}; + return backup_uuid; } catch (...) { /// Something bad happened, the backup has not built. - setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP); + if (!backup_settings.internal) + setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); throw; } @@ -305,7 +308,8 @@ void BackupsWorker::doBackup( backup.reset(); LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); - setStatus(backup_uuid, backup_settings.internal, BackupStatus::BACKUP_COMPLETE); + if (!backup_settings.internal) + setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); } catch (...) { @@ -313,7 +317,8 @@ void BackupsWorker::doBackup( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - setStatus(backup_uuid, backup_settings.internal, BackupStatus::FAILED_TO_BACKUP); + if (!backup_settings.internal) + setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } else @@ -325,7 +330,7 @@ void BackupsWorker::doBackup( } -std::pair BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) +UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) { auto restore_query = std::static_pointer_cast(query->clone()); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); @@ -344,7 +349,8 @@ std::pair BackupsWorker::startRestoring(const ASTPtr & query, Contex try { auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); - addInfo(restore_uuid, restore_settings.internal, backup_info.toString(), BackupStatus::RESTORING); + if (!restore_settings.internal) + addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING); /// Prepare context to use. ContextMutablePtr context_in_use = context; @@ -382,12 +388,13 @@ std::pair BackupsWorker::startRestoring(const ASTPtr & query, Contex /* called_async= */ false); } - return {restore_uuid, restore_settings.internal}; + return restore_uuid; } catch (...) { /// Something bad happened, the backup has not built. - setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE); + if (!restore_settings.internal) + setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); throw; } @@ -500,7 +507,8 @@ void BackupsWorker::doRestore( } LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); - setStatus(restore_uuid, restore_settings.internal, BackupStatus::RESTORED); + if (!restore_settings.internal) + setStatus(restore_uuid, BackupStatus::RESTORED); } catch (...) { @@ -508,7 +516,8 @@ void BackupsWorker::doRestore( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - setStatus(restore_uuid, restore_settings.internal, BackupStatus::FAILED_TO_RESTORE); + if (!restore_settings.internal) + setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } else @@ -520,31 +529,30 @@ void BackupsWorker::doRestore( } -void BackupsWorker::addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status) +void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, BackupStatus status) { Info info; info.uuid = uuid; info.backup_name = backup_name; info.status = status; info.status_changed_time = time(nullptr); - info.internal = internal; std::lock_guard lock{infos_mutex}; - bool inserted = infos.try_emplace({uuid, internal}, std::move(info)).second; + bool inserted = infos.try_emplace(uuid, std::move(info)).second; if (!inserted) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Pair of UUID={} and internal={} is already in use", uuid, internal); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup's UUID={} is already in use", uuid); num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); } -void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus status) +void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status) { std::lock_guard lock{infos_mutex}; - auto it = infos.find({uuid, internal}); + auto it = infos.find(uuid); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", uuid, internal); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", uuid); auto & info = it->second; auto old_status = info.status; @@ -555,14 +563,14 @@ void BackupsWorker::setStatus(const UUID & uuid, bool internal, BackupStatus sta } -void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception) +void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_exception) { std::unique_lock lock{infos_mutex}; status_changed.wait(lock, [&] { - auto it = infos.find({backup_or_restore_uuid, internal}); + auto it = infos.find(backup_or_restore_uuid); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", backup_or_restore_uuid); const auto & info = it->second; auto current_status = info.status; if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE))) @@ -571,12 +579,12 @@ void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool internal, boo }); } -BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid, bool internal) const +BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid) const { std::lock_guard lock{infos_mutex}; - auto it = infos.find({backup_or_restore_uuid, internal}); + auto it = infos.find(backup_or_restore_uuid); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown pair of UUID={} and internal={}", backup_or_restore_uuid, internal); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", backup_or_restore_uuid); return it->second; } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 7db62633412..2b1ac0d68f0 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -29,11 +29,11 @@ public: void shutdown(); /// Starts executing a BACKUP or RESTORE query. Returns UUID of the operation. - std::pair start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); + UUID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); /// Waits until a BACKUP or RESTORE query started by start() is finished. /// The function returns immediately if the operation is already finished. - void wait(const UUID & backup_or_restore_uuid, bool internal, bool rethrow_exception = true); + void wait(const UUID & backup_or_restore_uuid, bool rethrow_exception = true); /// Information about executing a BACKUP or RESTORE query started by calling start(). struct Info @@ -48,35 +48,31 @@ public: String error_message; std::exception_ptr exception; - - /// Whether this operation is internal, i.e. caused by another BACKUP or RESTORE operation. - /// For example BACKUP ON CLUSTER executes an internal BACKUP commands per each node. - bool internal = false; }; - Info getInfo(const UUID & backup_or_restore_uuid, bool internal) const; + Info getInfo(const UUID & backup_or_restore_uuid) const; std::vector getAllInfos() const; private: - std::pair startMakingBackup(const ASTPtr & query, const ContextPtr & context); + UUID startMakingBackup(const ASTPtr & query, const ContextPtr & context); void doBackup(const UUID & backup_uuid, const std::shared_ptr & backup_query, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, ContextMutablePtr mutable_context, bool called_async); - std::pair startRestoring(const ASTPtr & query, ContextMutablePtr context); + UUID startRestoring(const ASTPtr & query, ContextMutablePtr context); void doRestore(const UUID & restore_uuid, const std::shared_ptr & restore_query, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); - void addInfo(const UUID & uuid, bool internal, const String & backup_name, BackupStatus status); - void setStatus(const UUID & uuid, bool internal, BackupStatus status); + void addInfo(const UUID & uuid, const String & backup_name, BackupStatus status); + void setStatus(const UUID & uuid, BackupStatus status); ThreadPool backups_thread_pool; ThreadPool restores_thread_pool; - std::map, Info> infos; + std::unordered_map infos; std::condition_variable status_changed; std::atomic num_active_backups = 0; std::atomic num_active_restores = 0; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 8f2060d2d02..bda37878fe3 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -39,9 +39,9 @@ namespace BlockIO InterpreterBackupQuery::execute() { auto & backups_worker = context->getBackupsWorker(); - auto [uuid, internal] = backups_worker.start(query_ptr, context); + auto uuid = backups_worker.start(query_ptr, context); BlockIO res_io; - res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.getInfo(uuid, internal)))); + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.getInfo(uuid)))); return res_io; } diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index ccdd6678a88..80bd2049422 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -20,7 +20,6 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() {"status", std::make_shared(getBackupStatusEnumValues())}, {"status_changed_time", std::make_shared()}, {"error", std::make_shared()}, - {"internal", std::make_shared()}, }; return names_and_types; } @@ -34,7 +33,6 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_status = assert_cast(*res_columns[column_index++]); auto & column_status_changed_time = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); - auto & column_internal = assert_cast(*res_columns[column_index++]); auto add_row = [&](const BackupsWorker::Info & info) { @@ -43,7 +41,6 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con column_status.insertValue(static_cast(info.status)); column_status_changed_time.insertValue(info.status_changed_time); column_error.insertData(info.error_message.data(), info.error_message.size()); - column_internal.insertValue(info.internal); }; for (const auto & entry : context->getBackupsWorker().getAllInfos()) From d129be2a9e9d4540161efdc19bd13070e0de8481 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Jul 2022 11:05:10 +0200 Subject: [PATCH 212/227] Remove column "backup_name" from the result of a BACKUP/RESTORE query --- src/Interpreters/InterpreterBackupQuery.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index bda37878fe3..613455aedd9 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -20,17 +20,14 @@ namespace Block getResultRow(const BackupsWorker::Info & info) { auto column_uuid = ColumnUUID::create(); - auto column_backup_name = ColumnString::create(); auto column_status = ColumnInt8::create(); column_uuid->insert(info.uuid); - column_backup_name->insert(info.backup_name); column_status->insert(static_cast(info.status)); Block res_columns; res_columns.insert(0, {std::move(column_uuid), std::make_shared(), "uuid"}); - res_columns.insert(1, {std::move(column_backup_name), std::make_shared(), "backup_name"}); - res_columns.insert(2, {std::move(column_status), std::make_shared(getBackupStatusEnumValues()), "status"}); + res_columns.insert(1, {std::move(column_status), std::make_shared(getBackupStatusEnumValues()), "status"}); return res_columns; } @@ -40,8 +37,13 @@ BlockIO InterpreterBackupQuery::execute() { auto & backups_worker = context->getBackupsWorker(); auto uuid = backups_worker.start(query_ptr, context); + + auto info = backups_worker.getInfo(uuid); + if (info.exception) + std::rethrow_exception(info.exception); + BlockIO res_io; - res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(backups_worker.getInfo(uuid)))); + res_io.pipeline = QueryPipeline(std::make_shared(getResultRow(info))); return res_io; } From 131019ba49e28bcf29509f748b3a90b40824ccba Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Jul 2022 13:45:50 +0200 Subject: [PATCH 213/227] Rename column "backup_name" -> "name" in system.backups. --- src/Backups/BackupsWorker.cpp | 4 ++-- src/Backups/BackupsWorker.h | 8 ++++---- src/Storages/System/StorageSystemBackups.cpp | 6 +++--- .../test_backup_restore_new/test.py | 4 ++-- .../test_backup_restore_on_cluster/test.py | 18 ++++++++---------- .../test_concurrency.py | 6 +++--- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index fbd68f077e8..bff5762bbae 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -529,11 +529,11 @@ void BackupsWorker::doRestore( } -void BackupsWorker::addInfo(const UUID & uuid, const String & backup_name, BackupStatus status) +void BackupsWorker::addInfo(const UUID & uuid, const String & name, BackupStatus status) { Info info; info.uuid = uuid; - info.backup_name = backup_name; + info.name = name; info.status = status; info.status_changed_time = time(nullptr); diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 2b1ac0d68f0..461f4d2ad69 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -38,10 +38,10 @@ public: /// Information about executing a BACKUP or RESTORE query started by calling start(). struct Info { - UUID uuid; - /// Backup's name, a string like "Disk('backups', 'my_backup')" - String backup_name; + String name; + + UUID uuid; BackupStatus status; time_t status_changed_time; @@ -66,7 +66,7 @@ private: const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); - void addInfo(const UUID & uuid, const String & backup_name, BackupStatus status); + void addInfo(const UUID & uuid, const String & name, BackupStatus status); void setStatus(const UUID & uuid, BackupStatus status); ThreadPool backups_thread_pool; diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index 80bd2049422..e638fd31dc2 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -15,8 +15,8 @@ namespace DB NamesAndTypesList StorageSystemBackups::getNamesAndTypes() { NamesAndTypesList names_and_types{ + {"name", std::make_shared()}, {"uuid", std::make_shared()}, - {"backup_name", std::make_shared()}, {"status", std::make_shared(getBackupStatusEnumValues())}, {"status_changed_time", std::make_shared()}, {"error", std::make_shared()}, @@ -28,16 +28,16 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const { size_t column_index = 0; + auto & column_name = assert_cast(*res_columns[column_index++]); auto & column_uuid = assert_cast(*res_columns[column_index++]); - auto & column_backup_name = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); auto & column_status_changed_time = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); auto add_row = [&](const BackupsWorker::Info & info) { + column_name.insertData(info.name.data(), info.name.size()); column_uuid.insertValue(info.uuid); - column_backup_name.insertData(info.backup_name.data(), info.backup_name.size()); column_status.insertValue(static_cast(info.status)); column_status_changed_time.insertValue(info.status_changed_time); column_error.insertData(info.error_message.data(), info.error_message.size()); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 47f7c47d608..97389b52e7f 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -301,7 +301,7 @@ def test_async(): assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n" backup_name = new_backup_name() - [id, _, status] = instance.query( + [id, status] = instance.query( f"BACKUP TABLE test.table TO {backup_name} ASYNC" ).split("\t") @@ -315,7 +315,7 @@ def test_async(): instance.query("DROP TABLE test.table") - [id, _, status] = instance.query( + [id, status] = instance.query( f"RESTORE TABLE test.table FROM {backup_name} ASYNC" ).split("\t") diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index d1898213725..9d9604b6fa9 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -396,7 +396,7 @@ def test_replicated_database_async(): node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl") backup_name = new_backup_name() - [id, _, status] = node1.query( + [id, status] = node1.query( f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} ASYNC" ).split("\t") @@ -404,13 +404,13 @@ def test_replicated_database_async(): assert_eq_with_retry( node1, - f"SELECT status, error FROM system.backups WHERE uuid='{id}' AND NOT internal", + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", TSV([["BACKUP_COMPLETE", ""]]), ) node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") - [id, _, status] = node1.query( + [id, status] = node1.query( f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} ASYNC" ).split("\t") @@ -418,7 +418,7 @@ def test_replicated_database_async(): assert_eq_with_retry( node1, - f"SELECT status, error FROM system.backups WHERE uuid='{id}' AND NOT internal", + f"SELECT status, error FROM system.backups WHERE uuid='{id}'", TSV([["RESTORED", ""]]), ) @@ -471,7 +471,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): int( nodes[i] .query( - f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status == 'BACKUP_COMPLETE' AND NOT internal" + f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status == 'BACKUP_COMPLETE'" ) .strip() ) @@ -483,7 +483,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): for i in range(len(nodes)): print( nodes[i].query( - f"SELECT status, error FROM system.backups WHERE uuid='{ids[i]}' AND NOT internal" + f"SELECT status, error FROM system.backups WHERE uuid='{ids[i]}'" ) ) @@ -817,14 +817,12 @@ def test_stop_other_host_during_backup(kill): assert_eq_with_retry( node1, - f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP' AND NOT internal", + f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP'", "", retry_count=100, ) - status = node1.query( - f"SELECT status FROM system.backups WHERE uuid='{id}' AND NOT internal" - ).strip() + status = node1.query(f"SELECT status FROM system.backups WHERE uuid='{id}'").strip() if kill: assert status in ["BACKUP_COMPLETE", "FAILED_TO_BACKUP"] diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index 8eaed5ac486..34c50998977 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -127,7 +127,7 @@ def test_concurrent_backups_on_same_node(): ) assert node0.query( - f"SELECT status, error FROM system.backups WHERE uuid IN {ids_list} AND NOT internal" + f"SELECT status, error FROM system.backups WHERE uuid IN {ids_list}" ) == TSV([["BACKUP_COMPLETE", ""]] * num_concurrent_backups) for backup_name in backup_names: @@ -162,7 +162,7 @@ def test_concurrent_backups_on_different_nodes(): for i in range(num_concurrent_backups): assert nodes[i].query( - f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}' AND NOT internal" + f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}'" ) == TSV([["BACKUP_COMPLETE", ""]]) for i in range(num_concurrent_backups): @@ -259,7 +259,7 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine): for node in nodes: for id in ids: backup_name = node.query( - f"SELECT backup_name FROM system.backups WHERE uuid='{id}' FORMAT RawBLOB" + f"SELECT name FROM system.backups WHERE uuid='{id}' FORMAT RawBLOB" ).strip() if backup_name: backup_names[id] = backup_name From fc16a15ecf89fa86e1b5efb38db947d9d7623a80 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Fri, 22 Jul 2022 18:01:24 +0200 Subject: [PATCH 214/227] Rename column "uuid" -> "id" in system.backups and allow user to set it in a query. --- src/Backups/BackupImpl.cpp | 6 - src/Backups/BackupImpl.h | 2 +- src/Backups/BackupSettings.cpp | 1 + src/Backups/BackupSettings.h | 3 + src/Backups/BackupsWorker.cpp | 128 ++++++++++++------ src/Backups/BackupsWorker.h | 42 +++--- src/Backups/RestoreSettings.cpp | 1 + src/Backups/RestoreSettings.h | 3 + src/Interpreters/InterpreterBackupQuery.cpp | 10 +- src/Storages/System/StorageSystemBackups.cpp | 6 +- .../test_backup_restore_new/test.py | 58 +++++++- .../test_backup_restore_on_cluster/test.py | 14 +- .../test_concurrency.py | 14 +- 13 files changed, 190 insertions(+), 98 deletions(-) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index d445ef9d52c..b5d19ef8f1a 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -232,12 +232,6 @@ void BackupImpl::close() coordination.reset(); } -time_t BackupImpl::getTimestamp() const -{ - std::lock_guard lock{mutex}; - return timestamp; -} - void BackupImpl::writeBackupMetadata() { assert(!is_internal_backup); diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index ac0662c62c1..14b9d38835a 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -55,7 +55,7 @@ public: const String & getName() const override { return backup_name; } OpenMode getOpenMode() const override { return open_mode; } - time_t getTimestamp() const override; + time_t getTimestamp() const override { return timestamp; } UUID getUUID() const override { return *uuid; } Strings listFiles(const String & directory, bool recursive) const override; bool hasFiles(const String & directory) const override; diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index a9ba7cb5f74..a4b20e0b863 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -60,6 +60,7 @@ namespace /// List of backup settings except base_backup_name and cluster_host_ids. #define LIST_OF_BACKUP_SETTINGS(M) \ + M(String, id) \ M(String, compression_method) \ M(Int64, compression_level) \ M(String, password) \ diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index 4e2bad67fce..5c5f336aa45 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -11,6 +11,9 @@ class ASTBackupQuery; /// Settings specified in the "SETTINGS" clause of a BACKUP query. struct BackupSettings { + /// ID of the backup operation, to identify it in the system.backups table. Auto-generated if not set. + String id; + /// Base backup, if it's set an incremental backup will be built. That means only differences made after the base backup will be put /// into a new backup. std::optional base_backup_info; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index bff5762bbae..3c524635687 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -27,9 +27,10 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } +using OperationID = BackupsWorker::OperationID; namespace Stage = BackupCoordinationStage; namespace @@ -92,6 +93,17 @@ namespace } } + bool isFinalStatus(BackupStatus status) + { + return (status == BackupStatus::BACKUP_COMPLETE) || (status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::RESTORED) + || (status == BackupStatus::FAILED_TO_RESTORE); + } + + bool isErrorStatus(BackupStatus status) + { + return (status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::FAILED_TO_RESTORE); + } + /// Used to change num_active_backups. size_t getNumActiveBackupsChange(BackupStatus status) { @@ -115,7 +127,7 @@ BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threa } -UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) +OperationID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context) { const ASTBackupQuery & backup_query = typeid_cast(*backup_or_restore_query); if (backup_query.kind == ASTBackupQuery::Kind::BACKUP) @@ -125,14 +137,17 @@ UUID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutable } -UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) +OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context) { auto backup_query = std::static_pointer_cast(query->clone()); auto backup_settings = BackupSettings::fromBackupQuery(*backup_query); if (!backup_settings.backup_uuid) backup_settings.backup_uuid = UUIDHelpers::generateV4(); - UUID backup_uuid = *backup_settings.backup_uuid; + + OperationID backup_id = backup_settings.id; + if (backup_id.empty()) + backup_id = toString(*backup_settings.backup_uuid); std::shared_ptr backup_coordination; @@ -149,7 +164,7 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); if (!backup_settings.internal) - addInfo(backup_uuid, backup_info.toString(), BackupStatus::MAKING_BACKUP); + addInfo(backup_id, backup_info.toString(), BackupStatus::MAKING_BACKUP); /// Prepare context to use. ContextPtr context_in_use = context; @@ -165,10 +180,11 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c if (backup_settings.async) { backups_thread_pool.scheduleOrThrowOnError( - [this, backup_uuid, backup_query, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context] { + [this, backup_query, backup_id, backup_settings, backup_info, backup_coordination, context_in_use, mutable_context] + { doBackup( - backup_uuid, backup_query, + backup_id, backup_settings, backup_info, backup_coordination, @@ -180,8 +196,8 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c else { doBackup( - backup_uuid, backup_query, + backup_id, backup_settings, backup_info, backup_coordination, @@ -190,13 +206,13 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c /* called_async= */ false); } - return backup_uuid; + return backup_id; } catch (...) { /// Something bad happened, the backup has not built. if (!backup_settings.internal) - setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_id, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); throw; } @@ -204,8 +220,8 @@ UUID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & c void BackupsWorker::doBackup( - const UUID & backup_uuid, const std::shared_ptr & backup_query, + const OperationID & backup_id, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, @@ -240,7 +256,7 @@ void BackupsWorker::doBackup( if (backup_settings.coordination_zk_path.empty()) { String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(backup_uuid); + backup_settings.coordination_zk_path = root_zk_path + "/backup-" + toString(*backup_settings.backup_uuid); } } @@ -259,7 +275,7 @@ void BackupsWorker::doBackup( backup_create_params.password = backup_settings.password; backup_create_params.is_internal_backup = backup_settings.internal; backup_create_params.backup_coordination = backup_coordination; - backup_create_params.backup_uuid = backup_uuid; + backup_create_params.backup_uuid = backup_settings.backup_uuid; BackupMutablePtr backup = BackupFactory::instance().createBackup(backup_create_params); /// Write the backup. @@ -309,7 +325,7 @@ void BackupsWorker::doBackup( LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); if (!backup_settings.internal) - setStatus(backup_uuid, BackupStatus::BACKUP_COMPLETE); + setStatus(backup_id, BackupStatus::BACKUP_COMPLETE); } catch (...) { @@ -318,7 +334,7 @@ void BackupsWorker::doBackup( { tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); if (!backup_settings.internal) - setStatus(backup_uuid, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_id, BackupStatus::FAILED_TO_BACKUP); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } else @@ -330,12 +346,17 @@ void BackupsWorker::doBackup( } -UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) +OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context) { auto restore_query = std::static_pointer_cast(query->clone()); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); + UUID restore_uuid = UUIDHelpers::generateV4(); + OperationID restore_id = restore_settings.id; + if (restore_id.empty()) + restore_id = toString(restore_uuid); + std::shared_ptr restore_coordination; if (restore_settings.internal) @@ -350,7 +371,7 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte { auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); if (!restore_settings.internal) - addInfo(restore_uuid, backup_info.toString(), BackupStatus::RESTORING); + addInfo(restore_id, backup_info.toString(), BackupStatus::RESTORING); /// Prepare context to use. ContextMutablePtr context_in_use = context; @@ -365,10 +386,11 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte if (restore_settings.async) { backups_thread_pool.scheduleOrThrowOnError( - [this, restore_uuid, restore_query, restore_settings, backup_info, restore_coordination, context_in_use] { + [this, restore_query, restore_id, restore_uuid, restore_settings, backup_info, restore_coordination, context_in_use] { doRestore( - restore_uuid, restore_query, + restore_id, + restore_uuid, restore_settings, backup_info, restore_coordination, @@ -379,8 +401,9 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte else { doRestore( - restore_uuid, restore_query, + restore_id, + restore_uuid, restore_settings, backup_info, restore_coordination, @@ -388,13 +411,13 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte /* called_async= */ false); } - return restore_uuid; + return restore_id; } catch (...) { /// Something bad happened, the backup has not built. if (!restore_settings.internal) - setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_id, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); throw; } @@ -402,8 +425,9 @@ UUID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr conte void BackupsWorker::doRestore( - const UUID & restore_uuid, const std::shared_ptr & restore_query, + const OperationID & restore_id, + const UUID & restore_uuid, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, @@ -508,7 +532,7 @@ void BackupsWorker::doRestore( LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); if (!restore_settings.internal) - setStatus(restore_uuid, BackupStatus::RESTORED); + setStatus(restore_id, BackupStatus::RESTORED); } catch (...) { @@ -517,7 +541,7 @@ void BackupsWorker::doRestore( { tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); if (!restore_settings.internal) - setStatus(restore_uuid, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_id, BackupStatus::FAILED_TO_RESTORE); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } else @@ -529,62 +553,78 @@ void BackupsWorker::doRestore( } -void BackupsWorker::addInfo(const UUID & uuid, const String & name, BackupStatus status) +void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupStatus status) { Info info; - info.uuid = uuid; + info.id = id; info.name = name; info.status = status; info.status_changed_time = time(nullptr); std::lock_guard lock{infos_mutex}; - bool inserted = infos.try_emplace(uuid, std::move(info)).second; - if (!inserted) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup's UUID={} is already in use", uuid); + auto it = infos.find(id); + if (it != infos.end()) + { + /// It's better not allow to overwrite the current status if it's in progress. + auto current_status = it->second.status; + if (!isFinalStatus(current_status)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: it's id='{}' is already in use", id); + } + + infos[id] = std::move(info); + num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); } -void BackupsWorker::setStatus(const UUID & uuid, BackupStatus status) +void BackupsWorker::setStatus(const String & id, BackupStatus status) { std::lock_guard lock{infos_mutex}; - auto it = infos.find(uuid); + auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", uuid); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); auto & info = it->second; auto old_status = info.status; + info.status = status; info.status_changed_time = time(nullptr); + + if (isErrorStatus(status)) + { + info.error_message = getCurrentExceptionMessage(false); + info.exception = std::current_exception(); + } + num_active_backups += getNumActiveBackupsChange(status) - getNumActiveBackupsChange(old_status); num_active_restores += getNumActiveRestoresChange(status) - getNumActiveRestoresChange(old_status); } -void BackupsWorker::wait(const UUID & backup_or_restore_uuid, bool rethrow_exception) +void BackupsWorker::wait(const OperationID & id, bool rethrow_exception) { std::unique_lock lock{infos_mutex}; status_changed.wait(lock, [&] { - auto it = infos.find(backup_or_restore_uuid); + auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", backup_or_restore_uuid); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); const auto & info = it->second; auto current_status = info.status; - if (rethrow_exception && ((current_status == BackupStatus::FAILED_TO_BACKUP) || (current_status == BackupStatus::FAILED_TO_RESTORE))) + if (rethrow_exception && isErrorStatus(current_status)) std::rethrow_exception(info.exception); - return (current_status == BackupStatus::BACKUP_COMPLETE) || (current_status == BackupStatus::RESTORED); + return isFinalStatus(current_status); }); } -BackupsWorker::Info BackupsWorker::getInfo(const UUID & backup_or_restore_uuid) const +BackupsWorker::Info BackupsWorker::getInfo(const OperationID & id) const { std::lock_guard lock{infos_mutex}; - auto it = infos.find(backup_or_restore_uuid); + auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's UUID={}", backup_or_restore_uuid); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); return it->second; } @@ -599,14 +639,14 @@ std::vector BackupsWorker::getAllInfos() const void BackupsWorker::shutdown() { - bool has_active_backups_or_restores = (num_active_backups || num_active_restores); - if (has_active_backups_or_restores) + bool has_active_backups_and_restores = (num_active_backups || num_active_restores); + if (has_active_backups_and_restores) LOG_INFO(log, "Waiting for {} backups and {} restores to be finished", num_active_backups, num_active_restores); backups_thread_pool.wait(); restores_thread_pool.wait(); - if (has_active_backups_or_restores) + if (has_active_backups_and_restores) LOG_INFO(log, "All backup and restore tasks have finished"); } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 461f4d2ad69..4e2a98fd602 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -28,51 +28,57 @@ public: /// Waits until all tasks have been completed. void shutdown(); - /// Starts executing a BACKUP or RESTORE query. Returns UUID of the operation. - UUID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); + /// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID. + using OperationID = String; + + /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation. + OperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context); /// Waits until a BACKUP or RESTORE query started by start() is finished. /// The function returns immediately if the operation is already finished. - void wait(const UUID & backup_or_restore_uuid, bool rethrow_exception = true); + void wait(const OperationID & backup_or_restore_id, bool rethrow_exception = true); /// Information about executing a BACKUP or RESTORE query started by calling start(). struct Info { + /// Backup's or restore's operation ID, can be either passed via SETTINGS id=... or be randomly generated UUID. + OperationID id; + /// Backup's name, a string like "Disk('backups', 'my_backup')" String name; - UUID uuid; - + /// Status of backup or restore operation. BackupStatus status; time_t status_changed_time; - String error_message; + /// Set only if there was an error. std::exception_ptr exception; + String error_message; }; - Info getInfo(const UUID & backup_or_restore_uuid) const; + Info getInfo(const OperationID & id) const; std::vector getAllInfos() const; private: - UUID startMakingBackup(const ASTPtr & query, const ContextPtr & context); - - void doBackup(const UUID & backup_uuid, const std::shared_ptr & backup_query, BackupSettings backup_settings, + OperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context); + + void doBackup(const std::shared_ptr & backup_query, const OperationID & backup_id, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, ContextMutablePtr mutable_context, bool called_async); - UUID startRestoring(const ASTPtr & query, ContextMutablePtr context); + OperationID startRestoring(const ASTPtr & query, ContextMutablePtr context); + + void doRestore(const std::shared_ptr & restore_query, const OperationID & restore_id, const UUID & restore_uuid, + RestoreSettings restore_settings, const BackupInfo & backup_info, + std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); - void doRestore(const UUID & restore_uuid, const std::shared_ptr & restore_query, RestoreSettings restore_settings, - const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, - bool called_async); - - void addInfo(const UUID & uuid, const String & name, BackupStatus status); - void setStatus(const UUID & uuid, BackupStatus status); + void addInfo(const OperationID & id, const String & name, BackupStatus status); + void setStatus(const OperationID & id, BackupStatus status); ThreadPool backups_thread_pool; ThreadPool restores_thread_pool; - std::unordered_map infos; + std::unordered_map infos; std::condition_variable status_changed; std::atomic num_active_backups = 0; std::atomic num_active_restores = 0; diff --git a/src/Backups/RestoreSettings.cpp b/src/Backups/RestoreSettings.cpp index efa1fe2cfb8..63915670fa4 100644 --- a/src/Backups/RestoreSettings.cpp +++ b/src/Backups/RestoreSettings.cpp @@ -143,6 +143,7 @@ namespace /// List of restore settings except base_backup_name and cluster_host_ids. #define LIST_OF_RESTORE_SETTINGS(M) \ + M(String, id) \ M(String, password) \ M(Bool, structure_only) \ M(RestoreTableCreationMode, create_table) \ diff --git a/src/Backups/RestoreSettings.h b/src/Backups/RestoreSettings.h index 1bc5d867a37..713adbe8029 100644 --- a/src/Backups/RestoreSettings.h +++ b/src/Backups/RestoreSettings.h @@ -41,6 +41,9 @@ using RestoreUDFCreationMode = RestoreAccessCreationMode; /// Settings specified in the "SETTINGS" clause of a RESTORE query. struct RestoreSettings { + /// ID of the restore operation, to identify it in the system.backups table. Auto-generated if not set. + String id; + /// Base backup, with this setting we can override the location of the base backup while restoring. /// Any incremental backup keeps inside the information about its base backup, so using this setting is optional. std::optional base_backup_info; diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp index 613455aedd9..e238286a33c 100644 --- a/src/Interpreters/InterpreterBackupQuery.cpp +++ b/src/Interpreters/InterpreterBackupQuery.cpp @@ -19,14 +19,14 @@ namespace { Block getResultRow(const BackupsWorker::Info & info) { - auto column_uuid = ColumnUUID::create(); + auto column_id = ColumnString::create(); auto column_status = ColumnInt8::create(); - column_uuid->insert(info.uuid); + column_id->insert(info.id); column_status->insert(static_cast(info.status)); Block res_columns; - res_columns.insert(0, {std::move(column_uuid), std::make_shared(), "uuid"}); + res_columns.insert(0, {std::move(column_id), std::make_shared(), "id"}); res_columns.insert(1, {std::move(column_status), std::make_shared(getBackupStatusEnumValues()), "status"}); return res_columns; @@ -36,9 +36,9 @@ namespace BlockIO InterpreterBackupQuery::execute() { auto & backups_worker = context->getBackupsWorker(); - auto uuid = backups_worker.start(query_ptr, context); + auto id = backups_worker.start(query_ptr, context); - auto info = backups_worker.getInfo(uuid); + auto info = backups_worker.getInfo(id); if (info.exception) std::rethrow_exception(info.exception); diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index e638fd31dc2..a2f935749e8 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -15,8 +15,8 @@ namespace DB NamesAndTypesList StorageSystemBackups::getNamesAndTypes() { NamesAndTypesList names_and_types{ + {"id", std::make_shared()}, {"name", std::make_shared()}, - {"uuid", std::make_shared()}, {"status", std::make_shared(getBackupStatusEnumValues())}, {"status_changed_time", std::make_shared()}, {"error", std::make_shared()}, @@ -28,16 +28,16 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const { size_t column_index = 0; + auto & column_id = assert_cast(*res_columns[column_index++]); auto & column_name = assert_cast(*res_columns[column_index++]); - auto & column_uuid = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); auto & column_status_changed_time = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); auto add_row = [&](const BackupsWorker::Info & info) { + column_id.insertData(info.id.data(), info.id.size()); column_name.insertData(info.name.data(), info.name.size()); - column_uuid.insertValue(info.uuid); column_status.insertValue(static_cast(info.status)); column_status_changed_time.insertValue(info.status_changed_time); column_error.insertData(info.error_message.data(), info.error_message.size()); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 97389b52e7f..9bf3d2729c0 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -309,7 +309,7 @@ def test_async(): assert_eq_with_retry( instance, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["BACKUP_COMPLETE", ""]]), ) @@ -323,7 +323,7 @@ def test_async(): assert_eq_with_retry( instance, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["RESTORED", ""]]), ) @@ -347,16 +347,16 @@ def test_async_backups_to_same_destination(interface): assert_eq_with_retry( instance, - f"SELECT status FROM system.backups WHERE uuid IN ['{id1}', '{id2}'] AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id IN ['{id1}', '{id2}'] AND status == 'MAKING_BACKUP'", "", ) assert instance.query( - f"SELECT status, error FROM system.backups WHERE uuid='{id1}'" + f"SELECT status, error FROM system.backups WHERE id='{id1}'" ) == TSV([["BACKUP_COMPLETE", ""]]) assert ( - instance.query(f"SELECT status FROM system.backups WHERE uuid='{id2}'") + instance.query(f"SELECT status FROM system.backups WHERE id='{id2}'") == "FAILED_TO_BACKUP\n" ) @@ -758,7 +758,7 @@ def test_system_users_async(): assert_eq_with_retry( instance, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["BACKUP_COMPLETE", ""]]), ) @@ -770,7 +770,7 @@ def test_system_users_async(): assert_eq_with_retry( instance, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["RESTORED", ""]]), ) @@ -884,6 +884,50 @@ def test_restore_partition(): ) +def test_operation_id(): + create_and_fill_table(n=30) + + backup_name = new_backup_name() + + [id, status] = instance.query( + f"BACKUP TABLE test.table TO {backup_name} SETTINGS id='first' ASYNC" + ).split("\t") + + assert id == "first" + assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" + + assert_eq_with_retry( + instance, + f"SELECT status, error FROM system.backups WHERE id='first'", + TSV([["BACKUP_COMPLETE", ""]]), + ) + + instance.query("DROP TABLE test.table") + + [id, status] = instance.query( + f"RESTORE TABLE test.table FROM {backup_name} SETTINGS id='second' ASYNC" + ).split("\t") + + assert id == "second" + assert status == "RESTORING\n" or status == "RESTORED\n" + + assert_eq_with_retry( + instance, + f"SELECT status, error FROM system.backups WHERE id='second'", + TSV([["RESTORED", ""]]), + ) + + # Reuse the same ID again + instance.query("DROP TABLE test.table") + + [id, status] = instance.query( + f"RESTORE TABLE test.table FROM {backup_name} SETTINGS id='first'" + ).split("\t") + + assert id == "first" + assert status == "RESTORED\n" + + def test_mutation(): create_and_fill_table(engine="MergeTree ORDER BY tuple()", n=5) diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index 9d9604b6fa9..e39c2a6bf94 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -404,7 +404,7 @@ def test_replicated_database_async(): assert_eq_with_retry( node1, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["BACKUP_COMPLETE", ""]]), ) @@ -418,7 +418,7 @@ def test_replicated_database_async(): assert_eq_with_retry( node1, - f"SELECT status, error FROM system.backups WHERE uuid='{id}'", + f"SELECT status, error FROM system.backups WHERE id='{id}'", TSV([["RESTORED", ""]]), ) @@ -462,7 +462,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): for i in range(len(nodes)): assert_eq_with_retry( nodes[i], - f"SELECT status FROM system.backups WHERE uuid='{ids[i]}' AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id='{ids[i]}' AND status == 'MAKING_BACKUP'", "", ) @@ -471,7 +471,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): int( nodes[i] .query( - f"SELECT count() FROM system.backups WHERE uuid='{ids[i]}' AND status == 'BACKUP_COMPLETE'" + f"SELECT count() FROM system.backups WHERE id='{ids[i]}' AND status == 'BACKUP_COMPLETE'" ) .strip() ) @@ -483,7 +483,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): for i in range(len(nodes)): print( nodes[i].query( - f"SELECT status, error FROM system.backups WHERE uuid='{ids[i]}'" + f"SELECT status, error FROM system.backups WHERE id='{ids[i]}'" ) ) @@ -817,12 +817,12 @@ def test_stop_other_host_during_backup(kill): assert_eq_with_retry( node1, - f"SELECT status FROM system.backups WHERE uuid='{id}' AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id='{id}' AND status == 'MAKING_BACKUP'", "", retry_count=100, ) - status = node1.query(f"SELECT status FROM system.backups WHERE uuid='{id}'").strip() + status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip() if kill: assert status in ["BACKUP_COMPLETE", "FAILED_TO_BACKUP"] diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index 34c50998977..d4f3b98ca90 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -122,12 +122,12 @@ def test_concurrent_backups_on_same_node(): assert_eq_with_retry( node0, - f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND uuid IN {ids_list}", + f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND id IN {ids_list}", "", ) assert node0.query( - f"SELECT status, error FROM system.backups WHERE uuid IN {ids_list}" + f"SELECT status, error FROM system.backups WHERE id IN {ids_list}" ) == TSV([["BACKUP_COMPLETE", ""]] * num_concurrent_backups) for backup_name in backup_names: @@ -156,13 +156,13 @@ def test_concurrent_backups_on_different_nodes(): for i in range(num_concurrent_backups): assert_eq_with_retry( nodes[i], - f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND uuid = '{ids[i]}'", + f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND id = '{ids[i]}'", "", ) for i in range(num_concurrent_backups): assert nodes[i].query( - f"SELECT status, error FROM system.backups WHERE uuid = '{ids[i]}'" + f"SELECT status, error FROM system.backups WHERE id = '{ids[i]}'" ) == TSV([["BACKUP_COMPLETE", ""]]) for i in range(num_concurrent_backups): @@ -244,14 +244,14 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine): for node in nodes: assert_eq_with_retry( node, - f"SELECT status from system.backups WHERE uuid IN {ids_list} AND (status == 'MAKING_BACKUP')", + f"SELECT status from system.backups WHERE id IN {ids_list} AND (status == 'MAKING_BACKUP')", "", ) for node in nodes: assert_eq_with_retry( node, - f"SELECT status, error from system.backups WHERE uuid IN {ids_list} AND (status == 'FAILED_TO_BACKUP')", + f"SELECT status, error from system.backups WHERE id IN {ids_list} AND (status == 'FAILED_TO_BACKUP')", "", ) @@ -259,7 +259,7 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine): for node in nodes: for id in ids: backup_name = node.query( - f"SELECT name FROM system.backups WHERE uuid='{id}' FORMAT RawBLOB" + f"SELECT name FROM system.backups WHERE id='{id}' FORMAT RawBLOB" ).strip() if backup_name: backup_names[id] = backup_name From 35c267b3b1ccfdec6d9a1223e9cf36cbc8d74970 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 21:35:25 +0200 Subject: [PATCH 215/227] Replace column "status_changed_time" with columns "start_time" and "end_time". --- src/Backups/BackupsWorker.cpp | 9 +++++++-- src/Backups/BackupsWorker.h | 4 +++- src/Storages/System/StorageSystemBackups.cpp | 9 ++++++--- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 3c524635687..eddc0e95564 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -559,7 +559,10 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupS info.id = id; info.name = name; info.status = status; - info.status_changed_time = time(nullptr); + info.start_time = std::chrono::system_clock::now(); + + if (isFinalStatus(status)) + info.end_time = info.start_time; std::lock_guard lock{infos_mutex}; @@ -590,7 +593,9 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status) auto old_status = info.status; info.status = status; - info.status_changed_time = time(nullptr); + + if (isFinalStatus(status)) + info.end_time = std::chrono::system_clock::now(); if (isErrorStatus(status)) { diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 4e2a98fd602..4b6308ff146 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -49,11 +49,13 @@ public: /// Status of backup or restore operation. BackupStatus status; - time_t status_changed_time; /// Set only if there was an error. std::exception_ptr exception; String error_message; + + std::chrono::system_clock::time_point start_time; + std::chrono::system_clock::time_point end_time; }; Info getInfo(const OperationID & id) const; diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index a2f935749e8..32f4fd51933 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -18,8 +18,9 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() {"id", std::make_shared()}, {"name", std::make_shared()}, {"status", std::make_shared(getBackupStatusEnumValues())}, - {"status_changed_time", std::make_shared()}, {"error", std::make_shared()}, + {"start_time", std::make_shared()}, + {"end_time", std::make_shared()}, }; return names_and_types; } @@ -31,16 +32,18 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_id = assert_cast(*res_columns[column_index++]); auto & column_name = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); - auto & column_status_changed_time = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); + auto & column_start_time = assert_cast(*res_columns[column_index++]); + auto & column_end_time = assert_cast(*res_columns[column_index++]); auto add_row = [&](const BackupsWorker::Info & info) { column_id.insertData(info.id.data(), info.id.size()); column_name.insertData(info.name.data(), info.name.size()); column_status.insertValue(static_cast(info.status)); - column_status_changed_time.insertValue(info.status_changed_time); column_error.insertData(info.error_message.data(), info.error_message.size()); + column_start_time.insertValue(std::chrono::system_clock::to_time_t(info.start_time)); + column_end_time.insertValue(std::chrono::system_clock::to_time_t(info.end_time)); }; for (const auto & entry : context->getBackupsWorker().getAllInfos()) From 1cfe0b10f7f518f4adaa08f78d451e862290cf70 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 21:35:37 +0200 Subject: [PATCH 216/227] Add columns "total_size" and "num_files" to system.backups --- src/Backups/BackupImpl.cpp | 29 ++++++++++++++++++++ src/Backups/BackupImpl.h | 8 ++++++ src/Backups/BackupsWorker.cpp | 26 ++++++++++++++++++ src/Backups/BackupsWorker.h | 7 +++++ src/Backups/IBackup.h | 6 ++++ src/Storages/System/StorageSystemBackups.cpp | 6 ++++ 6 files changed, 82 insertions(+) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index b5d19ef8f1a..adc558b5c11 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -232,6 +232,18 @@ void BackupImpl::close() coordination.reset(); } +size_t BackupImpl::getTotalNumFiles() const +{ + std::lock_guard lock{mutex}; + return total_num_files; +} + +UInt64 BackupImpl::getTotalSize() const +{ + std::lock_guard lock{mutex}; + return total_size; +} + void BackupImpl::writeBackupMetadata() { assert(!is_internal_backup); @@ -284,6 +296,7 @@ void BackupImpl::writeBackupMetadata() if (info.pos_in_archive != static_cast(-1)) config->setUInt64(prefix + "pos_in_archive", info.pos_in_archive); } + updateTotals(info); ++index; } @@ -300,6 +313,8 @@ void BackupImpl::writeBackupMetadata() out = writer->writeFile(".backup"); out->write(str.data(), str.size()); out->finalize(); + + updateTotals(str.size()); } void BackupImpl::readBackupMetadata() @@ -320,6 +335,7 @@ void BackupImpl::readBackupMetadata() String str; readStringUntilEOF(str, *in); + updateTotals(str.size()); std::istringstream stream(str); // STYLE_CHECK_ALLOW_STD_STRING_STREAM Poco::AutoPtr config{new Poco::Util::XMLConfiguration()}; config->load(stream); @@ -376,6 +392,7 @@ void BackupImpl::readBackupMetadata() } coordination->addFileInfo(info); + updateTotals(info); } } } @@ -790,6 +807,18 @@ std::shared_ptr BackupImpl::getArchiveWriter(const String & suff return new_archive_writer; } +void BackupImpl::updateTotals(UInt64 file_size) +{ + total_size += file_size; + ++total_num_files; +} + +void BackupImpl::updateTotals(const FileInfo & info) +{ + if ((info.size > info.base_size) && (info.data_file_name.empty() || (info.data_file_name == info.file_name))) + updateTotals(info.size - info.base_size); +} + void BackupImpl::removeAllFilesAfterFailure() { if (is_internal_backup) diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index 14b9d38835a..cb02fd33433 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -57,6 +57,8 @@ public: OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override { return timestamp; } UUID getUUID() const override { return *uuid; } + UInt64 getTotalSize() const override; + size_t getTotalNumFiles() const override; Strings listFiles(const String & directory, bool recursive) const override; bool hasFiles(const String & directory) const override; bool fileExists(const String & file_name) const override; @@ -96,6 +98,10 @@ private: std::shared_ptr getArchiveReader(const String & suffix) const; std::shared_ptr getArchiveWriter(const String & suffix); + /// Updates `total_num_files` and `total_size`. + void updateTotals(UInt64 file_size); + void updateTotals(const FileInfo & info); + const String backup_name; const ArchiveParams archive_params; const bool use_archives; @@ -108,6 +114,8 @@ private: mutable std::mutex mutex; std::optional uuid; time_t timestamp = 0; + size_t total_num_files = 0; + UInt64 total_size = 0; UInt64 version; std::optional base_backup_info; std::shared_ptr base_backup; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index eddc0e95564..1a6e0714424 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -316,16 +316,26 @@ void BackupsWorker::doBackup( backup_coordination->setStage(backup_settings.host_id, Stage::COMPLETED, ""); } + size_t num_files = 0; + UInt64 total_size = 0; + /// Finalize backup (write its metadata). if (!backup_settings.internal) + { backup->finalizeWriting(); + num_files = backup->getTotalNumFiles(); + total_size = backup->getTotalSize(); + } /// Close the backup. backup.reset(); LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); if (!backup_settings.internal) + { setStatus(backup_id, BackupStatus::BACKUP_COMPLETE); + setNumFilesAndTotalSize(backup_id, num_files, total_size); + } } catch (...) { @@ -452,6 +462,9 @@ void BackupsWorker::doRestore( backup_open_params.password = restore_settings.password; BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); + if (!restore_settings.internal) + setNumFilesAndTotalSize(restore_id, backup->getTotalNumFiles(), backup->getTotalSize()); + String current_database = context->getCurrentDatabase(); /// Checks access rights if this is ON CLUSTER query. @@ -608,6 +621,19 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status) } +void BackupsWorker::setNumFilesAndTotalSize(const String & id, size_t num_files, UInt64 total_size) +{ + std::lock_guard lock{infos_mutex}; + auto it = infos.find(id); + if (it == infos.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); + + auto & info = it->second; + info.num_files = num_files; + info.total_size = total_size; +} + + void BackupsWorker::wait(const OperationID & id, bool rethrow_exception) { std::unique_lock lock{infos_mutex}; diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 4b6308ff146..af2b3645f3c 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -50,6 +50,12 @@ public: /// Status of backup or restore operation. BackupStatus status; + /// Number of files in the backup (including backup's metadata; only unique files are counted). + size_t num_files = 0; + + /// Total size of files in the backup (including backup's metadata; only unique files are counted). + UInt64 total_size = 0; + /// Set only if there was an error. std::exception_ptr exception; String error_message; @@ -76,6 +82,7 @@ private: void addInfo(const OperationID & id, const String & name, BackupStatus status); void setStatus(const OperationID & id, BackupStatus status); + void setNumFilesAndTotalSize(const OperationID & id, size_t num_files, UInt64 total_size); ThreadPool backups_thread_pool; ThreadPool restores_thread_pool; diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index 467c8fea4cd..dd19bd0da2a 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -36,6 +36,12 @@ public: /// Returns UUID of the backup. virtual UUID getUUID() const = 0; + /// Returns the total size of unique files in the backup. + virtual UInt64 getTotalSize() const = 0; + + /// Returns the number of unique files in the backup. + virtual size_t getTotalNumFiles() const = 0; + /// Returns names of entries stored in a specified directory in the backup. /// If `directory` is empty or '/' the functions returns entries in the backup's root. virtual Strings listFiles(const String & directory, bool recursive = false) const = 0; diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index 32f4fd51933..ee737266ef6 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -18,6 +18,8 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() {"id", std::make_shared()}, {"name", std::make_shared()}, {"status", std::make_shared(getBackupStatusEnumValues())}, + {"num_files", std::make_shared()}, + {"total_size", std::make_shared()}, {"error", std::make_shared()}, {"start_time", std::make_shared()}, {"end_time", std::make_shared()}, @@ -32,6 +34,8 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_id = assert_cast(*res_columns[column_index++]); auto & column_name = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); + auto & column_num_files = assert_cast(*res_columns[column_index++]); + auto & column_total_size = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); auto & column_start_time = assert_cast(*res_columns[column_index++]); auto & column_end_time = assert_cast(*res_columns[column_index++]); @@ -41,6 +45,8 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con column_id.insertData(info.id.data(), info.id.size()); column_name.insertData(info.name.data(), info.name.size()); column_status.insertValue(static_cast(info.status)); + column_num_files.insertValue(info.num_files); + column_total_size.insertValue(info.total_size); column_error.insertData(info.error_message.data(), info.error_message.size()); column_start_time.insertValue(std::chrono::system_clock::to_time_t(info.start_time)); column_end_time.insertValue(std::chrono::system_clock::to_time_t(info.end_time)); From 51a2bf33e81f9fb7f9b4f3f96ac50ae2170a6bcc Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 21:46:22 +0200 Subject: [PATCH 217/227] Rename backup statuses to CREATING_BACKUP, BACKUP_CREATED, BACKUP_FAILED, RESTORING, RESTORED, RESTORE_FAILED. --- src/Backups/BackupStatus.cpp | 16 +++++++-------- src/Backups/BackupStatus.h | 8 ++++---- src/Backups/BackupsWorker.cpp | 20 +++++++++---------- .../test_backup_restore_new/test.py | 16 +++++++-------- .../test_backup_restore_on_cluster/test.py | 18 ++++++++--------- .../test_concurrency.py | 12 +++++------ .../test_concurrent_backups_s3/test.py | 2 +- 7 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/Backups/BackupStatus.cpp b/src/Backups/BackupStatus.cpp index dd1f70dd20b..53adaa577ea 100644 --- a/src/Backups/BackupStatus.cpp +++ b/src/Backups/BackupStatus.cpp @@ -15,18 +15,18 @@ std::string_view toString(BackupStatus backup_status) { switch (backup_status) { - case BackupStatus::MAKING_BACKUP: - return "MAKING_BACKUP"; - case BackupStatus::BACKUP_COMPLETE: - return "BACKUP_COMPLETE"; - case BackupStatus::FAILED_TO_BACKUP: - return "FAILED_TO_BACKUP"; + case BackupStatus::CREATING_BACKUP: + return "CREATING_BACKUP"; + case BackupStatus::BACKUP_CREATED: + return "BACKUP_CREATED"; + case BackupStatus::BACKUP_FAILED: + return "BACKUP_FAILED"; case BackupStatus::RESTORING: return "RESTORING"; case BackupStatus::RESTORED: return "RESTORED"; - case BackupStatus::FAILED_TO_RESTORE: - return "FAILED_TO_RESTORE"; + case BackupStatus::RESTORE_FAILED: + return "RESTORE_FAILED"; default: break; } diff --git a/src/Backups/BackupStatus.h b/src/Backups/BackupStatus.h index 3382b6b55d1..0afe6efe899 100644 --- a/src/Backups/BackupStatus.h +++ b/src/Backups/BackupStatus.h @@ -9,14 +9,14 @@ namespace DB enum class BackupStatus { /// Statuses of making backups - MAKING_BACKUP, - BACKUP_COMPLETE, - FAILED_TO_BACKUP, + CREATING_BACKUP, + BACKUP_CREATED, + BACKUP_FAILED, /// Status of restoring RESTORING, RESTORED, - FAILED_TO_RESTORE, + RESTORE_FAILED, MAX, }; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 1a6e0714424..1d58bf87152 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -95,19 +95,19 @@ namespace bool isFinalStatus(BackupStatus status) { - return (status == BackupStatus::BACKUP_COMPLETE) || (status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::RESTORED) - || (status == BackupStatus::FAILED_TO_RESTORE); + return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORED) + || (status == BackupStatus::RESTORE_FAILED); } bool isErrorStatus(BackupStatus status) { - return (status == BackupStatus::FAILED_TO_BACKUP) || (status == BackupStatus::FAILED_TO_RESTORE); + return (status == BackupStatus::BACKUP_FAILED) || (status == BackupStatus::RESTORE_FAILED); } /// Used to change num_active_backups. size_t getNumActiveBackupsChange(BackupStatus status) { - return status == BackupStatus::MAKING_BACKUP; + return status == BackupStatus::CREATING_BACKUP; } /// Used to change num_active_restores. @@ -164,7 +164,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); if (!backup_settings.internal) - addInfo(backup_id, backup_info.toString(), BackupStatus::MAKING_BACKUP); + addInfo(backup_id, backup_info.toString(), BackupStatus::CREATING_BACKUP); /// Prepare context to use. ContextPtr context_in_use = context; @@ -212,7 +212,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context { /// Something bad happened, the backup has not built. if (!backup_settings.internal) - setStatus(backup_id, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_id, BackupStatus::BACKUP_FAILED); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); throw; } @@ -333,7 +333,7 @@ void BackupsWorker::doBackup( LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); if (!backup_settings.internal) { - setStatus(backup_id, BackupStatus::BACKUP_COMPLETE); + setStatus(backup_id, BackupStatus::BACKUP_CREATED); setNumFilesAndTotalSize(backup_id, num_files, total_size); } } @@ -344,7 +344,7 @@ void BackupsWorker::doBackup( { tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); if (!backup_settings.internal) - setStatus(backup_id, BackupStatus::FAILED_TO_BACKUP); + setStatus(backup_id, BackupStatus::BACKUP_FAILED); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } else @@ -427,7 +427,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt { /// Something bad happened, the backup has not built. if (!restore_settings.internal) - setStatus(restore_id, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_id, BackupStatus::RESTORE_FAILED); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); throw; } @@ -554,7 +554,7 @@ void BackupsWorker::doRestore( { tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); if (!restore_settings.internal) - setStatus(restore_id, BackupStatus::FAILED_TO_RESTORE); + setStatus(restore_id, BackupStatus::RESTORE_FAILED); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } else diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 9bf3d2729c0..9c69b26efc8 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -305,12 +305,12 @@ def test_async(): f"BACKUP TABLE test.table TO {backup_name} ASYNC" ).split("\t") - assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" + assert status == "CREATING_BACKUP\n" or status == "BACKUP_CREATED\n" assert_eq_with_retry( instance, f"SELECT status, error FROM system.backups WHERE id='{id}'", - TSV([["BACKUP_COMPLETE", ""]]), + TSV([["BACKUP_CREATED", ""]]), ) instance.query("DROP TABLE test.table") @@ -347,17 +347,17 @@ def test_async_backups_to_same_destination(interface): assert_eq_with_retry( instance, - f"SELECT status FROM system.backups WHERE id IN ['{id1}', '{id2}'] AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id IN ['{id1}', '{id2}'] AND status == 'CREATING_BACKUP'", "", ) assert instance.query( f"SELECT status, error FROM system.backups WHERE id='{id1}'" - ) == TSV([["BACKUP_COMPLETE", ""]]) + ) == TSV([["BACKUP_CREATED", ""]]) assert ( instance.query(f"SELECT status FROM system.backups WHERE id='{id2}'") - == "FAILED_TO_BACKUP\n" + == "BACKUP_FAILED\n" ) instance.query("DROP TABLE test.table") @@ -759,7 +759,7 @@ def test_system_users_async(): assert_eq_with_retry( instance, f"SELECT status, error FROM system.backups WHERE id='{id}'", - TSV([["BACKUP_COMPLETE", ""]]), + TSV([["BACKUP_CREATED", ""]]), ) instance.query("DROP USER u1") @@ -894,12 +894,12 @@ def test_operation_id(): ).split("\t") assert id == "first" - assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" + assert status == "CREATING_BACKUP\n" or status == "BACKUP_CREATED\n" assert_eq_with_retry( instance, f"SELECT status, error FROM system.backups WHERE id='first'", - TSV([["BACKUP_COMPLETE", ""]]), + TSV([["BACKUP_CREATED", ""]]), ) instance.query("DROP TABLE test.table") diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py index e39c2a6bf94..ecf713f0f6f 100644 --- a/tests/integration/test_backup_restore_on_cluster/test.py +++ b/tests/integration/test_backup_restore_on_cluster/test.py @@ -400,12 +400,12 @@ def test_replicated_database_async(): f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} ASYNC" ).split("\t") - assert status == "MAKING_BACKUP\n" or status == "BACKUP_COMPLETE\n" + assert status == "CREATING_BACKUP\n" or status == "BACKUP_CREATED\n" assert_eq_with_retry( node1, f"SELECT status, error FROM system.backups WHERE id='{id}'", - TSV([["BACKUP_COMPLETE", ""]]), + TSV([["BACKUP_CREATED", ""]]), ) node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' NO DELAY") @@ -462,7 +462,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): for i in range(len(nodes)): assert_eq_with_retry( nodes[i], - f"SELECT status FROM system.backups WHERE id='{ids[i]}' AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id='{ids[i]}' AND status == 'CREATING_BACKUP'", "", ) @@ -471,7 +471,7 @@ def test_async_backups_to_same_destination(interface, on_cluster): int( nodes[i] .query( - f"SELECT count() FROM system.backups WHERE id='{ids[i]}' AND status == 'BACKUP_COMPLETE'" + f"SELECT count() FROM system.backups WHERE id='{ids[i]}' AND status == 'BACKUP_CREATED'" ) .strip() ) @@ -817,7 +817,7 @@ def test_stop_other_host_during_backup(kill): assert_eq_with_retry( node1, - f"SELECT status FROM system.backups WHERE id='{id}' AND status == 'MAKING_BACKUP'", + f"SELECT status FROM system.backups WHERE id='{id}' AND status == 'CREATING_BACKUP'", "", retry_count=100, ) @@ -825,17 +825,17 @@ def test_stop_other_host_during_backup(kill): status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip() if kill: - assert status in ["BACKUP_COMPLETE", "FAILED_TO_BACKUP"] + assert status in ["BACKUP_CREATED", "BACKUP_FAILED"] else: - assert status == "BACKUP_COMPLETE" + assert status == "BACKUP_CREATED" node2.start_clickhouse() - if status == "BACKUP_COMPLETE": + if status == "BACKUP_CREATED": node1.query("DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}") assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5]) - elif status == "FAILED_TO_BACKUP": + elif status == "BACKUP_FAILED": assert not os.path.exists( os.path.join(get_path_to_backup(backup_name), ".backup") ) diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py index d4f3b98ca90..2269ccda828 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py @@ -122,13 +122,13 @@ def test_concurrent_backups_on_same_node(): assert_eq_with_retry( node0, - f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND id IN {ids_list}", + f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP' AND id IN {ids_list}", "", ) assert node0.query( f"SELECT status, error FROM system.backups WHERE id IN {ids_list}" - ) == TSV([["BACKUP_COMPLETE", ""]] * num_concurrent_backups) + ) == TSV([["BACKUP_CREATED", ""]] * num_concurrent_backups) for backup_name in backup_names: node0.query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") @@ -156,14 +156,14 @@ def test_concurrent_backups_on_different_nodes(): for i in range(num_concurrent_backups): assert_eq_with_retry( nodes[i], - f"SELECT status FROM system.backups WHERE status == 'MAKING_BACKUP' AND id = '{ids[i]}'", + f"SELECT status FROM system.backups WHERE status == 'CREATING_BACKUP' AND id = '{ids[i]}'", "", ) for i in range(num_concurrent_backups): assert nodes[i].query( f"SELECT status, error FROM system.backups WHERE id = '{ids[i]}'" - ) == TSV([["BACKUP_COMPLETE", ""]]) + ) == TSV([["BACKUP_CREATED", ""]]) for i in range(num_concurrent_backups): nodes[i].query(f"DROP TABLE tbl ON CLUSTER 'cluster' NO DELAY") @@ -244,14 +244,14 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine): for node in nodes: assert_eq_with_retry( node, - f"SELECT status from system.backups WHERE id IN {ids_list} AND (status == 'MAKING_BACKUP')", + f"SELECT status from system.backups WHERE id IN {ids_list} AND (status == 'CREATING_BACKUP')", "", ) for node in nodes: assert_eq_with_retry( node, - f"SELECT status, error from system.backups WHERE id IN {ids_list} AND (status == 'FAILED_TO_BACKUP')", + f"SELECT status, error from system.backups WHERE id IN {ids_list} AND (status == 'BACKUP_FAILED')", "", ) diff --git a/tests/integration/test_concurrent_backups_s3/test.py b/tests/integration/test_concurrent_backups_s3/test.py index 608144843d9..d3ca1fd35a7 100644 --- a/tests/integration/test_concurrent_backups_s3/test.py +++ b/tests/integration/test_concurrent_backups_s3/test.py @@ -45,7 +45,7 @@ def test_concurrent_backups(start_cluster): assert_eq_with_retry( node, - "SELECT count() FROM system.backups WHERE status != 'BACKUP_COMPLETE' and status != 'FAILED_TO_BACKUP'", + "SELECT count() FROM system.backups WHERE status != 'BACKUP_CREATED' and status != 'BACKUP_FAILED'", "0", retry_count=100, ) From a79fc1723522ecc8c4c8d27fb3bad9331185f507 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 22:20:19 +0200 Subject: [PATCH 218/227] Add test for system.backups --- src/Storages/System/StorageSystemBackups.cpp | 4 +-- .../test_backup_restore_new/test.py | 34 +++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index ee737266ef6..0fe04f2a232 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -34,8 +34,8 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_id = assert_cast(*res_columns[column_index++]); auto & column_name = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); - auto & column_num_files = assert_cast(*res_columns[column_index++]); - auto & column_total_size = assert_cast(*res_columns[column_index++]); + auto & column_num_files = assert_cast(*res_columns[column_index++]); + auto & column_total_size = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); auto & column_start_time = assert_cast(*res_columns[column_index++]); auto & column_end_time = assert_cast(*res_columns[column_index++]); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index 9c69b26efc8..da6a6011614 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -888,11 +888,11 @@ def test_operation_id(): create_and_fill_table(n=30) backup_name = new_backup_name() - + [id, status] = instance.query( f"BACKUP TABLE test.table TO {backup_name} SETTINGS id='first' ASYNC" ).split("\t") - + assert id == "first" assert status == "CREATING_BACKUP\n" or status == "BACKUP_CREATED\n" @@ -923,11 +923,39 @@ def test_operation_id(): [id, status] = instance.query( f"RESTORE TABLE test.table FROM {backup_name} SETTINGS id='first'" ).split("\t") - + assert id == "first" assert status == "RESTORED\n" +def test_system_backups(): + create_and_fill_table(n=30) + + backup_name = new_backup_name() + + id = instance.query(f"BACKUP TABLE test.table TO {backup_name}").split("\t")[0] + + escaped_backup_name = backup_name.replace("'", "\\'") + assert instance.query( + f"SELECT name, status, num_files, total_size, error FROM system.backups WHERE id='{id}'" + ) == TSV([[escaped_backup_name, "BACKUP_CREATED", 56, 19656, ""]]) + + backup_name2 = new_backup_name() + expected_error = "Table test.non_existent_table was not found" + assert expected_error in instance.query_and_get_error( + f"BACKUP TABLE test.non_existent_table TO {backup_name2}" + ) + + escaped_backup_name2 = backup_name2.replace("'", "\\'") + assert instance.query( + f"SELECT status, num_files, total_size FROM system.backups WHERE name='{escaped_backup_name2}'" + ) == TSV([["BACKUP_FAILED", 0, 0]]) + + assert expected_error in instance.query( + f"SELECT error FROM system.backups WHERE name='{escaped_backup_name2}'" + ) + + def test_mutation(): create_and_fill_table(engine="MergeTree ORDER BY tuple()", n=5) From e602e012322a12b9ec09826015afa45d6571a2ec Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 26 Jul 2022 22:32:32 +0200 Subject: [PATCH 219/227] Fix style. --- src/Backups/BackupsWorker.cpp | 3 ++- src/Backups/BackupsWorker.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 1d58bf87152..bf28a3953a4 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -28,6 +28,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } using OperationID = BackupsWorker::OperationID; @@ -589,7 +590,7 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupS } infos[id] = std::move(info); - + num_active_backups += getNumActiveBackupsChange(status); num_active_restores += getNumActiveRestoresChange(status); } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index af2b3645f3c..9031cb8d231 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -69,13 +69,13 @@ public: private: OperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context); - + void doBackup(const std::shared_ptr & backup_query, const OperationID & backup_id, BackupSettings backup_settings, const BackupInfo & backup_info, std::shared_ptr backup_coordination, const ContextPtr & context, ContextMutablePtr mutable_context, bool called_async); OperationID startRestoring(const ASTPtr & query, ContextMutablePtr context); - + void doRestore(const std::shared_ptr & restore_query, const OperationID & restore_id, const UUID & restore_uuid, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); From e62526720fc567c1d3f3a3dcebfd1ca3e1143dda Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 27 Jul 2022 07:51:30 +0000 Subject: [PATCH 220/227] Address PR comments --- programs/keeper-converter/KeeperConverter.cpp | 2 +- src/Coordination/KeeperSnapshotManager.cpp | 2 +- src/Coordination/KeeperStorage.cpp | 69 ++++++++----------- 3 files changed, 29 insertions(+), 44 deletions(-) diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index 42e0894257a..7d25c1d5017 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -42,7 +42,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) auto keeper_context = std::make_shared(); keeper_context->digest_enabled = true; - DB::KeeperStorage storage(500, "", keeper_context, false); + DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false); DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as(), logger); storage.initializeSystemNodes(); diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 90281e6fc5a..0057fd7e96e 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -656,7 +656,7 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff compressed_reader = std::make_unique(*reader); SnapshotDeserializationResult result; - result.storage = std::make_unique(storage_tick_time, superdigest, keeper_context, false); + result.storage = std::make_unique(storage_tick_time, superdigest, keeper_context, /* initialize_system_nodes */ false); KeeperStorageSnapshot::deserialize(result, *compressed_reader, keeper_context); result.storage->initializeSystemNodes(); return result; diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 3956bb0e930..9c6f54dc5bf 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -714,21 +714,34 @@ struct KeeperStorageSyncRequestProcessor final : public KeeperStorageRequestProc namespace { - Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local) +Coordination::ACLs getNodeACLs(KeeperStorage & storage, StringRef path, bool is_local) +{ + if (is_local) { - if (is_local) - { - auto node_it = storage.container.find(path); - if (node_it == storage.container.end()) - return {}; + auto node_it = storage.container.find(path); + if (node_it == storage.container.end()) + return {}; - return storage.acl_map.convertNumber(node_it->value.acl_id); - } - - return storage.uncommitted_state.getACLs(path); + return storage.acl_map.convertNumber(node_it->value.acl_id); } + return storage.uncommitted_state.getACLs(path); } + +void handleSystemNodeModification(const KeeperContext & keeper_context, std::string_view error_msg) +{ + if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "{}. Ignoring it can lead to data loss. " + "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", + error_msg); + + LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); +} + +} + bool KeeperStorage::checkACL(StringRef path, int32_t permission, int64_t session_id, bool is_local) { const auto node_acls = getNodeACLs(*this, path, is_local); @@ -812,14 +825,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr { auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created); - if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "{}. Ignoring it can lead to data loss. " - "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", - error_msg); - - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); + handleSystemNodeModification(keeper_context, error_msg); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -989,14 +995,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path); - if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "{}. Ignoring it can lead to data loss. " - "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", - error_msg); - - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); + handleSystemNodeModification(keeper_context, error_msg); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1150,14 +1149,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce { auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path); - if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "{}. Ignoring it can lead to data loss. " - "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", - error_msg); - - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); + handleSystemNodeModification(keeper_context, error_msg); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } @@ -1426,14 +1418,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr { auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path); - if (keeper_context.server_state == KeeperContext::Phase::INIT && !keeper_context.ignore_system_path_on_startup) - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "{}. Ignoring it can lead to data loss. " - "If you still want to ignore it, you can set 'keeper_server.ignore_system_path_on_startup' to true.", - error_msg); - - LOG_ERROR(&Poco::Logger::get("KeeperStorage"), fmt::runtime(error_msg)); + handleSystemNodeModification(keeper_context, error_msg); return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; } From 794eeb5d51797982028bea082dfbb59f887c77b9 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 27 Jul 2022 10:36:56 +0200 Subject: [PATCH 221/227] Split "total_size" to "uncompressed_size" and "compressed_size". --- src/Backups/BackupIO.h | 7 +- src/Backups/BackupIO_Disk.cpp | 7 +- src/Backups/BackupIO_Disk.h | 3 +- src/Backups/BackupIO_File.cpp | 7 +- src/Backups/BackupIO_File.h | 3 +- src/Backups/BackupImpl.cpp | 75 ++++++++++++------- src/Backups/BackupImpl.h | 20 +++-- src/Backups/BackupsWorker.cpp | 17 +++-- src/Backups/BackupsWorker.h | 9 ++- src/Backups/IBackup.h | 11 ++- src/Storages/System/StorageSystemBackups.cpp | 9 ++- .../test_backup_restore_new/test.py | 49 ++++++++---- 12 files changed, 148 insertions(+), 69 deletions(-) diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h index 433e81a70a2..389df97502a 100644 --- a/src/Backups/BackupIO.h +++ b/src/Backups/BackupIO.h @@ -8,21 +8,22 @@ class SeekableReadBuffer; class WriteBuffer; /// Represents operations of loading from disk or downloading for reading a backup. -class IBackupReader /// BackupReaderFile, BackupReaderDisk, BackupReaderS3 +class IBackupReader /// BackupReaderFile, BackupReaderDisk { public: virtual ~IBackupReader() = default; virtual bool fileExists(const String & file_name) = 0; - virtual size_t getFileSize(const String & file_name) = 0; + virtual UInt64 getFileSize(const String & file_name) = 0; virtual std::unique_ptr readFile(const String & file_name) = 0; }; /// Represents operations of storing to disk or uploading for writing a backup. -class IBackupWriter /// BackupWriterFile, BackupWriterDisk, BackupWriterS3 +class IBackupWriter /// BackupWriterFile, BackupWriterDisk { public: virtual ~IBackupWriter() = default; virtual bool fileExists(const String & file_name) = 0; + virtual UInt64 getFileSize(const String & file_name) = 0; virtual bool fileContentsEqual(const String & file_name, const String & expected_file_contents) = 0; virtual std::unique_ptr writeFile(const String & file_name) = 0; virtual void removeFiles(const Strings & file_names) = 0; diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp index 537bc667cd4..22a86825387 100644 --- a/src/Backups/BackupIO_Disk.cpp +++ b/src/Backups/BackupIO_Disk.cpp @@ -17,7 +17,7 @@ bool BackupReaderDisk::fileExists(const String & file_name) return disk->exists(path / file_name); } -size_t BackupReaderDisk::getFileSize(const String & file_name) +UInt64 BackupReaderDisk::getFileSize(const String & file_name) { return disk->getFileSize(path / file_name); } @@ -38,6 +38,11 @@ bool BackupWriterDisk::fileExists(const String & file_name) return disk->exists(path / file_name); } +UInt64 BackupWriterDisk::getFileSize(const String & file_name) +{ + return disk->getFileSize(path / file_name); +} + bool BackupWriterDisk::fileContentsEqual(const String & file_name, const String & expected_file_contents) { if (!disk->exists(path / file_name)) diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h index 8ba99470938..53412e6d219 100644 --- a/src/Backups/BackupIO_Disk.h +++ b/src/Backups/BackupIO_Disk.h @@ -15,7 +15,7 @@ public: ~BackupReaderDisk() override; bool fileExists(const String & file_name) override; - size_t getFileSize(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; private: @@ -30,6 +30,7 @@ public: ~BackupWriterDisk() override; bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp index 774d493ee38..7c08c150474 100644 --- a/src/Backups/BackupIO_File.cpp +++ b/src/Backups/BackupIO_File.cpp @@ -18,7 +18,7 @@ bool BackupReaderFile::fileExists(const String & file_name) return fs::exists(path / file_name); } -size_t BackupReaderFile::getFileSize(const String & file_name) +UInt64 BackupReaderFile::getFileSize(const String & file_name) { return fs::file_size(path / file_name); } @@ -39,6 +39,11 @@ bool BackupWriterFile::fileExists(const String & file_name) return fs::exists(path / file_name); } +UInt64 BackupWriterFile::getFileSize(const String & file_name) +{ + return fs::file_size(path / file_name); +} + bool BackupWriterFile::fileContentsEqual(const String & file_name, const String & expected_file_contents) { if (!fs::exists(path / file_name)) diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h index aebf2bdab73..5d37408e6d8 100644 --- a/src/Backups/BackupIO_File.h +++ b/src/Backups/BackupIO_File.h @@ -13,7 +13,7 @@ public: ~BackupReaderFile() override; bool fileExists(const String & file_name) override; - size_t getFileSize(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; std::unique_ptr readFile(const String & file_name) override; private: @@ -27,6 +27,7 @@ public: ~BackupWriterFile() override; bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; bool fileContentsEqual(const String & file_name, const String & expected_file_contents) override; std::unique_ptr writeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index adc558b5c11..263aab2bd50 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -219,10 +219,7 @@ void BackupImpl::open(const ContextPtr & context) void BackupImpl::close() { std::lock_guard lock{mutex}; - - archive_readers.clear(); - for (auto & archive_writer : archive_writers) - archive_writer = {"", nullptr}; + closeArchives(); if (!is_internal_backup && writer && !writing_finalized) removeAllFilesAfterFailure(); @@ -232,16 +229,29 @@ void BackupImpl::close() coordination.reset(); } -size_t BackupImpl::getTotalNumFiles() const +void BackupImpl::closeArchives() { - std::lock_guard lock{mutex}; - return total_num_files; + archive_readers.clear(); + for (auto & archive_writer : archive_writers) + archive_writer = {"", nullptr}; } -UInt64 BackupImpl::getTotalSize() const +size_t BackupImpl::getNumFiles() const { std::lock_guard lock{mutex}; - return total_size; + return num_files; +} + +UInt64 BackupImpl::getUncompressedSize() const +{ + std::lock_guard lock{mutex}; + return uncompressed_size; +} + +UInt64 BackupImpl::getCompressedSize() const +{ + std::lock_guard lock{mutex}; + return compressed_size; } void BackupImpl::writeBackupMetadata() @@ -296,7 +306,7 @@ void BackupImpl::writeBackupMetadata() if (info.pos_in_archive != static_cast(-1)) config->setUInt64(prefix + "pos_in_archive", info.pos_in_archive); } - updateTotals(info); + increaseUncompressedSize(info); ++index; } @@ -314,7 +324,7 @@ void BackupImpl::writeBackupMetadata() out->write(str.data(), str.size()); out->finalize(); - updateTotals(str.size()); + increaseUncompressedSize(str.size()); } void BackupImpl::readBackupMetadata() @@ -324,6 +334,7 @@ void BackupImpl::readBackupMetadata() { if (!reader->fileExists(archive_params.archive_name)) throw Exception(ErrorCodes::BACKUP_NOT_FOUND, "Backup {} not found", backup_name); + setCompressedSize(); in = getArchiveReader("")->readFile(".backup"); } else @@ -335,7 +346,7 @@ void BackupImpl::readBackupMetadata() String str; readStringUntilEOF(str, *in); - updateTotals(str.size()); + increaseUncompressedSize(str.size()); std::istringstream stream(str); // STYLE_CHECK_ALLOW_STD_STRING_STREAM Poco::AutoPtr config{new Poco::Util::XMLConfiguration()}; config->load(stream); @@ -392,9 +403,12 @@ void BackupImpl::readBackupMetadata() } coordination->addFileInfo(info); - updateTotals(info); + increaseUncompressedSize(info); } } + + if (!use_archives) + setCompressedSize(); } void BackupImpl::checkBackupDoesntExist() const @@ -761,6 +775,8 @@ void BackupImpl::finalizeWriting() { LOG_TRACE(log, "Finalizing backup {}", backup_name); writeBackupMetadata(); + closeArchives(); + setCompressedSize(); removeLockFile(); LOG_TRACE(log, "Finalized backup {}", backup_name); } @@ -769,12 +785,32 @@ void BackupImpl::finalizeWriting() } +void BackupImpl::increaseUncompressedSize(UInt64 file_size) +{ + uncompressed_size += file_size; + ++num_files; +} + +void BackupImpl::increaseUncompressedSize(const FileInfo & info) +{ + if ((info.size > info.base_size) && (info.data_file_name.empty() || (info.data_file_name == info.file_name))) + increaseUncompressedSize(info.size - info.base_size); +} + +void BackupImpl::setCompressedSize() +{ + if (use_archives) + compressed_size = writer ? writer->getFileSize(archive_params.archive_name) : reader->getFileSize(archive_params.archive_name); + else + compressed_size = uncompressed_size; +} + + String BackupImpl::getArchiveNameWithSuffix(const String & suffix) const { return archive_params.archive_name + (suffix.empty() ? "" : ".") + suffix; } - std::shared_ptr BackupImpl::getArchiveReader(const String & suffix) const { auto it = archive_readers.find(suffix); @@ -807,17 +843,6 @@ std::shared_ptr BackupImpl::getArchiveWriter(const String & suff return new_archive_writer; } -void BackupImpl::updateTotals(UInt64 file_size) -{ - total_size += file_size; - ++total_num_files; -} - -void BackupImpl::updateTotals(const FileInfo & info) -{ - if ((info.size > info.base_size) && (info.data_file_name.empty() || (info.data_file_name == info.file_name))) - updateTotals(info.size - info.base_size); -} void BackupImpl::removeAllFilesAfterFailure() { diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h index cb02fd33433..525aec2fcd6 100644 --- a/src/Backups/BackupImpl.h +++ b/src/Backups/BackupImpl.h @@ -57,8 +57,9 @@ public: OpenMode getOpenMode() const override { return open_mode; } time_t getTimestamp() const override { return timestamp; } UUID getUUID() const override { return *uuid; } - UInt64 getTotalSize() const override; - size_t getTotalNumFiles() const override; + size_t getNumFiles() const override; + UInt64 getUncompressedSize() const override; + UInt64 getCompressedSize() const override; Strings listFiles(const String & directory, bool recursive) const override; bool hasFiles(const String & directory) const override; bool fileExists(const String & file_name) const override; @@ -78,6 +79,7 @@ private: void open(const ContextPtr & context); void close(); + void closeArchives(); /// Writes the file ".backup" containing backup's metadata. void writeBackupMetadata(); @@ -98,9 +100,12 @@ private: std::shared_ptr getArchiveReader(const String & suffix) const; std::shared_ptr getArchiveWriter(const String & suffix); - /// Updates `total_num_files` and `total_size`. - void updateTotals(UInt64 file_size); - void updateTotals(const FileInfo & info); + /// Increases `uncompressed_size` by a specific value and `num_files` by 1. + void increaseUncompressedSize(UInt64 file_size); + void increaseUncompressedSize(const FileInfo & info); + + /// Calculates and sets `compressed_size`. + void setCompressedSize(); const String backup_name; const ArchiveParams archive_params; @@ -114,8 +119,9 @@ private: mutable std::mutex mutex; std::optional uuid; time_t timestamp = 0; - size_t total_num_files = 0; - UInt64 total_size = 0; + size_t num_files = 0; + UInt64 uncompressed_size = 0; + UInt64 compressed_size = 0; UInt64 version; std::optional base_backup_info; std::shared_ptr base_backup; diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index bf28a3953a4..e2de4763f7c 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -318,14 +318,16 @@ void BackupsWorker::doBackup( } size_t num_files = 0; - UInt64 total_size = 0; + UInt64 uncompressed_size = 0; + UInt64 compressed_size = 0; /// Finalize backup (write its metadata). if (!backup_settings.internal) { backup->finalizeWriting(); - num_files = backup->getTotalNumFiles(); - total_size = backup->getTotalSize(); + num_files = backup->getNumFiles(); + uncompressed_size = backup->getUncompressedSize(); + compressed_size = backup->getCompressedSize(); } /// Close the backup. @@ -335,7 +337,7 @@ void BackupsWorker::doBackup( if (!backup_settings.internal) { setStatus(backup_id, BackupStatus::BACKUP_CREATED); - setNumFilesAndTotalSize(backup_id, num_files, total_size); + setNumFilesAndSize(backup_id, num_files, uncompressed_size, compressed_size); } } catch (...) @@ -464,7 +466,7 @@ void BackupsWorker::doRestore( BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); if (!restore_settings.internal) - setNumFilesAndTotalSize(restore_id, backup->getTotalNumFiles(), backup->getTotalSize()); + setNumFilesAndSize(restore_id, backup->getNumFiles(), backup->getUncompressedSize(), backup->getCompressedSize()); String current_database = context->getCurrentDatabase(); @@ -622,7 +624,7 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status) } -void BackupsWorker::setNumFilesAndTotalSize(const String & id, size_t num_files, UInt64 total_size) +void BackupsWorker::setNumFilesAndSize(const String & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size) { std::lock_guard lock{infos_mutex}; auto it = infos.find(id); @@ -631,7 +633,8 @@ void BackupsWorker::setNumFilesAndTotalSize(const String & id, size_t num_files, auto & info = it->second; info.num_files = num_files; - info.total_size = total_size; + info.uncompressed_size = uncompressed_size; + info.compressed_size = compressed_size; } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 9031cb8d231..9196ea45689 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -53,8 +53,11 @@ public: /// Number of files in the backup (including backup's metadata; only unique files are counted). size_t num_files = 0; - /// Total size of files in the backup (including backup's metadata; only unique files are counted). - UInt64 total_size = 0; + /// Size of all files in the backup (including backup's metadata; only unique files are counted). + UInt64 uncompressed_size = 0; + + /// Size of the backup if it's stored as an archive; or the same as `uncompressed_size` if the backup is stored as a folder. + UInt64 compressed_size = 0; /// Set only if there was an error. std::exception_ptr exception; @@ -82,7 +85,7 @@ private: void addInfo(const OperationID & id, const String & name, BackupStatus status); void setStatus(const OperationID & id, BackupStatus status); - void setNumFilesAndTotalSize(const OperationID & id, size_t num_files, UInt64 total_size); + void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size); ThreadPool backups_thread_pool; ThreadPool restores_thread_pool; diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h index dd19bd0da2a..a8ddbb5b64d 100644 --- a/src/Backups/IBackup.h +++ b/src/Backups/IBackup.h @@ -36,11 +36,14 @@ public: /// Returns UUID of the backup. virtual UUID getUUID() const = 0; - /// Returns the total size of unique files in the backup. - virtual UInt64 getTotalSize() const = 0; - /// Returns the number of unique files in the backup. - virtual size_t getTotalNumFiles() const = 0; + virtual size_t getNumFiles() const = 0; + + /// Returns the total size of unique files in the backup. + virtual UInt64 getUncompressedSize() const = 0; + + /// Returns the compressed size of the backup. If the backup is not stored as an archive it returns the same as getUncompressedSize(). + virtual UInt64 getCompressedSize() const = 0; /// Returns names of entries stored in a specified directory in the backup. /// If `directory` is empty or '/' the functions returns entries in the backup's root. diff --git a/src/Storages/System/StorageSystemBackups.cpp b/src/Storages/System/StorageSystemBackups.cpp index 0fe04f2a232..e7146711c4a 100644 --- a/src/Storages/System/StorageSystemBackups.cpp +++ b/src/Storages/System/StorageSystemBackups.cpp @@ -19,7 +19,8 @@ NamesAndTypesList StorageSystemBackups::getNamesAndTypes() {"name", std::make_shared()}, {"status", std::make_shared(getBackupStatusEnumValues())}, {"num_files", std::make_shared()}, - {"total_size", std::make_shared()}, + {"uncompressed_size", std::make_shared()}, + {"compressed_size", std::make_shared()}, {"error", std::make_shared()}, {"start_time", std::make_shared()}, {"end_time", std::make_shared()}, @@ -35,7 +36,8 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con auto & column_name = assert_cast(*res_columns[column_index++]); auto & column_status = assert_cast(*res_columns[column_index++]); auto & column_num_files = assert_cast(*res_columns[column_index++]); - auto & column_total_size = assert_cast(*res_columns[column_index++]); + auto & column_uncompressed_size = assert_cast(*res_columns[column_index++]); + auto & column_compressed_size = assert_cast(*res_columns[column_index++]); auto & column_error = assert_cast(*res_columns[column_index++]); auto & column_start_time = assert_cast(*res_columns[column_index++]); auto & column_end_time = assert_cast(*res_columns[column_index++]); @@ -46,7 +48,8 @@ void StorageSystemBackups::fillData(MutableColumns & res_columns, ContextPtr con column_name.insertData(info.name.data(), info.name.size()); column_status.insertValue(static_cast(info.status)); column_num_files.insertValue(info.num_files); - column_total_size.insertValue(info.total_size); + column_uncompressed_size.insertValue(info.uncompressed_size); + column_compressed_size.insertValue(info.compressed_size); column_error.insertData(info.error_message.data(), info.error_message.size()); column_start_time.insertValue(std::chrono::system_clock::to_time_t(info.start_time)); column_end_time.insertValue(std::chrono::system_clock::to_time_t(info.end_time)); diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index da6a6011614..fc8ca6ab0b7 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -935,26 +935,49 @@ def test_system_backups(): id = instance.query(f"BACKUP TABLE test.table TO {backup_name}").split("\t")[0] - escaped_backup_name = backup_name.replace("'", "\\'") - assert instance.query( - f"SELECT name, status, num_files, total_size, error FROM system.backups WHERE id='{id}'" - ) == TSV([[escaped_backup_name, "BACKUP_CREATED", 56, 19656, ""]]) + [name, status, num_files, uncompressed_size, compressed_size, error] = ( + instance.query( + f"SELECT name, status, num_files, uncompressed_size, compressed_size, error FROM system.backups WHERE id='{id}'" + ) + .strip("\n") + .split("\t") + ) - backup_name2 = new_backup_name() + escaped_backup_name = backup_name.replace("'", "\\'") + num_files = int(num_files) + compressed_size = int(compressed_size) + uncompressed_size = int(uncompressed_size) + assert name == escaped_backup_name + assert status == "BACKUP_CREATED" + assert num_files > 1 + assert uncompressed_size > 1 + assert compressed_size == uncompressed_size + assert error == "" + + backup_name = new_backup_name() expected_error = "Table test.non_existent_table was not found" assert expected_error in instance.query_and_get_error( - f"BACKUP TABLE test.non_existent_table TO {backup_name2}" + f"BACKUP TABLE test.non_existent_table TO {backup_name}" ) - escaped_backup_name2 = backup_name2.replace("'", "\\'") - assert instance.query( - f"SELECT status, num_files, total_size FROM system.backups WHERE name='{escaped_backup_name2}'" - ) == TSV([["BACKUP_FAILED", 0, 0]]) - - assert expected_error in instance.query( - f"SELECT error FROM system.backups WHERE name='{escaped_backup_name2}'" + escaped_backup_name = backup_name.replace("'", "\\'") + [status, num_files, uncompressed_size, compressed_size, error] = ( + instance.query( + f"SELECT status, num_files, uncompressed_size, compressed_size, error FROM system.backups WHERE name='{escaped_backup_name}'" + ) + .strip("\n") + .split("\t") ) + num_files = int(num_files) + compressed_size = int(compressed_size) + uncompressed_size = int(uncompressed_size) + assert status == "BACKUP_FAILED" + assert num_files == 0 + assert uncompressed_size == 0 + assert compressed_size == 0 + assert expected_error in error + def test_mutation(): create_and_fill_table(engine="MergeTree ORDER BY tuple()", n=5) From 32e40e630e923e8a72df70d7069a857df04e363a Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 27 Jul 2022 12:24:21 +0200 Subject: [PATCH 222/227] Fix removing "internal" column. --- src/Backups/BackupsWorker.cpp | 88 +++++++++++++++++------------------ src/Backups/BackupsWorker.h | 11 +++-- 2 files changed, 51 insertions(+), 48 deletions(-) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index e2de4763f7c..add23411d8f 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -146,12 +146,16 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context if (!backup_settings.backup_uuid) backup_settings.backup_uuid = UUIDHelpers::generateV4(); - OperationID backup_id = backup_settings.id; - if (backup_id.empty()) + /// `backup_id` will be used as a key to the `infos` map, so it should be unique. + OperationID backup_id; + if (backup_settings.internal) + backup_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `backup_id` for internal backup to avoid collision if both internal and non-internal backups are on the same host + else if (!backup_settings.id.empty()) + backup_id = backup_settings.id; + else backup_id = toString(*backup_settings.backup_uuid); std::shared_ptr backup_coordination; - if (backup_settings.internal) { /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination @@ -163,9 +167,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context try { auto backup_info = BackupInfo::fromAST(*backup_query->backup_name); - - if (!backup_settings.internal) - addInfo(backup_id, backup_info.toString(), BackupStatus::CREATING_BACKUP); + addInfo(backup_id, backup_info.toString(), backup_settings.internal, BackupStatus::CREATING_BACKUP); /// Prepare context to use. ContextPtr context_in_use = context; @@ -212,8 +214,7 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context catch (...) { /// Something bad happened, the backup has not built. - if (!backup_settings.internal) - setStatus(backup_id, BackupStatus::BACKUP_FAILED); + setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); throw; } @@ -334,11 +335,8 @@ void BackupsWorker::doBackup( backup.reset(); LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_info.toString()); - if (!backup_settings.internal) - { - setStatus(backup_id, BackupStatus::BACKUP_CREATED); - setNumFilesAndSize(backup_id, num_files, uncompressed_size, compressed_size); - } + setStatus(backup_id, BackupStatus::BACKUP_CREATED); + setNumFilesAndSize(backup_id, num_files, uncompressed_size, compressed_size); } catch (...) { @@ -346,8 +344,7 @@ void BackupsWorker::doBackup( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - if (!backup_settings.internal) - setStatus(backup_id, BackupStatus::BACKUP_FAILED); + setStatusSafe(backup_id, BackupStatus::BACKUP_FAILED); sendCurrentExceptionToCoordination(backup_coordination, backup_settings.host_id); } else @@ -364,14 +361,16 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt auto restore_query = std::static_pointer_cast(query->clone()); auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query); - UUID restore_uuid = UUIDHelpers::generateV4(); - - OperationID restore_id = restore_settings.id; - if (restore_id.empty()) - restore_id = toString(restore_uuid); + /// `restore_id` will be used as a key to the `infos` map, so it should be unique. + OperationID restore_id; + if (restore_settings.internal) + restore_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `restore_id` for internal restore to avoid collision if both internal and non-internal restores are on the same host + else if (!restore_settings.id.empty()) + restore_id = restore_settings.id; + else + restore_id = toString(UUIDHelpers::generateV4()); std::shared_ptr restore_coordination; - if (restore_settings.internal) { /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination @@ -383,8 +382,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt try { auto backup_info = BackupInfo::fromAST(*restore_query->backup_name); - if (!restore_settings.internal) - addInfo(restore_id, backup_info.toString(), BackupStatus::RESTORING); + addInfo(restore_id, backup_info.toString(), restore_settings.internal, BackupStatus::RESTORING); /// Prepare context to use. ContextMutablePtr context_in_use = context; @@ -399,11 +397,10 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt if (restore_settings.async) { backups_thread_pool.scheduleOrThrowOnError( - [this, restore_query, restore_id, restore_uuid, restore_settings, backup_info, restore_coordination, context_in_use] { + [this, restore_query, restore_id, restore_settings, backup_info, restore_coordination, context_in_use] { doRestore( restore_query, restore_id, - restore_uuid, restore_settings, backup_info, restore_coordination, @@ -416,7 +413,6 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt doRestore( restore_query, restore_id, - restore_uuid, restore_settings, backup_info, restore_coordination, @@ -429,8 +425,7 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt catch (...) { /// Something bad happened, the backup has not built. - if (!restore_settings.internal) - setStatus(restore_id, BackupStatus::RESTORE_FAILED); + setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); throw; } @@ -440,7 +435,6 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt void BackupsWorker::doRestore( const std::shared_ptr & restore_query, const OperationID & restore_id, - const UUID & restore_uuid, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, @@ -465,8 +459,7 @@ void BackupsWorker::doRestore( backup_open_params.password = restore_settings.password; BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params); - if (!restore_settings.internal) - setNumFilesAndSize(restore_id, backup->getNumFiles(), backup->getUncompressedSize(), backup->getCompressedSize()); + setNumFilesAndSize(restore_id, backup->getNumFiles(), backup->getUncompressedSize(), backup->getCompressedSize()); String current_database = context->getCurrentDatabase(); @@ -500,7 +493,7 @@ void BackupsWorker::doRestore( if (on_cluster && restore_settings.coordination_zk_path.empty()) { String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups"); - restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(restore_uuid); + restore_settings.coordination_zk_path = root_zk_path + "/restore-" + toString(UUIDHelpers::generateV4()); } if (!restore_coordination) @@ -547,8 +540,7 @@ void BackupsWorker::doRestore( } LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString()); - if (!restore_settings.internal) - setStatus(restore_id, BackupStatus::RESTORED); + setStatus(restore_id, BackupStatus::RESTORED); } catch (...) { @@ -556,8 +548,7 @@ void BackupsWorker::doRestore( if (called_async) { tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings.internal ? "internal backup" : "backup"), backup_info.toString())); - if (!restore_settings.internal) - setStatus(restore_id, BackupStatus::RESTORE_FAILED); + setStatusSafe(restore_id, BackupStatus::RESTORE_FAILED); sendCurrentExceptionToCoordination(restore_coordination, restore_settings.host_id); } else @@ -569,11 +560,12 @@ void BackupsWorker::doRestore( } -void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupStatus status) +void BackupsWorker::addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status) { Info info; info.id = id; info.name = name; + info.internal = internal; info.status = status; info.start_time = std::chrono::system_clock::now(); @@ -588,7 +580,7 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupS /// It's better not allow to overwrite the current status if it's in progress. auto current_status = it->second.status; if (!isFinalStatus(current_status)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: it's id='{}' is already in use", id); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot start a backup or restore: ID {} is already in use", id); } infos[id] = std::move(info); @@ -598,12 +590,17 @@ void BackupsWorker::addInfo(const OperationID & id, const String & name, BackupS } -void BackupsWorker::setStatus(const String & id, BackupStatus status) +void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw_if_error) { std::lock_guard lock{infos_mutex}; auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); + { + if (throw_if_error) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id); + else + return; + } auto & info = it->second; auto old_status = info.status; @@ -629,7 +626,7 @@ void BackupsWorker::setNumFilesAndSize(const String & id, size_t num_files, UInt std::lock_guard lock{infos_mutex}; auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id); auto & info = it->second; info.num_files = num_files; @@ -645,7 +642,7 @@ void BackupsWorker::wait(const OperationID & id, bool rethrow_exception) { auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id); const auto & info = it->second; auto current_status = info.status; if (rethrow_exception && isErrorStatus(current_status)) @@ -659,7 +656,7 @@ BackupsWorker::Info BackupsWorker::getInfo(const OperationID & id) const std::lock_guard lock{infos_mutex}; auto it = infos.find(id); if (it == infos.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup's id={}", id); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", id); return it->second; } @@ -668,7 +665,10 @@ std::vector BackupsWorker::getAllInfos() const std::vector res_infos; std::lock_guard lock{infos_mutex}; for (const auto & info : infos | boost::adaptors::map_values) - res_infos.push_back(info); + { + if (!info.internal) + res_infos.push_back(info); + } return res_infos; } diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h index 9196ea45689..54b20c1df90 100644 --- a/src/Backups/BackupsWorker.h +++ b/src/Backups/BackupsWorker.h @@ -47,6 +47,9 @@ public: /// Backup's name, a string like "Disk('backups', 'my_backup')" String name; + /// This operation is internal and should not be shown in system.backups + bool internal = false; + /// Status of backup or restore operation. BackupStatus status; @@ -79,12 +82,12 @@ private: OperationID startRestoring(const ASTPtr & query, ContextMutablePtr context); - void doRestore(const std::shared_ptr & restore_query, const OperationID & restore_id, const UUID & restore_uuid, - RestoreSettings restore_settings, const BackupInfo & backup_info, + void doRestore(const std::shared_ptr & restore_query, const OperationID & restore_id, RestoreSettings restore_settings, const BackupInfo & backup_info, std::shared_ptr restore_coordination, ContextMutablePtr context, bool called_async); - void addInfo(const OperationID & id, const String & name, BackupStatus status); - void setStatus(const OperationID & id, BackupStatus status); + void addInfo(const OperationID & id, const String & name, bool internal, BackupStatus status); + void setStatus(const OperationID & id, BackupStatus status, bool throw_if_error = true); + void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); } void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size); ThreadPool backups_thread_pool; From 2fd1bf85482802fb7986b43f68086aab7a7b1ccc Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 27 Jul 2022 14:26:41 +0200 Subject: [PATCH 223/227] run tests with Replicated database in master --- .github/workflows/master.yml | 118 ++++++++++++++++++++++++----------- 1 file changed, 80 insertions(+), 38 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 162c03a37d5..fa046ed40c7 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -1126,6 +1126,84 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestReleaseDatabaseReplicated0: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=0 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + # shellcheck disable=SC2046 + docker kill $(docker ps -q) ||: + # shellcheck disable=SC2046 + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr "$TEMP_PATH" + FunctionalStatelessTestReleaseDatabaseReplicated1: + needs: [BuilderDebRelease] + runs-on: [self-hosted, func-tester] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/stateless_database_replicated + REPORTS_PATH=${{runner.temp}}/reports_dir + CHECK_NAME=Stateless tests (release, DatabaseReplicated) + REPO_COPY=${{runner.temp}}/stateless_database_replicated/ClickHouse + KILL_TIMEOUT=10800 + RUN_BY_HASH_NUM=1 + RUN_BY_HASH_TOTAL=2 + EOF + - name: Download json reports + uses: actions/download-artifact@v2 + with: + path: ${{ env.REPORTS_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + uses: actions/checkout@v2 + - name: Functional test + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" + python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" + - name: Cleanup + if: always() + run: | + # shellcheck disable=SC2046 + docker kill $(docker ps -q) ||: + # shellcheck disable=SC2046 + docker rm -f $(docker ps -a -q) ||: + sudo rm -fr "$TEMP_PATH" FunctionalStatelessTestReleaseS3: needs: [BuilderDebRelease] runs-on: [self-hosted, func-tester] @@ -1706,43 +1784,6 @@ jobs: # shellcheck disable=SC2046 docker rm -f $(docker ps -a -q) ||: sudo rm -fr "$TEMP_PATH" - FunctionalStatefulTestReleaseDatabaseOrdinary: - needs: [BuilderDebRelease] - runs-on: [self-hosted, func-tester] - steps: - - name: Set envs - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/stateful_release_database_ordinary - REPORTS_PATH=${{runner.temp}}/reports_dir - CHECK_NAME=Stateful tests (release, DatabaseOrdinary) - REPO_COPY=${{runner.temp}}/stateful_release_database_ordinary/ClickHouse - KILL_TIMEOUT=3600 - EOF - - name: Download json reports - uses: actions/download-artifact@v2 - with: - path: ${{ env.REPORTS_PATH }} - - name: Clear repository - run: | - sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - - name: Check out repository code - uses: actions/checkout@v2 - - name: Functional test - run: | - sudo rm -fr "$TEMP_PATH" - mkdir -p "$TEMP_PATH" - cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" - python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT" - - name: Cleanup - if: always() - run: | - # shellcheck disable=SC2046 - docker kill $(docker ps -q) ||: - # shellcheck disable=SC2046 - docker rm -f $(docker ps -a -q) ||: - sudo rm -fr "$TEMP_PATH" FunctionalStatefulTestAarch64: needs: [BuilderDebAarch64] runs-on: [self-hosted, func-tester-aarch64] @@ -3063,6 +3104,8 @@ jobs: - FunctionalStatelessTestDebug2 - FunctionalStatelessTestRelease - FunctionalStatelessTestReleaseDatabaseOrdinary + - FunctionalStatelessTestReleaseDatabaseReplicated0 + - FunctionalStatelessTestReleaseDatabaseReplicated1 - FunctionalStatelessTestAarch64 - FunctionalStatelessTestAsan0 - FunctionalStatelessTestAsan1 @@ -3075,7 +3118,6 @@ jobs: - FunctionalStatelessTestUBsan - FunctionalStatefulTestDebug - FunctionalStatefulTestRelease - - FunctionalStatefulTestReleaseDatabaseOrdinary - FunctionalStatelessTestReleaseS3 - FunctionalStatefulTestAarch64 - FunctionalStatefulTestAsan From 05e4bd6c8b69e403fe94bb95ab3192bc62e953d6 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Wed, 27 Jul 2022 15:36:50 +0200 Subject: [PATCH 224/227] review fix --- src/Processors/Executors/ExecutorTasks.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Executors/ExecutorTasks.cpp b/src/Processors/Executors/ExecutorTasks.cpp index 3b5fdac3ee3..d5c2bfe7399 100644 --- a/src/Processors/Executors/ExecutorTasks.cpp +++ b/src/Processors/Executors/ExecutorTasks.cpp @@ -32,7 +32,7 @@ void ExecutorTasks::tryWakeUpAnyOtherThreadWithTasks(ExecutionThreadContext & se { if (!task_queue.empty() && !threads_queue.empty() && !finished) { - size_t next_thread = (self.thread_number + 1) % use_threads; + size_t next_thread = self.thread_number + 1 >= use_threads ? 0 : (self.thread_number + 1); auto thread_to_wake = task_queue.getAnyThreadWithTasks(next_thread); if (threads_queue.has(thread_to_wake)) From 8a3125f2d2803d94c647b8fdbe9bb640124baa99 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Wed, 27 Jul 2022 14:34:10 +0000 Subject: [PATCH 225/227] Fix integration test test_total_max_threads --- tests/integration/test_total_max_threads/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_total_max_threads/test.py index c5e96939f4f..010d1971463 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_total_max_threads/test.py @@ -59,7 +59,7 @@ def test_total_max_threads_defined_50(started_cluster): node2.query( "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_2'" ) - == "51\n" + == "52\n" ) @@ -72,7 +72,7 @@ def test_total_max_threads_defined_1(started_cluster): node3.query( "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_3'" ) - == "2\n" + == "3\n" ) From f35349eb08cbb191a193c789ba5e85bd38a60212 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 28 Jul 2022 07:46:20 +0000 Subject: [PATCH 226/227] Rename total_max_threads to concurrent_threads_soft_limit --- .../settings.md | 10 +++---- programs/server/Server.cpp | 14 +++++----- programs/server/config.xml | 4 +-- .../__init__.py | 0 .../configs/config_default.xml | 0 .../configs/config_defined_1.xml | 2 +- .../configs/config_defined_50.xml | 2 +- .../configs/config_limit_reached.xml | 2 +- .../configs/users.xml | 0 .../test.py | 26 +++++++++---------- 10 files changed, 30 insertions(+), 30 deletions(-) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/__init__.py (100%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/configs/config_default.xml (100%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/configs/config_defined_1.xml (68%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/configs/config_defined_50.xml (67%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/configs/config_limit_reached.xml (67%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/configs/users.xml (100%) rename tests/integration/{test_total_max_threads => test_concurrent_threads_soft_limit}/test.py (81%) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 79beaa89320..1714251ac92 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -197,7 +197,7 @@ Default value: `480` (8 minute). Parameter of a task that cleans up garbage from `store/` directory. If some subdirectory is not used by clickhouse-server and this directory was not modified for last -`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by +`database_catalog_unused_dir_hide_timeout_sec` seconds, the task will "hide" this directory by removing all access rights. It also works for directories that clickhouse-server does not expect to see inside `store/`. Zero means "immediately". @@ -206,10 +206,10 @@ Default value: `3600` (1 hour). ## database_catalog_unused_dir_rm_timeout_sec {#database_catalog_unused_dir_rm_timeout_sec} Parameter of a task that cleans up garbage from `store/` directory. -If some subdirectory is not used by clickhouse-server and it was previousely "hidden" -(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec)) +If some subdirectory is not used by clickhouse-server and it was previousely "hidden" +(see [database_catalog_unused_dir_hide_timeout_sec](../../operations/server-configuration-parameters/settings.md#database_catalog_unused_dir_hide_timeout_sec)) and this directory was not modified for last -`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory. +`database_catalog_unused_dir_rm_timeout_sec` seconds, the task will remove this directory. It also works for directories that clickhouse-server does not expect to see inside `store/`. Zero means "never". @@ -731,7 +731,7 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa - [max_server_memory_usage](#max_server_memory_usage) -## total_max_threads {#total-max-threads} +## concurrent_threads_soft_limit {#concurrent_threads_soft_limit} The maximum number of query processing threads, excluding threads for retrieving data from remote servers, allowed to run all queries. This is not a hard limit. In case if the limit is reached the query will still get one thread to run. Possible values: diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 569c55711a7..9effc23e107 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1125,17 +1125,17 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("total_max_threads")) + if (config->has("concurrent_threads_soft_limit")) { - auto total_max_threads = config->getInt("total_max_threads", 0); - if (total_max_threads == -1) + auto concurrent_threads_soft_limit = config->getInt("concurrent_threads_soft_limit", 0); + if (concurrent_threads_soft_limit == -1) { - // Based on tests total_max_threads has an optimal value when it's about 3 times of logical CPU cores + // Based on tests concurrent_threads_soft_limit has an optimal value when it's about 3 times of logical CPU cores constexpr size_t thread_factor = 3; - total_max_threads = std::thread::hardware_concurrency() * thread_factor; + concurrent_threads_soft_limit = std::thread::hardware_concurrency() * thread_factor; } - if (total_max_threads) - ConcurrencyControl::instance().setMaxConcurrency(total_max_threads); + if (concurrent_threads_soft_limit) + ConcurrencyControl::instance().setMaxConcurrency(concurrent_threads_soft_limit); else ConcurrencyControl::instance().setMaxConcurrency(ConcurrencyControl::Unlimited); } diff --git a/programs/server/config.xml b/programs/server/config.xml index cdd48e4e6d9..84930230397 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -274,7 +274,7 @@ For value equals to -1 this parameter is initialized by number of logical cores multiplies by 3. Which is a good heuristic for CPU-bound tasks. --> - 0 + 0 100 @@ -611,7 +611,7 @@ if this setting is true the user B will see all rows, and if this setting is false the user B will see no rows. By default this setting is false for compatibility with earlier access configurations. --> false - + false diff --git a/tests/integration/test_total_max_threads/__init__.py b/tests/integration/test_concurrent_threads_soft_limit/__init__.py similarity index 100% rename from tests/integration/test_total_max_threads/__init__.py rename to tests/integration/test_concurrent_threads_soft_limit/__init__.py diff --git a/tests/integration/test_total_max_threads/configs/config_default.xml b/tests/integration/test_concurrent_threads_soft_limit/configs/config_default.xml similarity index 100% rename from tests/integration/test_total_max_threads/configs/config_default.xml rename to tests/integration/test_concurrent_threads_soft_limit/configs/config_default.xml diff --git a/tests/integration/test_total_max_threads/configs/config_defined_1.xml b/tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_1.xml similarity index 68% rename from tests/integration/test_total_max_threads/configs/config_defined_1.xml rename to tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_1.xml index ff4aa98c3ab..026563ecd53 100644 --- a/tests/integration/test_total_max_threads/configs/config_defined_1.xml +++ b/tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_1.xml @@ -1,6 +1,6 @@ - 1 + 1 system query_log
diff --git a/tests/integration/test_total_max_threads/configs/config_defined_50.xml b/tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_50.xml similarity index 67% rename from tests/integration/test_total_max_threads/configs/config_defined_50.xml rename to tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_50.xml index 09234c9924a..55f1bf32bf6 100644 --- a/tests/integration/test_total_max_threads/configs/config_defined_50.xml +++ b/tests/integration/test_concurrent_threads_soft_limit/configs/config_defined_50.xml @@ -1,6 +1,6 @@ - 50 + 50 system query_log
diff --git a/tests/integration/test_total_max_threads/configs/config_limit_reached.xml b/tests/integration/test_concurrent_threads_soft_limit/configs/config_limit_reached.xml similarity index 67% rename from tests/integration/test_total_max_threads/configs/config_limit_reached.xml rename to tests/integration/test_concurrent_threads_soft_limit/configs/config_limit_reached.xml index 94afef2d6fb..c7d86765212 100644 --- a/tests/integration/test_total_max_threads/configs/config_limit_reached.xml +++ b/tests/integration/test_concurrent_threads_soft_limit/configs/config_limit_reached.xml @@ -1,6 +1,6 @@ - 10 + 10 system query_log
diff --git a/tests/integration/test_total_max_threads/configs/users.xml b/tests/integration/test_concurrent_threads_soft_limit/configs/users.xml similarity index 100% rename from tests/integration/test_total_max_threads/configs/users.xml rename to tests/integration/test_concurrent_threads_soft_limit/configs/users.xml diff --git a/tests/integration/test_total_max_threads/test.py b/tests/integration/test_concurrent_threads_soft_limit/test.py similarity index 81% rename from tests/integration/test_total_max_threads/test.py rename to tests/integration/test_concurrent_threads_soft_limit/test.py index 010d1971463..5fee8b58eec 100644 --- a/tests/integration/test_total_max_threads/test.py +++ b/tests/integration/test_concurrent_threads_soft_limit/test.py @@ -37,49 +37,49 @@ def started_cluster(): cluster.shutdown() -def test_total_max_threads_default(started_cluster): +def test_concurrent_threads_soft_limit_default(started_cluster): node1.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_1" + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_1" ) node1.query("SYSTEM FLUSH LOGS") assert ( node1.query( - "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_1'" + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_concurrent_threads_soft_limit_1'" ) == "102\n" ) -def test_total_max_threads_defined_50(started_cluster): +def test_concurrent_threads_soft_limit_defined_50(started_cluster): node2.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_2" + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_2" ) node2.query("SYSTEM FLUSH LOGS") assert ( node2.query( - "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_2'" + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_concurrent_threads_soft_limit_2'" ) == "52\n" ) -def test_total_max_threads_defined_1(started_cluster): +def test_concurrent_threads_soft_limit_defined_1(started_cluster): node3.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_total_max_threads_3" + "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_3" ) node3.query("SYSTEM FLUSH LOGS") assert ( node3.query( - "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_3'" + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_concurrent_threads_soft_limit_3'" ) == "3\n" ) -# In config_limit_reached.xml there is total_max_threads=10 +# In config_limit_reached.xml there is concurrent_threads_soft_limit=10 # Background query starts in a separate thread to reach this limit. # When this limit is reached the foreground query gets less than 5 queries despite the fact that it has settings max_threads=5 -def test_total_max_threads_limit_reached(started_cluster): +def test_concurrent_threads_soft_limit_limit_reached(started_cluster): def background_query(): try: node4.query( @@ -107,12 +107,12 @@ def test_total_max_threads_limit_reached(started_cluster): node4.query( "SELECT count(*) FROM numbers_mt(10000000) settings max_threads=5", - query_id="test_total_max_threads_4", + query_id="test_concurrent_threads_soft_limit_4", ) node4.query("SYSTEM FLUSH LOGS") s_count = node4.query( - "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_total_max_threads_4'" + "select length(thread_ids) from system.query_log where current_database = currentDatabase() and type = 'QueryFinish' and query_id = 'test_concurrent_threads_soft_limit_4'" ).strip() if s_count: count = int(s_count) From 9186b254271374bf9df87abfec1223ca9f27e4e2 Mon Sep 17 00:00:00 2001 From: Roman Vasin Date: Thu, 28 Jul 2022 08:10:44 +0000 Subject: [PATCH 227/227] Fix Style Check in test_concurrent_threads_soft_limit --- .../test_concurrent_threads_soft_limit/test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_concurrent_threads_soft_limit/test.py b/tests/integration/test_concurrent_threads_soft_limit/test.py index 5fee8b58eec..2f76f44ddc2 100644 --- a/tests/integration/test_concurrent_threads_soft_limit/test.py +++ b/tests/integration/test_concurrent_threads_soft_limit/test.py @@ -39,7 +39,8 @@ def started_cluster(): def test_concurrent_threads_soft_limit_default(started_cluster): node1.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_1" + "SELECT count(*) FROM numbers_mt(10000000)", + query_id="test_concurrent_threads_soft_limit_1", ) node1.query("SYSTEM FLUSH LOGS") assert ( @@ -52,7 +53,8 @@ def test_concurrent_threads_soft_limit_default(started_cluster): def test_concurrent_threads_soft_limit_defined_50(started_cluster): node2.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_2" + "SELECT count(*) FROM numbers_mt(10000000)", + query_id="test_concurrent_threads_soft_limit_2", ) node2.query("SYSTEM FLUSH LOGS") assert ( @@ -65,7 +67,8 @@ def test_concurrent_threads_soft_limit_defined_50(started_cluster): def test_concurrent_threads_soft_limit_defined_1(started_cluster): node3.query( - "SELECT count(*) FROM numbers_mt(10000000)", query_id="test_concurrent_threads_soft_limit_3" + "SELECT count(*) FROM numbers_mt(10000000)", + query_id="test_concurrent_threads_soft_limit_3", ) node3.query("SYSTEM FLUSH LOGS") assert (