From a43d3a6902eb0f647f40832778473f4723de2d4a Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 5 Jul 2019 19:50:44 +0300 Subject: [PATCH 01/14] Report memory usage in performance tests. --- .../performance-test/ConfigPreprocessor.cpp | 11 ++-- .../performance-test/PerformanceTestSuite.cpp | 7 ++- .../performance-test/ReportBuilder.cpp | 3 ++ dbms/programs/performance-test/TestStats.h | 4 ++ .../performance-test/executeQuery.cpp | 51 ++++++++++++++++++- .../DataStreams/RemoteBlockInputStream.cpp | 2 +- dbms/src/DataStreams/RemoteBlockInputStream.h | 15 ++++++ 7 files changed, 84 insertions(+), 9 deletions(-) diff --git a/dbms/programs/performance-test/ConfigPreprocessor.cpp b/dbms/programs/performance-test/ConfigPreprocessor.cpp index 3ea095a5175..ab276049682 100644 --- a/dbms/programs/performance-test/ConfigPreprocessor.cpp +++ b/dbms/programs/performance-test/ConfigPreprocessor.cpp @@ -14,10 +14,15 @@ std::vector ConfigPreprocessor::processConfig( { std::vector result; - for (const auto & path : paths) + for (const auto & pathStr : paths) { - result.emplace_back(XMLConfigurationPtr(new XMLConfiguration(path))); - result.back()->setString("path", Poco::Path(path).absolute().toString()); + auto test = XMLConfigurationPtr(new XMLConfiguration(pathStr)); + result.push_back(test); + + const auto path = Poco::Path(pathStr); + test->setString("path", path.absolute().toString()); + if (test->getString("name", "") == "") + test->setString("name", path.getBaseName()); } /// Leave tests: diff --git a/dbms/programs/performance-test/PerformanceTestSuite.cpp b/dbms/programs/performance-test/PerformanceTestSuite.cpp index cfa7d202d1d..8f87da36081 100644 --- a/dbms/programs/performance-test/PerformanceTestSuite.cpp +++ b/dbms/programs/performance-test/PerformanceTestSuite.cpp @@ -259,15 +259,12 @@ static std::vector getInputFiles(const po::variables_map & options, if (input_files.empty()) throw DB::Exception("Did not find any xml files", DB::ErrorCodes::BAD_ARGUMENTS); - else - LOG_INFO(log, "Found " << input_files.size() << " files"); } else { input_files = options["input-files"].as>(); - LOG_INFO(log, "Found " + std::to_string(input_files.size()) + " input files"); - std::vector collected_files; + std::vector collected_files; for (const std::string & filename : input_files) { fs::path file(filename); @@ -289,6 +286,8 @@ static std::vector getInputFiles(const po::variables_map & options, input_files = std::move(collected_files); } + + LOG_INFO(log, "Found " + std::to_string(input_files.size()) + " input files"); std::sort(input_files.begin(), input_files.end()); return input_files; } diff --git a/dbms/programs/performance-test/ReportBuilder.cpp b/dbms/programs/performance-test/ReportBuilder.cpp index 97d4874ca5d..dd8dc44bd18 100644 --- a/dbms/programs/performance-test/ReportBuilder.cpp +++ b/dbms/programs/performance-test/ReportBuilder.cpp @@ -157,6 +157,9 @@ std::string ReportBuilder::buildFullReport( runJSON.set("avg_bytes_per_second", statistics.avg_bytes_speed_value); } + runJSON.set("max_memory_usage", statistics.max_memory_usage); + runJSON.set("min_memory_usage", statistics.min_memory_usage); + run_infos.push_back(runJSON); } } diff --git a/dbms/programs/performance-test/TestStats.h b/dbms/programs/performance-test/TestStats.h index 5d70edc437c..6ab921d1d33 100644 --- a/dbms/programs/performance-test/TestStats.h +++ b/dbms/programs/performance-test/TestStats.h @@ -19,6 +19,7 @@ struct TestStats Stopwatch avg_bytes_speed_watch; bool last_query_was_cancelled = false; + std::string query_id; size_t queries = 0; @@ -49,6 +50,9 @@ struct TestStats size_t number_of_rows_speed_info_batches = 0; size_t number_of_bytes_speed_info_batches = 0; + UInt64 max_memory_usage = 0; + UInt64 min_memory_usage = std::numeric_limits::max(); + bool ready = false; // check if a query wasn't interrupted by SIGINT std::string exception; diff --git a/dbms/programs/performance-test/executeQuery.cpp b/dbms/programs/performance-test/executeQuery.cpp index f12808eac36..0cb83fb72d0 100644 --- a/dbms/programs/performance-test/executeQuery.cpp +++ b/dbms/programs/performance-test/executeQuery.cpp @@ -2,9 +2,11 @@ #include #include #include +#include namespace DB { + namespace { @@ -36,7 +38,7 @@ void checkFulfilledConditionsAndUpdate( } } -} +} // anonymous namespace void executeQuery( Connection & connection, @@ -47,12 +49,19 @@ void executeQuery( Context & context, const Settings & settings) { + static const std::string query_id_prefix + = Poco::UUIDGenerator::defaultGenerator().create().toString() + "-"; + static int next_query_id = 1; + statistics.watch_per_query.restart(); statistics.last_query_was_cancelled = false; statistics.last_query_rows_read = 0; statistics.last_query_bytes_read = 0; + statistics.query_id = query_id_prefix + std::to_string(next_query_id++); + fprintf(stderr, "Query id is '%s'\n", statistics.query_id.c_str()); RemoteBlockInputStream stream(connection, query, {}, context, &settings); + stream.setQueryId(statistics.query_id); stream.setProgressCallback( [&](const Progress & value) @@ -69,5 +78,45 @@ void executeQuery( statistics.updateQueryInfo(); statistics.setTotalTime(); + + /// Get max memory usage from the server query log. + /// We might have to wait for some time before the query log is updated. + int n_waits = 0; + const int one_wait_us = 500 * 1000; + const int max_waits = (10 * 1000 * 1000) / one_wait_us; + for (; n_waits < max_waits; n_waits++) + { + RemoteBlockInputStream log(connection, + "select memory_usage from system.query_log where type = 2 and query_id = '" + + statistics.query_id + "'", + {}, context, &settings); + + log.readPrefix(); + Block block = log.read(); + if (block.columns() == 0) + { + log.readSuffix(); + ::usleep(one_wait_us); + continue; + } + assert(block.columns() == 1); + assert(block.getDataTypes()[0]->getName() == "UInt64"); + ColumnPtr column = block.getByPosition(0).column; + assert(column->size() == 1); + StringRef ref = column->getDataAt(0); + assert(ref.size == sizeof(UInt64)); + const UInt64 memory_usage = *reinterpret_cast(ref.data); + statistics.max_memory_usage = std::max(statistics.max_memory_usage, + memory_usage); + statistics.min_memory_usage = std::min(statistics.min_memory_usage, + memory_usage); + log.readSuffix(); + + fprintf(stderr, "Memory usage is %ld\n", memory_usage); + break; + } + fprintf(stderr, "Waited for query log for %.2fs\n", + (n_waits * one_wait_us) / 1e6f); } + } diff --git a/dbms/src/DataStreams/RemoteBlockInputStream.cpp b/dbms/src/DataStreams/RemoteBlockInputStream.cpp index 740e60ffb09..eb576075f80 100644 --- a/dbms/src/DataStreams/RemoteBlockInputStream.cpp +++ b/dbms/src/DataStreams/RemoteBlockInputStream.cpp @@ -292,7 +292,7 @@ void RemoteBlockInputStream::sendQuery() established = true; auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings); - multiplexed_connections->sendQuery(timeouts, query, "", stage, &context.getClientInfo(), true); + multiplexed_connections->sendQuery(timeouts, query, query_id, stage, &context.getClientInfo(), true); established = false; sent_query = true; diff --git a/dbms/src/DataStreams/RemoteBlockInputStream.h b/dbms/src/DataStreams/RemoteBlockInputStream.h index 3cef2099030..f8b35f7c11f 100644 --- a/dbms/src/DataStreams/RemoteBlockInputStream.h +++ b/dbms/src/DataStreams/RemoteBlockInputStream.h @@ -46,6 +46,20 @@ public: ~RemoteBlockInputStream() override; + /// Set the query_id. For now, used by performance test to later find the query + /// in the server query_log. Must be called before sending the query to the + /// server. + /// + /// FIXME This should have been a parameter of the constructor, but I can't bring + /// myself to add even more parameters. These constructors actually implement + /// (in a quite bizarre way) an overloaded function that prepares the multiplexed + /// connection wrapper. It should have been a plain function that is run by + /// the caller, but apparently that would have been obscenely straighforward, + /// too easy to understand and not insane at all, which is a blatant violation + /// of our coding conventions. + /// I'm not going to rewrite it now, so that I can get at least something done. + void setQueryId(std::string _query_id) { assert(!sent_query); query_id = _query_id; } + /// Specify how we allocate connections on a shard. void setPoolMode(PoolMode pool_mode_) { pool_mode = pool_mode_; } @@ -95,6 +109,7 @@ private: std::unique_ptr multiplexed_connections; const String query; + String query_id = ""; Context context; /// Temporary tables needed to be sent to remote servers From bdbb77a0a147cbbb67b946ff177ad7b0d9956e92 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 8 Jul 2019 16:23:30 +0300 Subject: [PATCH 02/14] Read the query log after running all the queries. The log entries appear ~10s after a query has finished, so waiting for them after each query takes too long. --- .../performance-test/PerformanceTest.cpp | 49 +++++++++++++++++++ .../performance-test/ReportBuilder.cpp | 3 +- dbms/programs/performance-test/TestStats.h | 3 +- .../performance-test/executeQuery.cpp | 39 --------------- 4 files changed, 51 insertions(+), 43 deletions(-) diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index c2d8d4f252c..1dd0b095fbe 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -296,6 +297,54 @@ void PerformanceTest::runQueries( break; } } + + if (got_SIGINT) + { + return; + } + + // Pull memory usage data from query log. The log entries take some time + // to become available, so we do this after running all the queries, and + // might still have to wait. We will wait in increment of 0.5s, for no more + // than than 10s in total. + const int one_wait_us = 500 * 1000; + const int max_waits = (10 * 1000 * 1000) / one_wait_us; + int n_waits = 0; + for (auto & statistics : statistics_by_run) + { +retry: + RemoteBlockInputStream log(connection, + "select memory_usage from system.query_log where type = 2 and query_id = '" + + statistics.query_id + "'", + {} /* header */, context); + + log.readPrefix(); + Block block = log.read(); + if (block.columns() == 0) + { + log.readSuffix(); + + if (n_waits >= max_waits) + break; + n_waits++; + + ::usleep(one_wait_us); + goto retry; + } + assert(block.columns() == 1); + assert(block.getDataTypes()[0]->getName() == "UInt64"); + ColumnPtr column = block.getByPosition(0).column; + assert(column->size() == 1); + StringRef ref = column->getDataAt(0); + assert(ref.size == sizeof(UInt64)); + statistics.memory_usage = *reinterpret_cast(ref.data); + log.readSuffix(); + + fprintf(stderr, "Memory usage for query '%s' is %ld\n", + statistics.query_id.c_str(), statistics.memory_usage); + } + fprintf(stderr, "Waited for query log for %.2fs\n", + (n_waits * one_wait_us) / 1e6f); } diff --git a/dbms/programs/performance-test/ReportBuilder.cpp b/dbms/programs/performance-test/ReportBuilder.cpp index dd8dc44bd18..4aa1933a209 100644 --- a/dbms/programs/performance-test/ReportBuilder.cpp +++ b/dbms/programs/performance-test/ReportBuilder.cpp @@ -157,8 +157,7 @@ std::string ReportBuilder::buildFullReport( runJSON.set("avg_bytes_per_second", statistics.avg_bytes_speed_value); } - runJSON.set("max_memory_usage", statistics.max_memory_usage); - runJSON.set("min_memory_usage", statistics.min_memory_usage); + runJSON.set("memory_usage", statistics.memory_usage); run_infos.push_back(runJSON); } diff --git a/dbms/programs/performance-test/TestStats.h b/dbms/programs/performance-test/TestStats.h index 6ab921d1d33..b38ffa7386a 100644 --- a/dbms/programs/performance-test/TestStats.h +++ b/dbms/programs/performance-test/TestStats.h @@ -50,8 +50,7 @@ struct TestStats size_t number_of_rows_speed_info_batches = 0; size_t number_of_bytes_speed_info_batches = 0; - UInt64 max_memory_usage = 0; - UInt64 min_memory_usage = std::numeric_limits::max(); + UInt64 memory_usage = 0; bool ready = false; // check if a query wasn't interrupted by SIGINT std::string exception; diff --git a/dbms/programs/performance-test/executeQuery.cpp b/dbms/programs/performance-test/executeQuery.cpp index 0cb83fb72d0..d280831c878 100644 --- a/dbms/programs/performance-test/executeQuery.cpp +++ b/dbms/programs/performance-test/executeQuery.cpp @@ -78,45 +78,6 @@ void executeQuery( statistics.updateQueryInfo(); statistics.setTotalTime(); - - /// Get max memory usage from the server query log. - /// We might have to wait for some time before the query log is updated. - int n_waits = 0; - const int one_wait_us = 500 * 1000; - const int max_waits = (10 * 1000 * 1000) / one_wait_us; - for (; n_waits < max_waits; n_waits++) - { - RemoteBlockInputStream log(connection, - "select memory_usage from system.query_log where type = 2 and query_id = '" - + statistics.query_id + "'", - {}, context, &settings); - - log.readPrefix(); - Block block = log.read(); - if (block.columns() == 0) - { - log.readSuffix(); - ::usleep(one_wait_us); - continue; - } - assert(block.columns() == 1); - assert(block.getDataTypes()[0]->getName() == "UInt64"); - ColumnPtr column = block.getByPosition(0).column; - assert(column->size() == 1); - StringRef ref = column->getDataAt(0); - assert(ref.size == sizeof(UInt64)); - const UInt64 memory_usage = *reinterpret_cast(ref.data); - statistics.max_memory_usage = std::max(statistics.max_memory_usage, - memory_usage); - statistics.min_memory_usage = std::min(statistics.min_memory_usage, - memory_usage); - log.readSuffix(); - - fprintf(stderr, "Memory usage is %ld\n", memory_usage); - break; - } - fprintf(stderr, "Waited for query log for %.2fs\n", - (n_waits * one_wait_us) / 1e6f); } } From b928d87d8a8d95771f42d17eaf77bbcc36309ef3 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 8 Jul 2019 18:08:46 +0300 Subject: [PATCH 03/14] error: declaration shadows a field of 'DB::PerformanceTest' --- dbms/programs/performance-test/PerformanceTest.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index 1dd0b095fbe..db44d283dc8 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -313,16 +313,16 @@ void PerformanceTest::runQueries( for (auto & statistics : statistics_by_run) { retry: - RemoteBlockInputStream log(connection, + RemoteBlockInputStream log_reader(connection, "select memory_usage from system.query_log where type = 2 and query_id = '" + statistics.query_id + "'", {} /* header */, context); - log.readPrefix(); - Block block = log.read(); + log_reader.readPrefix(); + Block block = log_reader.read(); if (block.columns() == 0) { - log.readSuffix(); + log_reader.readSuffix(); if (n_waits >= max_waits) break; @@ -338,7 +338,7 @@ retry: StringRef ref = column->getDataAt(0); assert(ref.size == sizeof(UInt64)); statistics.memory_usage = *reinterpret_cast(ref.data); - log.readSuffix(); + log_reader.readSuffix(); fprintf(stderr, "Memory usage for query '%s' is %ld\n", statistics.query_id.c_str(), statistics.memory_usage); From 78df28f0e986a5664eff0b9e6f6f3cb18993fecd Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Mon, 8 Jul 2019 18:35:32 +0300 Subject: [PATCH 04/14] Cleanup. --- dbms/programs/performance-test/PerformanceTest.cpp | 12 ++++-------- dbms/programs/performance-test/executeQuery.cpp | 1 - dbms/src/DataStreams/RemoteBlockInputStream.h | 9 --------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index db44d283dc8..11dbd053616 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -305,10 +305,9 @@ void PerformanceTest::runQueries( // Pull memory usage data from query log. The log entries take some time // to become available, so we do this after running all the queries, and - // might still have to wait. We will wait in increment of 0.5s, for no more - // than than 10s in total. + // might still have to wait. const int one_wait_us = 500 * 1000; - const int max_waits = (10 * 1000 * 1000) / one_wait_us; + const int max_waits = (30 * 1000 * 1000) / one_wait_us; int n_waits = 0; for (auto & statistics : statistics_by_run) { @@ -339,12 +338,9 @@ retry: assert(ref.size == sizeof(UInt64)); statistics.memory_usage = *reinterpret_cast(ref.data); log_reader.readSuffix(); - - fprintf(stderr, "Memory usage for query '%s' is %ld\n", - statistics.query_id.c_str(), statistics.memory_usage); } - fprintf(stderr, "Waited for query log for %.2fs\n", - (n_waits * one_wait_us) / 1e6f); + + printf("Waited for query log for %.2fs\n", (n_waits * one_wait_us) / 1e6f); } diff --git a/dbms/programs/performance-test/executeQuery.cpp b/dbms/programs/performance-test/executeQuery.cpp index d280831c878..db82a48d0c1 100644 --- a/dbms/programs/performance-test/executeQuery.cpp +++ b/dbms/programs/performance-test/executeQuery.cpp @@ -59,7 +59,6 @@ void executeQuery( statistics.last_query_bytes_read = 0; statistics.query_id = query_id_prefix + std::to_string(next_query_id++); - fprintf(stderr, "Query id is '%s'\n", statistics.query_id.c_str()); RemoteBlockInputStream stream(connection, query, {}, context, &settings); stream.setQueryId(statistics.query_id); diff --git a/dbms/src/DataStreams/RemoteBlockInputStream.h b/dbms/src/DataStreams/RemoteBlockInputStream.h index f8b35f7c11f..4c19b44a59e 100644 --- a/dbms/src/DataStreams/RemoteBlockInputStream.h +++ b/dbms/src/DataStreams/RemoteBlockInputStream.h @@ -49,15 +49,6 @@ public: /// Set the query_id. For now, used by performance test to later find the query /// in the server query_log. Must be called before sending the query to the /// server. - /// - /// FIXME This should have been a parameter of the constructor, but I can't bring - /// myself to add even more parameters. These constructors actually implement - /// (in a quite bizarre way) an overloaded function that prepares the multiplexed - /// connection wrapper. It should have been a plain function that is run by - /// the caller, but apparently that would have been obscenely straighforward, - /// too easy to understand and not insane at all, which is a blatant violation - /// of our coding conventions. - /// I'm not going to rewrite it now, so that I can get at least something done. void setQueryId(std::string _query_id) { assert(!sent_query); query_id = _query_id; } /// Specify how we allocate connections on a shard. From f260296f89369a48bd85294feb5eefd7ef2b4736 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 9 Jul 2019 12:49:34 +0300 Subject: [PATCH 05/14] Print logs to stderr. --- dbms/programs/performance-test/PerformanceTest.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index 11dbd053616..932d47a5115 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -340,7 +340,9 @@ retry: log_reader.readSuffix(); } - printf("Waited for query log for %.2fs\n", (n_waits * one_wait_us) / 1e6f); + // LOG_INFO can't format floats. Print to stderr because stdout is the + // resulting JSON. + fprintf(stderr, "Waited for query log for %.2fs\n", (n_waits * one_wait_us) / 1e6f); } From 8c379fb97206e6ba9ebdd7b84409f7f96ca19f9e Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 9 Jul 2019 17:22:42 +0300 Subject: [PATCH 06/14] Flush query log instead of waiting for it. --- .../performance-test/ConfigPreprocessor.cpp | 6 ++-- .../performance-test/PerformanceTest.cpp | 33 ++++++++----------- dbms/src/DataStreams/RemoteBlockInputStream.h | 2 +- 3 files changed, 18 insertions(+), 23 deletions(-) diff --git a/dbms/programs/performance-test/ConfigPreprocessor.cpp b/dbms/programs/performance-test/ConfigPreprocessor.cpp index ab276049682..5da9650454a 100644 --- a/dbms/programs/performance-test/ConfigPreprocessor.cpp +++ b/dbms/programs/performance-test/ConfigPreprocessor.cpp @@ -14,12 +14,12 @@ std::vector ConfigPreprocessor::processConfig( { std::vector result; - for (const auto & pathStr : paths) + for (const auto & path_str : paths) { - auto test = XMLConfigurationPtr(new XMLConfiguration(pathStr)); + auto test = XMLConfigurationPtr(new XMLConfiguration(path_str)); result.push_back(test); - const auto path = Poco::Path(pathStr); + const auto path = Poco::Path(path_str); test->setString("path", path.absolute().toString()); if (test->getString("name", "") == "") test->setString("name", path.getBaseName()); diff --git a/dbms/programs/performance-test/PerformanceTest.cpp b/dbms/programs/performance-test/PerformanceTest.cpp index 932d47a5115..a005fcb5fbb 100644 --- a/dbms/programs/performance-test/PerformanceTest.cpp +++ b/dbms/programs/performance-test/PerformanceTest.cpp @@ -303,15 +303,19 @@ void PerformanceTest::runQueries( return; } - // Pull memory usage data from query log. The log entries take some time - // to become available, so we do this after running all the queries, and - // might still have to wait. - const int one_wait_us = 500 * 1000; - const int max_waits = (30 * 1000 * 1000) / one_wait_us; - int n_waits = 0; + // Pull memory usage data from query log. The log is normally filled in + // background, so we have to flush it synchronously here to see all the + // previous queries. + { + RemoteBlockInputStream flush_log(connection, "system flush logs", + {} /* header */, context); + flush_log.readPrefix(); + while (flush_log.read()); + flush_log.readSuffix(); + } + for (auto & statistics : statistics_by_run) { -retry: RemoteBlockInputStream log_reader(connection, "select memory_usage from system.query_log where type = 2 and query_id = '" + statistics.query_id + "'", @@ -321,15 +325,10 @@ retry: Block block = log_reader.read(); if (block.columns() == 0) { - log_reader.readSuffix(); - - if (n_waits >= max_waits) - break; - n_waits++; - - ::usleep(one_wait_us); - goto retry; + LOG_WARNING(log, "Query '" << statistics.query_id << "' is not found in query log."); + continue; } + assert(block.columns() == 1); assert(block.getDataTypes()[0]->getName() == "UInt64"); ColumnPtr column = block.getByPosition(0).column; @@ -339,10 +338,6 @@ retry: statistics.memory_usage = *reinterpret_cast(ref.data); log_reader.readSuffix(); } - - // LOG_INFO can't format floats. Print to stderr because stdout is the - // resulting JSON. - fprintf(stderr, "Waited for query log for %.2fs\n", (n_waits * one_wait_us) / 1e6f); } diff --git a/dbms/src/DataStreams/RemoteBlockInputStream.h b/dbms/src/DataStreams/RemoteBlockInputStream.h index 4c19b44a59e..af8d79c324c 100644 --- a/dbms/src/DataStreams/RemoteBlockInputStream.h +++ b/dbms/src/DataStreams/RemoteBlockInputStream.h @@ -49,7 +49,7 @@ public: /// Set the query_id. For now, used by performance test to later find the query /// in the server query_log. Must be called before sending the query to the /// server. - void setQueryId(std::string _query_id) { assert(!sent_query); query_id = _query_id; } + void setQueryId(const std::string& query_id_) { assert(!sent_query); query_id = query_id_; } /// Specify how we allocate connections on a shard. void setPoolMode(PoolMode pool_mode_) { pool_mode = pool_mode_; } From 458a171fef8390d5c510226c3806124125d28e28 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Jul 2019 12:33:47 +0300 Subject: [PATCH 07/14] Add dependencies to rpm packages --- utils/release/release_lib.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/utils/release/release_lib.sh b/utils/release/release_lib.sh index 75307dfe0b0..7eb5d033bf4 100644 --- a/utils/release/release_lib.sh +++ b/utils/release/release_lib.sh @@ -226,12 +226,22 @@ function make_rpm { PACKAGE=clickhouse-server ARCH=all TARGET=noarch - unpack_pack + deb_unpack + mv ${PACKAGE}-$VERSION_FULL-2.spec ${PACKAGE}-$VERSION_FULL-2.spec_tmp + echo "Requires: clickhouse-common-static" >> ${PACKAGE}-$VERSION_FULL-2.spec + echo "Requires: tzdata" >> ${PACKAGE}-$VERSION_FULL-2.spec + echo "Requires: initscripts" >> ${PACKAGE}-$VERSION_FULL-2.spec + cat ${PACKAGE}-$VERSION_FULL-2.spec_tmp >> ${PACKAGE}-$VERSION_FULL-2.spec + rpm_pack PACKAGE=clickhouse-client ARCH=all TARGET=noarch - unpack_pack + deb_unpack + mv ${PACKAGE}-$VERSION_FULL-2.spec ${PACKAGE}-$VERSION_FULL-2.spec_tmp + echo "Requires: clickhouse-common-static" >> ${PACKAGE}-$VERSION_FULL-2.spec + cat ${PACKAGE}-$VERSION_FULL-2.spec_tmp >> ${PACKAGE}-$VERSION_FULL-2.spec + rpm_pack PACKAGE=clickhouse-test ARCH=all From f32b3975c77be1dbeaae0764dd6b0100f6ade60c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Jul 2019 13:02:31 +0300 Subject: [PATCH 08/14] Fix version --- utils/release/release_lib.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/release/release_lib.sh b/utils/release/release_lib.sh index 7eb5d033bf4..4eaa4d4ebbd 100644 --- a/utils/release/release_lib.sh +++ b/utils/release/release_lib.sh @@ -228,7 +228,7 @@ function make_rpm { TARGET=noarch deb_unpack mv ${PACKAGE}-$VERSION_FULL-2.spec ${PACKAGE}-$VERSION_FULL-2.spec_tmp - echo "Requires: clickhouse-common-static" >> ${PACKAGE}-$VERSION_FULL-2.spec + echo "Requires: clickhouse-common-static = $VERSION_FULL-2" >> ${PACKAGE}-$VERSION_FULL-2.spec echo "Requires: tzdata" >> ${PACKAGE}-$VERSION_FULL-2.spec echo "Requires: initscripts" >> ${PACKAGE}-$VERSION_FULL-2.spec cat ${PACKAGE}-$VERSION_FULL-2.spec_tmp >> ${PACKAGE}-$VERSION_FULL-2.spec @@ -239,7 +239,7 @@ function make_rpm { TARGET=noarch deb_unpack mv ${PACKAGE}-$VERSION_FULL-2.spec ${PACKAGE}-$VERSION_FULL-2.spec_tmp - echo "Requires: clickhouse-common-static" >> ${PACKAGE}-$VERSION_FULL-2.spec + echo "Requires: clickhouse-common-static = $VERSION_FULL-2" >> ${PACKAGE}-$VERSION_FULL-2.spec cat ${PACKAGE}-$VERSION_FULL-2.spec_tmp >> ${PACKAGE}-$VERSION_FULL-2.spec rpm_pack From f786c45ac419ffc1d69bf1586964545e3800dba2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Jul 2019 13:19:37 +0300 Subject: [PATCH 09/14] Add settings for check query backward compatibility --- dbms/src/Core/Settings.h | 3 +- .../Interpreters/InterpreterCheckQuery.cpp | 38 ++++++++++++------- dbms/src/Interpreters/InterpreterCheckQuery.h | 1 - .../0_stateless/00063_check_query.reference | 7 +--- .../queries/0_stateless/00063_check_query.sql | 2 + .../queries/0_stateless/00961_check_table.sql | 1 + .../00077_log_tinylog_stripelog.reference | 19 +++------- .../00077_log_tinylog_stripelog.sql | 2 + 8 files changed, 40 insertions(+), 33 deletions(-) diff --git a/dbms/src/Core/Settings.h b/dbms/src/Core/Settings.h index f49e09ba599..e06e953810b 100644 --- a/dbms/src/Core/Settings.h +++ b/dbms/src/Core/Settings.h @@ -331,7 +331,8 @@ struct Settings : public SettingsCollection M(SettingBool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.") \ M(SettingBool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.") \ \ - M(SettingUInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.") + M(SettingUInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.") \ + M(SettingBool, check_query_single_value_result, true, "Return check query result as single 1/0 value") DECLARE_SETTINGS_COLLECTION(LIST_OF_SETTINGS) diff --git a/dbms/src/Interpreters/InterpreterCheckQuery.cpp b/dbms/src/Interpreters/InterpreterCheckQuery.cpp index 3edcc12fedc..08af398421f 100644 --- a/dbms/src/Interpreters/InterpreterCheckQuery.cpp +++ b/dbms/src/Interpreters/InterpreterCheckQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -42,22 +43,33 @@ BlockIO InterpreterCheckQuery::execute() StoragePtr table = context.getTable(database_name, table_name); auto check_results = table->checkData(query_ptr, context); - auto block_structure = getBlockStructure(); - auto path_column = block_structure[0].type->createColumn(); - auto is_passed_column = block_structure[1].type->createColumn(); - auto message_column = block_structure[2].type->createColumn(); - - for (const auto & check_result : check_results) + Block block; + if (context.getSettingsRef().check_query_single_value_result) { - path_column->insert(check_result.fs_path); - is_passed_column->insert(static_cast(check_result.success)); - message_column->insert(check_result.failure_message); + bool result = std::all_of(check_results.begin(), check_results.end(), [](const CheckResult & res ) { return res.success; }); + auto column = ColumnUInt8::create(); + column->insertValue(UInt64(result)); + block = Block{{std::move(column), std::make_shared(), "result"}}; } + else + { + auto block_structure = getBlockStructure(); + auto path_column = block_structure[0].type->createColumn(); + auto is_passed_column = block_structure[1].type->createColumn(); + auto message_column = block_structure[2].type->createColumn(); - Block block({ - {std::move(path_column), block_structure[0].type, block_structure[0].name}, - {std::move(is_passed_column), block_structure[1].type, block_structure[1].name}, - {std::move(message_column), block_structure[2].type, block_structure[2].name}}); + for (const auto & check_result : check_results) + { + path_column->insert(check_result.fs_path); + is_passed_column->insert(static_cast(check_result.success)); + message_column->insert(check_result.failure_message); + } + + block = Block({ + {std::move(path_column), block_structure[0].type, block_structure[0].name}, + {std::move(is_passed_column), block_structure[1].type, block_structure[1].name}, + {std::move(message_column), block_structure[2].type, block_structure[2].name}}); + } BlockIO res; res.in = std::make_shared(block); diff --git a/dbms/src/Interpreters/InterpreterCheckQuery.h b/dbms/src/Interpreters/InterpreterCheckQuery.h index ca6af4af7bc..c667ca74c22 100644 --- a/dbms/src/Interpreters/InterpreterCheckQuery.h +++ b/dbms/src/Interpreters/InterpreterCheckQuery.h @@ -21,7 +21,6 @@ private: ASTPtr query_ptr; const Context & context; - Block result; }; } diff --git a/dbms/tests/queries/0_stateless/00063_check_query.reference b/dbms/tests/queries/0_stateless/00063_check_query.reference index 9b20cc02e31..6ed281c757a 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.reference +++ b/dbms/tests/queries/0_stateless/00063_check_query.reference @@ -1,5 +1,2 @@ -N.bin 1 -S.bin 1 -N.bin 1 -S.bin 1 -__marks.mrk 1 +1 +1 diff --git a/dbms/tests/queries/0_stateless/00063_check_query.sql b/dbms/tests/queries/0_stateless/00063_check_query.sql index c9493d19d34..2b806cb3bf2 100644 --- a/dbms/tests/queries/0_stateless/00063_check_query.sql +++ b/dbms/tests/queries/0_stateless/00063_check_query.sql @@ -1,3 +1,5 @@ +SET check_query_single_value_result = 1; + DROP TABLE IF EXISTS check_query_tiny_log; CREATE TABLE check_query_tiny_log (N UInt32, S String) Engine = TinyLog; diff --git a/dbms/tests/queries/0_stateless/00961_check_table.sql b/dbms/tests/queries/0_stateless/00961_check_table.sql index 9752ffc3974..0e0b2c3b483 100644 --- a/dbms/tests/queries/0_stateless/00961_check_table.sql +++ b/dbms/tests/queries/0_stateless/00961_check_table.sql @@ -1,3 +1,4 @@ +SET check_query_single_value_result = 0; DROP TABLE IF EXISTS mt_table; CREATE TABLE mt_table (d Date, key UInt64, data String) ENGINE = MergeTree() PARTITION BY toYYYYMM(d) ORDER BY key; diff --git a/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.reference b/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.reference index d8ccbd66592..98f50687de4 100644 --- a/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.reference +++ b/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.reference @@ -1,17 +1,10 @@ +1 +1 +1 8873898 12457120258355519194 8873898 12457120258355519194 8873898 12457120258355519194 8873898 12457120258355519194 -AdvEngineID.bin 1 -CounterID.bin 1 -RegionID.bin 1 -SearchPhrase.bin 1 -UserID.bin 1 -__marks.mrk 1 -AdvEngineID.bin 1 -CounterID.bin 1 -RegionID.bin 1 -SearchPhrase.bin 1 -UserID.bin 1 -data.bin 1 -index.mrk 1 +1 +1 +1 diff --git a/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.sql b/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.sql index d745203ea6b..2324ad69165 100644 --- a/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.sql +++ b/dbms/tests/queries/1_stateful/00077_log_tinylog_stripelog.sql @@ -1,3 +1,5 @@ +SET check_query_single_value_result = 1; + DROP TABLE IF EXISTS test.hits_log; DROP TABLE IF EXISTS test.hits_tinylog; DROP TABLE IF EXISTS test.hits_stripelog; From 0df345c2d962b6da5ff5fd27ea917ac8ea46a745 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Jul 2019 13:49:16 +0300 Subject: [PATCH 10/14] Fix integration test --- .../integration/test_check_table/test.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/dbms/tests/integration/test_check_table/test.py b/dbms/tests/integration/test_check_table/test.py index 6d53b423896..83df59b44a0 100644 --- a/dbms/tests/integration/test_check_table/test.py +++ b/dbms/tests/integration/test_check_table/test.py @@ -50,35 +50,35 @@ def remove_part_from_disk(node, table, part_name): def test_check_normal_table_corruption(started_cluster): node1.query("INSERT INTO non_replicated_mt VALUES (toDate('2019-02-01'), 1, 10), (toDate('2019-02-01'), 2, 12)") - assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201902") == "201902_1_1_0\t1\t\n" + assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201902", settings={"check_query_single_value_result": 0}) == "201902_1_1_0\t1\t\n" remove_checksums_on_disk(node1, "non_replicated_mt", "201902_1_1_0") - assert node1.query("CHECK TABLE non_replicated_mt").strip() == "201902_1_1_0\t1\tChecksums recounted and written to disk." + assert node1.query("CHECK TABLE non_replicated_mt", settings={"check_query_single_value_result": 0}).strip() == "201902_1_1_0\t1\tChecksums recounted and written to disk." assert node1.query("SELECT COUNT() FROM non_replicated_mt") == "2\n" remove_checksums_on_disk(node1, "non_replicated_mt", "201902_1_1_0") - assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201902").strip() == "201902_1_1_0\t1\tChecksums recounted and written to disk." + assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201902", settings={"check_query_single_value_result": 0}).strip() == "201902_1_1_0\t1\tChecksums recounted and written to disk." assert node1.query("SELECT COUNT() FROM non_replicated_mt") == "2\n" corrupt_data_part_on_disk(node1, "non_replicated_mt", "201902_1_1_0") - assert node1.query("CHECK TABLE non_replicated_mt").strip() == "201902_1_1_0\t0\tCannot read all data. Bytes read: 2. Bytes expected: 16." + assert node1.query("CHECK TABLE non_replicated_mt", settings={"check_query_single_value_result": 0}).strip() == "201902_1_1_0\t0\tCannot read all data. Bytes read: 2. Bytes expected: 16." - assert node1.query("CHECK TABLE non_replicated_mt").strip() == "201902_1_1_0\t0\tCannot read all data. Bytes read: 2. Bytes expected: 16." + assert node1.query("CHECK TABLE non_replicated_mt", settings={"check_query_single_value_result": 0}).strip() == "201902_1_1_0\t0\tCannot read all data. Bytes read: 2. Bytes expected: 16." node1.query("INSERT INTO non_replicated_mt VALUES (toDate('2019-01-01'), 1, 10), (toDate('2019-01-01'), 2, 12)") - assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201901") == "201901_2_2_0\t1\t\n" + assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "201901_2_2_0\t1\t\n" corrupt_data_part_on_disk(node1, "non_replicated_mt", "201901_2_2_0") remove_checksums_on_disk(node1, "non_replicated_mt", "201901_2_2_0") - assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201901") == "201901_2_2_0\t0\tCheck of part finished with error: \\'Cannot read all data. Bytes read: 2. Bytes expected: 16.\\'\n" + assert node1.query("CHECK TABLE non_replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "201901_2_2_0\t0\tCheck of part finished with error: \\'Cannot read all data. Bytes read: 2. Bytes expected: 16.\\'\n" def test_check_replicated_table_simple(started_cluster): @@ -90,16 +90,16 @@ def test_check_replicated_table_simple(started_cluster): assert node1.query("SELECT count() from replicated_mt") == "2\n" assert node2.query("SELECT count() from replicated_mt") == "2\n" - assert node1.query("CHECK TABLE replicated_mt") == "201902_0_0_0\t1\t\n" - assert node2.query("CHECK TABLE replicated_mt") == "201902_0_0_0\t1\t\n" + assert node1.query("CHECK TABLE replicated_mt", settings={"check_query_single_value_result": 0}) == "201902_0_0_0\t1\t\n" + assert node2.query("CHECK TABLE replicated_mt", settings={"check_query_single_value_result": 0}) == "201902_0_0_0\t1\t\n" node2.query("INSERT INTO replicated_mt VALUES (toDate('2019-01-02'), 3, 10), (toDate('2019-01-02'), 4, 12)") node1.query("SYSTEM SYNC REPLICA replicated_mt") assert node1.query("SELECT count() from replicated_mt") == "4\n" assert node2.query("SELECT count() from replicated_mt") == "4\n" - assert node1.query("CHECK TABLE replicated_mt PARTITION 201901") == "201901_0_0_0\t1\t\n" - assert node2.query("CHECK TABLE replicated_mt PARTITION 201901") == "201901_0_0_0\t1\t\n" + assert node1.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "201901_0_0_0\t1\t\n" + assert node2.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "201901_0_0_0\t1\t\n" def test_check_replicated_table_corruption(started_cluster): @@ -115,15 +115,15 @@ def test_check_replicated_table_corruption(started_cluster): part_name = node1.query("SELECT name from system.parts where table = 'replicated_mt' and partition_id = '201901' and active = 1").strip() corrupt_data_part_on_disk(node1, "replicated_mt", part_name) - assert node1.query("CHECK TABLE replicated_mt PARTITION 201901") == "{p}\t0\tPart {p} looks broken. Removing it and queueing a fetch.\n".format(p=part_name) + assert node1.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "{p}\t0\tPart {p} looks broken. Removing it and queueing a fetch.\n".format(p=part_name) node1.query("SYSTEM SYNC REPLICA replicated_mt") - assert node1.query("CHECK TABLE replicated_mt PARTITION 201901") == "{}\t1\t\n".format(part_name) + assert node1.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt") == "4\n" remove_part_from_disk(node2, "replicated_mt", part_name) - assert node2.query("CHECK TABLE replicated_mt PARTITION 201901") == "{p}\t0\tPart {p} looks broken. Removing it and queueing a fetch.\n".format(p=part_name) + assert node2.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "{p}\t0\tPart {p} looks broken. Removing it and queueing a fetch.\n".format(p=part_name) node1.query("SYSTEM SYNC REPLICA replicated_mt") - assert node1.query("CHECK TABLE replicated_mt PARTITION 201901") == "{}\t1\t\n".format(part_name) + assert node1.query("CHECK TABLE replicated_mt PARTITION 201901", settings={"check_query_single_value_result": 0}) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt") == "4\n" From 27447d4816d78fa8cdd023c27c97f42debbfe82e Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Jul 2019 13:50:59 +0300 Subject: [PATCH 11/14] Fix style --- dbms/src/Interpreters/InterpreterCheckQuery.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Interpreters/InterpreterCheckQuery.cpp b/dbms/src/Interpreters/InterpreterCheckQuery.cpp index 08af398421f..8326a627e6c 100644 --- a/dbms/src/Interpreters/InterpreterCheckQuery.cpp +++ b/dbms/src/Interpreters/InterpreterCheckQuery.cpp @@ -46,7 +46,7 @@ BlockIO InterpreterCheckQuery::execute() Block block; if (context.getSettingsRef().check_query_single_value_result) { - bool result = std::all_of(check_results.begin(), check_results.end(), [](const CheckResult & res ) { return res.success; }); + bool result = std::all_of(check_results.begin(), check_results.end(), [] (const CheckResult & res) { return res.success; }); auto column = ColumnUInt8::create(); column->insertValue(UInt64(result)); block = Block{{std::move(column), std::make_shared(), "result"}}; From e7ded837369b5d0b6026fa008c7a38de4cb2f933 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 16 Jul 2019 14:07:52 +0300 Subject: [PATCH 12/14] Change link for next meetup --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6538fbee0eb..3de9abdc333 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,6 @@ ClickHouse is an open-source column-oriented database management system that all * You can also [fill this form](https://forms.yandex.com/surveys/meet-yandex-clickhouse-team/) to meet Yandex ClickHouse team in person. ## Upcoming Events -* [ClickHouse Meetup in Minsk](https://yandex.ru/promo/metrica/clickhouse-minsk) on July 11. +* [ClickHouse Meetup in Saint Petersburg](https://yandex.ru/promo/clickhouse/saint-petersburg-2019) on July 27. * [ClickHouse Meetup in Shenzhen](https://www.huodongxing.com/event/3483759917300) on October 20. * [ClickHouse Meetup in Shanghai](https://www.huodongxing.com/event/4483760336000) on October 27. From 14b7a018cabfecc34f6217b98838b8f207f388e5 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 16 Jul 2019 14:08:53 +0300 Subject: [PATCH 13/14] Change next meetup link --- website/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/index.html b/website/index.html index e87223555fd..9b799638c26 100644 --- a/website/index.html +++ b/website/index.html @@ -94,7 +94,7 @@
- Upcoming Meetups: Minsk on July 11 and Shenzhen on October 20 + Upcoming Meetups: Saint Petersburg on July 27 and Shenzhen on October 20
From dcd86964666ff34fb994d585dd732e1e9a0e8989 Mon Sep 17 00:00:00 2001 From: Dmitry Rubashkin Date: Tue, 16 Jul 2019 14:40:11 +0300 Subject: [PATCH 14/14] Minor fixes --- .../MergeTree/MergeTreeIndexFullText.cpp | 37 ++++++++----------- .../MergeTree/MergeTreeIndexFullText.h | 2 + 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index 43a507ae902..be9994ece64 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -49,7 +49,17 @@ static void likeStringToBloomFilter( while (cur < data.size() && token_extractor->nextLike(data, &cur, token)) bloom_filter.add(token.c_str(), token.size()); } +/// Unified condition for equals, startsWith and endsWith +bool MergeTreeConditionFullText::createFunctionEqualsCondition(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) +{ + out.function = RPNElement::FUNCTION_EQUALS; + out.bloom_filter = std::make_unique( + idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); + const auto & str = value.get(); + stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); + return true; +} MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(const MergeTreeIndexFullText & index) : IMergeTreeIndexGranule() @@ -129,13 +139,7 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "equals", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - out.function = RPNElement::FUNCTION_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); - - const auto & str = value.get(); - stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter); - return true; + return createFunctionEqualsCondition(out, value, idx); } }, { @@ -168,26 +172,14 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map "startsWith", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - out.function = RPNElement::FUNCTION_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); - - const auto & prefix = value.get(); - stringToBloomFilter(prefix.c_str(), prefix.size(), idx.token_extractor_func, *out.bloom_filter); - return true; + return createFunctionEqualsCondition(out, value, idx); } }, { "endsWith", [] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx) { - out.function = RPNElement::FUNCTION_EQUALS; - out.bloom_filter = std::make_unique( - idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed); - - const auto & suffix = value.get(); - stringToBloomFilter(suffix.c_str(), suffix.size(), idx.token_extractor_func, *out.bloom_filter); - return true; + return createFunctionEqualsCondition(out, value, idx); } }, { @@ -196,6 +188,7 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map { out.function = RPNElement::FUNCTION_MULTI_SEARCH; + /// 2d vector is not needed here but is used because already exists for FUNCTION_IN std::vector> bloom_filters; bloom_filters.emplace_back(); for (const auto & element : value.get()) @@ -405,7 +398,7 @@ bool MergeTreeConditionFullText::atomFromAST( size_t key_arg_pos; /// Position of argument with key column (non-const argument) size_t key_column_num = -1; /// Number of a key column (inside key_column_names array) - std::string func_name = func->name; + const auto & func_name = func->name; if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out)) { diff --git a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h index af22b83272f..f6230134596 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h +++ b/dbms/src/Storages/MergeTree/MergeTreeIndexFullText.h @@ -117,6 +117,8 @@ private: bool getKey(const ASTPtr & node, size_t & key_column_num); bool tryPrepareSetBloomFilter(const ASTs & args, RPNElement & out); + static bool createFunctionEqualsCondition(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx); + static const AtomMap atom_map; const MergeTreeIndexFullText & index;