From d808fafa8ff7ba692e5eeda28c0574544cf113cb Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 2 Aug 2019 18:35:34 +0300 Subject: [PATCH 01/13] Disable consecutive key optimization for UInt8/16. These types use a FixedHashMap for aggregation, which makes lookup almost free, so we don't have to cache the last lookup result. This is a part of StringHashMap PR #5417 by Amos Bird. --- dbms/src/Interpreters/Aggregator.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/dbms/src/Interpreters/Aggregator.h b/dbms/src/Interpreters/Aggregator.h index 8c67271b1d5..5fb554f74f4 100644 --- a/dbms/src/Interpreters/Aggregator.h +++ b/dbms/src/Interpreters/Aggregator.h @@ -155,7 +155,9 @@ using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTw /// For the case where there is one numeric key. -template /// UInt8/16/32/64 for any type with corresponding bit width. +/// FieldType is UInt8/16/32/64 for any type with corresponding bit width. +template struct AggregationMethodOneNumber { using Data = TData; @@ -172,7 +174,8 @@ struct AggregationMethodOneNumber AggregationMethodOneNumber(const Other & other) : data(other.data) {} /// To use one `Method` in different threads, use different `State`. - using State = ColumnsHashing::HashMethodOneNumber; + using State = ColumnsHashing::HashMethodOneNumber; /// Use optimization for low cardinality. static const bool low_cardinality_optimization = false; @@ -421,8 +424,10 @@ struct AggregatedDataVariants : private boost::noncopyable */ AggregatedDataWithoutKey without_key = nullptr; - std::unique_ptr> key8; - std::unique_ptr> key16; + // Disable consecutive key optimization for Uint8/16, because they use a FixedHashMap + // and the lookup there is almost free, so we don't need to cache the last lookup result + std::unique_ptr> key8; + std::unique_ptr> key16; std::unique_ptr> key32; std::unique_ptr> key64; From 67d91c4b88ba6817899c951c9d1a21840a03236b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 2 Aug 2019 20:14:04 +0300 Subject: [PATCH 02/13] Fixed the possibility of hanging queries when server is overloaded --- dbms/src/Common/ThreadPool.cpp | 34 +++++++- dbms/src/Common/ThreadPool.h | 10 ++- .../tests/gtest_thread_pool_global_full.cpp | 81 +++++++++++++++++++ 3 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 dbms/src/Common/tests/gtest_thread_pool_global_full.cpp diff --git a/dbms/src/Common/ThreadPool.cpp b/dbms/src/Common/ThreadPool.cpp index cb08fa944a9..ce004ed7674 100644 --- a/dbms/src/Common/ThreadPool.cpp +++ b/dbms/src/Common/ThreadPool.cpp @@ -1,7 +1,6 @@ #include #include -#include #include @@ -34,6 +33,28 @@ ThreadPoolImpl::ThreadPoolImpl(size_t max_threads, size_t max_free_threa { } +template +void ThreadPoolImpl::setMaxThreads(size_t value) +{ + std::lock_guard lock(mutex); + max_threads = value; +} + +template +void ThreadPoolImpl::setMaxFreeThreads(size_t value) +{ + std::lock_guard lock(mutex); + max_free_threads = value; +} + +template +void ThreadPoolImpl::setQueueSize(size_t value) +{ + std::lock_guard lock(mutex); + queue_size = value; +} + + template template ReturnType ThreadPoolImpl::scheduleImpl(Job job, int priority, std::optional wait_microseconds) @@ -59,7 +80,7 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, int priority, std::opti auto pred = [this] { return !queue_size || scheduled_jobs < queue_size || shutdown; }; - if (wait_microseconds) + if (wait_microseconds) /// Check for optional. Condition is true if the optional is set and the value is zero. { if (!job_finished.wait_for(lock, std::chrono::microseconds(*wait_microseconds), pred)) return on_error(); @@ -83,6 +104,15 @@ ReturnType ThreadPoolImpl::scheduleImpl(Job job, int priority, std::opti catch (...) { threads.pop_front(); + + /// Remove the job and return error to caller. + /// Note that if we have allocated at least one thread, we may continue + /// (one thread is enough to process all jobs). + /// But this condition indicate an error nevertheless and better to refuse. + + jobs.pop(); + --scheduled_jobs; + return on_error(); } } } diff --git a/dbms/src/Common/ThreadPool.h b/dbms/src/Common/ThreadPool.h index a0dae3f810c..23c0848e931 100644 --- a/dbms/src/Common/ThreadPool.h +++ b/dbms/src/Common/ThreadPool.h @@ -60,14 +60,18 @@ public: /// Returns number of running and scheduled jobs. size_t active() const; + void setMaxThreads(size_t value); + void setMaxFreeThreads(size_t value); + void setQueueSize(size_t value); + private: mutable std::mutex mutex; std::condition_variable job_finished; std::condition_variable new_job_or_shutdown; - const size_t max_threads; - const size_t max_free_threads; - const size_t queue_size; + size_t max_threads; + size_t max_free_threads; + size_t queue_size; size_t scheduled_jobs = 0; bool shutdown = false; diff --git a/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp b/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp new file mode 100644 index 00000000000..48858fa40ef --- /dev/null +++ b/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp @@ -0,0 +1,81 @@ +#include + +#include + +#include + + +/// Test what happens if local ThreadPool cannot create a ThreadFromGlobalPool. +/// There was a bug: if local ThreadPool cannot allocate even a single thread, +/// the job will be scheduled but never get executed. + + +TEST(ThreadPool, GlobalFull1) +{ + GlobalThreadPool & global_pool = GlobalThreadPool::instance(); + + static constexpr size_t capacity = 5; + + global_pool.setMaxThreads(capacity); + global_pool.setMaxFreeThreads(1); + global_pool.setQueueSize(capacity); + global_pool.wait(); + + std::atomic counter = 0; + static constexpr size_t num_jobs = capacity + 1; + + auto func = [&] { ++counter; while (counter != num_jobs) {} }; + + ThreadPool pool(num_jobs); + + for (size_t i = 0; i < capacity; ++i) + pool.schedule(func); + + for (size_t i = capacity; i < num_jobs; ++i) + { + EXPECT_THROW(pool.schedule(func), DB::Exception); + ++counter; + } + + pool.wait(); + EXPECT_EQ(counter, num_jobs); +} + + +TEST(ThreadPool, GlobalFull2) +{ + GlobalThreadPool & global_pool = GlobalThreadPool::instance(); + + static constexpr size_t capacity = 5; + + global_pool.setMaxThreads(capacity); + global_pool.setMaxFreeThreads(1); + global_pool.setQueueSize(capacity); + + /// ThreadFromGlobalPool from local thread pools from previous test case have exited + /// but their threads from global_pool may not have finished (they still have to exit). + /// If we will not wait here, we can get "Cannot schedule a task exception" earlier than we expect in this test. + global_pool.wait(); + + std::atomic counter = 0; + auto func = [&] { ++counter; while (counter != capacity + 1) {} }; + + ThreadPool pool(capacity, 0, capacity); + for (size_t i = 0; i < capacity; ++i) + pool.schedule(func); + + ThreadPool another_pool(1); + EXPECT_THROW(another_pool.schedule(func), DB::Exception); + + ++counter; + + pool.wait(); + + global_pool.wait(); + + for (size_t i = 0; i < capacity; ++i) + another_pool.schedule([&] { ++counter; }); + + another_pool.wait(); + EXPECT_EQ(counter, capacity * 2 + 1); +} From a5105f85cf563d557e4f8bf0c45a1c7be6cb56cd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 1 Aug 2019 20:04:19 +0300 Subject: [PATCH 03/13] Removed table function "catBoostPool" and storage "CatBoostPool" --- dbms/src/Storages/StorageCatBoostPool.cpp | 303 ------------------ dbms/src/Storages/StorageCatBoostPool.h | 82 ----- .../TableFunctionCatBoostPool.cpp | 56 ---- .../TableFunctionCatBoostPool.h | 21 -- .../TableFunctions/registerTableFunctions.cpp | 2 - 5 files changed, 464 deletions(-) delete mode 100644 dbms/src/Storages/StorageCatBoostPool.cpp delete mode 100644 dbms/src/Storages/StorageCatBoostPool.h delete mode 100644 dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp delete mode 100644 dbms/src/TableFunctions/TableFunctionCatBoostPool.h diff --git a/dbms/src/Storages/StorageCatBoostPool.cpp b/dbms/src/Storages/StorageCatBoostPool.cpp deleted file mode 100644 index a9e2acedcce..00000000000 --- a/dbms/src/Storages/StorageCatBoostPool.cpp +++ /dev/null @@ -1,303 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int CANNOT_OPEN_FILE; - extern const int CANNOT_PARSE_TEXT; - extern const int DATABASE_ACCESS_DENIED; -} - -namespace -{ -class CatBoostDatasetBlockInputStream : public IBlockInputStream -{ -public: - - CatBoostDatasetBlockInputStream(const std::string & file_name, const std::string & format_name, - const Block & sample_block, const Context & context, UInt64 max_block_size) - : file_name(file_name), format_name(format_name) - { - read_buf = std::make_unique(file_name); - reader = FormatFactory::instance().getInput(format_name, *read_buf, sample_block, context, max_block_size); - } - - String getName() const override - { - return "CatBoostDataset"; - } - - Block readImpl() override - { - return reader->read(); - } - - void readPrefixImpl() override - { - reader->readPrefix(); - } - - void readSuffixImpl() override - { - reader->readSuffix(); - } - - Block getHeader() const override { return reader->getHeader(); } - -private: - std::unique_ptr read_buf; - BlockInputStreamPtr reader; - std::string file_name; - std::string format_name; -}; - -} - -static boost::filesystem::path canonicalPath(std::string && path) -{ - return boost::filesystem::canonical(boost::filesystem::path(path)); -} - -static std::string resolvePath(const boost::filesystem::path & base_path, std::string && path) -{ - boost::filesystem::path resolved_path(path); - if (!resolved_path.is_absolute()) - return boost::filesystem::canonical(resolved_path, base_path).string(); - return boost::filesystem::canonical(resolved_path).string(); -} - -static void checkCreationIsAllowed(const String & base_path, const String & path) -{ - if (base_path != path.substr(0, base_path.size())) - throw Exception( - "Using file descriptor or user specified path as source of storage isn't allowed for server daemons", - ErrorCodes::DATABASE_ACCESS_DENIED); -} - - -StorageCatBoostPool::StorageCatBoostPool( - const String & database_name_, - const String & table_name_, - const Context & context, - String column_description_file_name_, - String data_description_file_name_) - : table_name(table_name_) - , database_name(database_name_) - , column_description_file_name(std::move(column_description_file_name_)) - , data_description_file_name(std::move(data_description_file_name_)) -{ - auto base_path = canonicalPath(context.getPath()); - column_description_file_name = resolvePath(base_path, std::move(column_description_file_name)); - data_description_file_name = resolvePath(base_path, std::move(data_description_file_name)); - if (context.getApplicationType() == Context::ApplicationType::SERVER) - { - const auto & base_path_str = base_path.string(); - checkCreationIsAllowed(base_path_str, column_description_file_name); - checkCreationIsAllowed(base_path_str, data_description_file_name); - } - - parseColumnDescription(); - createSampleBlockAndColumns(); -} - -std::string StorageCatBoostPool::getColumnTypesString(const ColumnTypesMap & columnTypesMap) -{ - std::string types_string; - bool first = true; - for (const auto & value : columnTypesMap) - { - if (!first) - types_string.append(", "); - - first = false; - types_string += value.first; - } - - return types_string; -} - -void StorageCatBoostPool::checkDatasetDescription() -{ - std::ifstream in(data_description_file_name); - if (!in.good()) - throw Exception("Cannot open file: " + data_description_file_name, ErrorCodes::CANNOT_OPEN_FILE); - - std::string line; - if (!std::getline(in, line)) - throw Exception("File is empty: " + data_description_file_name, ErrorCodes::CANNOT_PARSE_TEXT); - - size_t columns_count = 1; - for (char sym : line) - if (sym == '\t') - ++columns_count; - - columns_description.resize(columns_count); -} - -void StorageCatBoostPool::parseColumnDescription() -{ - /// NOTE: simple parsing - /// TODO: use ReadBufferFromFile - - checkDatasetDescription(); - - std::ifstream in(column_description_file_name); - if (!in.good()) - throw Exception("Cannot open file: " + column_description_file_name, ErrorCodes::CANNOT_OPEN_FILE); - - std::string line; - size_t line_num = 0; - auto column_types_map = getColumnTypesMap(); - auto column_types_string = getColumnTypesString(column_types_map); - - /// Enumerate default names for columns as Auxiliary, Auxiliary1, Auxiliary2, ... - std::map columns_per_type_count; - - while (std::getline(in, line)) - { - ++line_num; - std::string str_line_num = std::to_string(line_num); - - if (line.empty()) - continue; - - std::istringstream iss(line); - std::vector tokens; - std::string token; - while (std::getline(iss, token, '\t')) - tokens.push_back(token); - - if (tokens.size() != 2 && tokens.size() != 3) - throw Exception("Cannot parse column description at line " + str_line_num + " '" + line + "' " - + ": expected 2 or 3 columns, got " + std::to_string(tokens.size()), - ErrorCodes::CANNOT_PARSE_TEXT); - - std::string str_id = tokens[0]; - std::string col_type = tokens[1]; - std::string col_alias = tokens.size() > 2 ? tokens[2] : ""; - - size_t num_id; - try - { - num_id = std::stoull(str_id); - } - catch (std::exception & e) - { - throw Exception("Cannot parse column index at row " + str_line_num + ": " + e.what(), - ErrorCodes::CANNOT_PARSE_TEXT); - } - - if (num_id >= columns_description.size()) - throw Exception("Invalid index at row " + str_line_num + ": " + str_id - + ", expected in range [0, " + std::to_string(columns_description.size()) + ")", - ErrorCodes::CANNOT_PARSE_TEXT); - - if (column_types_map.count(col_type) == 0) - throw Exception("Invalid column type: " + col_type + ", expected: " + column_types_string, - ErrorCodes::CANNOT_PARSE_TEXT); - - auto type = column_types_map[col_type]; - - std::string col_name; - - bool is_feature_column = type == DatasetColumnType::Num || type == DatasetColumnType::Categ; - auto & col_number = columns_per_type_count[type]; - /// If column is not feature skip '0' after the name (to use 'Target' instead of 'Target0'). - col_name = col_type + (is_feature_column || col_number ? std::to_string(col_number) : ""); - ++col_number; - - columns_description[num_id] = ColumnDescription(col_name, col_alias, type); - } -} - -void StorageCatBoostPool::createSampleBlockAndColumns() -{ - ColumnsDescription columns; - NamesAndTypesList cat_columns; - NamesAndTypesList num_columns; - NamesAndTypesList other_columns; - sample_block.clear(); - - auto get_type = [](DatasetColumnType column_type) -> DataTypePtr - { - if (column_type == DatasetColumnType::Categ - || column_type == DatasetColumnType::Auxiliary - || column_type == DatasetColumnType::DocId) - return std::make_shared(); - else - return std::make_shared(); - }; - - for (auto & desc : columns_description) - { - DataTypePtr type = get_type(desc.column_type); - - if (desc.column_type == DatasetColumnType::Categ) - cat_columns.emplace_back(desc.column_name, type); - else if (desc.column_type == DatasetColumnType::Num) - num_columns.emplace_back(desc.column_name, type); - else - other_columns.emplace_back(desc.column_name, type); - - sample_block.insert(ColumnWithTypeAndName(type, desc.column_name)); - } - - /// Order is important: first numeric columns, then categorial, then all others. - for (const auto & column : num_columns) - columns.add(DB::ColumnDescription(column.name, column.type, false)); - for (const auto & column : cat_columns) - columns.add(DB::ColumnDescription(column.name, column.type, false)); - for (const auto & column : other_columns) - { - DB::ColumnDescription column_desc(column.name, column.type, false); - /// We assign Materialized kind to the column so that it doesn't show in SELECT *. - /// Because the table is readonly, we do not need default expression. - column_desc.default_desc.kind = ColumnDefaultKind::Materialized; - columns.add(std::move(column_desc)); - } - - for (auto & desc : columns_description) - { - if (!desc.alias.empty()) - { - DB::ColumnDescription column(desc.alias, get_type(desc.column_type), false); - column.default_desc.kind = ColumnDefaultKind::Alias; - column.default_desc.expression = std::make_shared(desc.column_name); - columns.add(std::move(column)); - } - } - - setColumns(columns); -} - -BlockInputStreams StorageCatBoostPool::read( - const Names & column_names, - const SelectQueryInfo & /*query_info*/, - const Context & context, - QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size, - unsigned /*threads*/) -{ - auto stream = std::make_shared( - data_description_file_name, "TSV", sample_block, context, max_block_size); - - auto filter_stream = std::make_shared(stream, column_names, false); - return { filter_stream }; -} - -} diff --git a/dbms/src/Storages/StorageCatBoostPool.h b/dbms/src/Storages/StorageCatBoostPool.h deleted file mode 100644 index 0cc457fabc0..00000000000 --- a/dbms/src/Storages/StorageCatBoostPool.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ - -class StorageCatBoostPool : public ext::shared_ptr_helper, public IStorage -{ -public: - std::string getName() const override { return "CatBoostPool"; } - std::string getTableName() const override { return table_name; } - std::string getDatabaseName() const override { return database_name; } - - BlockInputStreams read(const Names & column_names, - const SelectQueryInfo & query_info, - const Context & context, - QueryProcessingStage::Enum processed_stage, - size_t max_block_size, - unsigned threads) override; - -private: - String table_name; - String database_name; - - String column_description_file_name; - String data_description_file_name; - Block sample_block; - - enum class DatasetColumnType - { - Target, - Num, - Categ, - Auxiliary, - DocId, - Weight, - Baseline - }; - - using ColumnTypesMap = std::map; - - ColumnTypesMap getColumnTypesMap() const - { - return - { - {"Target", DatasetColumnType::Target}, - {"Num", DatasetColumnType::Num}, - {"Categ", DatasetColumnType::Categ}, - {"Auxiliary", DatasetColumnType::Auxiliary}, - {"DocId", DatasetColumnType::DocId}, - {"Weight", DatasetColumnType::Weight}, - {"Baseline", DatasetColumnType::Baseline}, - }; - } - - std::string getColumnTypesString(const ColumnTypesMap & columnTypesMap); - - struct ColumnDescription - { - std::string column_name; - std::string alias; - DatasetColumnType column_type; - - ColumnDescription() : column_type(DatasetColumnType::Num) {} - ColumnDescription(std::string column_name, std::string alias, DatasetColumnType column_type) - : column_name(std::move(column_name)), alias(std::move(alias)), column_type(column_type) {} - }; - - std::vector columns_description; - - void checkDatasetDescription(); - void parseColumnDescription(); - void createSampleBlockAndColumns(); - -protected: - StorageCatBoostPool(const String & database_name_, const String & table_name_, const Context & context, String column_description_file_name, String data_description_file_name); -}; - -} diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp b/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp deleted file mode 100644 index 74fab72fd19..00000000000 --- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int BAD_ARGUMENTS; -} - - -StoragePtr TableFunctionCatBoostPool::executeImpl(const ASTPtr & ast_function, const Context & context, const std::string & table_name) const -{ - ASTs & args_func = ast_function->children; - - std::string err = "Table function '" + getName() + "' requires 2 parameters: " - + "column descriptions file, dataset description file"; - - if (args_func.size() != 1) - throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - ASTs & args = args_func.at(0)->children; - - if (args.size() != 2) - throw Exception(err, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - - auto getStringLiteral = [](const IAST & node, const char * description) - { - const auto * lit = node.as(); - if (!lit) - throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS); - - if (lit->value.getType() != Field::Types::String) - throw Exception(description + String(" must be string literal (in single quotes)."), ErrorCodes::BAD_ARGUMENTS); - - return safeGet(lit->value); - }; - String column_descriptions_file = getStringLiteral(*args[0], "Column descriptions file"); - String dataset_description_file = getStringLiteral(*args[1], "Dataset description file"); - - return StorageCatBoostPool::create(getDatabaseName(), table_name, context, column_descriptions_file, dataset_description_file); -} - -void registerTableFunctionCatBoostPool(TableFunctionFactory & factory) -{ - factory.registerFunction(); -} - -} diff --git a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h b/dbms/src/TableFunctions/TableFunctionCatBoostPool.h deleted file mode 100644 index cf93308b60c..00000000000 --- a/dbms/src/TableFunctions/TableFunctionCatBoostPool.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include - - -namespace DB -{ - -/* catboostPool('column_descriptions_file', 'dataset_description_file') - * Create storage from CatBoost dataset. - */ -class TableFunctionCatBoostPool : public ITableFunction -{ -public: - static constexpr auto name = "catBoostPool"; - std::string getName() const override { return name; } -private: - StoragePtr executeImpl(const ASTPtr & ast_function, const Context & context, const std::string & table_name) const override; -}; - -} diff --git a/dbms/src/TableFunctions/registerTableFunctions.cpp b/dbms/src/TableFunctions/registerTableFunctions.cpp index 76eeb23f6cc..61d0ec23f7d 100644 --- a/dbms/src/TableFunctions/registerTableFunctions.cpp +++ b/dbms/src/TableFunctions/registerTableFunctions.cpp @@ -10,7 +10,6 @@ namespace DB void registerTableFunctionMerge(TableFunctionFactory & factory); void registerTableFunctionRemote(TableFunctionFactory & factory); void registerTableFunctionNumbers(TableFunctionFactory & factory); -void registerTableFunctionCatBoostPool(TableFunctionFactory & factory); void registerTableFunctionFile(TableFunctionFactory & factory); void registerTableFunctionURL(TableFunctionFactory & factory); void registerTableFunctionValues(TableFunctionFactory & factory); @@ -37,7 +36,6 @@ void registerTableFunctions() registerTableFunctionMerge(factory); registerTableFunctionRemote(factory); registerTableFunctionNumbers(factory); - registerTableFunctionCatBoostPool(factory); registerTableFunctionFile(factory); registerTableFunctionURL(factory); registerTableFunctionValues(factory); From 5e942d2d797484e79d62e6996106ae42aee77ab9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 2 Aug 2019 23:16:02 +0300 Subject: [PATCH 04/13] Added performance test to show degradation of performance in gcc-9 in more isolated way --- .../empty_string_deserialization.xml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 dbms/tests/performance/empty_string_deserialization.xml diff --git a/dbms/tests/performance/empty_string_deserialization.xml b/dbms/tests/performance/empty_string_deserialization.xml new file mode 100644 index 00000000000..99e7470c5e6 --- /dev/null +++ b/dbms/tests/performance/empty_string_deserialization.xml @@ -0,0 +1,20 @@ + + loop + + + + 10 + + + + + + CREATE TABLE empty_strings (s String) ENGINE = Log; + INSERT INTO empty_strings SELECT '' FROM numbers(1000000000); + + SELECT count() FROM empty_strings + + DROP TABLE IF EXISTS empty_strings + From 2fb71013af23c056344d61308be56607b65bd60f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 2 Aug 2019 23:18:03 +0300 Subject: [PATCH 05/13] Added performance test to show degradation of performance in gcc-9 in more isolated way --- .../performance/empty_string_serialization.xml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 dbms/tests/performance/empty_string_serialization.xml diff --git a/dbms/tests/performance/empty_string_serialization.xml b/dbms/tests/performance/empty_string_serialization.xml new file mode 100644 index 00000000000..46c4bc0275c --- /dev/null +++ b/dbms/tests/performance/empty_string_serialization.xml @@ -0,0 +1,17 @@ + + loop + + + + 10 + + + + + + CREATE TABLE empty_strings (s String) ENGINE = Log; + INSERT INTO empty_strings SELECT '' FROM numbers(100000000); + DROP TABLE IF EXISTS empty_strings + From 21cb7de777656c09d4c08ad903e8dbcced61aada Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 2 Aug 2019 23:19:06 +0300 Subject: [PATCH 06/13] More validation of part_name --- dbms/src/Storages/MergeTree/DataPartsExchange.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp index 80e521b32c1..f01b384d441 100644 --- a/dbms/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/dbms/src/Storages/MergeTree/DataPartsExchange.cpp @@ -55,6 +55,9 @@ void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*bo String part_name = params.get("part"); + /// Validation of the input that may come from malicious replica. + MergeTreePartInfo::fromPartName(part_name, data.format_version); + static std::atomic_uint total_sends {0}; if ((data.settings.replicated_max_parallel_sends && total_sends >= data.settings.replicated_max_parallel_sends) @@ -169,6 +172,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( bool to_detached, const String & tmp_prefix_) { + /// Validation of the input that may come from malicious replica. + MergeTreePartInfo::fromPartName(part_name, data.format_version); + Poco::URI uri; uri.setScheme(interserver_scheme); uri.setHost(host); From 5774ef1ed1b587d8e0245c6f2fc1e84061248ec5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 00:49:50 +0300 Subject: [PATCH 07/13] Removed test --- .../00689_catboost_pool_files.reference | 1 - .../0_stateless/00689_catboost_pool_files.sh | 32 ------------------- 2 files changed, 33 deletions(-) delete mode 100644 dbms/tests/queries/0_stateless/00689_catboost_pool_files.reference delete mode 100755 dbms/tests/queries/0_stateless/00689_catboost_pool_files.sh diff --git a/dbms/tests/queries/0_stateless/00689_catboost_pool_files.reference b/dbms/tests/queries/0_stateless/00689_catboost_pool_files.reference deleted file mode 100644 index e965047ad7c..00000000000 --- a/dbms/tests/queries/0_stateless/00689_catboost_pool_files.reference +++ /dev/null @@ -1 +0,0 @@ -Hello diff --git a/dbms/tests/queries/0_stateless/00689_catboost_pool_files.sh b/dbms/tests/queries/0_stateless/00689_catboost_pool_files.sh deleted file mode 100755 index b859a7c3e34..00000000000 --- a/dbms/tests/queries/0_stateless/00689_catboost_pool_files.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -. $CURDIR/../shell_config.sh - -${CLICKHOUSE_CLIENT} --query="drop table if exists catboost_pool_desc;" -${CLICKHOUSE_CLIENT} --query="drop table if exists catboost_pool_vals;" -${CLICKHOUSE_CLIENT} --query="create table catboost_pool_desc (id String, type String) engine = File(TSV);" -${CLICKHOUSE_CLIENT} --query="insert into catboost_pool_desc select '0', 'Categ';" -${CLICKHOUSE_CLIENT} --query="create table catboost_pool_vals (str String) engine = File(TSV);" -${CLICKHOUSE_CLIENT} --query="insert into catboost_pool_vals select 'Hello';" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', 'data/${CLICKHOUSE_DATABASE}/catboost_pool_vals/data.TSV');" - -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('data/${CLICKHOUSE_DATABASE}/catboost_pool_desc/data.TSV', '../../../../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" - -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('${CURDIR}/00689_file.txt', '${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../${CURDIR}/00689_file.txt', '../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../../${CURDIR}/00689_file.txt', '../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../../../${CURDIR}/00689_file.txt', '../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../../../../${CURDIR}/00689_file.txt', '../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../../../../../${CURDIR}/00689_file.txt', '../../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" -${CLICKHOUSE_CLIENT} --query="select * from catBoostPool('../../../../../../${CURDIR}/00689_file.txt', '../../../../../../${CURDIR}/00689_file.txt');" 2>&1 | grep -o "Data" - -${CLICKHOUSE_CLIENT} --query="drop table if exists catboost_pool_desc;" -${CLICKHOUSE_CLIENT} --query="drop table if exists catboost_pool_vals;" From 62053314bbbe5f7ce33f5c0486f662afb7351f1b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 01:40:28 +0300 Subject: [PATCH 08/13] Fixed FPE in yandexConsistentHash --- .../Functions/FunctionsConsistentHashing.h | 24 ++++++++++--------- libs/consistent-hashing/consistent_hashing.h | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/dbms/src/Functions/FunctionsConsistentHashing.h b/dbms/src/Functions/FunctionsConsistentHashing.h index b7282c10dba..31a292bb9f2 100644 --- a/dbms/src/Functions/FunctionsConsistentHashing.h +++ b/dbms/src/Functions/FunctionsConsistentHashing.h @@ -29,11 +29,12 @@ struct YandexConsistentHashImpl static constexpr auto name = "yandexConsistentHash"; using HashType = UInt64; - /// Actually it supports UInt64, but it is effective only if n < 65536 - using ResultType = UInt32; - using BucketsCountType = ResultType; + /// Actually it supports UInt64, but it is efficient only if n <= 32768 + using ResultType = UInt16; + using BucketsType = ResultType; + static constexpr auto max_buckets = 32768; - static inline ResultType apply(UInt64 hash, BucketsCountType n) + static inline ResultType apply(UInt64 hash, BucketsType n) { return ConsistentHashing(hash, n); } @@ -59,9 +60,10 @@ struct JumpConsistentHashImpl using HashType = UInt64; using ResultType = Int32; - using BucketsCountType = ResultType; + using BucketsType = ResultType; + static constexpr auto max_buckets = static_cast(std::numeric_limits::max()); - static inline ResultType apply(UInt64 hash, BucketsCountType n) + static inline ResultType apply(UInt64 hash, BucketsType n) { return JumpConsistentHash(hash, n); } @@ -74,9 +76,10 @@ struct SumburConsistentHashImpl using HashType = UInt32; using ResultType = UInt16; - using BucketsCountType = ResultType; + using BucketsType = ResultType; + static constexpr auto max_buckets = static_cast(std::numeric_limits::max()); - static inline ResultType apply(HashType hash, BucketsCountType n) + static inline ResultType apply(HashType hash, BucketsType n) { return static_cast(sumburConsistentHash(hash, n)); } @@ -143,8 +146,7 @@ public: private: using HashType = typename Impl::HashType; using ResultType = typename Impl::ResultType; - using BucketsType = typename Impl::BucketsCountType; - static constexpr auto max_buckets = static_cast(std::numeric_limits::max()); + using BucketsType = typename Impl::BucketsType; template inline BucketsType checkBucketsRange(T buckets) @@ -153,7 +155,7 @@ private: throw Exception( "The second argument of function " + getName() + " (number of buckets) must be positive number", ErrorCodes::BAD_ARGUMENTS); - if (unlikely(static_cast(buckets) > max_buckets)) + if (unlikely(static_cast(buckets) > Impl::max_buckets)) throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " + DataTypeNumber().getName(), ErrorCodes::BAD_ARGUMENTS); diff --git a/libs/consistent-hashing/consistent_hashing.h b/libs/consistent-hashing/consistent_hashing.h index fba229c2bd4..a779dc2f3a6 100644 --- a/libs/consistent-hashing/consistent_hashing.h +++ b/libs/consistent-hashing/consistent_hashing.h @@ -15,5 +15,5 @@ * It requires O(1) memory and cpu to calculate. So, it is faster than classic * consistent hashing algos with points on circle. */ -std::size_t ConsistentHashing(std::uint64_t x, std::size_t n); // Works good for n < 65536 -std::size_t ConsistentHashing(std::uint64_t lo, std::uint64_t hi, std::size_t n); // Works good for n < 4294967296 +std::size_t ConsistentHashing(std::uint64_t x, std::size_t n); // Works for n <= 32768 +std::size_t ConsistentHashing(std::uint64_t lo, std::uint64_t hi, std::size_t n); // Works for n <= 2^31 From 7394d3e73a91bd4ed0b21b7151333c5b7a56abce Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 01:42:21 +0300 Subject: [PATCH 09/13] Fixed exception message --- dbms/src/Functions/FunctionsConsistentHashing.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsConsistentHashing.h b/dbms/src/Functions/FunctionsConsistentHashing.h index 31a292bb9f2..04640e0ec57 100644 --- a/dbms/src/Functions/FunctionsConsistentHashing.h +++ b/dbms/src/Functions/FunctionsConsistentHashing.h @@ -156,9 +156,8 @@ private: "The second argument of function " + getName() + " (number of buckets) must be positive number", ErrorCodes::BAD_ARGUMENTS); if (unlikely(static_cast(buckets) > Impl::max_buckets)) - throw Exception("The value of the second argument of function " + getName() + " (number of buckets) is not fit to " - + DataTypeNumber().getName(), - ErrorCodes::BAD_ARGUMENTS); + throw Exception("The value of the second argument of function " + getName() + " (number of buckets) must not be greater than " + + std::to_string(Impl::max_buckets), ErrorCodes::BAD_ARGUMENTS); return static_cast(buckets); } From 9cdbc6c663e1fd00c35a6be57d06969acb62e36f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 01:45:25 +0300 Subject: [PATCH 10/13] Added a test --- .../0_stateless/00979_yandex_consistent_hash_fpe.reference | 0 .../queries/0_stateless/00979_yandex_consistent_hash_fpe.sql | 1 + 2 files changed, 1 insertion(+) create mode 100644 dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.reference create mode 100644 dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.sql diff --git a/dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.reference b/dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.sql b/dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.sql new file mode 100644 index 00000000000..79fabeae7ef --- /dev/null +++ b/dbms/tests/queries/0_stateless/00979_yandex_consistent_hash_fpe.sql @@ -0,0 +1 @@ +SELECT yandexConsistentHash(-1, 40000); -- { serverError 36 } From 61541a76e66b324475f5aa25400e78fc42a0b8be Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 03:15:17 +0300 Subject: [PATCH 11/13] Update hyperscan; avoid hyperscan rebuilds --- contrib/hyperscan | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/hyperscan b/contrib/hyperscan index 01e6b83f9fb..3058c9c20cb 160000 --- a/contrib/hyperscan +++ b/contrib/hyperscan @@ -1 +1 @@ -Subproject commit 01e6b83f9fbdb4020cd68a5287bf3a0471eeb272 +Subproject commit 3058c9c20cba3accdf92544d8513a26240c4ff70 From 9f6e26f14d805c0b70308cb59a4bb61fabe79561 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 3 Aug 2019 04:10:13 +0300 Subject: [PATCH 12/13] Fixed tests --- dbms/src/Common/tests/gtest_thread_pool_global_full.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp b/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp index 48858fa40ef..5ed84b93a3c 100644 --- a/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp +++ b/dbms/src/Common/tests/gtest_thread_pool_global_full.cpp @@ -39,6 +39,10 @@ TEST(ThreadPool, GlobalFull1) pool.wait(); EXPECT_EQ(counter, num_jobs); + + global_pool.setMaxThreads(10000); + global_pool.setMaxFreeThreads(1000); + global_pool.setQueueSize(10000); } @@ -78,4 +82,8 @@ TEST(ThreadPool, GlobalFull2) another_pool.wait(); EXPECT_EQ(counter, capacity * 2 + 1); + + global_pool.setMaxThreads(10000); + global_pool.setMaxFreeThreads(1000); + global_pool.setQueueSize(10000); } From fe90d499a3bd81495ec747e7ab3a851dd61971f1 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 3 Aug 2019 05:53:34 +0300 Subject: [PATCH 13/13] Update index.html --- website/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/index.html b/website/index.html index 44c5549fccc..26090e5d857 100644 --- a/website/index.html +++ b/website/index.html @@ -416,7 +416,7 @@ clickhouse-client