From 99995f55bc28224facc13baf4557f1ce6e5969f7 Mon Sep 17 00:00:00 2001 From: hexiaoting Date: Thu, 7 Jul 2022 17:26:58 +0800 Subject: [PATCH 01/69] Allow more situations with final to push where to prewhere --- .../MergeTree/MergeTreeWhereOptimizer.cpp | 53 +++++-------------- .../MergeTree/MergeTreeWhereOptimizer.h | 5 +- ...der_key_to_prewhere_select_final.reference | 48 +++++++++++++++++ ...ove_order_key_to_prewhere_select_final.sql | 14 ++++- 4 files changed, 76 insertions(+), 44 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index b3ff05a960a..5a7030ad5c2 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -38,6 +38,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( , queried_columns{queried_columns_} , sorting_key_names{NameSet( metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())} + , pk_names{NameSet( + metadata_snapshot->getPrimaryKey().column_names.begin(), metadata_snapshot->getPrimaryKey().column_names.end())} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query->clone(), query_info.syntax_analyzer_result, context)} , log{log_} , column_sizes{std::move(column_sizes_)} @@ -193,8 +195,8 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() && !cannotBeMoved(node, is_final) - /// Do not take into consideration the conditions consisting only of the first primary key column - && !hasPrimaryKeyAtoms(node) + /// Do not take into consideration the conditions consisting of column that not belong to the primary key columns + && isColumnAllPrimaryKey(node) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. && isSubsetOfTableColumns(cond.identifiers) /// Do not move conditions involving all queried columns. @@ -271,7 +273,6 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const /// Move the best condition to PREWHERE if it is viable. auto it = std::min_element(where_conditions.begin(), where_conditions.end()); - if (!it->viable) break; @@ -289,7 +290,6 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const moved_enough = total_number_of_moved_columns > 0 && (total_number_of_moved_columns + it->identifiers.size()) * 4 > queried_columns.size(); } - if (moved_enough) break; @@ -320,48 +320,21 @@ UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identif return size; } - -bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const +bool MergeTreeWhereOptimizer::isColumnAllPrimaryKey(const ASTPtr & ast) const { if (const auto * func = ast->as()) { const auto & args = func->arguments->children; - - if ((func->name == "not" && 1 == args.size()) || func->name == "and" || func->name == "or") - { - for (const auto & arg : args) - if (hasPrimaryKeyAtoms(arg)) - return true; - - return false; - } + for (const auto & arg : args) + if (!isColumnAllPrimaryKey(arg)) + return false; + return true; } - return isPrimaryKeyAtom(ast); -} - - -bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const -{ - if (const auto * func = ast->as()) - { - if (!KeyCondition::atom_map.contains(func->name)) - return false; - - const auto & args = func->arguments->children; - if (args.size() != 2) - return false; - - const auto & first_arg_name = args.front()->getColumnName(); - const auto & second_arg_name = args.back()->getColumnName(); - - if ((first_primary_key_column == first_arg_name && isConstant(args[1])) - || (first_primary_key_column == second_arg_name && isConstant(args[0])) - || (first_primary_key_column == first_arg_name && functionIsInOrGlobalInOperator(func->name))) - return true; - } - - return false; + if (isConstant(ast) || pk_names.contains(ast->getColumnName())) + return true; + else + return false; } diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index fa14fea94d1..ee18ffa4558 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -83,9 +83,7 @@ private: UInt64 getIdentifiersColumnSize(const NameSet & identifiers) const; - bool hasPrimaryKeyAtoms(const ASTPtr & ast) const; - - bool isPrimaryKeyAtom(const ASTPtr & ast) const; + bool isColumnAllPrimaryKey(const ASTPtr & ast) const; bool isSortingKey(const String & column_name) const; @@ -109,6 +107,7 @@ private: const StringSet table_columns; const Names queried_columns; const NameSet sorting_key_names; + const NameSet pk_names; const Block block_with_constants; Poco::Logger * log; std::unordered_map column_sizes; diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference index 71d10397326..98c76cc2a50 100644 --- a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference @@ -1,5 +1,20 @@ optimize_move_to_prewhere_if_final = 1 +SELECT + x, + y, + z +FROM prewhere_move_select_final +PREWHERE x > 100 + +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE x > 100 + SELECT x, y, @@ -15,6 +30,21 @@ FROM prewhere_move_select_final FINAL PREWHERE y > 100 +SELECT + x, + y, + z +FROM prewhere_move_select_final +PREWHERE (x + y) > 100 + +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE (x + y) > 100 + SELECT x, y, @@ -32,6 +62,24 @@ FINAL PREWHERE y > 100 WHERE (y > 100) AND (z > 400) +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE x > 50 +WHERE (x > 50) AND (z > 400) + +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE (x + y) > 50 +WHERE ((x + y) > 50) AND (z > 400) + optimize_move_to_prewhere_if_final = 0 SELECT diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql index 789892dbd38..ede15738c5b 100644 --- a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql @@ -11,17 +11,29 @@ SET optimize_move_to_prewhere_if_final = 1; -- order key can be pushed down with final select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE x > 100; +select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x > 100; +select ''; EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE y > 100; select ''; EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100; +select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE x + y > 100; +select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x + y > 100; -- can not be pushed down select ''; EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE z > 400; --- only y can be pushed down +-- only condition with x/y can be pushed down select ''; EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100 and z > 400; +select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x > 50 and z > 400; +select ''; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x + y > 50 and z > 400; select ''; select 'optimize_move_to_prewhere_if_final = 0'; From 009fd0cebccc4dc1e92ae7f58f0ea496035c322c Mon Sep 17 00:00:00 2001 From: hexiaoting Date: Thu, 7 Jul 2022 17:30:45 +0800 Subject: [PATCH 02/69] fix style --- src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 5a7030ad5c2..b36f17a9851 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -273,6 +273,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const /// Move the best condition to PREWHERE if it is viable. auto it = std::min_element(where_conditions.begin(), where_conditions.end()); + if (!it->viable) break; @@ -290,6 +291,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const moved_enough = total_number_of_moved_columns > 0 && (total_number_of_moved_columns + it->identifiers.size()) * 4 > queried_columns.size(); } + if (moved_enough) break; From ee4137cd7864ddb2bba25f5481d45422e079efa1 Mon Sep 17 00:00:00 2001 From: hexiaoting Date: Fri, 8 Jul 2022 11:05:01 +0800 Subject: [PATCH 03/69] fix bug --- .../MergeTree/MergeTreeWhereOptimizer.cpp | 16 +++++++++------- src/Storages/MergeTree/MergeTreeWhereOptimizer.h | 3 +-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index b36f17a9851..3906b5e0c48 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -38,8 +38,6 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( , queried_columns{queried_columns_} , sorting_key_names{NameSet( metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())} - , pk_names{NameSet( - metadata_snapshot->getPrimaryKey().column_names.begin(), metadata_snapshot->getPrimaryKey().column_names.end())} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query->clone(), query_info.syntax_analyzer_result, context)} , log{log_} , column_sizes{std::move(column_sizes_)} @@ -195,8 +193,8 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() && !cannotBeMoved(node, is_final) - /// Do not take into consideration the conditions consisting of column that not belong to the primary key columns - && isColumnAllPrimaryKey(node) + /// when use final, do not take into consideration the conditions consisting of column that not belong to the sorting key columns + && (!is_final || isExpressionOverSortingKey(node)) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. && isSubsetOfTableColumns(cond.identifiers) /// Do not move conditions involving all queried columns. @@ -322,18 +320,22 @@ UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identif return size; } -bool MergeTreeWhereOptimizer::isColumnAllPrimaryKey(const ASTPtr & ast) const +bool MergeTreeWhereOptimizer::isExpressionOverSortingKey(const ASTPtr & ast) const { if (const auto * func = ast->as()) { const auto & args = func->arguments->children; for (const auto & arg : args) - if (!isColumnAllPrimaryKey(arg)) + { + if (isConstant(ast) || sorting_key_names.contains(arg->getColumnName())) + continue; + if (!isExpressionOverSortingKey(arg)) return false; + } return true; } - if (isConstant(ast) || pk_names.contains(ast->getColumnName())) + if (isConstant(ast) || sorting_key_names.contains(ast->getColumnName())) return true; else return false; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index ee18ffa4558..277994cbd28 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -83,7 +83,7 @@ private: UInt64 getIdentifiersColumnSize(const NameSet & identifiers) const; - bool isColumnAllPrimaryKey(const ASTPtr & ast) const; + bool isExpressionOverSortingKey(const ASTPtr & ast) const; bool isSortingKey(const String & column_name) const; @@ -107,7 +107,6 @@ private: const StringSet table_columns; const Names queried_columns; const NameSet sorting_key_names; - const NameSet pk_names; const Block block_with_constants; Poco::Logger * log; std::unordered_map column_sizes; From f19511a80a92cb7728f6c17c31f915bb52731573 Mon Sep 17 00:00:00 2001 From: hexiaoting Date: Fri, 8 Jul 2022 18:03:22 +0800 Subject: [PATCH 04/69] fix test cases --- .../00808_not_optimize_predicate.reference | 2 +- .../01786_explain_merge_tree.reference | 18 ++++++------- ...02149_read_in_order_fixed_prefix.reference | 27 ++++++++++--------- .../02149_read_in_order_fixed_prefix.sql | 1 + 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/tests/queries/0_stateless/00808_not_optimize_predicate.reference b/tests/queries/0_stateless/00808_not_optimize_predicate.reference index 647c6d91890..3110b82db89 100644 --- a/tests/queries/0_stateless/00808_not_optimize_predicate.reference +++ b/tests/queries/0_stateless/00808_not_optimize_predicate.reference @@ -24,6 +24,6 @@ FROM n, finalizeAggregation(s) FROM test_00808_push_down_with_finalizeAggregation - WHERE (n <= 5) AND (n >= 2) + PREWHERE (n <= 5) AND (n >= 2) ) WHERE (n >= 2) AND (n <= 5) diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.reference b/tests/queries/0_stateless/01786_explain_merge_tree.reference index 7e0a91b203f..f5efce1fcfc 100644 --- a/tests/queries/0_stateless/01786_explain_merge_tree.reference +++ b/tests/queries/0_stateless/01786_explain_merge_tree.reference @@ -97,12 +97,12 @@ ReadType: InReverseOrder Parts: 1 Granules: 3 - ReadFromMergeTree (default.idx) - Indexes: - PrimaryKey - Keys: - x - plus(x, y) - Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-Inf, 2])) - Parts: 1/1 - Granules: 1/1 + ReadFromMergeTree (default.idx) + Indexes: + PrimaryKey + Keys: + x + plus(x, y) + Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-Inf, 2])) + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference index 437b934c28c..e787cadd048 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference @@ -60,10 +60,8 @@ ExpressionTransform (Sorting) (Expression) ExpressionTransform - (Filter) - FilterTransform - (ReadFromMergeTree) - MergeTreeInOrder 0 → 1 + (ReadFromMergeTree) + MergeTreeInOrder 0 → 1 2020-10-11 0 0 2020-10-11 0 10 2020-10-11 0 20 @@ -78,15 +76,20 @@ ExpressionTransform PartialSortingTransform (Expression) ExpressionTransform - (Filter) - FilterTransform - (ReadFromMergeTree) - MergeTreeInOrder 0 → 1 + (ReadFromMergeTree) + MergeTreeInOrder 0 → 1 2020-10-12 0 2020-10-12 1 2020-10-12 2 2020-10-12 3 2020-10-12 4 +SELECT + date, + i +FROM t_read_in_order +PREWHERE date = \'2020-10-12\' +ORDER BY i DESC +LIMIT 5 (Expression) ExpressionTransform (Limit) @@ -94,11 +97,9 @@ ExpressionTransform (Sorting) (Expression) ExpressionTransform - (Filter) - FilterTransform - (ReadFromMergeTree) - ReverseTransform - MergeTreeReverse 0 → 1 + (ReadFromMergeTree) + ReverseTransform + MergeTreeReverse 0 → 1 2020-10-12 99999 2020-10-12 99998 2020-10-12 99997 diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql index 4dfcbb9bf80..c141f6601c2 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql @@ -30,6 +30,7 @@ INSERT INTO t_read_in_order SELECT '2020-10-12', number, number FROM numbers(100 SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i LIMIT 5; +EXPLAIN SYNTAX SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5; EXPLAIN PIPELINE SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5; SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5; From b589f8733801e0b5bccf7bdaadc9526159cfd000 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 18 Jul 2022 10:52:27 +0000 Subject: [PATCH 05/69] Do not sleep at key kondition analysis. --- src/Functions/sleep.h | 22 +++++++++++++++++----- src/Storages/MergeTree/KeyCondition.cpp | 2 +- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/Functions/sleep.h b/src/Functions/sleep.h index 6582f75b0e8..9ec1f6c784f 100644 --- a/src/Functions/sleep.h +++ b/src/Functions/sleep.h @@ -77,8 +77,17 @@ public: return std::make_shared(); } + ColumnPtr executeImplDryRun(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override + { + return execute(arguments, result_type, true); + } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override + { + return execute(arguments, result_type, false); + } + + ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, bool dry_run) const { const IColumn * col = arguments[0].column.get(); @@ -99,11 +108,14 @@ public: if (seconds > 3.0) /// The choice is arbitrary throw Exception("The maximum sleep time is 3 seconds. Requested: " + toString(seconds), ErrorCodes::TOO_SLOW); - UInt64 count = (variant == FunctionSleepVariant::PerBlock ? 1 : size); - UInt64 microseconds = seconds * count * 1e6; - sleepForMicroseconds(microseconds); - ProfileEvents::increment(ProfileEvents::SleepFunctionCalls, count); - ProfileEvents::increment(ProfileEvents::SleepFunctionMicroseconds, microseconds); + if (!dry_run) + { + UInt64 count = (variant == FunctionSleepVariant::PerBlock ? 1 : size); + UInt64 microseconds = seconds * count * 1e6; + sleepForMicroseconds(microseconds); + ProfileEvents::increment(ProfileEvents::SleepFunctionCalls, count); + ProfileEvents::increment(ProfileEvents::SleepFunctionMicroseconds, microseconds); + } } /// convertToFullColumn needed, because otherwise (constant expression case) function will not get called on each columns. diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index f6baae723c9..a66f0fe5456 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -419,7 +419,7 @@ Block KeyCondition::getBlockWithConstants( const auto expr_for_constant_folding = ExpressionAnalyzer(query, syntax_analyzer_result, context).getConstActions(); - expr_for_constant_folding->execute(result); + expr_for_constant_folding->execute(result, true); return result; } From 88880581e5da017f3671d4757895aca5e68893f9 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 5 Aug 2022 11:07:08 +0000 Subject: [PATCH 06/69] Fixing tests. --- tests/queries/0_stateless/01786_explain_merge_tree.reference | 2 +- .../01834_alias_columns_laziness_filimonov.reference | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.reference b/tests/queries/0_stateless/01786_explain_merge_tree.reference index f5efce1fcfc..4a3fe99710b 100644 --- a/tests/queries/0_stateless/01786_explain_merge_tree.reference +++ b/tests/queries/0_stateless/01786_explain_merge_tree.reference @@ -100,7 +100,7 @@ ReadFromMergeTree (default.idx) Indexes: PrimaryKey - Keys: + Keys: x plus(x, y) Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-Inf, 2])) diff --git a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference index 5290ffc5dbe..d8e51196fc7 100644 --- a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference +++ b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference @@ -1,2 +1,2 @@ -SleepFunctionCalls: 4 (increment) -SleepFunctionMicroseconds: 400000 (increment) +SleepFunctionCalls: 1 (increment) +SleepFunctionMicroseconds: 100000 (increment) From ffe8c86ec09d0e2e7eb068aeda28e1bd647c025d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 5 Aug 2022 18:21:05 +0000 Subject: [PATCH 07/69] Fixing tests. --- .../00597_push_down_predicate_long.reference | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/queries/0_stateless/00597_push_down_predicate_long.reference b/tests/queries/0_stateless/00597_push_down_predicate_long.reference index af4e9b5496e..215febe5c07 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate_long.reference +++ b/tests/queries/0_stateless/00597_push_down_predicate_long.reference @@ -179,7 +179,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 2000-01-01 1 test string 1 1 @@ -203,7 +203,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 ) @@ -229,7 +229,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) AS b WHERE id = 1 ) @@ -248,7 +248,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 2000-01-01 1 test string 1 1 @@ -272,7 +272,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 ) @@ -291,7 +291,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) AS b WHERE id = 1 2000-01-01 1 test string 1 1 @@ -315,7 +315,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) AS a WHERE id = 1 ) AS b @@ -332,7 +332,7 @@ FROM date, min(value) AS value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 GROUP BY id, date @@ -352,7 +352,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 UNION ALL SELECT date, @@ -360,7 +360,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 2000-01-01 1 test string 1 1 @@ -381,7 +381,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) ANY LEFT JOIN ( @@ -441,7 +441,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) ANY LEFT JOIN ( @@ -532,7 +532,7 @@ FROM name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) AS a ANY LEFT JOIN ( @@ -579,7 +579,7 @@ SEMI LEFT JOIN name, value FROM test_00597 - WHERE id = 1 + PREWHERE id = 1 ) WHERE id = 1 ) AS r USING (id) From 7bb8db4bb350db56aadd948410ec539b9b5c3438 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 28 Dec 2022 00:54:50 +0000 Subject: [PATCH 08/69] Add generateULID function --- .gitmodules | 3 + contrib/CMakeLists.txt | 2 + contrib/ulid-c | 1 + .../sql-reference/functions/uuid-functions.md | 45 +++++++++++ src/CMakeLists.txt | 2 + src/Functions/generateULID.cpp | 76 +++++++++++++++++++ .../0_stateless/02515_generate_ulid.reference | 1 + .../0_stateless/02515_generate_ulid.sql | 1 + 8 files changed, 131 insertions(+) create mode 160000 contrib/ulid-c create mode 100644 src/Functions/generateULID.cpp create mode 100644 tests/queries/0_stateless/02515_generate_ulid.reference create mode 100644 tests/queries/0_stateless/02515_generate_ulid.sql diff --git a/.gitmodules b/.gitmodules index 0805b6d5492..6a35b497826 100644 --- a/.gitmodules +++ b/.gitmodules @@ -294,3 +294,6 @@ [submodule "contrib/libdivide"] path = contrib/libdivide url = https://github.com/ridiculousfish/libdivide.git +[submodule "contrib/ulid-c"] + path = contrib/ulid-c + url = https://github.com/evillique/ulid-c.git diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 6f80059498e..9dc0663e58b 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -172,6 +172,8 @@ add_contrib (xxHash-cmake xxHash) add_contrib (google-benchmark-cmake google-benchmark) +add_contrib (ulid-c-cmake ulid-c) + # Put all targets defined here and in subdirectories under "contrib/" folders in GUI-based IDEs. # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear # in "contrib/..." as originally planned, so we workaround this by fixing FOLDER properties of all targets manually, diff --git a/contrib/ulid-c b/contrib/ulid-c new file mode 160000 index 00000000000..8b8bc280bc3 --- /dev/null +++ b/contrib/ulid-c @@ -0,0 +1 @@ +Subproject commit 8b8bc280bc305f0d3dbdca49131677f2a208c48c diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 43542367cd5..d1b3db09e6c 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -315,6 +315,51 @@ serverUUID() Type: [UUID](../data-types/uuid.md). + +# Functions for Working with ULID + +## generateULID + +Generates the [ULID](https://github.com/ulid/spec). + +**Syntax** + +``` sql +generateULID([x]) +``` + +**Arguments** + +- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. + +**Returned value** + +The [FixedString](../data-types/fixedstring.md) type value. + +**Usage example** + +``` sql +SELECT generateULID() +``` + +``` text +┌─generateULID()─────────────┐ +│ 01GNB2S2FGN2P93QPXDNB4EN2R │ +└────────────────────────────┘ +``` + +**Usage example if it is needed to generate multiple values in one row** + +```sql +SELECT generateULID(1), generateULID(2) +``` + +``` text +┌─generateULID(1)────────────┬─generateULID(2)────────────┐ +│ 01GNB2SGG4RHKVNT9ZGA4FFMNP │ 01GNB2SGG4V0HMQVH4VBVPSSRB │ +└────────────────────────────┴────────────────────────────┘ +``` + ## See Also - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7e3364dc92e..63c566df145 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -567,6 +567,8 @@ if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data) endif() +dbms_target_link_libraries (PUBLIC ch_contrib::ulid) + if (TARGET ch_contrib::bzip2) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::bzip2) endif() diff --git a/src/Functions/generateULID.cpp b/src/Functions/generateULID.cpp new file mode 100644 index 00000000000..95f00b9dab9 --- /dev/null +++ b/src/Functions/generateULID.cpp @@ -0,0 +1,76 @@ +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +class FunctionGenerateULID : public IFunction +{ +public: + static constexpr size_t ULID_LENGTH = 26; + + static constexpr auto name = "generateULID"; + + static FunctionPtr create(ContextPtr /*context*/) + { + return std::make_shared(); + } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 0; } + + bool isVariadic() const override { return true; } + bool isDeterministic() const override { return false; } + bool isDeterministicInScopeOfQuery() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 1) + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.", + getName(), arguments.size()); + + return std::make_shared(ULID_LENGTH); + } + + bool useDefaultImplementationForConstants() const override { return true; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & /*arguments*/, const DataTypePtr &, size_t input_rows_count) const override + { + auto col_res = ColumnFixedString::create(ULID_LENGTH); + auto & vec_res = col_res->getChars(); + + vec_res.resize(input_rows_count * ULID_LENGTH); + + ulid_generator generator; + ulid_generator_init(&generator, 0); + + for (size_t offset = 0, size = vec_res.size(); offset < size; offset += ULID_LENGTH) + ulid_generate(&generator, reinterpret_cast(&vec_res[offset])); + + return col_res; + } +}; + + +REGISTER_FUNCTION(GenerateULID) +{ + factory.registerFunction(); +} + +} diff --git a/tests/queries/0_stateless/02515_generate_ulid.reference b/tests/queries/0_stateless/02515_generate_ulid.reference new file mode 100644 index 00000000000..23aed8ce678 --- /dev/null +++ b/tests/queries/0_stateless/02515_generate_ulid.reference @@ -0,0 +1 @@ +1 FixedString(26) diff --git a/tests/queries/0_stateless/02515_generate_ulid.sql b/tests/queries/0_stateless/02515_generate_ulid.sql new file mode 100644 index 00000000000..3e179a45bac --- /dev/null +++ b/tests/queries/0_stateless/02515_generate_ulid.sql @@ -0,0 +1 @@ +SELECT generateULID(1) != generateULID(2), toTypeName(generateULID()); From 945d50af879803ee86fc232d7d7d185e2867b684 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Thu, 29 Dec 2022 02:17:52 +0000 Subject: [PATCH 09/69] Fix build --- src/CMakeLists.txt | 4 +++- src/Common/config.h.in | 1 + src/Functions/generateULID.cpp | 6 ++++++ src/configure_config.cmake | 3 +++ tests/queries/0_stateless/02515_generate_ulid.sql | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 63c566df145..208bc3ea2f7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -567,7 +567,9 @@ if (ENABLE_NLP) dbms_target_link_libraries (PUBLIC ch_contrib::nlp_data) endif() -dbms_target_link_libraries (PUBLIC ch_contrib::ulid) +if (TARGET ch_contrib::ulid) + dbms_target_link_libraries (PUBLIC ch_contrib::ulid) +endif() if (TARGET ch_contrib::bzip2) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::bzip2) diff --git a/src/Common/config.h.in b/src/Common/config.h.in index baa480a6545..eb674865298 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -54,3 +54,4 @@ #cmakedefine01 USE_BLAKE3 #cmakedefine01 USE_SKIM #cmakedefine01 USE_OPENSSL_INTREE +#cmakedefine01 USE_ULID diff --git a/src/Functions/generateULID.cpp b/src/Functions/generateULID.cpp index 95f00b9dab9..46b8d190496 100644 --- a/src/Functions/generateULID.cpp +++ b/src/Functions/generateULID.cpp @@ -1,3 +1,7 @@ +#include "config.h" + +#if USE_ULID + #include #include #include @@ -74,3 +78,5 @@ REGISTER_FUNCTION(GenerateULID) } } + +#endif diff --git a/src/configure_config.cmake b/src/configure_config.cmake index 58cb34b7d67..ae9725dff63 100644 --- a/src/configure_config.cmake +++ b/src/configure_config.cmake @@ -94,6 +94,9 @@ endif() if (ENABLE_NLP) set(USE_NLP 1) endif() +if (TARGET ch_contrib::ulid) + set(USE_ULID 1) +endif() if (TARGET ch_contrib::llvm) set(USE_EMBEDDED_COMPILER 1) endif() diff --git a/tests/queries/0_stateless/02515_generate_ulid.sql b/tests/queries/0_stateless/02515_generate_ulid.sql index 3e179a45bac..4059090a72b 100644 --- a/tests/queries/0_stateless/02515_generate_ulid.sql +++ b/tests/queries/0_stateless/02515_generate_ulid.sql @@ -1 +1,3 @@ +-- Tags: no-fasttest + SELECT generateULID(1) != generateULID(2), toTypeName(generateULID()); From 57b3fbf206a160edb9e7cce0e9bdf45187fa3de8 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Wed, 4 Jan 2023 02:24:14 +0000 Subject: [PATCH 10/69] Fix build --- contrib/ulid-c-cmake/CMakeLists.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 contrib/ulid-c-cmake/CMakeLists.txt diff --git a/contrib/ulid-c-cmake/CMakeLists.txt b/contrib/ulid-c-cmake/CMakeLists.txt new file mode 100644 index 00000000000..9a68e09faa2 --- /dev/null +++ b/contrib/ulid-c-cmake/CMakeLists.txt @@ -0,0 +1,17 @@ +option (ENABLE_ULID "Enable ulid" ${ENABLE_LIBRARIES}) + +if (NOT ENABLE_ULID) + message(STATUS "Not using ulid") + return() +endif() + +set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/ulid-c") + +set (SRCS + "${LIBRARY_DIR}/include/ulid.h" + "${LIBRARY_DIR}/include/ulid.c" +) + +add_library(_ulid ${SRCS}) +target_include_directories(_ulid SYSTEM PUBLIC "${LIBRARY_DIR}/include") +add_library(ch_contrib::ulid ALIAS _ulid) \ No newline at end of file From 4cd361747d7717e07499bfdd984f8deaae5a1052 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 6 Feb 2023 09:51:11 +0000 Subject: [PATCH 11/69] Address PR comment --- src/IO/S3/Client.cpp | 61 ++++++++++++++++++++++ src/IO/S3/Client.h | 81 ++++++++++++------------------ src/Storages/StorageS3Settings.cpp | 2 - src/Storages/StorageS3Settings.h | 10 ---- 4 files changed, 94 insertions(+), 60 deletions(-) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 2e07dfb19ff..8a3ff366dd9 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -76,6 +76,67 @@ void Client::RetryStrategy::RequestBookkeeping(const Aws::Client::HttpResponseOu return wrapped_strategy->RequestBookkeeping(httpResponseOutcome, lastError); } +namespace +{ + +void verifyClientConfiguration(const Aws::Client::ClientConfiguration & client_config) +{ + if (!client_config.retryStrategy) + throw Exception(ErrorCodes::LOGICAL_ERROR, "The S3 client can only be used with Client::RetryStrategy, define it in the client configuration"); + + assert_cast(*client_config.retryStrategy); +} + +} + +std::unique_ptr Client::create( + size_t max_redirects_, + const std::shared_ptr & credentials_provider, + const Aws::Client::ClientConfiguration & client_configuration, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads, + bool use_virtual_addressing) +{ + verifyClientConfiguration(client_configuration); + return std::unique_ptr( + new Client(max_redirects_, credentials_provider, client_configuration, sign_payloads, use_virtual_addressing)); +} + +std::unique_ptr Client::create(const Client & other) +{ + return std::unique_ptr(new Client(other)); +} + +Client::Client( + size_t max_redirects_, + const std::shared_ptr & credentials_provider, + const Aws::Client::ClientConfiguration & client_configuration, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads, + bool use_virtual_addressing) + : Aws::S3::S3Client(credentials_provider, client_configuration, std::move(sign_payloads), use_virtual_addressing) + , max_redirects(max_redirects_) + , log(&Poco::Logger::get("S3Client")) +{ + auto * endpoint_provider = dynamic_cast(accessEndpointProvider().get()); + endpoint_provider->GetBuiltInParameters().GetParameter("Region").GetString(explicit_region); + std::string endpoint; + endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint); + detect_region = explicit_region == Aws::Region::AWS_GLOBAL && endpoint.find(".amazonaws.com") != std::string::npos; + + cache = std::make_shared(); + ClientCacheRegistry::instance().registerClient(cache); +} + +Client::Client(const Client & other) + : Aws::S3::S3Client(other) + , explicit_region(other.explicit_region) + , detect_region(other.detect_region) + , max_redirects(other.max_redirects) + , log(&Poco::Logger::get("S3Client")) +{ + cache = std::make_shared(*other.cache); + ClientCacheRegistry::instance().registerClient(cache); +} + bool Client::checkIfWrongRegionDefined(const std::string & bucket, const Aws::S3::S3Error & error, std::string & region) const { if (detect_region) diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 13f26c214f2..8356b0b25fa 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -22,7 +22,6 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; extern const int TOO_MANY_REDIRECTS; } @@ -78,15 +77,25 @@ private: /// - automatically detect endpoint and regions for each bucket and cache them /// /// For this client to work correctly both Client::RetryStrategy and Requests defined in should be used. -class Client : public Aws::S3::S3Client +/// +/// To add support for new type of request +/// - ExtendedRequest should be defined inside IO/S3/Requests.h +/// - new method accepting that request should be defined in this Client (check other requests for reference) +/// - method handling the request from Aws::S3::S3Client should be left to private so we don't use it by accident +class Client : private Aws::S3::S3Client { public: - template - static std::unique_ptr create(Args &&... args) - { - (verifyArgument(args), ...); - return std::unique_ptr(new Client(std::forward(args)...)); - } + /// we use a factory method to verify arguments before creating a client because + /// there are certain requirements on arguments for it to work correctly + /// e.g. Client::RetryStrategy should be used + static std::unique_ptr create( + size_t max_redirects_, + const std::shared_ptr & credentials_provider, + const Aws::Client::ClientConfiguration & client_configuration, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads, + bool use_virtual_addressing); + + static std::unique_ptr create(const Client & other); Client & operator=(const Client &) = delete; @@ -106,7 +115,12 @@ public: } } - /// Decorator for RetryStrategy needed for this client to work correctly + /// Decorator for RetryStrategy needed for this client to work correctly. + /// We want to manually handle permanent moves (status code 301) because: + /// - redirect location is written in XML format inside the response body something that doesn't exist for HEAD + /// requests so we need to manually find the correct location + /// - we want to cache the new location to decrease number of roundtrips for future requests + /// This decorator doesn't retry if 301 is detected and fallbacks to the inner retry strategy otherwise. class RetryStrategy : public Aws::Client::RetryStrategy { public: @@ -147,35 +161,19 @@ public: Model::DeleteObjectOutcome DeleteObject(const DeleteObjectRequest & request) const; Model::DeleteObjectsOutcome DeleteObjects(const DeleteObjectsRequest & request) const; + using Aws::S3::S3Client::EnableRequestProcessing; + using Aws::S3::S3Client::DisableRequestProcessing; private: - template - explicit Client(size_t max_redirects_, Args &&... args) - : Aws::S3::S3Client(std::forward(args)...) - , max_redirects(max_redirects_) - , log(&Poco::Logger::get("S3Client")) - { - auto * endpoint_provider = dynamic_cast(accessEndpointProvider().get()); - endpoint_provider->GetBuiltInParameters().GetParameter("Region").GetString(explicit_region); - std::string endpoint; - endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint); - detect_region = explicit_region == Aws::Region::AWS_GLOBAL && endpoint.find(".amazonaws.com") != std::string::npos; + Client(size_t max_redirects_, + const std::shared_ptr& credentials_provider, + const Aws::Client::ClientConfiguration& client_configuration, + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy sign_payloads, + bool use_virtual_addressing); - cache = std::make_shared(); - ClientCacheRegistry::instance().registerClient(cache); - } + Client(const Client & other); - Client(const Client & other) - : Aws::S3::S3Client(other) - , explicit_region(other.explicit_region) - , detect_region(other.detect_region) - , max_redirects(other.max_redirects) - , log(&Poco::Logger::get("S3Client")) - { - cache = std::make_shared(*other.cache); - ClientCacheRegistry::instance().registerClient(cache); - } - - /// Make regular functions private + /// Leave regular functions private so we don't accidentally use them + /// otherwise region and endpoint redirection won't work using Aws::S3::S3Client::HeadObject; using Aws::S3::S3Client::ListObjectsV2; using Aws::S3::S3Client::ListObjects; @@ -279,19 +277,6 @@ private: bool checkIfWrongRegionDefined(const std::string & bucket, const Aws::S3::S3Error & error, std::string & region) const; void insertRegionOverride(const std::string & bucket, const std::string & region) const; - template - static void verifyArgument(const T & /*arg*/) - {} - - template T> - static void verifyArgument(const T & client_config) - { - if (!client_config.retryStrategy) - throw Exception(ErrorCodes::LOGICAL_ERROR, "The S3 client can only be used with Client::RetryStrategy, define it in the client configuration"); - - assert_cast(*client_config.retryStrategy); - } - std::string explicit_region; mutable bool detect_region = true; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index ee0b1fd88bf..ee704b3e750 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -166,7 +166,6 @@ S3Settings::RequestSettings::RequestSettings(const NamedCollection & collection) max_single_read_retries = collection.getOrDefault("max_single_read_retries", max_single_read_retries); max_connections = collection.getOrDefault("max_connections", max_connections); list_object_keys_size = collection.getOrDefault("list_object_keys_size", list_object_keys_size); - allow_head_object_request = collection.getOrDefault("allow_head_object_request", allow_head_object_request); } S3Settings::RequestSettings::RequestSettings( @@ -181,7 +180,6 @@ S3Settings::RequestSettings::RequestSettings( max_connections = config.getUInt64(key + "max_connections", settings.s3_max_connections); check_objects_after_upload = config.getBool(key + "check_objects_after_upload", settings.s3_check_objects_after_upload); list_object_keys_size = config.getUInt64(key + "list_object_keys_size", settings.s3_list_object_keys_size); - allow_head_object_request = config.getBool(key + "allow_head_object_request", allow_head_object_request); /// NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, /// which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 61da0a37f62..bce772859f0 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -67,16 +67,6 @@ struct S3Settings ThrottlerPtr get_request_throttler; ThrottlerPtr put_request_throttler; - /// If this is set to false then `S3::getObjectSize()` will use `GetObjectAttributes` request instead of `HeadObject`. - /// Details: `HeadObject` request never returns a response body (even if there is an error) however - /// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv" - /// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways to determine - /// the correct region and try to repeat the request again with the correct region. - /// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies, but for `HeadObject` - /// there is no response body so this way doesn't work. That's why it's better to use `GetObjectAttributes` requests sometimes. - /// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information. - bool allow_head_object_request = true; - const PartUploadSettings & getUploadSettings() const { return upload_settings; } RequestSettings() = default; From 28d1fd6c8345263fd34fc2482d976322b87bf930 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 8 Feb 2023 09:48:35 +0000 Subject: [PATCH 12/69] More fixes --- src/IO/S3/Client.cpp | 79 ++++++++++++++++++++++++++++++++++++- src/IO/S3/Client.h | 75 +---------------------------------- src/IO/S3/getObjectInfo.cpp | 3 +- 3 files changed, 80 insertions(+), 77 deletions(-) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 8a3ff366dd9..b6ae7abc98f 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -192,7 +192,7 @@ Model::HeadObjectOutcome Client::HeadObject(const HeadObjectRequest & request) c if (checkIfWrongRegionDefined(bucket, error, new_region)) { request.overrideRegion(new_region); - return HeadObject(request); + return Aws::S3::S3Client::HeadObject(request); } if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY) @@ -305,6 +305,83 @@ Model::DeleteObjectsOutcome Client::DeleteObjects(const DeleteObjectsRequest & r return doRequest(request, [this](const Model::DeleteObjectsRequest & req) { return Aws::S3::S3Client::DeleteObjects(req); }); } +template +std::invoke_result_t +Client::doRequest(const RequestType & request, RequestFn request_fn) const +{ + const auto & bucket = request.GetBucket(); + + if (auto region = getRegionForBucket(bucket); !region.empty()) + { + if (!detect_region) + LOG_INFO(log, "Using region override {} for bucket {}", region, bucket); + + request.overrideRegion(std::move(region)); + } + + if (auto uri = getURIForBucket(bucket); uri.has_value()) + request.overrideURI(std::move(*uri)); + + + bool found_new_endpoint = false; + // if we found correct endpoint after 301 responses, update the cache for future requests + SCOPE_EXIT( + if (found_new_endpoint) + { + auto uri_override = request.getURIOverride(); + assert(uri_override.has_value()); + updateURIForBucket(bucket, std::move(*uri_override)); + } + ); + + for (size_t attempt = 0; attempt <= max_redirects; ++attempt) + { + auto result = request_fn(request); + if (result.IsSuccess()) + return result; + + const auto & error = result.GetError(); + + std::string new_region; + if (checkIfWrongRegionDefined(bucket, error, new_region)) + { + request.overrideRegion(new_region); + continue; + } + + if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY) + return result; + + // maybe we detect a correct region + if (!detect_region) + { + if (auto region = GetErrorMarshaller()->ExtractRegion(error); !region.empty() && region != explicit_region) + { + request.overrideRegion(region); + insertRegionOverride(bucket, region); + } + } + + // we possibly got new location, need to try with that one + auto new_uri = getURIFromError(error); + if (!new_uri) + return result; + + const auto & current_uri_override = request.getURIOverride(); + /// we already tried with this URI + if (current_uri_override && current_uri_override->uri == new_uri->uri) + { + LOG_INFO(log, "Getting redirected to the same invalid location {}", new_uri->uri.toString()); + return result; + } + + found_new_endpoint = true; + request.overrideURI(*new_uri); + } + + throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects"); +} + std::string Client::getRegionForBucket(const std::string & bucket, bool force_detect) const { std::lock_guard lock(cache->region_cache_mutex); diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 8356b0b25fa..c3086499167 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -192,80 +192,7 @@ private: template std::invoke_result_t - doRequest(const RequestType & request, RequestFn request_fn) const - { - const auto & bucket = request.GetBucket(); - - if (auto region = getRegionForBucket(bucket); !region.empty()) - { - if (!detect_region) - LOG_INFO(log, "Using region override {} for bucket {}", region, bucket); - - request.overrideRegion(std::move(region)); - } - - if (auto uri = getURIForBucket(bucket); uri.has_value()) - request.overrideURI(std::move(*uri)); - - - bool found_new_endpoint = false; - // if we found correct endpoint after 301 responses, update the cache for future requests - SCOPE_EXIT( - if (found_new_endpoint) - { - auto uri_override = request.getURIOverride(); - assert(uri_override.has_value()); - updateURIForBucket(bucket, std::move(*uri_override)); - } - ); - - for (size_t attempt = 0; attempt <= max_redirects; ++attempt) - { - auto result = request_fn(request); - if (result.IsSuccess()) - return result; - - const auto & error = result.GetError(); - - std::string new_region; - if (checkIfWrongRegionDefined(bucket, error, new_region)) - { - request.overrideRegion(new_region); - continue; - } - - if (error.GetResponseCode() != Aws::Http::HttpResponseCode::MOVED_PERMANENTLY) - return result; - - // maybe we detect a correct region - if (!detect_region) - { - if (auto region = GetErrorMarshaller()->ExtractRegion(error); !region.empty() && region != explicit_region) - { - request.overrideRegion(region); - insertRegionOverride(bucket, region); - } - } - - // we possibly got new location, need to try with that one - auto new_uri = getURIFromError(error); - if (!new_uri) - return result; - - const auto & current_uri_override = request.getURIOverride(); - /// we already tried with this URI - if (current_uri_override && current_uri_override->uri == new_uri->uri) - { - LOG_INFO(log, "Getting redirected to the same invalid location {}", new_uri->uri.toString()); - return result; - } - - found_new_endpoint = true; - request.overrideURI(*new_uri); - } - - throw Exception(ErrorCodes::TOO_MANY_REDIRECTS, "Too many redirects"); - } + doRequest(const RequestType & request, RequestFn request_fn) const; void updateURIForBucket(const std::string & bucket, S3::URI new_uri) const; std::optional getURIFromError(const Aws::S3::S3Error & error) const; diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index 20d5e74d6d4..c652f16ab20 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -42,7 +42,6 @@ namespace } /// Performs a request to get the size and last modification time of an object. - /// The function performs either HeadObject or GetObjectAttributes request depending on the endpoint. std::pair, Aws::S3::S3Error> tryGetObjectInfo( const S3::Client & client, const String & bucket, const String & key, const String & version_id, const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata, bool for_disk_s3) @@ -87,7 +86,7 @@ ObjectInfo getObjectInfo( else if (throw_on_error) { throw DB::Exception(ErrorCodes::S3_ERROR, - "Failed to get object attributes: {}. HTTP response code: {}", + "Failed to get object info: {}. HTTP response code: {}", error.GetMessage(), static_cast(error.GetResponseCode())); } return {}; From 3418a8e955f14318fdcdac299fa2d144f84c26d9 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 8 Feb 2023 10:07:30 +0000 Subject: [PATCH 13/69] use master for aws --- contrib/aws | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/aws b/contrib/aws index 06a6610e6fb..ecccfc026a4 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit 06a6610e6fb3385e22ad85014a67aa307825ffb1 +Subproject commit ecccfc026a42b30023289410a67024d561f4bf3e From 2d26c9cffc8b69df1cc78e8f1ab944b63dcdef18 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 8 Feb 2023 10:39:32 +0000 Subject: [PATCH 14/69] Fix error codes --- src/IO/S3/Client.cpp | 1 + src/IO/S3/Client.h | 12 +----------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index b6ae7abc98f..197b405da6a 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -23,6 +23,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int TOO_MANY_REDIRECTS; } namespace S3 diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index c3086499167..47f7cdd2ed1 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -17,15 +17,7 @@ #include #include -namespace DB -{ - -namespace ErrorCodes -{ - extern const int TOO_MANY_REDIRECTS; -} - -namespace S3 +namespace DB::S3 { namespace Model = Aws::S3::Model; @@ -216,6 +208,4 @@ private: } -} - #endif From cf762347d5ca89e7c8f9fc94793c6d12e09e5c70 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 3 Feb 2023 12:01:35 +0000 Subject: [PATCH 15/69] Fix read in order optimization for DESC sorting with FINAL --- .../QueryPlan/ReadFromMergeTree.cpp | 21 +++++++++++++------ src/Processors/QueryPlan/ReadFromMergeTree.h | 1 + .../25336_read_in_order_final_desc.reference | 5 +++++ .../25336_read_in_order_final_desc.sql | 21 +++++++++++++++++++ 4 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.reference create mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.sql diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index cca8e5297ee..748a182111f 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -820,6 +820,7 @@ static void addMergingFinal( Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts_with_ranges, const Names & column_names, + const InputOrderInfoPtr & input_order_info, ActionsDAGPtr & out_projection) { const auto & settings = context->getSettingsRef(); @@ -894,15 +895,19 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( if (new_parts.empty()) continue; + auto read_type = ReadFromMergeTree::ReadType::InOrder; + if (input_order_info && input_order_info->direction == -1) + read_type = ReadType::InReverseOrder; + if (num_streams > 1 && metadata_for_reading->hasPrimaryKey()) { // Let's split parts into layers to ensure data parallelism of FINAL. - auto reading_step_getter = [this, &column_names, &info](auto parts) + auto reading_step_getter = [this, &column_names, &info, read_type](auto parts) { return this->read( std::move(parts), column_names, - ReadFromMergeTree::ReadType::InOrder, + read_type, 1 /* num_streams */, 0 /* min_marks_for_concurrent_read */, info.use_uncompressed_cache); @@ -913,7 +918,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( else { pipes.emplace_back(read( - std::move(new_parts), column_names, ReadFromMergeTree::ReadType::InOrder, num_streams, 0, info.use_uncompressed_cache)); + std::move(new_parts), column_names, read_type, num_streams, 0, info.use_uncompressed_cache)); } /// Drop temporary columns, added by 'sorting_key_expr' @@ -928,11 +933,14 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( pipe.addSimpleTransform([sorting_expr](const Block & header) { return std::make_shared(header, sorting_expr); }); - /// If do_not_merge_across_partitions_select_final is true and there is only one part in partition - /// with level > 0 then we won't postprocess this part + /// If do_not_merge_across_partitions_select_final is true + /// and there is only one part in partition with level > 0 + /// then we won't postprocess this part + bool should_read_in_order = !input_order_info || input_order_info->direction == 0; if (settings.do_not_merge_across_partitions_select_final && std::distance(parts_to_merge_ranges[range_index], parts_to_merge_ranges[range_index + 1]) == 1 && - parts_to_merge_ranges[range_index]->data_part->info.level > 0) + parts_to_merge_ranges[range_index]->data_part->info.level > 0 && + !should_read_in_order) { partition_pipes.emplace_back(Pipe::unitePipes(std::move(pipes))); continue; @@ -1397,6 +1405,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons pipe = spreadMarkRangesAmongStreamsFinal( std::move(result.parts_with_ranges), column_names_to_read, + input_order_info, result_projection); } else if (input_order_info) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 8b2eca5e08e..ba430fbadef 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -237,6 +237,7 @@ private: Pipe spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts, const Names & column_names, + const InputOrderInfoPtr & input_order_info, ActionsDAGPtr & out_projection); MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(MergeTreeData::DataPartsVector parts) const; diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference new file mode 100644 index 00000000000..83c7162f120 --- /dev/null +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference @@ -0,0 +1,5 @@ +1900000050000 1 +1900000040000 0.05 +1900000030000 0 +1900000020000 -0.0002 +1900000010000 -1 diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql new file mode 100644 index 00000000000..ee59badf094 --- /dev/null +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql @@ -0,0 +1,21 @@ +SET optimize_read_in_order = 1; +DROP TABLE IF EXISTS mytable; + +CREATE TABLE mytable +( + timestamp UInt64, + insert_timestamp UInt64, + key UInt64, + value Float64 +) ENGINE = ReplacingMergeTree(insert_timestamp) + PRIMARY KEY (key, timestamp) + ORDER BY (key, timestamp); + +INSERT INTO mytable (timestamp, insert_timestamp, key, value) VALUES (1900000010000, 1675159000000, 5, 555), (1900000010000, 1675159770000, 5, -1), (1900000020000, 1675159770000, 5, -0.0002), (1900000030000, 1675159770000, 5, 0), (1900000020000, 1675159700000, 5, 555), (1900000040000, 1675159770000, 5, 0.05), (1900000050000, 1675159770000, 5, 1); + +SELECT timestamp, value +FROM mytable FINAL +WHERE key = 5 +ORDER BY timestamp DESC; + +DROP TABLE IF EXISTS mytable; From a179528ea416f05da298ade32c3a07612f9c48ae Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 9 Feb 2023 10:47:45 +0000 Subject: [PATCH 16/69] Revert "Fix read in order optimization for DESC sorting with FINAL" This reverts commit cf762347d5ca89e7c8f9fc94793c6d12e09e5c70. --- .../QueryPlan/ReadFromMergeTree.cpp | 21 ++++++------------- src/Processors/QueryPlan/ReadFromMergeTree.h | 1 - .../25336_read_in_order_final_desc.reference | 5 ----- .../25336_read_in_order_final_desc.sql | 21 ------------------- 4 files changed, 6 insertions(+), 42 deletions(-) delete mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.reference delete mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.sql diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 748a182111f..cca8e5297ee 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -820,7 +820,6 @@ static void addMergingFinal( Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts_with_ranges, const Names & column_names, - const InputOrderInfoPtr & input_order_info, ActionsDAGPtr & out_projection) { const auto & settings = context->getSettingsRef(); @@ -895,19 +894,15 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( if (new_parts.empty()) continue; - auto read_type = ReadFromMergeTree::ReadType::InOrder; - if (input_order_info && input_order_info->direction == -1) - read_type = ReadType::InReverseOrder; - if (num_streams > 1 && metadata_for_reading->hasPrimaryKey()) { // Let's split parts into layers to ensure data parallelism of FINAL. - auto reading_step_getter = [this, &column_names, &info, read_type](auto parts) + auto reading_step_getter = [this, &column_names, &info](auto parts) { return this->read( std::move(parts), column_names, - read_type, + ReadFromMergeTree::ReadType::InOrder, 1 /* num_streams */, 0 /* min_marks_for_concurrent_read */, info.use_uncompressed_cache); @@ -918,7 +913,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( else { pipes.emplace_back(read( - std::move(new_parts), column_names, read_type, num_streams, 0, info.use_uncompressed_cache)); + std::move(new_parts), column_names, ReadFromMergeTree::ReadType::InOrder, num_streams, 0, info.use_uncompressed_cache)); } /// Drop temporary columns, added by 'sorting_key_expr' @@ -933,14 +928,11 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( pipe.addSimpleTransform([sorting_expr](const Block & header) { return std::make_shared(header, sorting_expr); }); - /// If do_not_merge_across_partitions_select_final is true - /// and there is only one part in partition with level > 0 - /// then we won't postprocess this part - bool should_read_in_order = !input_order_info || input_order_info->direction == 0; + /// If do_not_merge_across_partitions_select_final is true and there is only one part in partition + /// with level > 0 then we won't postprocess this part if (settings.do_not_merge_across_partitions_select_final && std::distance(parts_to_merge_ranges[range_index], parts_to_merge_ranges[range_index + 1]) == 1 && - parts_to_merge_ranges[range_index]->data_part->info.level > 0 && - !should_read_in_order) + parts_to_merge_ranges[range_index]->data_part->info.level > 0) { partition_pipes.emplace_back(Pipe::unitePipes(std::move(pipes))); continue; @@ -1405,7 +1397,6 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons pipe = spreadMarkRangesAmongStreamsFinal( std::move(result.parts_with_ranges), column_names_to_read, - input_order_info, result_projection); } else if (input_order_info) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index ba430fbadef..8b2eca5e08e 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -237,7 +237,6 @@ private: Pipe spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts, const Names & column_names, - const InputOrderInfoPtr & input_order_info, ActionsDAGPtr & out_projection); MergeTreeDataSelectAnalysisResultPtr selectRangesToRead(MergeTreeData::DataPartsVector parts) const; diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference deleted file mode 100644 index 83c7162f120..00000000000 --- a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference +++ /dev/null @@ -1,5 +0,0 @@ -1900000050000 1 -1900000040000 0.05 -1900000030000 0 -1900000020000 -0.0002 -1900000010000 -1 diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql deleted file mode 100644 index ee59badf094..00000000000 --- a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql +++ /dev/null @@ -1,21 +0,0 @@ -SET optimize_read_in_order = 1; -DROP TABLE IF EXISTS mytable; - -CREATE TABLE mytable -( - timestamp UInt64, - insert_timestamp UInt64, - key UInt64, - value Float64 -) ENGINE = ReplacingMergeTree(insert_timestamp) - PRIMARY KEY (key, timestamp) - ORDER BY (key, timestamp); - -INSERT INTO mytable (timestamp, insert_timestamp, key, value) VALUES (1900000010000, 1675159000000, 5, 555), (1900000010000, 1675159770000, 5, -1), (1900000020000, 1675159770000, 5, -0.0002), (1900000030000, 1675159770000, 5, 0), (1900000020000, 1675159700000, 5, 555), (1900000040000, 1675159770000, 5, 0.05), (1900000050000, 1675159770000, 5, 1); - -SELECT timestamp, value -FROM mytable FINAL -WHERE key = 5 -ORDER BY timestamp DESC; - -DROP TABLE IF EXISTS mytable; From 45f7ef6c8fe943a9f25d510a5135271bb1f80e1b Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 9 Feb 2023 11:00:22 +0000 Subject: [PATCH 17/69] Do not apply read in reverse order for queries with final --- .../Optimizations/distinctReadInOrder.cpp | 4 ++- .../Optimizations/optimizeReadInOrder.cpp | 30 ++++++++++++++---- .../QueryPlan/ReadFromMergeTree.cpp | 31 ++++++++++--------- src/Processors/QueryPlan/ReadFromMergeTree.h | 5 ++- src/Storages/StorageMerge.cpp | 24 +++++++++++--- src/Storages/StorageMerge.h | 4 ++- 6 files changed, 70 insertions(+), 28 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/distinctReadInOrder.cpp b/src/Processors/QueryPlan/Optimizations/distinctReadInOrder.cpp index 3677a1581c4..d584a27f16e 100644 --- a/src/Processors/QueryPlan/Optimizations/distinctReadInOrder.cpp +++ b/src/Processors/QueryPlan/Optimizations/distinctReadInOrder.cpp @@ -86,7 +86,9 @@ size_t tryDistinctReadInOrder(QueryPlan::Node * parent_node) /// update input order info in read_from_merge_tree step const int direction = 0; /// for DISTINCT direction doesn't matter, ReadFromMergeTree will choose proper one - read_from_merge_tree->requestReadingInOrder(number_of_sorted_distinct_columns, direction, pre_distinct->getLimitHint()); + bool can_read = read_from_merge_tree->requestReadingInOrder(number_of_sorted_distinct_columns, direction, pre_distinct->getLimitHint()); + if (!can_read) + return 0; /// update data stream's sorting properties for found transforms const DataStream * input_stream = &read_from_merge_tree->getOutputStream(); diff --git a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp index 301c3bca571..f11ecb84e05 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeReadInOrder.cpp @@ -884,7 +884,7 @@ AggregationInputOrder buildInputOrderInfo( } InputOrderInfoPtr buildInputOrderInfo( - ReadFromMergeTree * reading, + const ReadFromMergeTree * reading, const FixedColumns & fixed_columns, const ActionsDAGPtr & dag, const SortDescription & description, @@ -1012,7 +1012,11 @@ InputOrderInfoPtr buildInputOrderInfo(SortingStep & sorting, QueryPlan::Node & n limit); if (order_info) - reading->requestReadingInOrder(order_info->used_prefix_of_sorting_key_size, order_info->direction, order_info->limit); + { + bool can_read = reading->requestReadingInOrder(order_info->used_prefix_of_sorting_key_size, order_info->direction, order_info->limit); + if (!can_read) + return nullptr; + } return order_info; } @@ -1025,7 +1029,11 @@ InputOrderInfoPtr buildInputOrderInfo(SortingStep & sorting, QueryPlan::Node & n limit); if (order_info) - merge->requestReadingInOrder(order_info); + { + bool can_read = merge->requestReadingInOrder(order_info); + if (!can_read) + return nullptr; + } return order_info; } @@ -1057,10 +1065,14 @@ AggregationInputOrder buildInputOrderInfo(AggregatingStep & aggregating, QueryPl dag, keys); if (order_info.input_order) - reading->requestReadingInOrder( + { + bool can_read = reading->requestReadingInOrder( order_info.input_order->used_prefix_of_sorting_key_size, order_info.input_order->direction, order_info.input_order->limit); + if (!can_read) + return {}; + } return order_info; } @@ -1072,7 +1084,11 @@ AggregationInputOrder buildInputOrderInfo(AggregatingStep & aggregating, QueryPl dag, keys); if (order_info.input_order) - merge->requestReadingInOrder(order_info.input_order); + { + bool can_read = merge->requestReadingInOrder(order_info.input_order); + if (!can_read) + return {}; + } return order_info; } @@ -1261,7 +1277,9 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node, if (order_info) { - read_from_merge_tree->requestReadingInOrder(order_info->used_prefix_of_sorting_key_size, order_info->direction, order_info->limit); + bool can_read = read_from_merge_tree->requestReadingInOrder(order_info->used_prefix_of_sorting_key_size, order_info->direction, order_info->limit); + if (!can_read) + return 0; sorting->convertToFinishSorting(order_info->sort_description_for_merging); } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index cca8e5297ee..652cc5d5b12 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1165,8 +1165,6 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl( if (key_condition->alwaysFalse()) return std::make_shared(MergeTreeDataSelectAnalysisResult{.result = std::move(result)}); - const auto & select = query_info.query->as(); - size_t total_marks_pk = 0; size_t parts_before_pk = 0; try @@ -1203,11 +1201,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl( auto reader_settings = getMergeTreeReaderSettings(context, query_info); bool use_skip_indexes = settings.use_skip_indexes; - bool final = false; - if (query_info.table_expression_modifiers) - final = query_info.table_expression_modifiers->hasFinal(); - else - final = select.final(); + bool final = isFinal(query_info); if (final && !settings.use_skip_indexes_if_final) use_skip_indexes = false; @@ -1262,12 +1256,15 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToReadImpl( return std::make_shared(MergeTreeDataSelectAnalysisResult{.result = std::move(result)}); } -void ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, size_t limit) +bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, size_t limit) { /// if dirction is not set, use current one if (!direction) direction = getSortDirection(); + if (direction != 1 && isFinal(query_info)) + return false; + auto order_info = std::make_shared(SortDescription{}, prefix_size, direction, limit); if (query_info.projection) query_info.projection->input_order_info = order_info; @@ -1301,6 +1298,8 @@ void ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, output_stream->sort_description = std::move(sort_description); output_stream->sort_scope = DataStream::SortScope::Stream; } + + return true; } ReadFromMergeTree::AnalysisResult ReadFromMergeTree::getAnalysisResult() const @@ -1347,12 +1346,7 @@ void ReadFromMergeTree::initializePipeline(QueryPipelineBuilder & pipeline, cons ActionsDAGPtr result_projection; Names column_names_to_read = std::move(result.column_names_to_read); - const auto & select = query_info.query->as(); - bool final = false; - if (query_info.table_expression_modifiers) - final = query_info.table_expression_modifiers->hasFinal(); - else - final = select.final(); + bool final = isFinal(query_info); if (!final && result.sampling.use_sampling) { @@ -1661,6 +1655,15 @@ void ReadFromMergeTree::describeIndexes(JSONBuilder::JSONMap & map) const } } +bool ReadFromMergeTree::isFinal(const SelectQueryInfo & query_info) +{ + if (query_info.table_expression_modifiers) + return query_info.table_expression_modifiers->hasFinal(); + + const auto & select = query_info.query->as(); + return select.final(); +} + bool MergeTreeDataSelectAnalysisResult::error() const { return std::holds_alternative(result); diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h index 8b2eca5e08e..751509c74cf 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.h +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -158,7 +158,10 @@ public: StorageMetadataPtr getStorageMetadata() const { return metadata_for_reading; } const PrewhereInfo * getPrewhereInfo() const { return prewhere_info.get(); } - void requestReadingInOrder(size_t prefix_size, int direction, size_t limit); + /// Returns `false` if requested reading cannot be performed. + bool requestReadingInOrder(size_t prefix_size, int direction, size_t limit); + + static bool isFinal(const SelectQueryInfo & query_info); private: static MergeTreeDataSelectAnalysisResultPtr selectRangesToReadImpl( diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 86dd1773496..43383ac0660 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -566,11 +566,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( SelectQueryOptions(processed_stage).analyze()).buildQueryPipeline()); } - bool final = false; - if (modified_query_info.table_expression_modifiers) - final = modified_query_info.table_expression_modifiers->hasFinal(); - else - final = modified_select.final(); + bool final = isFinal(modified_query_info); if (!final && storage->needRewriteQueryWithFinal(real_column_names)) { @@ -912,6 +908,23 @@ void ReadFromMerge::convertingSourceStream( } } +bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) +{ + if (order_info_->direction != 1 && isFinal(query_info)) + return false; + + order_info = order_info_; + return true; +} + +bool ReadFromMerge::isFinal(const SelectQueryInfo & query_info) +{ + if (query_info.table_expression_modifiers) + return query_info.table_expression_modifiers->hasFinal(); + const auto & select_query = query_info.query->as(); + return select_query.final(); +} + IStorage::ColumnSizeByName StorageMerge::getColumnSizes() const { ColumnSizeByName column_sizes; @@ -993,4 +1006,5 @@ NamesAndTypesList StorageMerge::getVirtuals() const return virtuals; } + } diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index e0c17dedc63..3e0a5d1456f 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -148,7 +148,9 @@ public: const StorageListWithLocks & getSelectedTables() const { return selected_tables; } - void requestReadingInOrder(InputOrderInfoPtr order_info_) { order_info = order_info_; } + /// Returns `false` if requested reading cannot be performed. + bool requestReadingInOrder(InputOrderInfoPtr order_info_); + static bool isFinal(const SelectQueryInfo & query_info); private: const size_t required_max_block_size; From 63b94a39e7d6a0f7e3adc3cc2a4f76731f89ddd6 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 9 Feb 2023 11:06:26 +0000 Subject: [PATCH 18/69] Resotre 25336_read_in_order_final_desc --- .../25336_read_in_order_final_desc.reference | 5 +++++ .../25336_read_in_order_final_desc.sql | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.reference create mode 100644 tests/queries/0_stateless/25336_read_in_order_final_desc.sql diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference new file mode 100644 index 00000000000..83c7162f120 --- /dev/null +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference @@ -0,0 +1,5 @@ +1900000050000 1 +1900000040000 0.05 +1900000030000 0 +1900000020000 -0.0002 +1900000010000 -1 diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql new file mode 100644 index 00000000000..ee59badf094 --- /dev/null +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql @@ -0,0 +1,21 @@ +SET optimize_read_in_order = 1; +DROP TABLE IF EXISTS mytable; + +CREATE TABLE mytable +( + timestamp UInt64, + insert_timestamp UInt64, + key UInt64, + value Float64 +) ENGINE = ReplacingMergeTree(insert_timestamp) + PRIMARY KEY (key, timestamp) + ORDER BY (key, timestamp); + +INSERT INTO mytable (timestamp, insert_timestamp, key, value) VALUES (1900000010000, 1675159000000, 5, 555), (1900000010000, 1675159770000, 5, -1), (1900000020000, 1675159770000, 5, -0.0002), (1900000030000, 1675159770000, 5, 0), (1900000020000, 1675159700000, 5, 555), (1900000040000, 1675159770000, 5, 0.05), (1900000050000, 1675159770000, 5, 1); + +SELECT timestamp, value +FROM mytable FINAL +WHERE key = 5 +ORDER BY timestamp DESC; + +DROP TABLE IF EXISTS mytable; From cb5a178dac007fee63db2e1c6bb24676e66ee13b Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 9 Feb 2023 11:11:29 +0000 Subject: [PATCH 19/69] Check explain result in 25336_read_in_order_final_desc --- .../25336_read_in_order_final_desc.reference | 2 ++ .../25336_read_in_order_final_desc.sql | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference index 83c7162f120..88a926d0d58 100644 --- a/tests/queries/0_stateless/25336_read_in_order_final_desc.reference +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.reference @@ -3,3 +3,5 @@ 1900000030000 0 1900000020000 -0.0002 1900000010000 -1 +Ok +Ok diff --git a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql index ee59badf094..627fd834101 100644 --- a/tests/queries/0_stateless/25336_read_in_order_final_desc.sql +++ b/tests/queries/0_stateless/25336_read_in_order_final_desc.sql @@ -18,4 +18,23 @@ FROM mytable FINAL WHERE key = 5 ORDER BY timestamp DESC; + +SELECT if(explain like '%ReadType: InOrder%', 'Ok', 'Error: ' || explain) FROM ( + EXPLAIN PLAN actions = 1 + SELECT timestamp, value + FROM mytable FINAL + WHERE key = 5 + ORDER BY timestamp +) WHERE explain like '%ReadType%'; + + +SELECT if(explain like '%ReadType: Default%', 'Ok', 'Error: ' || explain) FROM ( + EXPLAIN PLAN actions = 1 + SELECT timestamp, value + FROM mytable FINAL + WHERE key = 5 + ORDER BY timestamp DESC +) WHERE explain like '%ReadType%'; + + DROP TABLE IF EXISTS mytable; From f61f95fcec41c3800abd49722c34cb4554c495d5 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 9 Feb 2023 18:54:15 +0000 Subject: [PATCH 20/69] Replace background executor scheduler merges and mutations with round-robin --- docker/test/stress/run.sh | 2 +- src/Interpreters/Context.h | 8 +- .../MergeTree/MergeTreeBackgroundExecutor.cpp | 5 +- .../MergeTree/MergeTreeBackgroundExecutor.h | 123 +++++++++++++++--- .../MergeTree/tests/gtest_executor.cpp | 77 ++++++++++- 5 files changed, 184 insertions(+), 31 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 314be8ae0fd..c99a34a3911 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -642,7 +642,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then -e "} TCPHandler: Code:" \ -e "} executeQuery: Code:" \ -e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \ - -e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \ + -e "[Queue = DB::RoundRobinRuntimeQueue]: Code: 235. DB::Exception: Part" \ -e "The set of parts restored in place of" \ -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \ -e "Code: 269. DB::Exception: Destination table is myself" \ diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index f40f8608092..000752542c7 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -129,11 +129,11 @@ class StoragePolicySelector; using StoragePolicySelectorPtr = std::shared_ptr; template class MergeTreeBackgroundExecutor; -class MergeMutateRuntimeQueue; -class OrdinaryRuntimeQueue; -using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; +class RoundRobinRuntimeQueue; +class FifoRuntimeQueue; +using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; using MergeMutateBackgroundExecutorPtr = std::shared_ptr; -using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; +using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; using OrdinaryBackgroundExecutorPtr = std::shared_ptr; struct PartUUIDs; using PartUUIDsPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp index 5bc3fda88bb..ded48dc69ea 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp @@ -268,7 +268,8 @@ void MergeTreeBackgroundExecutor::threadFunction() } -template class MergeTreeBackgroundExecutor; -template class MergeTreeBackgroundExecutor; +template class MergeTreeBackgroundExecutor; +template class MergeTreeBackgroundExecutor; +template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 5c1178a1bc1..3030c28ef1d 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -67,8 +67,8 @@ struct TaskRuntimeData } }; - -class OrdinaryRuntimeQueue +/// Simplest First-in-First-out queue, ignores priority. +class FifoRuntimeQueue { public: TaskRuntimeDataPtr pop() @@ -83,7 +83,7 @@ public: void remove(StorageID id) { auto it = std::remove_if(queue.begin(), queue.end(), - [&] (auto item) -> bool { return item->task->getStorageID() == id; }); + [&] (auto && item) -> bool { return item->task->getStorageID() == id; }); queue.erase(it, queue.end()); } @@ -94,8 +94,8 @@ private: boost::circular_buffer queue{0}; }; -/// Uses a heap to pop a task with minimal priority -class MergeMutateRuntimeQueue +/// Uses a heap to pop a task with minimal priority. +class PriorityRuntimeQueue { public: TaskRuntimeDataPtr pop() @@ -115,10 +115,7 @@ public: void remove(StorageID id) { - auto it = std::remove_if(buffer.begin(), buffer.end(), - [&] (auto item) -> bool { return item->task->getStorageID() == id; }); - buffer.erase(it, buffer.end()); - + std::erase_if(buffer, [&] (auto && item) -> bool { return item->task->getStorageID() == id; }); std::make_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority); } @@ -126,7 +123,85 @@ public: bool empty() { return buffer.empty(); } private: - std::vector buffer{}; + std::vector buffer; +}; + +/// Round-robin queue, ignores priority. +class RoundRobinRuntimeQueue +{ +public: + TaskRuntimeDataPtr pop() + { + assert(buffer.size() != unused); + while (buffer[current] == nullptr) + current = (current + 1) % buffer.size(); + auto result = std::move(buffer[current]); + buffer[current] = nullptr; // mark as unused + unused++; + if (buffer.size() == unused) + { + buffer.clear(); + unused = current = 0; + } + else + current = (current + 1) % buffer.size(); + return result; + } + + // Inserts element just before round-robin pointer. + // This way inserted element will be pop()-ed last. It guarantees fairness to avoid starvation. + void push(TaskRuntimeDataPtr item) + { + if (unused == 0) + { + buffer.insert(buffer.begin() + current, std::move(item)); + current = (current + 1) % buffer.size(); + } + else // Optimization to avoid O(N) complexity -- reuse unused elements + { + assert(!buffer.empty()); + size_t pos = (current + buffer.size() - 1) % buffer.size(); + while (buffer[pos] != nullptr) + { + std::swap(item, buffer[pos]); + pos = (pos + buffer.size() - 1) % buffer.size(); + } + buffer[pos] = std::move(item); + unused--; + } + } + + void remove(StorageID id) + { + // This is similar to `std::erase_if()`, but we also track movement of `current` element + size_t saved = 0; + size_t new_current = 0; + for (size_t i = 0; i < buffer.size(); i++) + { + if (buffer[i] != nullptr && buffer[i]->task->getStorageID() != id) // erase unused and matching elements + { + if (i < current) + new_current++; + if (i != saved) + buffer[saved] = std::move(buffer[i]); + saved++; + } + } + buffer.erase(buffer.begin() + saved, buffer.end()); + current = new_current; + unused = 0; + } + + void setCapacity(size_t count) { buffer.reserve(count); } + bool empty() { return buffer.empty(); } + +private: + // Buffer can contain unused elements (nullptrs) + // Unused elements are created by pop() and reused by push() + std::vector buffer; + + size_t current = 0; // round-robin pointer + size_t unused = 0; // number of nullptr elements }; /** @@ -149,13 +224,21 @@ private: * |s| * * Each task is simply a sequence of steps. Heavier tasks have longer sequences. - * When a step of a task is executed, we move tasks to pending queue. And take another from the queue's head. - * With these architecture all small merges / mutations will be executed faster, than bigger ones. + * When a step of a task is executed, we move tasks to pending queue. And take the next task from pending queue. + * Next task is choosen from pending tasks using one of the scheduling policies (class Queue): + * 1) FifoRuntimeQueue. The simplest First-in-First-out queue. Guaranties tasks are executed in order. + * 2) PriorityRuntimeQueue. Uses heap to select task with smallest priority value. + * With this architecture all small merges / mutations will be executed faster, than bigger ones. + * WARNING: Starvation is possible in case of overload. + * 3) RoundRobinRuntimeQueue. Next task is selected, using round-robin pointer, which iterates over all tasks in rounds. + * When task is added to pending queue, it is placed just before round-robin pointer + * to given every other task an opportunity to execute one step. + * With this architecture all merges / mutations are fairly scheduled and never starved. + * All decisions regarding priorities are left to components creating tasks (e.g. SimpleMergeSelector). * - * We use boost::circular_buffer as a container for queues not to do any allocations. - * - * Another nuisance that we faces with is than background operations always interact with an associated Storage. - * So, when a Storage want to shutdown, it must wait until all its background operations are finished. + * We use boost::circular_buffer as a container for active queue to avoid allocations. + * Another nuisance that we face is that background operations always interact with an associated Storage. + * So, when a Storage wants to shutdown, it must wait until all its background operations are finished. */ template class MergeTreeBackgroundExecutor final : boost::noncopyable @@ -225,10 +308,8 @@ private: Poco::Logger * log = &Poco::Logger::get("MergeTreeBackgroundExecutor"); }; -extern template class MergeTreeBackgroundExecutor; -extern template class MergeTreeBackgroundExecutor; - -using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; -using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; +extern template class MergeTreeBackgroundExecutor; +extern template class MergeTreeBackgroundExecutor; +extern template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp index b89692869fd..19f6d345aa6 100644 --- a/src/Storages/MergeTree/tests/gtest_executor.cpp +++ b/src/Storages/MergeTree/tests/gtest_executor.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -55,16 +56,86 @@ private: std::uniform_int_distribution<> distribution; String name; - std::function on_completed; }; +using StepFunc = std::function; + +class LambdaExecutableTask : public IExecutableTask +{ +public: + explicit LambdaExecutableTask(const String & name_, size_t step_count_, StepFunc step_func_ = {}) + : name(name_) + , step_count(step_count_) + , step_func(step_func_) + {} + + bool executeStep() override + { + if (step_func) + step_func(name, step_count); + return --step_count; + } + + StorageID getStorageID() override + { + return {"test", name}; + } + + void onCompleted() override {} + UInt64 getPriority() override { return 0; } + +private: + String name; + size_t step_count; + StepFunc step_func; +}; + + +TEST(Executor, RoundRobin) +{ + auto executor = std::make_shared> + ( + "GTest", + 1, // threads + 100, // max_tasks + CurrentMetrics::BackgroundMergesAndMutationsPoolTask + ); + + String schedule; // mutex is not required because we have a single worker + String expected_schedule = "ABCDEABCDABCDBCDCDD"; + std::barrier barrier(2); + auto task = [&] (const String & name, size_t) + { + schedule += name; + if (schedule.size() == expected_schedule.size()) + barrier.arrive_and_wait(); + }; + + // Schedule tasks from this `init_task` to guarantee atomicity. + // Worker will see pending queue when we push all tasks. + // This is required to check scheduling properties of round-robin in deterministic way. + auto init_task = [&] (const String &, size_t) + { + executor->trySchedule(std::make_shared("A", 3, task)); + executor->trySchedule(std::make_shared("B", 4, task)); + executor->trySchedule(std::make_shared("C", 5, task)); + executor->trySchedule(std::make_shared("D", 6, task)); + executor->trySchedule(std::make_shared("E", 1, task)); + }; + + executor->trySchedule(std::make_shared("init_task", 1, init_task)); + barrier.arrive_and_wait(); // Do not finish until tasks are done + executor->wait(); + ASSERT_EQ(schedule, expected_schedule); +} + TEST(Executor, RemoveTasks) { const size_t tasks_kinds = 25; const size_t batch = 100; - auto executor = std::make_shared + auto executor = std::make_shared> ( "GTest", tasks_kinds, @@ -105,7 +176,7 @@ TEST(Executor, RemoveTasksStress) const size_t schedulers_count = 5; const size_t removers_count = 5; - auto executor = std::make_shared + auto executor = std::make_shared> ( "GTest", tasks_kinds, From cc3c76944b9db2f826331e6b05e792a715484358 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 9 Feb 2023 19:31:54 +0000 Subject: [PATCH 21/69] remove redundant code --- src/Interpreters/Context.h | 3 +- .../MergeTree/MergeTreeBackgroundExecutor.cpp | 3 +- .../MergeTree/MergeTreeBackgroundExecutor.h | 92 +------------------ .../MergeTree/tests/gtest_executor.cpp | 6 +- 4 files changed, 10 insertions(+), 94 deletions(-) diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 000752542c7..abd2818520c 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -130,10 +130,9 @@ using StoragePolicySelectorPtr = std::shared_ptr; template class MergeTreeBackgroundExecutor; class RoundRobinRuntimeQueue; -class FifoRuntimeQueue; using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; using MergeMutateBackgroundExecutorPtr = std::shared_ptr; -using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; +using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; using OrdinaryBackgroundExecutorPtr = std::shared_ptr; struct PartUUIDs; using PartUUIDsPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp index ded48dc69ea..0fe2634eeba 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp @@ -268,8 +268,7 @@ void MergeTreeBackgroundExecutor::threadFunction() } -template class MergeTreeBackgroundExecutor; -template class MergeTreeBackgroundExecutor; template class MergeTreeBackgroundExecutor; +template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 3030c28ef1d..5fa31edef89 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -68,7 +68,7 @@ struct TaskRuntimeData }; /// Simplest First-in-First-out queue, ignores priority. -class FifoRuntimeQueue +class RoundRobinRuntimeQueue { public: TaskRuntimeDataPtr pop() @@ -126,84 +126,6 @@ private: std::vector buffer; }; -/// Round-robin queue, ignores priority. -class RoundRobinRuntimeQueue -{ -public: - TaskRuntimeDataPtr pop() - { - assert(buffer.size() != unused); - while (buffer[current] == nullptr) - current = (current + 1) % buffer.size(); - auto result = std::move(buffer[current]); - buffer[current] = nullptr; // mark as unused - unused++; - if (buffer.size() == unused) - { - buffer.clear(); - unused = current = 0; - } - else - current = (current + 1) % buffer.size(); - return result; - } - - // Inserts element just before round-robin pointer. - // This way inserted element will be pop()-ed last. It guarantees fairness to avoid starvation. - void push(TaskRuntimeDataPtr item) - { - if (unused == 0) - { - buffer.insert(buffer.begin() + current, std::move(item)); - current = (current + 1) % buffer.size(); - } - else // Optimization to avoid O(N) complexity -- reuse unused elements - { - assert(!buffer.empty()); - size_t pos = (current + buffer.size() - 1) % buffer.size(); - while (buffer[pos] != nullptr) - { - std::swap(item, buffer[pos]); - pos = (pos + buffer.size() - 1) % buffer.size(); - } - buffer[pos] = std::move(item); - unused--; - } - } - - void remove(StorageID id) - { - // This is similar to `std::erase_if()`, but we also track movement of `current` element - size_t saved = 0; - size_t new_current = 0; - for (size_t i = 0; i < buffer.size(); i++) - { - if (buffer[i] != nullptr && buffer[i]->task->getStorageID() != id) // erase unused and matching elements - { - if (i < current) - new_current++; - if (i != saved) - buffer[saved] = std::move(buffer[i]); - saved++; - } - } - buffer.erase(buffer.begin() + saved, buffer.end()); - current = new_current; - unused = 0; - } - - void setCapacity(size_t count) { buffer.reserve(count); } - bool empty() { return buffer.empty(); } - -private: - // Buffer can contain unused elements (nullptrs) - // Unused elements are created by pop() and reused by push() - std::vector buffer; - - size_t current = 0; // round-robin pointer - size_t unused = 0; // number of nullptr elements -}; - /** * Executor for a background MergeTree related operations such as merges, mutations, fetches and so on. * It can execute only successors of ExecutableTask interface. @@ -226,15 +148,12 @@ private: * Each task is simply a sequence of steps. Heavier tasks have longer sequences. * When a step of a task is executed, we move tasks to pending queue. And take the next task from pending queue. * Next task is choosen from pending tasks using one of the scheduling policies (class Queue): - * 1) FifoRuntimeQueue. The simplest First-in-First-out queue. Guaranties tasks are executed in order. + * 1) RoundRobinRuntimeQueue. Uses boost::circular_buffer as FIFO-queue. Next task is taken from queue's head and after one step + * enqueued into queue's tail. With this architecture all merges / mutations are fairly scheduled and never starved. + * All decisions regarding priorities are left to components creating tasks (e.g. SimpleMergeSelector). * 2) PriorityRuntimeQueue. Uses heap to select task with smallest priority value. * With this architecture all small merges / mutations will be executed faster, than bigger ones. * WARNING: Starvation is possible in case of overload. - * 3) RoundRobinRuntimeQueue. Next task is selected, using round-robin pointer, which iterates over all tasks in rounds. - * When task is added to pending queue, it is placed just before round-robin pointer - * to given every other task an opportunity to execute one step. - * With this architecture all merges / mutations are fairly scheduled and never starved. - * All decisions regarding priorities are left to components creating tasks (e.g. SimpleMergeSelector). * * We use boost::circular_buffer as a container for active queue to avoid allocations. * Another nuisance that we face is that background operations always interact with an associated Storage. @@ -308,8 +227,7 @@ private: Poco::Logger * log = &Poco::Logger::get("MergeTreeBackgroundExecutor"); }; -extern template class MergeTreeBackgroundExecutor; -extern template class MergeTreeBackgroundExecutor; extern template class MergeTreeBackgroundExecutor; +extern template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp index 19f6d345aa6..ecf22269b0b 100644 --- a/src/Storages/MergeTree/tests/gtest_executor.cpp +++ b/src/Storages/MergeTree/tests/gtest_executor.cpp @@ -91,7 +91,7 @@ private: }; -TEST(Executor, RoundRobin) +TEST(Executor, Simple) { auto executor = std::make_shared> ( @@ -135,7 +135,7 @@ TEST(Executor, RemoveTasks) const size_t tasks_kinds = 25; const size_t batch = 100; - auto executor = std::make_shared> + auto executor = std::make_shared> ( "GTest", tasks_kinds, @@ -176,7 +176,7 @@ TEST(Executor, RemoveTasksStress) const size_t schedulers_count = 5; const size_t removers_count = 5; - auto executor = std::make_shared> + auto executor = std::make_shared> ( "GTest", tasks_kinds, From a4006ec5a1c471ce41945cc5e8c36ea318d749e8 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 9 Feb 2023 19:49:11 +0000 Subject: [PATCH 22/69] add explanation --- src/Interpreters/Context.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index abd2818520c..85a842f904f 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -129,9 +129,14 @@ class StoragePolicySelector; using StoragePolicySelectorPtr = std::shared_ptr; template class MergeTreeBackgroundExecutor; + +/// Concurrent merges are scheduled using `RoundRobinRuntimeQueue` to ensure fair and starvation-free operation. +/// Previously in heavily overloaded shards big merges could possibly be starved by smaller +/// merges due to the use of strict priority scheduling `PriorityRuntimeQueue`. class RoundRobinRuntimeQueue; using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; using MergeMutateBackgroundExecutorPtr = std::shared_ptr; + using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; using OrdinaryBackgroundExecutorPtr = std::shared_ptr; struct PartUUIDs; From ea670acb7270c1bcefa636107f11cc717a1c28a0 Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Fri, 10 Feb 2023 20:05:18 +0100 Subject: [PATCH 23/69] Add comment to requestReadingInOrder --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 2 ++ src/Storages/StorageMerge.cpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 652cc5d5b12..1ad8ab940da 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1262,6 +1262,8 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, if (!direction) direction = getSortDirection(); + /// Disable read-in-order optimization for reverse order with final. + /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). if (direction != 1 && isFinal(query_info)) return false; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 43383ac0660..da5472340f1 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -910,6 +910,8 @@ void ReadFromMerge::convertingSourceStream( bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) { + /// Disable read-in-order optimization for reverse order with final. + /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). if (order_info_->direction != 1 && isFinal(query_info)) return false; From c58b165b0fe17340a53f3f520886dbec3312eed2 Mon Sep 17 00:00:00 2001 From: serxa Date: Sat, 11 Feb 2023 16:18:42 +0000 Subject: [PATCH 24/69] add config option to select scheduling policy --- .../settings.md | 18 ++++ programs/server/Server.cpp | 2 + programs/server/config.xml | 1 + src/Interpreters/Context.cpp | 11 ++- src/Interpreters/Context.h | 10 +- .../MergeTree/MergeTreeBackgroundExecutor.cpp | 1 + .../MergeTree/MergeTreeBackgroundExecutor.h | 92 ++++++++++++++++++- .../MergeTree/tests/gtest_executor.cpp | 61 ++++++++++-- 8 files changed, 181 insertions(+), 15 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 7e7422f9045..f31db0f2f4d 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1010,6 +1010,24 @@ Default value: 2. 3 ``` +## background_merges_mutations_scheduling_policy {#background_merges_mutations_scheduling_policy} + +Algorithm used to select next merge or mutation to be executed by background thread pool. Policy may be changed at runtime without server restart. +Could be applied from the `default` profile for backward compatibility. + +Possible values: + +- "round_robin" — Every concurrent merge and mutation is executed in round-robin order to ensure starvation-free operation. Smaller merges are completed faster than bigger ones just because they have fewer blocks to merge. +- "shortest_task_first" — Always execute smaller merge or mutation. Merges and mutations are assigned priorities based on their resulting size. Merges with smaller sizes are strictly preferred over bigger ones. This policy ensures the fastest possible merge of small parts but can lead to indefinite starvation of big merges in partitions heavily overloaded by INSERTs. + +Default value: "round_robin". + +**Example** + +```xml +shortest_task_first +``` + ## background_move_pool_size {#background_move_pool_size} Sets the number of threads performing background moves for tables with MergeTree engines. Could be increased at runtime and could be applied at server startup from the `default` profile for backward compatibility. diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index e0288415a2d..b97b48d9c68 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1282,6 +1282,8 @@ try auto new_pool_size = config->getUInt64("background_pool_size", 16); auto new_ratio = config->getUInt64("background_merges_mutations_concurrency_ratio", 2); global_context->getMergeMutateExecutor()->increaseThreadsAndMaxTasksCount(new_pool_size, new_pool_size * new_ratio); + auto new_scheduling_policy = config->getString("background_merges_mutations_scheduling_policy", "round_robin"); + global_context->getMergeMutateExecutor()->updateSchedulingPolicy(new_scheduling_policy); } if (global_context->areBackgroundExecutorsInitialized() && config->has("background_move_pool_size")) diff --git a/programs/server/config.xml b/programs/server/config.xml index b1a1514edc8..85cb299e188 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -339,6 +339,7 @@ 16 16 2 + round_robin 8 8 8 diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 36c88abb2ee..822020c567f 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -3746,6 +3746,12 @@ void Context::initializeBackgroundExecutorsIfNeeded() else if (config.has("profiles.default.background_merges_mutations_concurrency_ratio")) background_merges_mutations_concurrency_ratio = config.getUInt64("profiles.default.background_merges_mutations_concurrency_ratio"); + String background_merges_mutations_scheduling_policy = "round_robin"; + if (config.has("background_merges_mutations_scheduling_policy")) + background_merges_mutations_scheduling_policy = config.getString("background_merges_mutations_scheduling_policy"); + else if (config.has("profiles.default.background_merges_mutations_scheduling_policy")) + background_merges_mutations_scheduling_policy = config.getString("profiles.default.background_merges_mutations_scheduling_policy"); + size_t background_move_pool_size = 8; if (config.has("background_move_pool_size")) background_move_pool_size = config.getUInt64("background_move_pool_size"); @@ -3772,8 +3778,9 @@ void Context::initializeBackgroundExecutorsIfNeeded() /*max_tasks_count*/background_pool_size * background_merges_mutations_concurrency_ratio, CurrentMetrics::BackgroundMergesAndMutationsPoolTask ); - LOG_INFO(shared->log, "Initialized background executor for merges and mutations with num_threads={}, num_tasks={}", - background_pool_size, background_pool_size * background_merges_mutations_concurrency_ratio); + shared->merge_mutate_executor->updateSchedulingPolicy(background_merges_mutations_scheduling_policy); + LOG_INFO(shared->log, "Initialized background executor for merges and mutations with num_threads={}, num_tasks={}, scheduling_policy={}", + background_pool_size, background_pool_size * background_merges_mutations_concurrency_ratio, background_merges_mutations_scheduling_policy); shared->moves_executor = std::make_shared ( diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 1105d2a8315..8f0bbc986e5 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -130,13 +130,15 @@ using StoragePolicySelectorPtr = std::shared_ptr; template class MergeTreeBackgroundExecutor; -/// Concurrent merges are scheduled using `RoundRobinRuntimeQueue` to ensure fair and starvation-free operation. +/// Scheduling policy can be changed using `background_merges_mutations_scheduling_policy` config option. +/// By default concurrent merges are scheduled using "round_robin" to ensure fair and starvation-free operation. /// Previously in heavily overloaded shards big merges could possibly be starved by smaller -/// merges due to the use of strict priority scheduling `PriorityRuntimeQueue`. -class RoundRobinRuntimeQueue; -using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; +/// merges due to the use of strict priority scheduling "shortest_task_first". +class DynamicRuntimeQueue; +using MergeMutateBackgroundExecutor = MergeTreeBackgroundExecutor; using MergeMutateBackgroundExecutorPtr = std::shared_ptr; +class RoundRobinRuntimeQueue; using OrdinaryBackgroundExecutor = MergeTreeBackgroundExecutor; using OrdinaryBackgroundExecutorPtr = std::shared_ptr; struct PartUUIDs; diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp index 0fe2634eeba..d0469c35cef 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp @@ -270,5 +270,6 @@ void MergeTreeBackgroundExecutor::threadFunction() template class MergeTreeBackgroundExecutor; template class MergeTreeBackgroundExecutor; +template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 5fa31edef89..c556a18ca54 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -6,7 +6,9 @@ #include #include #include -#include +#include +#include + #include #include @@ -78,7 +80,10 @@ public: return result; } - void push(TaskRuntimeDataPtr item) { queue.push_back(std::move(item));} + void push(TaskRuntimeDataPtr item) + { + queue.push_back(std::move(item)); + } void remove(StorageID id) { @@ -90,6 +95,8 @@ public: void setCapacity(size_t count) { queue.set_capacity(count); } bool empty() { return queue.empty(); } + static constexpr std::string_view name = "round_robin"; + private: boost::circular_buffer queue{0}; }; @@ -122,10 +129,82 @@ public: void setCapacity(size_t count) { buffer.reserve(count); } bool empty() { return buffer.empty(); } + static constexpr std::string_view name = "shortest_task_first"; + private: std::vector buffer; }; +/// Queue that can dynamically change scheduling policy +template +class DynamicRuntimeQueueImpl +{ +public: + TaskRuntimeDataPtr pop() + { + return std::visit([&] (auto && queue) { return queue.pop(); }, impl); + } + + void push(TaskRuntimeDataPtr item) + { + std::visit([&] (auto && queue) { queue.push(std::move(item)); }, impl); + } + + void remove(StorageID id) + { + std::visit([&] (auto && queue) { queue.remove(id); }, impl); + } + + void setCapacity(size_t count) + { + capacity = count; + std::visit([&] (auto && queue) { queue.setCapacity(count); }, impl); + } + + bool empty() + { + return std::visit([&] (auto && queue) { return queue.empty(); }, impl); + } + + // Change policy. It does nothing if new policy is unknown or equals current policy. + void updatePolicy(std::string_view name) + { + // We use this double lambda trick to generate code for all possible pairs of types of old and new queue. + // If types are different it moves tasks from old queue to new one using corresponding pop() and push() + resolve(name, [&] (std::in_place_type_t) + { + std::visit([&] (auto && queue) + { + if constexpr (std::is_same_v) + return; // The same policy + NewQueue new_queue; + new_queue.setCapacity(capacity); + while (!queue.empty()) + new_queue.push(queue.pop()); + impl = std::move(new_queue); + }, impl); + }); + } + +private: + // Find policy with specified `name` and call `func()` if found. + // Tag `std::in_place_type_t` used to help templated lambda to deduce type T w/o creating its instance + template + void resolve(std::string_view name, Func && func) + { + if (T::name == name) + return func(std::in_place_type); + if constexpr (sizeof...(Ts)) + return resolve(name, std::forward(func)); + } + + std::variant impl; + size_t capacity; +}; + +// Avoid typedef and alias to facilitate forward declaration +class DynamicRuntimeQueue : public DynamicRuntimeQueueImpl {}; + /** * Executor for a background MergeTree related operations such as merges, mutations, fetches and so on. * It can execute only successors of ExecutableTask interface. @@ -206,6 +285,14 @@ public: void removeTasksCorrespondingToStorage(StorageID id); void wait(); + /// Update + void updateSchedulingPolicy(std::string_view new_policy) + requires requires(Queue queue) { queue.updatePolicy(new_policy); } // Because we use explicit template instantiation + { + std::lock_guard lock(mutex); + pending.updatePolicy(new_policy); + } + private: String name; size_t threads_count TSA_GUARDED_BY(mutex) = 0; @@ -229,5 +316,6 @@ private: extern template class MergeTreeBackgroundExecutor; extern template class MergeTreeBackgroundExecutor; +extern template class MergeTreeBackgroundExecutor; } diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp index ecf22269b0b..6a73be28d8c 100644 --- a/src/Storages/MergeTree/tests/gtest_executor.cpp +++ b/src/Storages/MergeTree/tests/gtest_executor.cpp @@ -9,6 +9,7 @@ #include #include + using namespace DB; namespace CurrentMetrics @@ -63,10 +64,11 @@ using StepFunc = std::function; class LambdaExecutableTask : public IExecutableTask { public: - explicit LambdaExecutableTask(const String & name_, size_t step_count_, StepFunc step_func_ = {}) + explicit LambdaExecutableTask(const String & name_, size_t step_count_, StepFunc step_func_ = {}, UInt64 priority_ = 0) : name(name_) , step_count(step_count_) , step_func(step_func_) + , priority(priority_) {} bool executeStep() override @@ -82,12 +84,14 @@ public: } void onCompleted() override {} - UInt64 getPriority() override { return 0; } + + UInt64 getPriority() override { return priority; } private: String name; size_t step_count; StepFunc step_func; + UInt64 priority; }; @@ -116,11 +120,11 @@ TEST(Executor, Simple) // This is required to check scheduling properties of round-robin in deterministic way. auto init_task = [&] (const String &, size_t) { - executor->trySchedule(std::make_shared("A", 3, task)); - executor->trySchedule(std::make_shared("B", 4, task)); - executor->trySchedule(std::make_shared("C", 5, task)); - executor->trySchedule(std::make_shared("D", 6, task)); - executor->trySchedule(std::make_shared("E", 1, task)); + executor->trySchedule(std::make_shared("A", 3, task)); + executor->trySchedule(std::make_shared("B", 4, task)); + executor->trySchedule(std::make_shared("C", 5, task)); + executor->trySchedule(std::make_shared("D", 6, task)); + executor->trySchedule(std::make_shared("E", 1, task)); }; executor->trySchedule(std::make_shared("init_task", 1, init_task)); @@ -222,3 +226,46 @@ TEST(Executor, RemoveTasksStress) ASSERT_EQ(CurrentMetrics::values[CurrentMetrics::BackgroundMergesAndMutationsPoolTask], 0); } + + +TEST(Executor, UpdatePolicy) +{ + auto executor = std::make_shared> + ( + "GTest", + 1, // threads + 100, // max_tasks + CurrentMetrics::BackgroundMergesAndMutationsPoolTask + ); + + String schedule; // mutex is not required because we have a single worker + String expected_schedule = "ABCDEDDDDDCCBACBACB"; + std::barrier barrier(2); + auto task = [&] (const String & name, size_t) + { + schedule += name; + if (schedule.size() == 5) + executor->updateSchedulingPolicy(PriorityRuntimeQueue::name); + if (schedule.size() == 12) + executor->updateSchedulingPolicy(RoundRobinRuntimeQueue::name); + if (schedule.size() == expected_schedule.size()) + barrier.arrive_and_wait(); + }; + + // Schedule tasks from this `init_task` to guarantee atomicity. + // Worker will see pending queue when we push all tasks. + // This is required to check scheduling properties in a deterministic way. + auto init_task = [&] (const String &, size_t) + { + executor->trySchedule(std::make_shared("A", 3, task, 5)); + executor->trySchedule(std::make_shared("B", 4, task, 4)); + executor->trySchedule(std::make_shared("C", 5, task, 3)); + executor->trySchedule(std::make_shared("D", 6, task, 2)); + executor->trySchedule(std::make_shared("E", 1, task, 1)); + }; + + executor->trySchedule(std::make_shared("init_task", 1, init_task)); + barrier.arrive_and_wait(); // Do not finish until tasks are done + executor->wait(); + ASSERT_EQ(schedule, expected_schedule); +} From d2f8377c479b5f3024871e13b42fd457ef1a7002 Mon Sep 17 00:00:00 2001 From: serxa Date: Sat, 11 Feb 2023 16:45:45 +0000 Subject: [PATCH 25/69] fix check --- src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index c556a18ca54..2e0a24259ad 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -19,6 +19,7 @@ #include #include + namespace DB { namespace ErrorCodes @@ -175,7 +176,7 @@ public: { std::visit([&] (auto && queue) { - if constexpr (std::is_same_v) + if constexpr (std::is_same_v, NewQueue>) return; // The same policy NewQueue new_queue; new_queue.setCapacity(capacity); From c17dc8e284c55e72b0be0648e419dc2d8f4e0640 Mon Sep 17 00:00:00 2001 From: serxa Date: Sat, 11 Feb 2023 16:48:47 +0000 Subject: [PATCH 26/69] fix --- docker/test/stress/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 78d27dddb65..41ff17bbe10 100644 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -642,7 +642,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then -e "} TCPHandler: Code:" \ -e "} executeQuery: Code:" \ -e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \ - -e "[Queue = DB::RoundRobinRuntimeQueue]: Code: 235. DB::Exception: Part" \ + -e "[Queue = DB::DynamicRuntimeQueue]: Code: 235. DB::Exception: Part" \ -e "The set of parts restored in place of" \ -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \ -e "Code: 269. DB::Exception: Destination table is myself" \ From 9a7f70f595ef3fa77871d0c5de9386be8f9b5b33 Mon Sep 17 00:00:00 2001 From: serxa Date: Sat, 11 Feb 2023 16:54:57 +0000 Subject: [PATCH 27/69] fix comment --- src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 2e0a24259ad..3e2f5525097 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -286,7 +286,7 @@ public: void removeTasksCorrespondingToStorage(StorageID id); void wait(); - /// Update + /// Update scheduling policy for pending tasks. It does nothing if `new_policy` is the same or unknown. void updateSchedulingPolicy(std::string_view new_policy) requires requires(Queue queue) { queue.updatePolicy(new_policy); } // Because we use explicit template instantiation { From df6b4c43721de6e5a85ad8ac6c45b687a450f830 Mon Sep 17 00:00:00 2001 From: serxa Date: Sat, 11 Feb 2023 20:18:19 +0000 Subject: [PATCH 28/69] fix typo --- src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 3e2f5525097..7745e5de334 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -227,7 +227,7 @@ class DynamicRuntimeQueue : public DynamicRuntimeQueueImpl Date: Sun, 12 Feb 2023 02:11:46 +0000 Subject: [PATCH 29/69] propagate storage limits to subquery --- src/Planner/PlannerJoinTree.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 95269f70bcc..6cac504557d 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -336,6 +336,8 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(const QueryTreeNodePtr & tabl { auto subquery_options = select_query_options.subquery(); Planner subquery_planner(table_expression, subquery_options, planner_context->getGlobalPlannerContext()); + /// Propagate storage limits to subquery + subquery_planner.addStorageLimits(*select_query_info.storage_limits); subquery_planner.buildQueryPlanIfNeeded(); query_plan = std::move(subquery_planner).extractQueryPlan(); } From 89cb55aa1f9f5dafb92ac9bbdead46b7205a47ee Mon Sep 17 00:00:00 2001 From: Vladimir C Date: Mon, 13 Feb 2023 12:19:31 +0100 Subject: [PATCH 30/69] whitespace --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 4 ++-- src/Storages/StorageMerge.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1ad8ab940da..0ee288168a8 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1262,8 +1262,8 @@ bool ReadFromMergeTree::requestReadingInOrder(size_t prefix_size, int direction, if (!direction) direction = getSortDirection(); - /// Disable read-in-order optimization for reverse order with final. - /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). + /// Disable read-in-order optimization for reverse order with final. + /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). if (direction != 1 && isFinal(query_info)) return false; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index da5472340f1..a30841d2975 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -910,8 +910,8 @@ void ReadFromMerge::convertingSourceStream( bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) { - /// Disable read-in-order optimization for reverse order with final. - /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). + /// Disable read-in-order optimization for reverse order with final. + /// Otherwise, it can lead to incorrect final behavior because the implementation may rely on the reading in direct order). if (order_info_->direction != 1 && isFinal(query_info)) return false; From df68017fff50c5c8d280c7f7e65625954bccafdd Mon Sep 17 00:00:00 2001 From: SadiHassan Date: Mon, 13 Feb 2023 08:18:49 -0800 Subject: [PATCH 31/69] update unixodbc to v2.3.11 to mitigate CVE-2011-1145 --- contrib/unixodbc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/unixodbc b/contrib/unixodbc index a2cd5395e8c..6c8071b1bef 160000 --- a/contrib/unixodbc +++ b/contrib/unixodbc @@ -1 +1 @@ -Subproject commit a2cd5395e8c7f7390025ec93af5bfebef3fb5fcd +Subproject commit 6c8071b1bef4e4991e7b3023a1c1c712168a818e From 9a496aea6a55be79747222676be8e9a04b772dd3 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Fri, 10 Feb 2023 13:08:34 +0800 Subject: [PATCH 32/69] add new window function, ntile --- src/Processors/Transforms/WindowTransform.cpp | 123 +++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index cb9ab95fba4..2802d7d2687 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -18,7 +18,6 @@ #include #include - namespace DB { @@ -1970,6 +1969,121 @@ struct WindowFunctionRowNumber final : public WindowFunction } }; +// Usage: ntile(n). n is the number of buckets. +struct WindowFunctionNtile final : public WindowFunction +{ + WindowFunctionNtile(const std::string & name_, + const DataTypes & argument_types_, const Array & parameters_) + : WindowFunction(name_, argument_types_, parameters_, std::make_shared()) + { + if (argument_types.size() != 1) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} takes exactly one parameter", name_); + } + } + + bool allocatesMemoryInArena() const override { return false; } + + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override + { + if (!buckets) [[unlikely]] + { + checkWindowFrameType(transform); + const auto & current_block = transform->blockAt(transform->current_row); + const auto & workspace = transform->workspaces[function_index]; + auto n = (*current_block.input_columns[ + workspace.argument_column_indices[0]])[ + transform->current_row.row].get(); + if (n <= 0) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must > 0"); + } + buckets = static_cast(n); + } + // new partition + if (transform->current_row_number == 1) [[unlikely]] + { + current_partition_rows = 0; + start_row = transform->current_row; + } + current_partition_rows++; + + // Only do the action when we meet the last row in this partition. + if (!transform->partition_ended) + return; + else + { + auto current_row = transform->current_row; + current_row.row++; + const auto & end_row = transform->partition_end; + if (current_row != end_row) + { + if (!transform->input_is_finished || end_row.block != current_row.block + 1 || end_row.row) + { + return; + } + // else, current_row is the last input row. + } + } + + auto bucket_capacity = current_partition_rows / buckets; + auto capacity_diff = current_partition_rows - bucket_capacity * buckets; + + // bucket number starts from 1. + UInt64 bucket_num = 1; + for (UInt64 r = 0; r < current_partition_rows;) + { + auto current_bucket_capacity = bucket_capacity; + if (capacity_diff > 0) + { + current_bucket_capacity += 1; + capacity_diff--; + } + Int64 left_rows = static_cast(current_bucket_capacity); + while (left_rows > 0) + { + auto available_block_rows = transform->blockRowsNumber(start_row) - start_row.row; + IColumn & to = *transform->blockAt(start_row).output_columns[function_index]; + auto & pod_array = assert_cast(to).getData(); + if (left_rows < static_cast(available_block_rows)) + { + pod_array.resize_fill(pod_array.size() + static_cast(left_rows), bucket_num); + start_row.row += static_cast(left_rows); + left_rows = 0; + } + else + { + pod_array.resize_fill(pod_array.size() + available_block_rows, bucket_num); + left_rows -= static_cast(available_block_rows); + start_row.block++; + start_row.row = 0; + } + } + + r += current_bucket_capacity; + bucket_num += 1; + } + } +private: + UInt64 buckets = 0; + RowNumber start_row; + UInt64 current_partition_rows = 0; + + void checkWindowFrameType(const WindowTransform * transform) + { + if (transform->window_description.frame.begin_type != WindowFrame::BoundaryType::Unbounded) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame start type must be Unbounded"); + } + + if (transform->window_description.frame.end_type != WindowFrame::BoundaryType::Current) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame end type must be Current"); + } + } +}; + // ClickHouse-specific variant of lag/lead that respects the window frame. template struct WindowFunctionLagLeadInFrame final : public WindowFunction @@ -2338,6 +2452,13 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) parameters); }, properties}, AggregateFunctionFactory::CaseInsensitive); + factory.registerFunction("ntile", {[](const std::string & name, + const DataTypes & argument_types, const Array & parameters, const Settings *) + { + return std::make_shared(name, argument_types, + parameters); + }, properties}, AggregateFunctionFactory::CaseInsensitive); + factory.registerFunction("nth_value", {[](const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *) { From 80a713bca0ded0c8e10f186d62aef55acd23cc6b Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Fri, 10 Feb 2023 14:47:59 +0800 Subject: [PATCH 33/69] update doc and add tests --- .../sql-reference/window-functions/index.md | 1 + .../01591_window_functions.reference | 43 +++++++++++++++++++ .../0_stateless/01591_window_functions.sql | 4 ++ 3 files changed, 48 insertions(+) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index f8107e3310e..e42a28f916f 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -21,6 +21,7 @@ ClickHouse supports the standard grammar for defining windows and window functio | `lag/lead(value, offset)` | Not supported. Workarounds: | | | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead` | | | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | +| ntile(buckets) | supported | ## ClickHouse-specific Window Functions diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index aaa88d66ca0..e9cd5ff8160 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -1277,3 +1277,46 @@ select count() over (w) from numbers(1) window w as (); 1 -- nonexistent parent window select count() over (w2 rows unbounded preceding); -- { serverError 36 } +-- ntile +select a, b, ntile(3) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 1 +0 2 1 +0 3 1 +0 4 2 +0 5 2 +0 6 2 +0 7 3 +0 8 3 +0 9 3 +1 0 1 +1 1 1 +1 2 1 +1 3 1 +1 4 2 +1 5 2 +1 6 2 +1 7 3 +1 8 3 +1 9 3 +select a, b, ntile(2) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 1 +0 2 1 +0 3 1 +0 4 1 +0 5 2 +0 6 2 +0 7 2 +0 8 2 +0 9 2 +1 0 1 +1 1 1 +1 2 1 +1 3 1 +1 4 1 +1 5 2 +1 6 2 +1 7 2 +1 8 2 +1 9 2 \ No newline at end of file diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 3f4a028eac2..2e77cee5375 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -545,3 +545,7 @@ select count() over (w) from numbers(1) window w as (); -- nonexistent parent window select count() over (w2 rows unbounded preceding); -- { serverError 36 } + +-- ntile +select a, b, ntile(3) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(2) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); From 82637637d03a4c810f2b5300fdca6015c8b354cb Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Fri, 10 Feb 2023 16:55:30 +0800 Subject: [PATCH 34/69] fixed test case --- tests/queries/0_stateless/01591_window_functions.reference | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index e9cd5ff8160..8e647437dcf 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -1319,4 +1319,5 @@ select a, b, ntile(2) over (partition by a order by b) from(select intDiv(number 1 6 2 1 7 2 1 8 2 -1 9 2 \ No newline at end of file +1 9 2 + From 0b32a655a8a9053a88fed1176eeebcf7712849d5 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Mon, 13 Feb 2023 12:18:28 +0800 Subject: [PATCH 35/69] fixed --- src/Processors/Transforms/WindowTransform.cpp | 42 ++++++-- .../01591_window_functions.reference | 44 -------- .../0_stateless/01591_window_functions.sql | 4 - .../0_stateless/02661_window_ntile.reference | 100 ++++++++++++++++++ .../0_stateless/02661_window_ntile.sql | 22 ++++ 5 files changed, 153 insertions(+), 59 deletions(-) create mode 100644 tests/queries/0_stateless/02661_window_ntile.reference create mode 100644 tests/queries/0_stateless/02661_window_ntile.sql diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 2802d7d2687..2e9f1568a5e 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -1980,6 +1980,11 @@ struct WindowFunctionNtile final : public WindowFunction { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Function {} takes exactly one parameter", name_); } + auto type_id = argument_types[0]->getTypeId(); + if (type_id != TypeIndex::UInt8 && type_id != TypeIndex::UInt16 && type_id != TypeIndex::UInt32 && type_id != TypeIndex::UInt32 && type_id != TypeIndex::UInt64) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument type must be an unsigned integer (not larger then 64-bit), but got {}", argument_types[0]->getName()); + } } bool allocatesMemoryInArena() const override { return false; } @@ -1992,14 +1997,23 @@ struct WindowFunctionNtile final : public WindowFunction checkWindowFrameType(transform); const auto & current_block = transform->blockAt(transform->current_row); const auto & workspace = transform->workspaces[function_index]; - auto n = (*current_block.input_columns[ - workspace.argument_column_indices[0]])[ - transform->current_row.row].get(); - if (n <= 0) + auto & arg_col = *current_block.original_input_columns[workspace.argument_column_indices[0]]; + if (!isColumnConst(arg_col)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must be a constant"); + auto type_id = argument_types[0]->getTypeId(); + if (type_id == TypeIndex::UInt8) + buckets = arg_col[transform->current_row.row].get(); + else if (type_id == TypeIndex::UInt16) + buckets = arg_col[transform->current_row.row].get(); + else if (type_id == TypeIndex::UInt32) + buckets = arg_col[transform->current_row.row].get(); + else if (type_id == TypeIndex::UInt64) + buckets = arg_col[transform->current_row.row].get(); + + if (!buckets) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must > 0"); } - buckets = static_cast(n); } // new partition if (transform->current_row_number == 1) [[unlikely]] @@ -2040,22 +2054,22 @@ struct WindowFunctionNtile final : public WindowFunction current_bucket_capacity += 1; capacity_diff--; } - Int64 left_rows = static_cast(current_bucket_capacity); - while (left_rows > 0) + auto left_rows = current_bucket_capacity; + while (left_rows) { auto available_block_rows = transform->blockRowsNumber(start_row) - start_row.row; IColumn & to = *transform->blockAt(start_row).output_columns[function_index]; auto & pod_array = assert_cast(to).getData(); - if (left_rows < static_cast(available_block_rows)) + if (left_rows < available_block_rows) { - pod_array.resize_fill(pod_array.size() + static_cast(left_rows), bucket_num); - start_row.row += static_cast(left_rows); + pod_array.resize_fill(pod_array.size() + left_rows, bucket_num); + start_row.row += left_rows; left_rows = 0; } else { pod_array.resize_fill(pod_array.size() + available_block_rows, bucket_num); - left_rows -= static_cast(available_block_rows); + left_rows -= available_block_rows; start_row.block++; start_row.row = 0; } @@ -2072,6 +2086,12 @@ private: void checkWindowFrameType(const WindowTransform * transform) { + if (transform->order_by_indices.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's window frame must have order by clause"); + if (transform->window_description.frame.type != WindowFrame::FrameType::ROWS) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame type must be rows"); + } if (transform->window_description.frame.begin_type != WindowFrame::BoundaryType::Unbounded) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame start type must be Unbounded"); diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 8e647437dcf..aaa88d66ca0 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -1277,47 +1277,3 @@ select count() over (w) from numbers(1) window w as (); 1 -- nonexistent parent window select count() over (w2 rows unbounded preceding); -- { serverError 36 } --- ntile -select a, b, ntile(3) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -0 0 1 -0 1 1 -0 2 1 -0 3 1 -0 4 2 -0 5 2 -0 6 2 -0 7 3 -0 8 3 -0 9 3 -1 0 1 -1 1 1 -1 2 1 -1 3 1 -1 4 2 -1 5 2 -1 6 2 -1 7 3 -1 8 3 -1 9 3 -select a, b, ntile(2) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -0 0 1 -0 1 1 -0 2 1 -0 3 1 -0 4 1 -0 5 2 -0 6 2 -0 7 2 -0 8 2 -0 9 2 -1 0 1 -1 1 1 -1 2 1 -1 3 1 -1 4 1 -1 5 2 -1 6 2 -1 7 2 -1 8 2 -1 9 2 - diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 2e77cee5375..3f4a028eac2 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -545,7 +545,3 @@ select count() over (w) from numbers(1) window w as (); -- nonexistent parent window select count() over (w2 rows unbounded preceding); -- { serverError 36 } - --- ntile -select a, b, ntile(3) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -select a, b, ntile(2) over (partition by a order by b) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); diff --git a/tests/queries/0_stateless/02661_window_ntile.reference b/tests/queries/0_stateless/02661_window_ntile.reference new file mode 100644 index 00000000000..2a0676faef4 --- /dev/null +++ b/tests/queries/0_stateless/02661_window_ntile.reference @@ -0,0 +1,100 @@ +-- { echo } + +-- Normal cases +select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 1 +0 2 1 +0 3 1 +0 4 2 +0 5 2 +0 6 2 +0 7 3 +0 8 3 +0 9 3 +1 0 1 +1 1 1 +1 2 1 +1 3 1 +1 4 2 +1 5 2 +1 6 2 +1 7 3 +1 8 3 +1 9 3 +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 1 +0 2 1 +0 3 1 +0 4 1 +0 5 2 +0 6 2 +0 7 2 +0 8 2 +0 9 2 +1 0 1 +1 1 1 +1 2 1 +1 3 1 +1 4 1 +1 5 2 +1 6 2 +1 7 2 +1 8 2 +1 9 2 +select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 1 +0 2 1 +0 3 1 +0 4 1 +0 5 1 +0 6 1 +0 7 1 +0 8 1 +0 9 1 +1 0 1 +1 1 1 +1 2 1 +1 3 1 +1 4 1 +1 5 1 +1 6 1 +1 7 1 +1 8 1 +1 9 1 +select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +0 0 1 +0 1 2 +0 2 3 +0 3 4 +0 4 5 +0 5 6 +0 6 7 +0 7 8 +0 8 9 +0 9 10 +1 0 1 +1 1 2 +1 2 3 +1 3 4 +1 4 5 +1 5 6 +1 6 7 +1 7 8 +1 8 9 +1 9 10 +-- Bad arguments +select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +-- Bad window type +select a, b, ntile(2) over (partition by a) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b range unbounded preceding) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } diff --git a/tests/queries/0_stateless/02661_window_ntile.sql b/tests/queries/0_stateless/02661_window_ntile.sql new file mode 100644 index 00000000000..80faab7be12 --- /dev/null +++ b/tests/queries/0_stateless/02661_window_ntile.sql @@ -0,0 +1,22 @@ +-- { echo } + +-- Normal cases +select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); + +-- Bad arguments +select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } + +-- Bad window type +select a, b, ntile(2) over (partition by a) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b range unbounded preceding) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } From 74d6ccad4f28d0130718bc94c2dfda10857c7198 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 14 Feb 2023 14:10:17 +0800 Subject: [PATCH 36/69] fixed change window specification as (partition by a order by b rows between unbounded preceding and unbounded following) --- .../sql-reference/window-functions/index.md | 2 +- src/Processors/Transforms/WindowTransform.cpp | 26 ++-- .../0_stateless/02661_window_ntile.reference | 123 ++++++++++++++++-- .../0_stateless/02661_window_ntile.sql | 23 ++-- 4 files changed, 141 insertions(+), 33 deletions(-) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index e42a28f916f..a8186c20026 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -21,7 +21,7 @@ ClickHouse supports the standard grammar for defining windows and window functio | `lag/lead(value, offset)` | Not supported. Workarounds: | | | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead` | | | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | -| ntile(buckets) | supported | +| ntile(buckets) | Supported. Specify window like, (partition by x order by y rows between unbounded preceding and unounded following). | ## ClickHouse-specific Window Functions diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 2e9f1568a5e..e6b72c77db4 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -18,6 +18,7 @@ #include #include + namespace DB { @@ -1414,7 +1415,6 @@ void WindowTransform::work() assert(prev_frame_start <= frame_start); const auto first_used_block = std::min(next_output_block_number, std::min(prev_frame_start.block, current_row.block)); - if (first_block_number < first_used_block) { // fmt::print(stderr, "will drop blocks from {} to {}\n", first_block_number, @@ -2019,6 +2019,7 @@ struct WindowFunctionNtile final : public WindowFunction if (transform->current_row_number == 1) [[unlikely]] { current_partition_rows = 0; + current_partition_inserted_row = 0; start_row = transform->current_row; } current_partition_rows++; @@ -2033,20 +2034,22 @@ struct WindowFunctionNtile final : public WindowFunction const auto & end_row = transform->partition_end; if (current_row != end_row) { - if (!transform->input_is_finished || end_row.block != current_row.block + 1 || end_row.row) + + if (current_row.row < transform->blockRowsNumber(current_row)) + return; + if (end_row.block != current_row.block + 1 || end_row.row) { return; } // else, current_row is the last input row. } } - auto bucket_capacity = current_partition_rows / buckets; auto capacity_diff = current_partition_rows - bucket_capacity * buckets; // bucket number starts from 1. UInt64 bucket_num = 1; - for (UInt64 r = 0; r < current_partition_rows;) + while (current_partition_inserted_row < current_partition_rows) { auto current_bucket_capacity = bucket_capacity; if (capacity_diff > 0) @@ -2074,8 +2077,7 @@ struct WindowFunctionNtile final : public WindowFunction start_row.row = 0; } } - - r += current_bucket_capacity; + current_partition_inserted_row += current_bucket_capacity; bucket_num += 1; } } @@ -2083,6 +2085,7 @@ private: UInt64 buckets = 0; RowNumber start_row; UInt64 current_partition_rows = 0; + UInt64 current_partition_inserted_row = 0; void checkWindowFrameType(const WindowTransform * transform) { @@ -2090,16 +2093,19 @@ private: throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's window frame must have order by clause"); if (transform->window_description.frame.type != WindowFrame::FrameType::ROWS) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame type must be rows"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame type must be ROWS"); } if (transform->window_description.frame.begin_type != WindowFrame::BoundaryType::Unbounded) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame start type must be Unbounded"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame start type must be UNBOUNDED PRECEDING"); } - if (transform->window_description.frame.end_type != WindowFrame::BoundaryType::Current) + if (transform->window_description.frame.end_type != WindowFrame::BoundaryType::Unbounded) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame end type must be Current"); + // We must wait all for the partition end and get the total rows number in this + // partition. So before the end of this partition, there is no any block could be + // dropped out. + throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's frame end type must be UNBOUNDED FOLLOWING"); } } }; diff --git a/tests/queries/0_stateless/02661_window_ntile.reference b/tests/queries/0_stateless/02661_window_ntile.reference index 2a0676faef4..cae0586fa8c 100644 --- a/tests/queries/0_stateless/02661_window_ntile.reference +++ b/tests/queries/0_stateless/02661_window_ntile.reference @@ -1,7 +1,7 @@ -- { echo } -- Normal cases -select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); 0 0 1 0 1 1 0 2 1 @@ -22,7 +22,7 @@ select a, b, ntile(3) over (partition by a order by b rows between unbounded pre 1 7 3 1 8 3 1 9 3 -select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); 0 0 1 0 1 1 0 2 1 @@ -43,7 +43,7 @@ select a, b, ntile(2) over (partition by a order by b rows between unbounded pre 1 7 2 1 8 2 1 9 2 -select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); 0 0 1 0 1 1 0 2 1 @@ -64,7 +64,7 @@ select a, b, ntile(1) over (partition by a order by b rows between unbounded pre 1 7 1 1 8 1 1 9 1 -select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); 0 0 1 0 1 2 0 2 3 @@ -85,16 +85,117 @@ select a, b, ntile(100) over (partition by a order by b rows between unbounded p 1 7 8 1 8 9 1 9 10 +select a, b, ntile(65535) over (partition by a order by b rows between unbounded preceding and unbounded following) from (select 1 as a, number as b from numbers(65535)) limit 100; +1 0 1 +1 1 2 +1 2 3 +1 3 4 +1 4 5 +1 5 6 +1 6 7 +1 7 8 +1 8 9 +1 9 10 +1 10 11 +1 11 12 +1 12 13 +1 13 14 +1 14 15 +1 15 16 +1 16 17 +1 17 18 +1 18 19 +1 19 20 +1 20 21 +1 21 22 +1 22 23 +1 23 24 +1 24 25 +1 25 26 +1 26 27 +1 27 28 +1 28 29 +1 29 30 +1 30 31 +1 31 32 +1 32 33 +1 33 34 +1 34 35 +1 35 36 +1 36 37 +1 37 38 +1 38 39 +1 39 40 +1 40 41 +1 41 42 +1 42 43 +1 43 44 +1 44 45 +1 45 46 +1 46 47 +1 47 48 +1 48 49 +1 49 50 +1 50 51 +1 51 52 +1 52 53 +1 53 54 +1 54 55 +1 55 56 +1 56 57 +1 57 58 +1 58 59 +1 59 60 +1 60 61 +1 61 62 +1 62 63 +1 63 64 +1 64 65 +1 65 66 +1 66 67 +1 67 68 +1 68 69 +1 69 70 +1 70 71 +1 71 72 +1 72 73 +1 73 74 +1 74 75 +1 75 76 +1 76 77 +1 77 78 +1 78 79 +1 79 80 +1 80 81 +1 81 82 +1 82 83 +1 83 84 +1 84 85 +1 85 86 +1 86 87 +1 87 88 +1 88 89 +1 89 90 +1 90 91 +1 91 92 +1 92 93 +1 93 94 +1 94 95 +1 95 96 +1 96 97 +1 97 98 +1 98 99 +1 99 100 -- Bad arguments -select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -- Bad window type select a, b, ntile(2) over (partition by a) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } -select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b range unbounded preceding) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } diff --git a/tests/queries/0_stateless/02661_window_ntile.sql b/tests/queries/0_stateless/02661_window_ntile.sql index 80faab7be12..4c25ecf4dd2 100644 --- a/tests/queries/0_stateless/02661_window_ntile.sql +++ b/tests/queries/0_stateless/02661_window_ntile.sql @@ -1,22 +1,23 @@ -- { echo } -- Normal cases -select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(3) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(1) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(100) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); +select a, b, ntile(65535) over (partition by a order by b rows between unbounded preceding and unbounded following) from (select 1 as a, number as b from numbers(65535)) limit 100; -- Bad arguments -select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(3.0) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile('2') over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(0) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(-2) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(b + 1) over (partition by a order by b rows between unbounded preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -- Bad window type select a, b, ntile(2) over (partition by a) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } -select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and current row) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and unbounded following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b rows between unbounded preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20)); -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b rows between 4 preceding and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } -select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } +select a, b, ntile(2) over (partition by a order by b rows between current row and 4 following) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } select a, b, ntile(2) over (partition by a order by b range unbounded preceding) from(select intDiv(number,10) as a, number%10 as b from numbers(20));; -- { serverError 36 } From bead024ecb2e6ae7b5c13d29c4e424c242c505bf Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 Feb 2023 12:42:58 +0100 Subject: [PATCH 37/69] Fix test --- src/Storages/RabbitMQ/RabbitMQHandler.cpp | 5 +++-- src/Storages/RabbitMQ/RabbitMQHandler.h | 2 +- src/Storages/RabbitMQ/RabbitMQProducer.cpp | 12 ++++++++++-- src/Storages/RabbitMQ/RabbitMQProducer.h | 2 +- tests/integration/test_storage_rabbitmq/test.py | 1 - 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Storages/RabbitMQ/RabbitMQHandler.cpp b/src/Storages/RabbitMQ/RabbitMQHandler.cpp index cbbcd87f79e..5258b9fa7dc 100644 --- a/src/Storages/RabbitMQ/RabbitMQHandler.cpp +++ b/src/Storages/RabbitMQ/RabbitMQHandler.cpp @@ -46,11 +46,12 @@ void RabbitMQHandler::startLoop() loop_running.store(false); } -void RabbitMQHandler::iterateLoop() +bool RabbitMQHandler::iterateLoop() { std::unique_lock lock(startup_mutex, std::defer_lock); if (lock.try_lock()) - uv_run(loop, UV_RUN_NOWAIT); + return uv_run(loop, UV_RUN_NOWAIT); + return 0; } /// Do not need synchronization as in iterateLoop(), because this method is used only for diff --git a/src/Storages/RabbitMQ/RabbitMQHandler.h b/src/Storages/RabbitMQ/RabbitMQHandler.h index 25b32a29f58..4a7c3fc7f78 100644 --- a/src/Storages/RabbitMQ/RabbitMQHandler.h +++ b/src/Storages/RabbitMQ/RabbitMQHandler.h @@ -34,7 +34,7 @@ public: /// Loop to wait for small tasks in a non-blocking mode. /// Adds synchronization with main background loop. - void iterateLoop(); + bool iterateLoop(); /// Loop to wait for small tasks in a blocking mode. /// No synchronization is done with the main loop thread. diff --git a/src/Storages/RabbitMQ/RabbitMQProducer.cpp b/src/Storages/RabbitMQ/RabbitMQProducer.cpp index 2d0719a3293..fc49f74b545 100644 --- a/src/Storages/RabbitMQ/RabbitMQProducer.cpp +++ b/src/Storages/RabbitMQ/RabbitMQProducer.cpp @@ -15,6 +15,7 @@ namespace DB static const auto BATCH = 1000; static const auto RETURNED_LIMIT = 50000; +static const auto FINISH_PRODUCER_NUM_TRIES = 50; namespace ErrorCodes { @@ -254,13 +255,20 @@ void RabbitMQProducer::startProducingTaskLoop() } } + int res = 0; + size_t try_num = 0; + while (++try_num <= FINISH_PRODUCER_NUM_TRIES && (res = iterateEventLoop())) + { + LOG_TEST(log, "Waiting for pending callbacks to finish (count: {}, try: {})", res, try_num); + } + LOG_DEBUG(log, "Producer on channel {} completed", channel_id); } -void RabbitMQProducer::iterateEventLoop() +bool RabbitMQProducer::iterateEventLoop() { - connection.getHandler().iterateLoop(); + return connection.getHandler().iterateLoop(); } } diff --git a/src/Storages/RabbitMQ/RabbitMQProducer.h b/src/Storages/RabbitMQ/RabbitMQProducer.h index 77dadd346ed..e3691c005ee 100644 --- a/src/Storages/RabbitMQ/RabbitMQProducer.h +++ b/src/Storages/RabbitMQ/RabbitMQProducer.h @@ -43,7 +43,7 @@ private: void stopProducingTask() override; void finishImpl() override; - void iterateEventLoop(); + bool iterateEventLoop(); void startProducingTaskLoop() override; void setupChannel(); void removeRecord(UInt64 received_delivery_tag, bool multiple, bool republish); diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index fa3f7f6102d..030d9507d4f 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -1019,7 +1019,6 @@ def test_rabbitmq_many_inserts(rabbitmq_cluster): ), "ClickHouse lost some messages: {}".format(result) -@pytest.mark.skip(reason="Flaky") def test_rabbitmq_overloaded_insert(rabbitmq_cluster): instance.query( """ From 5bb2144e12c05751e78677049ec8d3b2dcaac9ab Mon Sep 17 00:00:00 2001 From: SadiHassan Date: Tue, 14 Feb 2023 05:25:50 -0800 Subject: [PATCH 38/69] updated files --- contrib/unixodbc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/unixodbc b/contrib/unixodbc index 6c8071b1bef..18e0ebe2a1f 160000 --- a/contrib/unixodbc +++ b/contrib/unixodbc @@ -1 +1 @@ -Subproject commit 6c8071b1bef4e4991e7b3023a1c1c712168a818e +Subproject commit 18e0ebe2a1fb53b9072ff60a558f6bd6ad2a0551 From 043b394ff09457ee90d228380fdbdf2f1ddfba2f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 14 Feb 2023 13:36:34 +0000 Subject: [PATCH 39/69] Remove unneeded move --- src/IO/S3/Client.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 6b87ca691be..5c0539ee486 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -597,8 +597,8 @@ std::unique_ptr ClientFactory::create( // NOLINT client_configuration.retryStrategy = std::make_shared(std::move(client_configuration.retryStrategy)); return Client::create( client_configuration.s3_max_redirects, - std::move(credentials_provider), - std::move(client_configuration), // Client configuration. + credentials_provider, + client_configuration, // Client configuration. Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, is_virtual_hosted_style || client_configuration.endpointOverride.empty() /// Use virtual addressing if endpoint is not specified. ); From 5bda358fb72ff6e0bcea14144fd91dccee01e844 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 Feb 2023 15:14:11 +0100 Subject: [PATCH 40/69] Follow-up to #46168 --- .../QueryPlan/ReadFromMergeTree.cpp | 61 ++++- src/Storages/MergeTree/IMergeTreeReadPool.h | 29 +++ .../MergeTree/MergeTreePrefetchedReadPool.cpp | 15 +- .../MergeTree/MergeTreePrefetchedReadPool.h | 8 + src/Storages/MergeTree/MergeTreeReadPool.cpp | 71 +++--- src/Storages/MergeTree/MergeTreeReadPool.h | 223 ++++++++---------- .../MergeTree/MergeTreeReaderCompact.cpp | 32 ++- .../MergeTree/MergeTreeReaderCompact.h | 8 + .../MergeTreeThreadSelectProcessor.cpp | 5 +- 9 files changed, 271 insertions(+), 181 deletions(-) create mode 100644 src/Storages/MergeTree/IMergeTreeReadPool.h diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index ff0c5002e09..a54134a1152 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -283,7 +283,6 @@ Pipe ReadFromMergeTree::readFromPool( total_rows = query_info.limit; const auto & settings = context->getSettingsRef(); - MergeTreeReadPool::BackoffSettings backoff_settings(settings); /// round min_marks_to_read up to nearest multiple of block_size expressed in marks /// If granularity is adaptive it doesn't make sense @@ -295,18 +294,54 @@ Pipe ReadFromMergeTree::readFromPool( / max_block_size * max_block_size / fixed_index_granularity; } - auto pool = std::make_shared( - max_streams, - sum_marks, - min_marks_for_concurrent_read, - std::move(parts_with_range), - storage_snapshot, - prewhere_info, - required_columns, - virt_column_names, - backoff_settings, - settings.preferred_block_size_bytes, - false); + bool all_parts_are_remote = true; + bool all_parts_are_local = true; + for (const auto & part : parts_with_range) + { + const bool is_remote = part.data_part->isStoredOnRemoteDisk(); + all_parts_are_local &= !is_remote; + all_parts_are_remote &= is_remote; + } + + MergeTreeReadPoolPtr pool; + + if ((all_parts_are_remote + && settings.allow_prefetched_read_pool_for_remote_filesystem + && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.remote_fs_method)) + || (!all_parts_are_local + && settings.allow_prefetched_read_pool_for_local_filesystem + && MergeTreePrefetchedReadPool::checkReadMethodAllowed(reader_settings.read_settings.remote_fs_method))) + { + pool = std::make_shared( + max_streams, + sum_marks, + min_marks_for_concurrent_read, + std::move(parts_with_range), + storage_snapshot, + prewhere_info, + required_columns, + virt_column_names, + settings.preferred_block_size_bytes, + reader_settings, + context, + use_uncompressed_cache, + all_parts_are_remote, + *data.getSettings()); + } + else + { + pool = std::make_shared( + max_streams, + sum_marks, + min_marks_for_concurrent_read, + std::move(parts_with_range), + storage_snapshot, + prewhere_info, + required_columns, + virt_column_names, + context, + false); + } auto * logger = &Poco::Logger::get(data.getLogName() + " (SelectExecutor)"); LOG_DEBUG(logger, "Reading approx. {} rows with {} streams", total_rows, max_streams); diff --git a/src/Storages/MergeTree/IMergeTreeReadPool.h b/src/Storages/MergeTree/IMergeTreeReadPool.h new file mode 100644 index 00000000000..efdfca51c0a --- /dev/null +++ b/src/Storages/MergeTree/IMergeTreeReadPool.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ +struct MergeTreeReadTask; +using MergeTreeReadTaskPtr = std::unique_ptr; + + +class IMergeTreeReadPool : private boost::noncopyable +{ +public: + virtual ~IMergeTreeReadPool() = default; + + virtual Block getHeader() const = 0; + + virtual MergeTreeReadTaskPtr getTask(size_t thread) = 0; + + virtual void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) = 0; +}; + +using MergeTreeReadPoolPtr = std::shared_ptr; + +} diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp index d7cbf18c115..7ef1e436ec8 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.cpp @@ -39,16 +39,7 @@ MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool( bool use_uncompressed_cache_, bool is_remote_read_, const MergeTreeSettings & storage_settings_) - : IMergeTreeReadPool( - storage_snapshot_, - column_names_, - virtual_column_names_, - min_marks_for_concurrent_read_, - prewhere_info_, - parts_, - (preferred_block_size_bytes_ > 0), - /*do_not_steal_tasks_*/false) - , WithContext(context_) + : WithContext(context_) , log(&Poco::Logger::get("MergeTreePrefetchedReadPool(" + (parts_.empty() ? "" : parts_.front().data_part->storage.getStorageID().getNameForLogs()) + ")")) , header(storage_snapshot_->getSampleBlockForColumns(column_names_)) , mark_cache(context_->getGlobalContext()->getMarkCache().get()) @@ -57,6 +48,10 @@ MergeTreePrefetchedReadPool::MergeTreePrefetchedReadPool( , profile_callback([this](ReadBufferFromFileBase::ProfileInfo info_) { profileFeedback(info_); }) , index_granularity_bytes(storage_settings_.index_granularity_bytes) , fixed_index_granularity(storage_settings_.index_granularity) + , storage_snapshot(storage_snapshot_) + , column_names(column_names_) + , virtual_column_names(virtual_column_names_) + , prewhere_info(prewhere_info_) , is_remote_read(is_remote_read_) , prefetch_threadpool(getContext()->getPrefetchThreadpool()) { diff --git a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h index ca12739ef6b..8f46593a3c6 100644 --- a/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h +++ b/src/Storages/MergeTree/MergeTreePrefetchedReadPool.h @@ -84,12 +84,20 @@ private: ReadBufferFromFileBase::ProfileCallback profile_callback; size_t index_granularity_bytes; size_t fixed_index_granularity; + + StorageSnapshotPtr storage_snapshot; + const Names column_names; + const Names virtual_column_names; + PrewhereInfoPtr prewhere_info; + RangesInDataParts parts_ranges; + [[ maybe_unused ]] const bool is_remote_read; ThreadPool & prefetch_threadpool; PartsInfos parts_infos; ThreadsTasks threads_tasks; + std::mutex mutex; struct TaskHolder { diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index ebb87be0c41..dd551879fa3 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -20,7 +20,47 @@ namespace ErrorCodes namespace DB { -std::vector IMergeTreeReadPool::fillPerPartInfo(const RangesInDataParts & parts) +MergeTreeReadPool::MergeTreeReadPool( + size_t threads_, + size_t sum_marks_, + size_t min_marks_for_concurrent_read_, + RangesInDataParts && parts_, + const StorageSnapshotPtr & storage_snapshot_, + const PrewhereInfoPtr & prewhere_info_, + const Names & column_names_, + const Names & virtual_column_names_, + ContextPtr context_, + bool do_not_steal_tasks_) + : storage_snapshot(storage_snapshot_) + , column_names(column_names_) + , virtual_column_names(virtual_column_names_) + , min_marks_for_concurrent_read(min_marks_for_concurrent_read_) + , prewhere_info(prewhere_info_) + , parts_ranges(std::move(parts_)) + , predict_block_size_bytes(context_->getSettingsRef().preferred_block_size_bytes > 0) + , do_not_steal_tasks(do_not_steal_tasks_) + , backoff_settings{context_->getSettingsRef()} + , backoff_state{threads_} +{ + /// parts don't contain duplicate MergeTreeDataPart's. + const auto per_part_sum_marks = fillPerPartInfo( + parts_ranges, storage_snapshot, is_part_on_remote_disk, + do_not_steal_tasks, predict_block_size_bytes, + column_names, virtual_column_names, prewhere_info, per_part_params); + + fillPerThreadInfo(threads_, sum_marks_, per_part_sum_marks, parts_ranges); +} + +std::vector MergeTreeReadPool::fillPerPartInfo( + const RangesInDataParts & parts, + const StorageSnapshotPtr & storage_snapshot, + std::vector & is_part_on_remote_disk, + bool & do_not_steal_tasks, + bool & predict_block_size_bytes, + const Names & column_names, + const Names & virtual_column_names, + const PrewhereInfoPtr & prewhere_info, + std::vector & per_part_params) { std::vector per_part_sum_marks; Block sample_block = storage_snapshot->metadata->getSampleBlock(); @@ -65,35 +105,6 @@ std::vector IMergeTreeReadPool::fillPerPartInfo(const RangesInDataParts return per_part_sum_marks; } -MergeTreeReadPool::MergeTreeReadPool( - size_t threads_, - size_t sum_marks_, - size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, - const StorageSnapshotPtr & storage_snapshot_, - const PrewhereInfoPtr & prewhere_info_, - const Names & column_names_, - const Names & virtual_column_names_, - const BackoffSettings & backoff_settings_, - size_t preferred_block_size_bytes_, - bool do_not_steal_tasks_) - : IMergeTreeReadPool( - storage_snapshot_, - column_names_, - virtual_column_names_, - min_marks_for_concurrent_read_, - prewhere_info_, - std::move(parts_), - (preferred_block_size_bytes_ > 0), - do_not_steal_tasks_) - , backoff_settings{backoff_settings_} - , backoff_state{threads_} -{ - /// parts don't contain duplicate MergeTreeDataPart's. - const auto per_part_sum_marks = fillPerPartInfo(parts_ranges); - fillPerThreadInfo(threads_, sum_marks_, per_part_sum_marks, parts_ranges); -} - MergeTreeReadTaskPtr MergeTreeReadPool::getTask(size_t thread) { diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index ad9d8b2e225..83b2587568a 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -14,77 +15,42 @@ namespace DB { - -using MergeTreeReadTaskPtr = std::unique_ptr; - - -class IMergeTreeReadPool -{ -public: - IMergeTreeReadPool( - StorageSnapshotPtr storage_snapshot_, - Names column_names_, - Names virtual_column_names_, - size_t min_marks_for_concurrent_read_, - PrewhereInfoPtr prewhere_info_, - RangesInDataParts parts_ranges_, - bool predict_block_size_bytes_, - bool do_not_steal_tasks_) - : storage_snapshot(storage_snapshot_) - , column_names(column_names_) - , virtual_column_names(virtual_column_names_) - , min_marks_for_concurrent_read(min_marks_for_concurrent_read_) - , prewhere_info(prewhere_info_) - , parts_ranges(parts_ranges_) - , predict_block_size_bytes(predict_block_size_bytes_) - , do_not_steal_tasks(do_not_steal_tasks_) - {} - - virtual MergeTreeReadTaskPtr getTask(size_t thread) = 0; - virtual Block getHeader() const = 0; - virtual void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) = 0; - virtual ~IMergeTreeReadPool() = default; - -protected: - - std::vector fillPerPartInfo(const RangesInDataParts & parts); - - /// Initialized in constructor - StorageSnapshotPtr storage_snapshot; - const Names column_names; - const Names virtual_column_names; - size_t min_marks_for_concurrent_read{0}; - PrewhereInfoPtr prewhere_info; - RangesInDataParts parts_ranges; - bool predict_block_size_bytes; - bool do_not_steal_tasks; - - struct PerPartParams - { - MergeTreeReadTaskColumns task_columns; - NameSet column_name_set; - MergeTreeBlockSizePredictorPtr size_predictor; - RangesInDataPart data_part; - }; - - std::vector per_part_params; - std::vector is_part_on_remote_disk; - - mutable std::mutex mutex; -}; - -using IMergeTreeReadPoolPtr = std::shared_ptr; - -/** Provides read tasks for MergeTreeThreadSelectProcessor`s in fine-grained batches, allowing for more - * uniform distribution of work amongst multiple threads. All parts and their ranges are divided into `threads` - * workloads with at most `sum_marks / threads` marks. Then, threads are performing reads from these workloads - * in "sequential" manner, requesting work in small batches. As soon as some thread has exhausted - * it's workload, it either is signaled that no more work is available (`do_not_steal_tasks == false`) or - * continues taking small batches from other threads' workloads (`do_not_steal_tasks == true`). +/** Provides read tasks for MergeTreeThreadSelectProcessor`s in fine-grained batches, allowing for more + * uniform distribution of work amongst multiple threads. All parts and their ranges are divided into `threads` + * workloads with at most `sum_marks / threads` marks. Then, threads are performing reads from these workloads + * in "sequential" manner, requesting work in small batches. As soon as some thread has exhausted + * it's workload, it either is signaled that no more work is available (`do_not_steal_tasks == false`) or + * continues taking small batches from other threads' workloads (`do_not_steal_tasks == true`). */ -class MergeTreeReadPool final: public IMergeTreeReadPool, private boost::noncopyable +class MergeTreeReadPool : public IMergeTreeReadPool { public: + struct BackoffSettings; + + MergeTreeReadPool( + size_t threads_, + size_t sum_marks_, + size_t min_marks_for_concurrent_read_, + RangesInDataParts && parts_, + const StorageSnapshotPtr & storage_snapshot_, + const PrewhereInfoPtr & prewhere_info_, + const Names & column_names_, + const Names & virtual_column_names_, + ContextPtr context_, + bool do_not_steal_tasks_ = false); + + ~MergeTreeReadPool() override = default; + + MergeTreeReadTaskPtr getTask(size_t thread) override; + + /** Each worker could call this method and pass information about read performance. + * If read performance is too low, pool could decide to lower number of threads: do not assign more tasks to several threads. + * This allows to overcome excessive load to disk subsystem, when reads are not from page cache. + */ + void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) override; + + Block getHeader() const override; + /** Pull could dynamically lower (backoff) number of threads, if read operation are too slow. * Settings for that backoff. */ @@ -107,46 +73,51 @@ public: max_throughput(settings.read_backoff_max_throughput), min_interval_between_events_ms(settings.read_backoff_min_interval_between_events_ms.totalMilliseconds()), min_events(settings.read_backoff_min_events), - min_concurrency(settings.read_backoff_min_concurrency) - { - } + min_concurrency(settings.read_backoff_min_concurrency) {} BackoffSettings() : min_read_latency_ms(0) {} }; - BackoffSettings backoff_settings; + struct PerPartParams + { + MergeTreeReadTaskColumns task_columns; + NameSet column_name_set; + MergeTreeBlockSizePredictorPtr size_predictor; + RangesInDataPart data_part; + }; - MergeTreeReadPool( - size_t threads_, - size_t sum_marks_, - size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, - const StorageSnapshotPtr & storage_snapshot_, - const PrewhereInfoPtr & prewhere_info_, - const Names & column_names_, - const Names & virtual_column_names_, - const BackoffSettings & backoff_settings_, - size_t preferred_block_size_bytes_, - bool do_not_steal_tasks_ = false); - - ~MergeTreeReadPool() override = default; - - MergeTreeReadTaskPtr getTask(size_t thread) override; - - /** Each worker could call this method and pass information about read performance. - * If read performance is too low, pool could decide to lower number of threads: do not assign more tasks to several threads. - * This allows to overcome excessive load to disk subsystem, when reads are not from page cache. - */ - void profileFeedback(ReadBufferFromFileBase::ProfileInfo info) override; - - Block getHeader() const override; + static std::vector fillPerPartInfo( + const RangesInDataParts & parts, + const StorageSnapshotPtr & storage_snapshot, + std::vector & is_part_on_remote_disk, + bool & do_not_steal_tasks, + bool & predict_block_size_bytes, + const Names & column_names, + const Names & virtual_column_names, + const PrewhereInfoPtr & prewhere_info, + std::vector & per_part_params); private: - void fillPerThreadInfo( size_t threads, size_t sum_marks, std::vector per_part_sum_marks, const RangesInDataParts & parts); + /// Initialized in constructor + StorageSnapshotPtr storage_snapshot; + const Names column_names; + const Names virtual_column_names; + size_t min_marks_for_concurrent_read{0}; + PrewhereInfoPtr prewhere_info; + RangesInDataParts parts_ranges; + bool predict_block_size_bytes; + bool do_not_steal_tasks; + + std::vector per_part_params; + std::vector is_part_on_remote_disk; + + BackoffSettings backoff_settings; + + mutable std::mutex mutex; /// State to track numbers of slow reads. struct BackoffState { @@ -156,7 +127,6 @@ private: explicit BackoffState(size_t threads) : current_threads(threads) {} }; - BackoffState backoff_state; struct Part @@ -185,9 +155,7 @@ private: }; -using MergeTreeReadPoolPtr = std::shared_ptr; - -class MergeTreeReadPoolParallelReplicas : public IMergeTreeReadPool, private boost::noncopyable +class MergeTreeReadPoolParallelReplicas : public IMergeTreeReadPool { public: @@ -199,21 +167,19 @@ public: const PrewhereInfoPtr & prewhere_info_, const Names & column_names_, const Names & virtual_column_names_, - size_t min_marks_for_concurrent_read_ - ) - : IMergeTreeReadPool( - storage_snapshot_, - column_names_, - virtual_column_names_, - min_marks_for_concurrent_read_, - prewhere_info_, - parts_, - /*predict_block_size*/false, - /*do_not_steal_tasks*/false) - , extension(extension_) - , threads(threads_) + size_t min_marks_for_concurrent_read_) + : extension(extension_) + , threads(threads_) + , prewhere_info(prewhere_info_) + , storage_snapshot(storage_snapshot_) + , min_marks_for_concurrent_read(min_marks_for_concurrent_read_) + , column_names(column_names_) + , virtual_column_names(virtual_column_names_) + , parts_ranges(std::move(parts_)) { - fillPerPartInfo(parts_ranges); + MergeTreeReadPool::fillPerPartInfo( + parts_ranges, storage_snapshot, is_part_on_remote_disk, do_not_steal_tasks, + predict_block_size_bytes, column_names, virtual_column_names, prewhere_info, per_part_params); extension.all_callback({ .description = parts_ranges.getDescriptions(), @@ -223,8 +189,10 @@ public: ~MergeTreeReadPoolParallelReplicas() override; - MergeTreeReadTaskPtr getTask(size_t thread) override; Block getHeader() const override; + + MergeTreeReadTaskPtr getTask(size_t thread) override; + void profileFeedback(ReadBufferFromFileBase::ProfileInfo) override {} private: @@ -234,6 +202,20 @@ private: size_t threads; bool no_more_tasks_available{false}; Poco::Logger * log = &Poco::Logger::get("MergeTreeReadPoolParallelReplicas"); + + std::mutex mutex; + + PrewhereInfoPtr prewhere_info; + StorageSnapshotPtr storage_snapshot; + size_t min_marks_for_concurrent_read; + const Names column_names; + const Names virtual_column_names; + RangesInDataParts parts_ranges; + + bool do_not_steal_tasks = false; + bool predict_block_size_bytes = false; + std::vector is_part_on_remote_disk; + std::vector per_part_params; }; using MergeTreeReadPoolParallelReplicasPtr = std::shared_ptr; @@ -247,10 +229,10 @@ public: ParallelReadingExtension extension_, CoordinationMode mode_, size_t min_marks_for_concurrent_read_) - : parts_ranges(parts_) - , extension(extension_) - , mode(mode_) - , min_marks_for_concurrent_read(min_marks_for_concurrent_read_) + : parts_ranges(parts_) + , extension(extension_) + , mode(mode_) + , min_marks_for_concurrent_read(min_marks_for_concurrent_read_) { for (const auto & part : parts_ranges) request.push_back({part.data_part->info, MarkRanges{}}); @@ -266,6 +248,7 @@ public: MarkRanges getNewTask(RangesInDataPartDescription description); + RangesInDataParts parts_ranges; ParallelReadingExtension extension; CoordinationMode mode; diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index d1f69ce6cea..d1796dac6cc 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -45,6 +45,12 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( settings.read_settings, load_marks_threadpool_, data_part_info_for_read_->getColumns().size()) + , profile_callback(profile_callback_) + , clock_type(clock_type_) +{ +} + +void MergeTreeReaderCompact::initialize() { try { @@ -75,8 +81,8 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( uncompressed_cache, /* allow_different_codecs = */ true); - if (profile_callback_) - buffer->setProfileCallback(profile_callback_, clock_type_); + if (profile_callback) + buffer->setProfileCallback(profile_callback, clock_type); if (!settings.checksum_on_read) buffer->disableChecksumming(); @@ -95,8 +101,8 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( std::nullopt, std::nullopt), /* allow_different_codecs = */ true); - if (profile_callback_) - buffer->setProfileCallback(profile_callback_, clock_type_); + if (profile_callback) + buffer->setProfileCallback(profile_callback, clock_type); if (!settings.checksum_on_read) buffer->disableChecksumming(); @@ -157,6 +163,12 @@ void MergeTreeReaderCompact::fillColumnPositions() size_t MergeTreeReaderCompact::readRows( size_t from_mark, size_t current_task_last_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns) { + if (!initialized) + { + initialize(); + initialized = true; + } + if (continue_reading) from_mark = next_mark; @@ -302,6 +314,18 @@ void MergeTreeReaderCompact::readData( last_read_granule.emplace(from_mark, column_position); } +void MergeTreeReaderCompact::prefetchBeginOfRange(int64_t priority) +{ + if (!initialized) + { + initialize(); + initialized = true; + } + + adjustUpperBound(all_mark_ranges.back().end); + seekToMark(all_mark_ranges.front().begin, 0); + data_buffer->prefetch(priority); +} void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index) { diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index 8dcd00ad733..a994e72d3ff 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -38,9 +38,12 @@ public: bool canReadIncompleteGranules() const override { return false; } + void prefetchBeginOfRange(int64_t priority) override; + private: bool isContinuousReading(size_t mark, size_t column_position); void fillColumnPositions(); + void initialize(); ReadBuffer * data_buffer; CompressedReadBufferBase * compressed_data_buffer; @@ -78,6 +81,11 @@ private: /// For asynchronous reading from remote fs. void adjustUpperBound(size_t last_mark); + + ReadBufferFromFileBase::ProfileCallback profile_callback; + clockid_t clock_type; + + bool initialized = false; }; } diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp index 0c7f90b2349..cae9080d644 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectProcessor.cpp @@ -21,8 +21,7 @@ MergeTreeThreadSelectAlgorithm::MergeTreeThreadSelectAlgorithm( ExpressionActionsSettings actions_settings, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_) - : - IMergeTreeSelectAlgorithm{ + : IMergeTreeSelectAlgorithm{ pool_->getHeader(), storage_, storage_snapshot_, prewhere_info_, std::move(actions_settings), max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, @@ -59,8 +58,6 @@ void MergeTreeThreadSelectAlgorithm::finalizeNewTask() const bool init_new_readers = !reader || task->reader.valid() || part_name != last_read_part_name; if (init_new_readers) { - initializeMergeTreeReadersForPart( - task->data_part, task->task_columns, metadata_snapshot, task->mark_ranges, value_size_map, profile_callback); initializeMergeTreeReadersForCurrentTask(metadata_snapshot, value_size_map, profile_callback); } From 94d52d2b28fe8ee49d81e4bb0538f59b9f6486d2 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 14 Feb 2023 16:45:46 +0000 Subject: [PATCH 41/69] Fix include --- .gitmodules | 2 +- contrib/ulid-c | 2 +- contrib/ulid-c-cmake/CMakeLists.txt | 3 +-- src/Functions/generateULID.cpp | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitmodules b/.gitmodules index a3e23a1529e..ca55281e643 100644 --- a/.gitmodules +++ b/.gitmodules @@ -298,7 +298,7 @@ url = https://github.com/ridiculousfish/libdivide [submodule "contrib/ulid-c"] path = contrib/ulid-c - url = https://github.com/evillique/ulid-c.git + url = https://github.com/ClickHouse/ulid-c.git [submodule "contrib/aws-crt-cpp"] path = contrib/aws-crt-cpp url = https://github.com/ClickHouse/aws-crt-cpp diff --git a/contrib/ulid-c b/contrib/ulid-c index 8b8bc280bc3..c433b6783cf 160000 --- a/contrib/ulid-c +++ b/contrib/ulid-c @@ -1 +1 @@ -Subproject commit 8b8bc280bc305f0d3dbdca49131677f2a208c48c +Subproject commit c433b6783cf918b8f996dacd014cb2b68c7de419 diff --git a/contrib/ulid-c-cmake/CMakeLists.txt b/contrib/ulid-c-cmake/CMakeLists.txt index 9a68e09faa2..3afbd965bac 100644 --- a/contrib/ulid-c-cmake/CMakeLists.txt +++ b/contrib/ulid-c-cmake/CMakeLists.txt @@ -8,8 +8,7 @@ endif() set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/ulid-c") set (SRCS - "${LIBRARY_DIR}/include/ulid.h" - "${LIBRARY_DIR}/include/ulid.c" + "${LIBRARY_DIR}/src/ulid.c" ) add_library(_ulid ${SRCS}) diff --git a/src/Functions/generateULID.cpp b/src/Functions/generateULID.cpp index 46b8d190496..648754ddaa2 100644 --- a/src/Functions/generateULID.cpp +++ b/src/Functions/generateULID.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include namespace DB From 04b3c9da2742407c683f3a20a6379515eacbef50 Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 14 Feb 2023 16:58:29 +0000 Subject: [PATCH 42/69] Move docs to separate file --- .../sql-reference/functions/ulid-functions.md | 53 +++++++++++++++++++ .../sql-reference/functions/uuid-functions.md | 45 ---------------- 2 files changed, 53 insertions(+), 45 deletions(-) create mode 100644 docs/en/sql-reference/functions/ulid-functions.md diff --git a/docs/en/sql-reference/functions/ulid-functions.md b/docs/en/sql-reference/functions/ulid-functions.md new file mode 100644 index 00000000000..94167945f76 --- /dev/null +++ b/docs/en/sql-reference/functions/ulid-functions.md @@ -0,0 +1,53 @@ +--- +slug: /en/sql-reference/functions/ulid-functions +sidebar_position: 54 +sidebar_label: ULID +--- + +# Functions for Working with ULID + +## generateULID + +Generates the [ULID](https://github.com/ulid/spec). + +**Syntax** + +``` sql +generateULID([x]) +``` + +**Arguments** + +- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. + +**Returned value** + +The [FixedString](../data-types/fixedstring.md) type value. + +**Usage example** + +``` sql +SELECT generateULID() +``` + +``` text +┌─generateULID()─────────────┐ +│ 01GNB2S2FGN2P93QPXDNB4EN2R │ +└────────────────────────────┘ +``` + +**Usage example if it is needed to generate multiple values in one row** + +```sql +SELECT generateULID(1), generateULID(2) +``` + +``` text +┌─generateULID(1)────────────┬─generateULID(2)────────────┐ +│ 01GNB2SGG4RHKVNT9ZGA4FFMNP │ 01GNB2SGG4V0HMQVH4VBVPSSRB │ +└────────────────────────────┴────────────────────────────┘ +``` + +## See Also + +- [UUID](../../sql-reference/functions/uuid-functions.md) diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 1a1e065180b..474e3248d1f 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -359,51 +359,6 @@ serverUUID() Type: [UUID](../data-types/uuid.md). - -# Functions for Working with ULID - -## generateULID - -Generates the [ULID](https://github.com/ulid/spec). - -**Syntax** - -``` sql -generateULID([x]) -``` - -**Arguments** - -- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter. - -**Returned value** - -The [FixedString](../data-types/fixedstring.md) type value. - -**Usage example** - -``` sql -SELECT generateULID() -``` - -``` text -┌─generateULID()─────────────┐ -│ 01GNB2S2FGN2P93QPXDNB4EN2R │ -└────────────────────────────┘ -``` - -**Usage example if it is needed to generate multiple values in one row** - -```sql -SELECT generateULID(1), generateULID(2) -``` - -``` text -┌─generateULID(1)────────────┬─generateULID(2)────────────┐ -│ 01GNB2SGG4RHKVNT9ZGA4FFMNP │ 01GNB2SGG4V0HMQVH4VBVPSSRB │ -└────────────────────────────┴────────────────────────────┘ -``` - ## See Also - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other) From e07c723032f75c351fcdf4ab98711be8bd876370 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 14 Feb 2023 13:03:44 -0400 Subject: [PATCH 43/69] Update prewhere.md --- .../statements/select/prewhere.md | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/en/sql-reference/statements/select/prewhere.md b/docs/en/sql-reference/statements/select/prewhere.md index d0248790bfd..5fb83f60dc6 100644 --- a/docs/en/sql-reference/statements/select/prewhere.md +++ b/docs/en/sql-reference/statements/select/prewhere.md @@ -26,3 +26,44 @@ The `PREWHERE` section is executed before `FINAL`, so the results of `FROM ... F ## Limitations `PREWHERE` is only supported by tables from the [*MergeTree](../../../engines/table-engines/mergetree-family/index.md) family. + +## Example + +```sql +CREATE TABLE mydata +( + `A` Int64, + `B` Int8, + `C` String +) +ENGINE = MergeTree +ORDER BY A AS +SELECT + number, + 0, + if(number between 1000 and 2000, 'x', toString(number)) +FROM numbers(10000000); + +SELECT count() +FROM mydata +WHERE (B = 0) AND (C = 'x'); + +1 row in set. Elapsed: 0.074 sec. Processed 10.00 million rows, 168.89 MB (134.98 million rows/s., 2.28 GB/s.) + +-- let's enable tracing to see which predicate are moved to PREWHERE +set send_logs_level='debug'; + +MergeTreeWhereOptimizer: condition "B = 0" moved to PREWHERE +-- Clickhouse moves automatically `B = 0` to PREWHERE, but it has no sense because B is always 0. + +-- Let's move other predicate `C = 'x'` + +SELECT count() +FROM mydata +PREWHERE C = 'x' +WHERE B = 0; + +1 row in set. Elapsed: 0.069 sec. Processed 10.00 million rows, 158.89 MB (144.90 million rows/s., 2.30 GB/s.) + +-- This query with manual `PREWHERE` processes slightly less data 158.89 MB VS 168.89 MB +``` From 6d1b3dcd789446906340ef2d47a980f14f88b2a0 Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Tue, 14 Feb 2023 13:07:01 -0400 Subject: [PATCH 44/69] Update prewhere.md --- docs/en/sql-reference/statements/select/prewhere.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/select/prewhere.md b/docs/en/sql-reference/statements/select/prewhere.md index 5fb83f60dc6..374113fbd67 100644 --- a/docs/en/sql-reference/statements/select/prewhere.md +++ b/docs/en/sql-reference/statements/select/prewhere.md @@ -65,5 +65,5 @@ WHERE B = 0; 1 row in set. Elapsed: 0.069 sec. Processed 10.00 million rows, 158.89 MB (144.90 million rows/s., 2.30 GB/s.) --- This query with manual `PREWHERE` processes slightly less data 158.89 MB VS 168.89 MB +-- This query with manual `PREWHERE` processes slightly less data: 158.89 MB VS 168.89 MB ``` From 86fda9bd22faa508076082ae6a03d409517d5bd6 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 14 Feb 2023 18:52:09 +0100 Subject: [PATCH 45/69] some clean up --- src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 8bdbdb5a2e6..064bb6874d5 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -42,10 +42,6 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( , log{log_} , column_sizes{std::move(column_sizes_)} { - const auto & primary_key = metadata_snapshot->getPrimaryKey(); - if (!primary_key.column_names.empty()) - first_primary_key_column = primary_key.column_names[0]; - for (const auto & name : queried_columns) { auto it = column_sizes.find(name); @@ -193,7 +189,8 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() && !cannotBeMoved(node, is_final) - /// when use final, do not take into consideration the conditions consisting of column that not belong to the sorting key columns + /// When use final, do not take into consideration the conditions with non-sorting keys. Because final select + /// need to use all sorting keys, it will cause correctness issues if we filter other columns before final merge. && (!is_final || isExpressionOverSortingKey(node)) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. && isSubsetOfTableColumns(cond.identifiers) From 937fade98250c2a178d0f59e19374217996a4dee Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 14 Feb 2023 18:54:19 +0100 Subject: [PATCH 46/69] clean up --- src/Storages/MergeTree/MergeTreeWhereOptimizer.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 277994cbd28..d477bca0a48 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -103,7 +103,6 @@ private: using StringSet = std::unordered_set; - String first_primary_key_column; const StringSet table_columns; const Names queried_columns; const NameSet sorting_key_names; From a7bbf02baca0d6b98c7b7b15fe23f7d9cd55a1ab Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 14 Feb 2023 19:37:09 +0000 Subject: [PATCH 47/69] fix possible deadlock --- src/Interpreters/Context.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index bae6264e105..726bb99ed09 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -3825,9 +3825,9 @@ void Context::initializeBackgroundExecutorsIfNeeded() "MergeMutate", /*max_threads_count*/background_pool_size, /*max_tasks_count*/background_pool_size * background_merges_mutations_concurrency_ratio, - CurrentMetrics::BackgroundMergesAndMutationsPoolTask + CurrentMetrics::BackgroundMergesAndMutationsPoolTask, + background_merges_mutations_scheduling_policy ); - shared->merge_mutate_executor->updateSchedulingPolicy(background_merges_mutations_scheduling_policy); LOG_INFO(shared->log, "Initialized background executor for merges and mutations with num_threads={}, num_tasks={}, scheduling_policy={}", background_pool_size, background_pool_size * background_merges_mutations_concurrency_ratio, background_merges_mutations_scheduling_policy); diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h index 7745e5de334..9305f36feb5 100644 --- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h +++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h @@ -267,6 +267,18 @@ public: pool.scheduleOrThrowOnError([this] { threadFunction(); }); } + MergeTreeBackgroundExecutor( + String name_, + size_t threads_count_, + size_t max_tasks_count_, + CurrentMetrics::Metric metric_, + std::string_view policy) + requires requires(Queue queue) { queue.updatePolicy(policy); } // Because we use explicit template instantiation + : MergeTreeBackgroundExecutor(name_, threads_count_, max_tasks_count_, metric_) + { + pending.updatePolicy(policy); + } + ~MergeTreeBackgroundExecutor() { wait(); From 6e4b660033566bfb1a4ca82ca220bc717905d2eb Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 14 Feb 2023 22:35:10 +0000 Subject: [PATCH 48/69] Move MongoDB and PostgreSQL sources to Sources folder --- src/Dictionaries/MongoDBDictionarySource.h | 2 +- src/Dictionaries/PostgreSQLDictionarySource.cpp | 2 +- src/Processors/{Transforms => Sources}/MongoDBSource.cpp | 0 src/Processors/{Transforms => Sources}/MongoDBSource.h | 0 src/Processors/{Transforms => Sources}/PostgreSQLSource.cpp | 0 src/Processors/{Transforms => Sources}/PostgreSQLSource.h | 0 src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp | 2 +- src/Storages/StorageMongoDB.cpp | 2 +- src/Storages/StoragePostgreSQL.cpp | 2 +- 9 files changed, 5 insertions(+), 5 deletions(-) rename src/Processors/{Transforms => Sources}/MongoDBSource.cpp (100%) rename src/Processors/{Transforms => Sources}/MongoDBSource.h (100%) rename src/Processors/{Transforms => Sources}/PostgreSQLSource.cpp (100%) rename src/Processors/{Transforms => Sources}/PostgreSQLSource.h (100%) diff --git a/src/Dictionaries/MongoDBDictionarySource.h b/src/Dictionaries/MongoDBDictionarySource.h index ac5f19816d2..4c7ae649f09 100644 --- a/src/Dictionaries/MongoDBDictionarySource.h +++ b/src/Dictionaries/MongoDBDictionarySource.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include "DictionaryStructure.h" diff --git a/src/Dictionaries/PostgreSQLDictionarySource.cpp b/src/Dictionaries/PostgreSQLDictionarySource.cpp index 9153b9cb05d..95bfb1a37d8 100644 --- a/src/Dictionaries/PostgreSQLDictionarySource.cpp +++ b/src/Dictionaries/PostgreSQLDictionarySource.cpp @@ -8,7 +8,7 @@ #if USE_LIBPQXX #include #include -#include +#include #include "readInvalidateQuery.h" #include #include diff --git a/src/Processors/Transforms/MongoDBSource.cpp b/src/Processors/Sources/MongoDBSource.cpp similarity index 100% rename from src/Processors/Transforms/MongoDBSource.cpp rename to src/Processors/Sources/MongoDBSource.cpp diff --git a/src/Processors/Transforms/MongoDBSource.h b/src/Processors/Sources/MongoDBSource.h similarity index 100% rename from src/Processors/Transforms/MongoDBSource.h rename to src/Processors/Sources/MongoDBSource.h diff --git a/src/Processors/Transforms/PostgreSQLSource.cpp b/src/Processors/Sources/PostgreSQLSource.cpp similarity index 100% rename from src/Processors/Transforms/PostgreSQLSource.cpp rename to src/Processors/Sources/PostgreSQLSource.cpp diff --git a/src/Processors/Transforms/PostgreSQLSource.h b/src/Processors/Sources/PostgreSQLSource.h similarity index 100% rename from src/Processors/Transforms/PostgreSQLSource.h rename to src/Processors/Sources/PostgreSQLSource.h diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp index f450604fded..99f6fded669 100644 --- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp +++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp index 25a303620d6..2cb85878000 100644 --- a/src/Storages/StorageMongoDB.cpp +++ b/src/Storages/StorageMongoDB.cpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include namespace DB diff --git a/src/Storages/StoragePostgreSQL.cpp b/src/Storages/StoragePostgreSQL.cpp index 729ffdf0dde..400430b9ea2 100644 --- a/src/Storages/StoragePostgreSQL.cpp +++ b/src/Storages/StoragePostgreSQL.cpp @@ -1,7 +1,7 @@ #include "StoragePostgreSQL.h" #if USE_LIBPQXX -#include +#include #include #include From 3feb164c75dc3d2168c38c07505a11c40988497f Mon Sep 17 00:00:00 2001 From: Nikolay Degterinsky Date: Tue, 14 Feb 2023 22:54:30 +0000 Subject: [PATCH 49/69] Add doc --- src/Functions/generateULID.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Functions/generateULID.cpp b/src/Functions/generateULID.cpp index 648754ddaa2..a8a2a2174cb 100644 --- a/src/Functions/generateULID.cpp +++ b/src/Functions/generateULID.cpp @@ -74,7 +74,19 @@ public: REGISTER_FUNCTION(GenerateULID) { - factory.registerFunction(); + factory.registerFunction( + { + R"( +Generates a Universally Unique Lexicographically Sortable Identifier (ULID). +This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times. +The function returns a value of type FixedString(26). +)", + Documentation::Examples{ + {"ulid", "SELECT generateULID()"}, + {"multiple", "SELECT generateULID(1), generateULID(2)"}}, + Documentation::Categories{"ULID"} + }, + FunctionFactory::CaseSensitive); } } From bfbd8804758a394534d5de1de052a6122262e156 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Wed, 15 Feb 2023 08:44:10 +0800 Subject: [PATCH 50/69] fixed binary_tidy build failure --- src/Processors/Transforms/WindowTransform.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index e6b72c77db4..c2293c3097d 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -1997,7 +1997,7 @@ struct WindowFunctionNtile final : public WindowFunction checkWindowFrameType(transform); const auto & current_block = transform->blockAt(transform->current_row); const auto & workspace = transform->workspaces[function_index]; - auto & arg_col = *current_block.original_input_columns[workspace.argument_column_indices[0]]; + const auto & arg_col = *current_block.original_input_columns[workspace.argument_column_indices[0]]; if (!isColumnConst(arg_col)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must be a constant"); auto type_id = argument_types[0]->getTypeId(); @@ -2087,7 +2087,7 @@ private: UInt64 current_partition_rows = 0; UInt64 current_partition_inserted_row = 0; - void checkWindowFrameType(const WindowTransform * transform) + static void checkWindowFrameType(const WindowTransform * transform) { if (transform->order_by_indices.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's window frame must have order by clause"); From df6eaaa2785254cb176002e5db5435a45f508e4c Mon Sep 17 00:00:00 2001 From: zhangnew Date: Wed, 15 Feb 2023 17:06:34 +0800 Subject: [PATCH 51/69] Docs: view.md fix typo --- docs/zh/sql-reference/statements/create/view.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md index 5b475e177c0..a000c69f1ef 100644 --- a/docs/zh/sql-reference/statements/create/view.md +++ b/docs/zh/sql-reference/statements/create/view.md @@ -61,7 +61,7 @@ ClickHouse 中的物化视图更像是插入触发器。 如果视图查询中 请注意,物化视图受[optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert)设置的影响。 在插入视图之前合并数据。 -视图看起来与普通表相同。 例如,它们列在1SHOW TABLES1查询的结果中。 +视图看起来与普通表相同。 例如,它们列在`SHOW TABLES`查询的结果中。 删除视图,使用[DROP VIEW](../../../sql-reference/statements/drop#drop-view). `DROP TABLE`也适用于视图。 From f75d69a954f76a3b3f2f391110d187841dffebc4 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 Feb 2023 11:14:50 +0100 Subject: [PATCH 52/69] Fix --- src/Storages/RabbitMQ/RabbitMQHandler.cpp | 4 ++-- src/Storages/RabbitMQ/RabbitMQHandler.h | 2 +- src/Storages/RabbitMQ/RabbitMQProducer.cpp | 2 +- src/Storages/RabbitMQ/RabbitMQProducer.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/RabbitMQ/RabbitMQHandler.cpp b/src/Storages/RabbitMQ/RabbitMQHandler.cpp index 5258b9fa7dc..934753257b4 100644 --- a/src/Storages/RabbitMQ/RabbitMQHandler.cpp +++ b/src/Storages/RabbitMQ/RabbitMQHandler.cpp @@ -46,12 +46,12 @@ void RabbitMQHandler::startLoop() loop_running.store(false); } -bool RabbitMQHandler::iterateLoop() +int RabbitMQHandler::iterateLoop() { std::unique_lock lock(startup_mutex, std::defer_lock); if (lock.try_lock()) return uv_run(loop, UV_RUN_NOWAIT); - return 0; + return 1; /// We cannot know how actual value. } /// Do not need synchronization as in iterateLoop(), because this method is used only for diff --git a/src/Storages/RabbitMQ/RabbitMQHandler.h b/src/Storages/RabbitMQ/RabbitMQHandler.h index 4a7c3fc7f78..948d56416fd 100644 --- a/src/Storages/RabbitMQ/RabbitMQHandler.h +++ b/src/Storages/RabbitMQ/RabbitMQHandler.h @@ -34,7 +34,7 @@ public: /// Loop to wait for small tasks in a non-blocking mode. /// Adds synchronization with main background loop. - bool iterateLoop(); + int iterateLoop(); /// Loop to wait for small tasks in a blocking mode. /// No synchronization is done with the main loop thread. diff --git a/src/Storages/RabbitMQ/RabbitMQProducer.cpp b/src/Storages/RabbitMQ/RabbitMQProducer.cpp index fc49f74b545..5d639b77f53 100644 --- a/src/Storages/RabbitMQ/RabbitMQProducer.cpp +++ b/src/Storages/RabbitMQ/RabbitMQProducer.cpp @@ -266,7 +266,7 @@ void RabbitMQProducer::startProducingTaskLoop() } -bool RabbitMQProducer::iterateEventLoop() +int RabbitMQProducer::iterateEventLoop() { return connection.getHandler().iterateLoop(); } diff --git a/src/Storages/RabbitMQ/RabbitMQProducer.h b/src/Storages/RabbitMQ/RabbitMQProducer.h index e3691c005ee..70afbbb9b90 100644 --- a/src/Storages/RabbitMQ/RabbitMQProducer.h +++ b/src/Storages/RabbitMQ/RabbitMQProducer.h @@ -43,7 +43,7 @@ private: void stopProducingTask() override; void finishImpl() override; - bool iterateEventLoop(); + int iterateEventLoop(); void startProducingTaskLoop() override; void setupChannel(); void removeRecord(UInt64 received_delivery_tag, bool multiple, bool republish); From 051f551b6e02d0e60f274660da2252a63c323726 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 15 Feb 2023 11:42:15 +0100 Subject: [PATCH 53/69] change tests --- ...00940_order_by_read_in_order_query_plan.reference | 11 ++++++++--- .../00940_order_by_read_in_order_query_plan.sql | 12 ++++++++++++ ..._distinct_in_order_optimization_explain.reference | 1 - 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.reference b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.reference index f08c4cfd3e5..c827d208387 100644 --- a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.reference +++ b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.reference @@ -293,8 +293,8 @@ select * from (select * from tab where (a + b) * c = 8 union all select * from t select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; Prefix sort description: sin(divide(a, b)) ASC Result sort description: sin(divide(a, b)) ASC - ReadType: InOrder - ReadType: InOrder + ReadType: InOrder + ReadType: InOrder select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b); 2 2 2 2 2 2 2 2 @@ -311,7 +311,7 @@ select * from (select * from tab where (a + b) * c = 8 union all select * from t select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%'; Prefix sort description: sin(divide(a, b)) ASC Result sort description: sin(divide(a, b)) ASC - ReadType: InOrder + ReadType: InOrder ReadType: InOrder select * from (select * from tab union all select * from tab5) order by (a + b) * c; 0 0 0 0 @@ -403,3 +403,8 @@ select * from (explain plan actions = 1 select * from (select * from tab union a Sort description: multiply(plus(a, b), c) ASC, sin(divide(a, b)) ASC, d ASC Limit 3 ReadType: Default +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; diff --git a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql index e694ccf84ee..8e59e5af254 100644 --- a/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql +++ b/tests/queries/0_stateless/00940_order_by_read_in_order_query_plan.sql @@ -1,5 +1,11 @@ SET optimize_read_in_order = 1, query_plan_read_in_order=1; +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; + create table tab (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by ((a + b) * c, sin(a / b)); insert into tab select number, number, number, number from numbers(5); insert into tab select number, number, number, number from numbers(5); @@ -142,3 +148,9 @@ select * from (explain plan actions = 1 select * from (select * from tab union a -- In case of tab4, we do full sorting by ((a + b) * c, sin(a / b), d) with LIMIT. We can replace it to sorting by ((a + b) * c, sin(a / b)) and LIMIT WITH TIES, when sorting alog support it. select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3; select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%'; + +drop table if exists tab; +drop table if exists tab2; +drop table if exists tab3; +drop table if exists tab4; +drop table if exists tab5; diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index da24918a208..0a123a2a50f 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -55,7 +55,6 @@ MergeTreeThread Sorting (Stream): a ASC, b ASC Sorting (Stream): a ASC, b ASC Sorting (Stream): a ASC, b ASC -Sorting (Stream): a ASC, b ASC -- check that reading in order optimization for ORDER BY and DISTINCT applied correctly in the same query -- disabled, check that sorting description for ReadFromMergeTree match ORDER BY columns Sorting (Stream): a ASC From d7bfe494b2e8063ab7cff7ad502dfeba638909c1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 15 Feb 2023 12:02:24 +0100 Subject: [PATCH 54/69] Fix test --- .../0_stateless/01605_adaptive_granularity_block_borders.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01605_adaptive_granularity_block_borders.sql b/tests/queries/0_stateless/01605_adaptive_granularity_block_borders.sql index 85ad7636201..ca7d0f3c950 100644 --- a/tests/queries/0_stateless/01605_adaptive_granularity_block_borders.sql +++ b/tests/queries/0_stateless/01605_adaptive_granularity_block_borders.sql @@ -1,6 +1,7 @@ -- Tags: no-random-merge-tree-settings SET use_uncompressed_cache = 0; +SET allow_prefetched_read_pool_for_remote_filesystem=0; DROP TABLE IF EXISTS adaptive_table; From 8cdafcc2d099f3b488b5448f3a7908256b4d8e99 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 15 Feb 2023 11:45:57 +0000 Subject: [PATCH 55/69] Correctly check if there were failures --- programs/copier/ClusterCopier.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/programs/copier/ClusterCopier.cpp b/programs/copier/ClusterCopier.cpp index dfeabc6620c..48a3578dd7b 100644 --- a/programs/copier/ClusterCopier.cpp +++ b/programs/copier/ClusterCopier.cpp @@ -1225,8 +1225,8 @@ TaskStatus ClusterCopier::iterateThroughAllPiecesInPartition(const ConnectionTim std::this_thread::sleep_for(retry_delay_ms); } - was_active_pieces = (res == TaskStatus::Active); - was_failed_pieces = (res == TaskStatus::Error); + was_active_pieces |= (res == TaskStatus::Active); + was_failed_pieces |= (res == TaskStatus::Error); } if (was_failed_pieces) From f6c894eb6e906d03d897d159e65ea59bb64db98b Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 15 Feb 2023 13:03:30 +0000 Subject: [PATCH 56/69] Update version_date.tsv and changelogs after v22.3.18.37-lts --- docs/changelogs/v22.3.18.37-lts.md | 33 ++++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 1 + 2 files changed, 34 insertions(+) create mode 100644 docs/changelogs/v22.3.18.37-lts.md diff --git a/docs/changelogs/v22.3.18.37-lts.md b/docs/changelogs/v22.3.18.37-lts.md new file mode 100644 index 00000000000..ff6378f09ad --- /dev/null +++ b/docs/changelogs/v22.3.18.37-lts.md @@ -0,0 +1,33 @@ +--- +sidebar_position: 1 +sidebar_label: 2023 +--- + +# 2023 Changelog + +### ClickHouse release v22.3.18.37-lts (fe512717551) FIXME as compared to v22.3.17.13-lts (fcc4de7e805) + +#### Performance Improvement +* Backported in [#46372](https://github.com/ClickHouse/ClickHouse/issues/46372): Fix too big memory usage for vertical merges on non-remote disk. Respect `max_insert_delayed_streams_for_parallel_write` for the remote disk. [#46275](https://github.com/ClickHouse/ClickHouse/pull/46275) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#46357](https://github.com/ClickHouse/ClickHouse/issues/46357): Allow using Vertical merge algorithm with parts in Compact format. This will allow ClickHouse server to use much less memory for background operations. This closes [#46084](https://github.com/ClickHouse/ClickHouse/issues/46084). [#46282](https://github.com/ClickHouse/ClickHouse/pull/46282) ([Anton Popov](https://github.com/CurtizJ)). + +#### Build/Testing/Packaging Improvement +* Backported in [#45856](https://github.com/ClickHouse/ClickHouse/issues/45856): Fix zookeeper downloading, update the version, and optimize the image size. [#44853](https://github.com/ClickHouse/ClickHouse/pull/44853) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Backported in [#45620](https://github.com/ClickHouse/ClickHouse/issues/45620): Another fix for `Cannot read all data` error which could happen while reading `LowCardinality` dictionary from remote fs. Fixes [#44709](https://github.com/ClickHouse/ClickHouse/issues/44709). [#44875](https://github.com/ClickHouse/ClickHouse/pull/44875) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#45549](https://github.com/ClickHouse/ClickHouse/issues/45549): Fix `SELECT ... FROM system.dictionaries` exception when there is a dictionary with a bad structure (e.g. incorrect type in xml config). [#45399](https://github.com/ClickHouse/ClickHouse/pull/45399) ([Aleksei Filatov](https://github.com/aalexfvk)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Automatically merge green backport PRs and green approved PRs [#41110](https://github.com/ClickHouse/ClickHouse/pull/41110) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix wrong approved_at, simplify conditions [#45302](https://github.com/ClickHouse/ClickHouse/pull/45302) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Get rid of artifactory in favor of r2 + ch-repos-manager [#45421](https://github.com/ClickHouse/ClickHouse/pull/45421) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Another attempt to fix automerge, or at least to have debug footprint [#45476](https://github.com/ClickHouse/ClickHouse/pull/45476) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Trim refs/tags/ from GITHUB_TAG in release workflow [#45636](https://github.com/ClickHouse/ClickHouse/pull/45636) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add check for running workflows to merge_pr.py [#45803](https://github.com/ClickHouse/ClickHouse/pull/45803) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Get rid of progress timestamps in release publishing [#45818](https://github.com/ClickHouse/ClickHouse/pull/45818) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add helping logging to auto-merge script [#46080](https://github.com/ClickHouse/ClickHouse/pull/46080) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Fix write buffer destruction order for vertical merge. [#46205](https://github.com/ClickHouse/ClickHouse/pull/46205) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index e09a39ff463..d313c4bfb78 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -61,6 +61,7 @@ v22.4.5.9-stable 2022-05-06 v22.4.4.7-stable 2022-04-29 v22.4.3.3-stable 2022-04-26 v22.4.2.1-stable 2022-04-22 +v22.3.18.37-lts 2023-02-15 v22.3.17.13-lts 2023-01-12 v22.3.16.1190-lts 2023-01-09 v22.3.15.33-lts 2022-12-02 From da4389a763a7f154d1b280e26d3ae6a9d5053154 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 15 Feb 2023 15:29:11 +0100 Subject: [PATCH 57/69] Add docs for KeeperMap --- docs/en/engines/table-engines/index.md | 1 + .../integrations/embedded-rocksdb.md | 36 ++++++ .../table-engines/special/keepermap.md | 111 ++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 docs/en/engines/table-engines/special/keepermap.md diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md index ebf6826221d..31563e2e727 100644 --- a/docs/en/engines/table-engines/index.md +++ b/docs/en/engines/table-engines/index.md @@ -76,6 +76,7 @@ Engines in the family: - [View](../../engines/table-engines/special/view.md#table_engines-view) - [Memory](../../engines/table-engines/special/memory.md#memory) - [Buffer](../../engines/table-engines/special/buffer.md#buffer) +- [KeeperMap](../../engines/table-engines/special/keepermap.md) ## Virtual Columns {#table_engines-virtual_columns} diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index f846f785c5a..35c59b8181a 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -84,3 +84,39 @@ You can also change any [rocksdb options](https://github.com/facebook/rocksdb/wi ``` + +## Supported operations {#table_engine-EmbeddedRocksDB-supported-operations} + +### Inserts + +When new rows are inserted into `EmbeddedRocksDB`, if the key already exists, the value will be updated, otherwise new key is created. + +Example: + +```sql +INSERT INTO test VALUES ('some key', 1, 'value', 3.2); +``` + +### Deletes + +Rows can be deleted using `DELETE` query or `TRUNCATE`. + +```sql +DELETE FROM test WHERE key LIKE 'some%' AND v1 > 1; +``` + +```sql +ALTER TABLE test DELETE WHERE key LIKE 'some%' AND v1 > 1; +``` + +```sql +TRUNCATE TABLE test; +``` + +### Updates + +Values can be updated using `ALTER TABLE` query. Primary key cannot be updated. + +```sql +ALTER TABLE test UPDATE v1 = v1 * 10 + 2 WHERE key LIKE 'some%' AND v3 > 3.1; +``` diff --git a/docs/en/engines/table-engines/special/keepermap.md b/docs/en/engines/table-engines/special/keepermap.md new file mode 100644 index 00000000000..680413039e7 --- /dev/null +++ b/docs/en/engines/table-engines/special/keepermap.md @@ -0,0 +1,111 @@ +--- +slug: /en/engines/table-engines/special/keeper-map +sidebar_position: 150 +sidebar_label: KeeperMap +--- + +# KeeperMap {#keepermap} + +This engine allows you to use Keeper/ZooKeeper cluster as consistent key-value store with linearizable writes and sequentially consistent reads. + +To enable KeeperMap storage engine, you need to define a ZooKeeper path where the tables will be stored using `` config. + +For example: + +```xml + + /keeper_map_tables + +``` + +where path can be any other valid ZooKeeper path. + +## Creating a Table {#table_engine-KeeperMap-creating-a-table} + +``` sql +CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] +( + name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1], + name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2], + ... +) ENGINE = KeeperMap(root_path, [keys_limit]) PRIMARY KEY(primary_key_name) +``` + +Engine parameters: + +- `root_path` - ZooKeeper path where the `table_name` will be stored. +This path should not contain the prefix defined by `` config because the prefix will be automatically appended to the `root_path`. +Additionally, format of `auxiliary_zookeper_cluster_name:/some/path` is also supported where `auxiliary_zookeper_cluster` is a ZooKeeper cluster defined inside `` config. +By default, ZooKeeper cluster defined inside `` config is used. +- `keys_limit` - number of keys allowed inside the table. +This limit is a soft limit and it can be possible that more keys will end up in the table for some edge cases. +- `primary_key_name` – any column name in the column list. +- `primary key` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a `node name` inside ZooKeeper. +- columns other than the primary key will be serialized to binary in corresponding order and stored as a value of the resulting node defined by the serialized key. +- queries with key `equals` or `in` filtering will be optimized to multi keys lookup from `Keeper`, otherwise all values will be fetched. + +Example: + +``` sql +CREATE TABLE keeper_map_table +( + `key` String, + `v1` UInt32, + `v2` String, + `v3` Float32 +) +ENGINE = KeeperMap(/keeper_map_table, 4) +PRIMARY KEY key +``` + +with + +```xml + + /keeper_map_tables + +``` + + +Each value, which is binary serialization of `(v1, v2, v3)`, will be stored inside `/keeper_map_tables/keeper_map_table/data/serialized_key` in `Keeper`. +Additionally, number of keys will have a soft limit of 4 for the number of keys. + +If multiple tables are created on the same ZooKeeper path, the values are persisted until there exists at least 1 table using it. +As a result, it is possible to use `ON CLUSTER` clause when creating the table and sharing the data from multiple ClickHouse instances. +Of course, it's possible to manually run `CREATE TABLE` with same path on nonrelated ClickHouse instances to have same data sharing effect. + +## Supported operations {#table_engine-KeeperMap-supported-operations} + +### Inserts + +When new rows are inserted into `KeeperMap`, if the key already exists, the value will be updated, otherwise new key is created. + +Example: + +```sql +INSERT INTO keeper_map_table VALUES ('some key', 1, 'value', 3.2); +``` + +### Deletes + +Rows can be deleted using `DELETE` query or `TRUNCATE`. + +```sql +DELETE FROM keeper_map_table WHERE key LIKE 'some%' AND v1 > 1; +``` + +```sql +ALTER TABLE keeper_map_table DELETE WHERE key LIKE 'some%' AND v1 > 1; +``` + +```sql +TRUNCATE TABLE keeper_map_table; +``` + +### Updates + +Values can be updated using `ALTER TABLE` query. Primary key cannot be updated. + +```sql +ALTER TABLE keeper_map_table UPDATE v1 = v1 * 10 + 2 WHERE key LIKE 'some%' AND v3 > 3.1; +``` From 4f380370a942b52819eb57c9fef82854940b4fd8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 15 Feb 2023 15:30:43 +0100 Subject: [PATCH 58/69] Fix s3Cluster schema inference in parallel distributed insert select (#46381) * Fix s3Cluster schema inference in parallel distributed insert select * Try fix flaky test * Try SYSTEM SYNC REPLICA to avoid test flakiness --- src/Storages/StorageDeltaLake.cpp | 4 +-- src/Storages/StorageHudi.cpp | 4 +-- src/Storages/StorageS3.cpp | 7 ++-- src/Storages/StorageS3.h | 2 -- src/Storages/StorageS3Cluster.cpp | 4 +-- src/TableFunctions/TableFunctionS3.cpp | 2 +- src/TableFunctions/TableFunctionS3Cluster.cpp | 2 +- tests/integration/test_s3_cluster/test.py | 36 +++++++++++++++++++ 8 files changed, 46 insertions(+), 15 deletions(-) diff --git a/src/Storages/StorageDeltaLake.cpp b/src/Storages/StorageDeltaLake.cpp index 9f86eaf70d0..bec6f99ce59 100644 --- a/src/Storages/StorageDeltaLake.cpp +++ b/src/Storages/StorageDeltaLake.cpp @@ -230,7 +230,7 @@ StorageDeltaLake::StorageDeltaLake( if (columns_.empty()) { columns_ = StorageS3::getTableStructureFromData( - new_configuration, /*distributed processing*/ false, format_settings_, context_, nullptr); + new_configuration, format_settings_, context_, nullptr); storage_metadata.setColumns(columns_); } else @@ -272,7 +272,7 @@ ColumnsDescription StorageDeltaLake::getTableStructureFromData( { StorageS3::updateS3Configuration(ctx, configuration); auto new_configuration = getAdjustedS3Configuration(ctx, configuration, &Poco::Logger::get("StorageDeltaLake")); - return StorageS3::getTableStructureFromData(new_configuration, /*distributed processing*/ false, format_settings, ctx, /*object_infos*/ nullptr); + return StorageS3::getTableStructureFromData(new_configuration, format_settings, ctx, /*object_infos*/ nullptr); } void registerStorageDeltaLake(StorageFactory & factory) diff --git a/src/Storages/StorageHudi.cpp b/src/Storages/StorageHudi.cpp index b50759d1c2f..87a1014492a 100644 --- a/src/Storages/StorageHudi.cpp +++ b/src/Storages/StorageHudi.cpp @@ -163,7 +163,7 @@ StorageHudi::StorageHudi( if (columns_.empty()) { columns_ = StorageS3::getTableStructureFromData( - new_configuration, /*distributed processing*/ false, format_settings_, context_, nullptr); + new_configuration, format_settings_, context_, nullptr); storage_metadata.setColumns(columns_); } else @@ -203,7 +203,7 @@ ColumnsDescription StorageHudi::getTableStructureFromData( { StorageS3::updateS3Configuration(ctx, configuration); auto new_configuration = getAdjustedS3Configuration(configuration, &Poco::Logger::get("StorageDeltaLake")); - return StorageS3::getTableStructureFromData(new_configuration, /*distributed processing*/ false, format_settings, ctx, /*object_infos*/ nullptr); + return StorageS3::getTableStructureFromData(new_configuration, format_settings, ctx, /*object_infos*/ nullptr); } void registerStorageHudi(StorageFactory & factory) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 59f49b56262..974d6b23fcf 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -961,7 +961,6 @@ StorageS3::StorageS3( format_name, s3_configuration, compression_method, - distributed_processing_, is_key_with_globs, format_settings, context_, @@ -1369,14 +1368,13 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context ColumnsDescription StorageS3::getTableStructureFromData( StorageS3::Configuration & configuration, - bool distributed_processing, const std::optional & format_settings, ContextPtr ctx, ObjectInfos * object_infos) { updateS3Configuration(ctx, configuration); return getTableStructureFromDataImpl( - configuration.format, configuration, configuration.compression_method, distributed_processing, + configuration.format, configuration, configuration.compression_method, configuration.url.key.find_first_of("*?{") != std::string::npos, format_settings, ctx, object_infos); } @@ -1384,7 +1382,6 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( const String & format, const Configuration & s3_configuration, const String & compression_method, - bool distributed_processing, bool is_key_with_globs, const std::optional & format_settings, ContextPtr ctx, @@ -1396,7 +1393,7 @@ ColumnsDescription StorageS3::getTableStructureFromDataImpl( s3_configuration, {s3_configuration.url.key}, is_key_with_globs, - distributed_processing, + false, ctx, nullptr, {}, object_infos, &read_keys); diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 0531a3c08d3..b01c8e90a41 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -291,7 +291,6 @@ public: static ColumnsDescription getTableStructureFromData( StorageS3::Configuration & configuration, - bool distributed_processing, const std::optional & format_settings, ContextPtr ctx, ObjectInfos * object_infos = nullptr); @@ -338,7 +337,6 @@ private: const String & format, const Configuration & s3_configuration, const String & compression_method, - bool distributed_processing, bool is_key_with_globs, const std::optional & format_settings, ContextPtr ctx, diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 75aca759103..6be0bb1fbd3 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -66,8 +66,8 @@ StorageS3Cluster::StorageS3Cluster( /// `distributed_processing` is set to false, because this code is executed on the initiator, so there is no callback set /// for asking for the next tasks. /// `format_settings` is set to std::nullopt, because StorageS3Cluster is used only as table function - auto columns = StorageS3::getTableStructureFromDataImpl(format_name, s3_configuration, compression_method, - /*distributed_processing_*/false, is_key_with_globs, /*format_settings=*/std::nullopt, context_); + auto columns = StorageS3::getTableStructureFromDataImpl( + format_name, s3_configuration, compression_method, is_key_with_globs, /*format_settings=*/std::nullopt, context_); storage_metadata.setColumns(columns); } else diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index 49254ca469d..b7c04601222 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -139,7 +139,7 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) if (configuration.structure == "auto") { context->checkAccess(getSourceAccessType()); - return StorageS3::getTableStructureFromData(configuration, false, std::nullopt, context); + return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index 6efe686ca5d..997789086b6 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -82,7 +82,7 @@ ColumnsDescription TableFunctionS3Cluster::getActualTableStructure(ContextPtr co context->checkAccess(getSourceAccessType()); if (configuration.structure == "auto") - return StorageS3::getTableStructureFromData(configuration, false, std::nullopt, context); + return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); return parseColumnsListFromString(configuration.structure, context); } diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 1f46b9b2603..139a6e65237 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -326,3 +326,39 @@ def test_distributed_insert_select_with_replicated(started_cluster): first_replica_first_shard.query( """DROP TABLE IF EXISTS insert_select_replicated_local ON CLUSTER 'first_shard' SYNC;""" ) + + +def test_parallel_distributed_insert_select_with_schema_inference(started_cluster): + node = started_cluster.instances["s0_0_0"] + + node.query( + """DROP TABLE IF EXISTS parallel_insert_select ON CLUSTER 'first_shard' SYNC;""" + ) + + node.query( + """ + CREATE TABLE parallel_insert_select ON CLUSTER 'first_shard' (a String, b UInt64) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/{shard}/insert_select_with_replicated', '{replica}') + ORDER BY (a, b); + """ + ) + + node.query( + """ + INSERT INTO parallel_insert_select SELECT * FROM s3Cluster( + 'first_shard', + 'http://minio1:9001/root/data/generated/*.csv', 'minio', 'minio123', 'CSV' + ) SETTINGS parallel_distributed_insert_select=1, use_structure_from_insertion_table_in_table_functions=0; + """ + ) + + node.query("SYSTEM SYNC REPLICA parallel_insert_select") + + actual_count = int( + node.query( + "SELECT count() FROM s3('http://minio1:9001/root/data/generated/*.csv', 'minio', 'minio123', 'CSV','a String, b UInt64')" + ) + ) + + count = int(node.query("SELECT count() FROM parallel_insert_select")) + assert count == actual_count From 4a3ad7da85b88f78082a8e9f529d9398ecb3c15c Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 15 Feb 2023 16:04:52 +0100 Subject: [PATCH 59/69] Fix test test_backup_restore_new/test.py::test_async_backups_to_same_destination[http]. --- .../test_backup_restore_new/test.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index eea4abd719a..a903821071b 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -632,7 +632,8 @@ def test_async_backups_to_same_destination(interface): f"BACKUP TABLE test.table TO {backup_name} ASYNC" ) - # The second backup to the same destination is expected to fail. It can either fail immediately or after a while. + # One of those two backups to the same destination is expected to fail. + # If the second backup is going to fail it can fail either immediately or after a while. # If it fails immediately we won't even get its ID. id2 = None if err else res.split("\t")[0] @@ -647,17 +648,25 @@ def test_async_backups_to_same_destination(interface): "", ) - # The first backup should succeed. - assert instance.query( - f"SELECT status, error FROM system.backups WHERE id='{id1}'" - ) == TSV([["BACKUP_CREATED", ""]]) - - if id2: - # The second backup should fail. - assert ( - instance.query(f"SELECT status FROM system.backups WHERE id='{id2}'") - == "BACKUP_FAILED\n" + ids_succeeded = ( + instance.query( + f"SELECT id FROM system.backups WHERE id IN {ids_for_query} AND status == 'BACKUP_CREATED'" ) + .rstrip("\n") + .split("\n") + ) + + ids_failed = ( + instance.query( + f"SELECT id FROM system.backups WHERE id IN {ids_for_query} AND status == 'BACKUP_FAILED'" + ) + .rstrip("\n") + .split("\n") + ) + + assert len(ids_succeeded) == 1 + assert len(ids_failed) <= 1 + assert set(ids_succeeded + ids_failed) == set(ids) # Check that the first backup is all right. instance.query("DROP TABLE test.table") From 0496b550034d23d3cf7b00a057e783e61e726970 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Wed, 15 Feb 2023 13:29:26 -0500 Subject: [PATCH 60/69] Update docs/en/engines/table-engines/integrations/embedded-rocksdb.md --- docs/en/engines/table-engines/integrations/embedded-rocksdb.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 35c59b8181a..4a662bd8ec9 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -89,7 +89,7 @@ You can also change any [rocksdb options](https://github.com/facebook/rocksdb/wi ### Inserts -When new rows are inserted into `EmbeddedRocksDB`, if the key already exists, the value will be updated, otherwise new key is created. +When new rows are inserted into `EmbeddedRocksDB`, if the key already exists, the value will be updated, otherwise a new key is created. Example: From 10c8f318113aaf01f62d8ae9cfab985b3c79bcc9 Mon Sep 17 00:00:00 2001 From: Dan Roscigno Date: Wed, 15 Feb 2023 13:29:33 -0500 Subject: [PATCH 61/69] Update docs/en/engines/table-engines/integrations/embedded-rocksdb.md --- docs/en/engines/table-engines/integrations/embedded-rocksdb.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 4a662bd8ec9..a3604b3c332 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -115,7 +115,7 @@ TRUNCATE TABLE test; ### Updates -Values can be updated using `ALTER TABLE` query. Primary key cannot be updated. +Values can be updated using the `ALTER TABLE` query. The primary key cannot be updated. ```sql ALTER TABLE test UPDATE v1 = v1 * 10 + 2 WHERE key LIKE 'some%' AND v3 > 3.1; From c8d7b72e5d40f2a6a3a2e33d884e2dc48bf0990c Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 15 Feb 2023 15:09:36 -0500 Subject: [PATCH 62/69] move database credential inputs to the center on initial load --- programs/server/dashboard.html | 93 ++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 10 deletions(-) diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 74bd9746a4d..a91040b2701 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -92,7 +92,52 @@ .chart div { position: absolute; } - .inputs { font-size: 14pt; } + .inputs { + height: auto; + width: 100%; + + font-size: 14pt; + + display: flex; + flex-flow: column nowrap; + justify-content: center; + } + + .inputs.unconnected { + height: 100vh; + } + .unconnected #params { + display: flex; + flex-flow: column nowrap; + justify-content: center; + align-items: center; + } + .unconnected #connection-params { + width: 50%; + + display: flex; + flex-flow: row wrap; + } + .unconnected #url { + width: 100%; + } + .unconnected #user { + width: 50%; + } + .unconnected #password { + width: 49.5%; + } + .unconnected input { + margin-bottom: 5px; + } + + .inputs #chart-params { + display: block; + } + + .inputs.unconnected #chart-params { + display: none; + } #connection-params { margin-bottom: 0.5rem; @@ -223,6 +268,10 @@ color: var(--chart-button-hover-color); } + .disabled { + opacity: 0.5; + } + .query-editor { display: none; grid-template-columns: auto fit-content(10%); @@ -286,7 +335,7 @@ -
+
@@ -294,8 +343,8 @@
- - + + 🌚🌞
@@ -845,7 +894,7 @@ async function draw(idx, chart, url_params, query) { error_div.firstChild.data = error; title_div.style.display = 'none'; error_div.style.display = 'block'; - return; + return false; } else { error_div.firstChild.data = ''; error_div.style.display = 'none'; @@ -886,6 +935,7 @@ async function draw(idx, chart, url_params, query) { /// Set title const title = queries[idx] && queries[idx].title ? queries[idx].title.replaceAll(/\{(\w+)\}/g, (_, name) => params[name] ) : ''; chart.querySelector('.title').firstChild.data = title; + return true } function showAuthError(message) { @@ -902,8 +952,6 @@ function showAuthError(message) { function hideAuthError() { const charts = document.querySelector('#charts'); charts.style.display = 'flex'; - const add = document.querySelector('#add'); - add.style.display = 'block'; const authError = document.querySelector('#auth-error'); authError.textContent = ''; @@ -924,9 +972,20 @@ async function drawAll() { if (!firstLoad) { showAuthError(e.message); } + return false; }); - })).then(() => { - firstLoad = false; + })).then((results) => { + if (firstLoad) { + firstLoad = false; + } else { + enableReloadButton(); + } + if (!results.includes(false)) { + const element = document.querySelector('.inputs'); + element.classList.remove('unconnected'); + const add = document.querySelector('#add'); + add.style.display = 'block'; + } }) } @@ -941,11 +1000,25 @@ function resize() { new ResizeObserver(resize).observe(document.body); +function disableReloadButton() { + const reloadButton = document.getElementById('reload') + reloadButton.value = 'Reloading...' + reloadButton.disabled = true + reloadButton.classList.add('disabled') +} + +function enableReloadButton() { + const reloadButton = document.getElementById('reload') + reloadButton.value = 'Reload' + reloadButton.disabled = false + reloadButton.classList.remove('disabled') +} + function reloadAll() { updateParams(); drawAll(); saveState(); - document.getElementById('reload').style.display = 'none'; + disableReloadButton() } document.getElementById('params').onsubmit = function(event) { From 922d7f2c2a828d65a81a77eaf088a84d6a08565d Mon Sep 17 00:00:00 2001 From: Rich Raposa Date: Wed, 15 Feb 2023 13:26:58 -0700 Subject: [PATCH 63/69] Update s3Cluster.md The explanation of how the cluster is used in a query seemed liked it was worded poorly. (It made it sound like you were querying data on a cluster of CH nodes.) I reworded it. --- .../en/sql-reference/table-functions/s3Cluster.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md index b81fc51fd18..e77806a3665 100644 --- a/docs/en/sql-reference/table-functions/s3Cluster.md +++ b/docs/en/sql-reference/table-functions/s3Cluster.md @@ -27,18 +27,21 @@ A table with the specified structure for reading or writing data in the specifie **Examples** -Select the data from all files in the cluster `cluster_simple`: +Select the data from all the files in the `/root/data/clickhouse` and `/root/data/database/` folders, using all the nodes in the `cluster_simple` cluster: ``` sql -SELECT * FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/data/{clickhouse,database}/*', 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon); +SELECT * FROM s3Cluster( + 'cluster_simple', + 'http://minio1:9001/root/data/{clickhouse,database}/*', + 'minio', + 'minio123', + 'CSV', + 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') ORDER BY (name, value, polygon +); ``` Count the total amount of rows in all files in the cluster `cluster_simple`: -``` sql -SELECT count(*) FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/data/{clickhouse,database}/*', 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))'); -``` - :::warning If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. ::: From 0c5648a5a648d0b93e2401935c726839d4433064 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 15 Feb 2023 17:09:33 +0100 Subject: [PATCH 64/69] Use OK and FAIL for tests status --- tests/ci/install_check.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/ci/install_check.py b/tests/ci/install_check.py index 1444759cea0..c6ecd092aa7 100644 --- a/tests/ci/install_check.py +++ b/tests/ci/install_check.py @@ -36,6 +36,8 @@ DEB_IMAGE = "clickhouse/install-deb-test" TEMP_PATH = Path(TEMP) SUCCESS = "success" FAILURE = "failure" +OK = "OK" +FAIL = "FAIL" def prepare_test_scripts(): @@ -156,9 +158,9 @@ def test_install(image: DockerImage, tests: Dict[str, str]) -> TestResults: with TeePopen(install_command, log_file) as process: retcode = process.wait() if retcode == 0: - status = SUCCESS + status = OK else: - status = FAILURE + status = FAIL subprocess.check_call(f"docker kill -s 9 {container_id}", shell=True) test_results.append( @@ -266,9 +268,11 @@ def main(): test_results.extend(test_install_tgz(docker_images[RPM_IMAGE])) state = SUCCESS + test_status = OK description = "Packages installed successfully" - if FAILURE in (result.status for result in test_results): + if FAIL in (result.status for result in test_results): state = FAILURE + test_status = FAIL description = "Failed to install packages: " + ", ".join( result.name for result in test_results ) @@ -298,7 +302,7 @@ def main(): prepared_events = prepare_tests_results_for_clickhouse( pr_info, test_results, - state, + test_status, stopwatch.duration_seconds, stopwatch.start_time_str, report_url, From 3bcd049d189226b2f9719bec117123d9352931e8 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 15 Feb 2023 17:42:28 +0100 Subject: [PATCH 65/69] Improve download filter, group keeper mntr command --- tests/ci/install_check.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tests/ci/install_check.py b/tests/ci/install_check.py index c6ecd092aa7..40f8c82132b 100644 --- a/tests/ci/install_check.py +++ b/tests/ci/install_check.py @@ -52,9 +52,11 @@ for i in {1..20}; do done for i in {1..5}; do echo wait for clickhouse-keeper to answer on mntr request - exec 13<>/dev/tcp/127.0.0.1/9181 - echo mntr >&13 - cat <&13 | grep zk_version && break || sleep 1 + { + exec 13<>/dev/tcp/127.0.0.1/9181 + echo mntr >&13 + cat <&13 | grep zk_version + } && break || sleep 1 exec 13>&- done exec 13>&-""" @@ -72,9 +74,11 @@ for i in {1..20}; do done for i in {1..5}; do echo wait for clickhouse-keeper to answer on mntr request - exec 13<>/dev/tcp/127.0.0.1/9181 - echo mntr >&13 - cat <&13 | grep zk_version && break || sleep 1 + { + exec 13<>/dev/tcp/127.0.0.1/9181 + echo mntr >&13 + cat <&13 | grep zk_version + } && break || sleep 1 exec 13>&- done exec 13>&-""" @@ -247,12 +251,16 @@ def main(): if args.download: def filter_artifacts(path: str) -> bool: - return ( - path.endswith(".deb") - or path.endswith(".rpm") - or path.endswith(".tgz") - or path.endswith("/clickhouse") - ) + is_match = False + if args.deb or args.rpm: + is_match = is_match or path.endswith("/clickhouse") + if args.deb: + is_match = is_match or path.endswith(".deb") + if args.rpm: + is_match = is_match or path.endswith(".rpm") + if args.tgz: + is_match = is_match or path.endswith(".tgz") + return is_match download_builds_filter( args.check_name, REPORTS_PATH, TEMP_PATH, filter_artifacts From f39d9b09b01ebcaf5ed527e59fe3ae7d685f76f2 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 15 Feb 2023 18:52:38 +0100 Subject: [PATCH 66/69] Preserve clickhouse binary to not redownload it --- tests/ci/install_check.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/ci/install_check.py b/tests/ci/install_check.py index 40f8c82132b..ce0d8454bc6 100644 --- a/tests/ci/install_check.py +++ b/tests/ci/install_check.py @@ -7,7 +7,7 @@ import logging import sys import subprocess from pathlib import Path - +from shutil import copy2 from typing import Dict from github import Github @@ -61,8 +61,7 @@ for i in {1..5}; do done exec 13>&-""" binary_test = r"""#!/bin/bash -chmod +x /packages/clickhouse -/packages/clickhouse install +/packages/clickhouse.copy install clickhouse-server start --daemon for i in {1..5}; do clickhouse-client -q 'SELECT version()' && break || sleep 1 @@ -267,6 +266,14 @@ def main(): ) test_results = [] # type: TestResults + ch_binary = Path(TEMP_PATH) / "clickhouse" + if ch_binary.exists(): + # make a copy of clickhouse to avoid redownload of exctracted binary + ch_binary.chmod(0o755) + ch_copy = ch_binary.parent / "clickhouse.copy" + copy2(ch_binary, ch_binary.parent / "clickhouse.copy") + subprocess.check_output(f"{ch_copy.absolute()} local -q 'SELECT 1'", shell=True) + if args.deb: test_results.extend(test_install_deb(docker_images[DEB_IMAGE])) if args.rpm: From 9dba4e7cc91960a21badba522e7742ee11cb37b8 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 15 Feb 2023 22:37:21 +0100 Subject: [PATCH 67/69] Replace format with f-strings --- tests/ci/compress_files.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/ci/compress_files.py b/tests/ci/compress_files.py index ced8879d6d9..8d52d030b84 100644 --- a/tests/ci/compress_files.py +++ b/tests/ci/compress_files.py @@ -6,11 +6,11 @@ import os def compress_file_fast(path, archive_path): if archive_path.endswith(".zst"): - subprocess.check_call("zstd < {} > {}".format(path, archive_path), shell=True) + subprocess.check_call(f"zstd < {path} > {archive_path}", shell=True) elif os.path.exists("/usr/bin/pigz"): - subprocess.check_call("pigz < {} > {}".format(path, archive_path), shell=True) + subprocess.check_call(f"pigz < {path} > {archive_path}", shell=True) else: - subprocess.check_call("gzip < {} > {}".format(path, archive_path), shell=True) + subprocess.check_call(f"gzip < {path} > {archive_path}", shell=True) def compress_fast(path, archive_path, exclude=None): @@ -28,9 +28,9 @@ def compress_fast(path, archive_path, exclude=None): if exclude is None: exclude_part = "" elif isinstance(exclude, list): - exclude_part = " ".join(["--exclude {}".format(x) for x in exclude]) + exclude_part = " ".join([f"--exclude {x}" for x in exclude]) else: - exclude_part = "--exclude {}".format(str(exclude)) + exclude_part = f"--exclude {exclude}" fname = os.path.basename(path) if os.path.isfile(path): @@ -38,9 +38,7 @@ def compress_fast(path, archive_path, exclude=None): else: path += "/.." - cmd = "tar {} {} -cf {} -C {} {}".format( - program_part, exclude_part, archive_path, path, fname - ) + cmd = f"tar {program_part} {exclude_part} -cf {archive_path} -C {path} {fname}" logging.debug("compress_fast cmd: %s", cmd) subprocess.check_call(cmd, shell=True) @@ -70,11 +68,9 @@ def decompress_fast(archive_path, result_path=None): ) if result_path is None: - subprocess.check_call( - "tar {} -xf {}".format(program_part, archive_path), shell=True - ) + subprocess.check_call(f"tar {program_part} -xf {archive_path}", shell=True) else: subprocess.check_call( - "tar {} -xf {} -C {}".format(program_part, archive_path, result_path), + f"tar {program_part} -xf {archive_path} -C {result_path}", shell=True, ) From 346c3e3c49159d9f091d0bcd60ed7681542d0a03 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 15 Feb 2023 22:47:31 +0100 Subject: [PATCH 68/69] Preserve logs for failed installation tests, retry 3 times --- tests/ci/install_check.py | 62 +++++++++++++++++++++++++++++---------- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/tests/ci/install_check.py b/tests/ci/install_check.py index ce0d8454bc6..48d0da195ce 100644 --- a/tests/ci/install_check.py +++ b/tests/ci/install_check.py @@ -19,6 +19,7 @@ from clickhouse_helper import ( prepare_tests_results_for_clickhouse, ) from commit_status_helper import post_commit_status, update_mergeable_check +from compress_files import compress_fast from docker_pull_helper import get_image_with_version, DockerImage from env_helper import CI, TEMP_PATH as TEMP, REPORTS_PATH from get_robot_token import get_best_robot_token @@ -34,6 +35,7 @@ from upload_result_helper import upload_results RPM_IMAGE = "clickhouse/install-rpm-test" DEB_IMAGE = "clickhouse/install-deb-test" TEMP_PATH = Path(TEMP) +LOGS_PATH = TEMP_PATH / "tests_logs" SUCCESS = "success" FAILURE = "failure" OK = "OK" @@ -42,9 +44,13 @@ FAIL = "FAIL" def prepare_test_scripts(): server_test = r"""#!/bin/bash +set -e +trap "bash -ex /packages/preserve_logs.sh" ERR systemctl start clickhouse-server clickhouse-client -q 'SELECT version()'""" keeper_test = r"""#!/bin/bash +set -e +trap "bash -ex /packages/preserve_logs.sh" ERR systemctl start clickhouse-keeper for i in {1..20}; do echo wait for clickhouse-keeper to being up @@ -61,6 +67,8 @@ for i in {1..5}; do done exec 13>&-""" binary_test = r"""#!/bin/bash +set -e +trap "bash -ex /packages/preserve_logs.sh" ERR /packages/clickhouse.copy install clickhouse-server start --daemon for i in {1..5}; do @@ -81,9 +89,18 @@ for i in {1..5}; do exec 13>&- done exec 13>&-""" + preserve_logs = r"""#!/bin/bash +journalctl -u clickhouse-server > /tests_logs/clickhouse-server.service || : +journalctl -u clickhouse-keeper > /tests_logs/clickhouse-keeper.service || : +cp /var/log/clickhouse-server/clickhouse-server.* /tests_logs/ || : +cp /var/log/clickhouse-keeper/clickhouse-keeper.* /tests_logs/ || : +chmod a+rw -R /tests_logs +exit 1 +""" (TEMP_PATH / "server_test.sh").write_text(server_test, encoding="utf-8") (TEMP_PATH / "keeper_test.sh").write_text(keeper_test, encoding="utf-8") (TEMP_PATH / "binary_test.sh").write_text(binary_test, encoding="utf-8") + (TEMP_PATH / "preserve_logs.sh").write_text(preserve_logs, encoding="utf-8") def test_install_deb(image: DockerImage) -> TestResults: @@ -148,27 +165,41 @@ def test_install(image: DockerImage, tests: Dict[str, str]) -> TestResults: stopwatch = Stopwatch() container_name = name.lower().replace(" ", "_").replace("/", "_") log_file = TEMP_PATH / f"{container_name}.log" + logs = [log_file] run_command = ( f"docker run --rm --privileged --detach --cap-add=SYS_PTRACE " - f"--volume={TEMP_PATH}:/packages {image}" + f"--volume={LOGS_PATH}:/tests_logs --volume={TEMP_PATH}:/packages {image}" ) - logging.info("Running docker container: `%s`", run_command) - container_id = subprocess.check_output( - run_command, shell=True, encoding="utf-8" - ).strip() - (TEMP_PATH / "install.sh").write_text(command) - install_command = f"docker exec {container_id} bash -ex /packages/install.sh" - with TeePopen(install_command, log_file) as process: - retcode = process.wait() - if retcode == 0: - status = OK - else: + + for retry in range(1, 4): + for file in LOGS_PATH.glob("*"): + file.unlink() + + logging.info("Running docker container: `%s`", run_command) + container_id = subprocess.check_output( + run_command, shell=True, encoding="utf-8" + ).strip() + (TEMP_PATH / "install.sh").write_text(command) + install_command = ( + f"docker exec {container_id} bash -ex /packages/install.sh" + ) + with TeePopen(install_command, log_file) as process: + retcode = process.wait() + if retcode == 0: + status = OK + break + status = FAIL + copy2(log_file, LOGS_PATH) + archive_path = TEMP_PATH / f"{container_name}-{retry}.tar.gz" + compress_fast( + LOGS_PATH.as_posix(), + archive_path.as_posix(), + ) + logs.append(archive_path) subprocess.check_call(f"docker kill -s 9 {container_id}", shell=True) - test_results.append( - TestResult(name, status, stopwatch.duration_seconds, [log_file]) - ) + test_results.append(TestResult(name, status, stopwatch.duration_seconds, logs)) return test_results @@ -227,6 +258,7 @@ def main(): args = parse_args() TEMP_PATH.mkdir(parents=True, exist_ok=True) + LOGS_PATH.mkdir(parents=True, exist_ok=True) pr_info = PRInfo() From 53e32b19e651d5cfcf457397a0d0a294cecd7d9f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 16 Feb 2023 02:13:18 +0300 Subject: [PATCH 69/69] Update AggregatingTransform.cpp --- src/Processors/Transforms/AggregatingTransform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp index 836458ef792..755af34ad01 100644 --- a/src/Processors/Transforms/AggregatingTransform.cpp +++ b/src/Processors/Transforms/AggregatingTransform.cpp @@ -555,7 +555,7 @@ void AggregatingTransform::initGenerate() double elapsed_seconds = watch.elapsedSeconds(); size_t rows = variants.sizeWithoutOverflowRow(); - LOG_DEBUG(log, "Aggregated. {} to {} rows (from {}) in {} sec. ({:.3f} rows/sec., {}/sec.)", + LOG_TRACE(log, "Aggregated. {} to {} rows (from {}) in {} sec. ({:.3f} rows/sec., {}/sec.)", src_rows, rows, ReadableSize(src_bytes), elapsed_seconds, src_rows / elapsed_seconds, ReadableSize(src_bytes / elapsed_seconds));