diff --git a/src/Core/Settings.h b/src/Core/Settings.h index adc804c3a28..0693f53db9f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -409,6 +409,7 @@ struct Settings : public SettingsCollection \ M(SettingDateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \ \ + M(SettingBool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \ M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \ M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \ M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \ diff --git a/src/Interpreters/GroupByFunctionKeysVisitor.h b/src/Interpreters/GroupByFunctionKeysVisitor.h new file mode 100644 index 00000000000..afcf5a14118 --- /dev/null +++ b/src/Interpreters/GroupByFunctionKeysVisitor.h @@ -0,0 +1,115 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + + +/// recursive traversal and check for optimizeGroupByFunctionKeys +struct KeepFunctionMatcher +{ + struct Data + { + std::unordered_set & key_names_to_keep; + bool & keep_key; + }; + + using Visitor = InDepthNodeVisitor; + + static bool needChildVisit(const ASTPtr & node, const ASTPtr &) + { + return !(node->as()); + } + + static void visit(ASTFunction * function_node, Data & data) + { + if ((function_node->arguments->children).empty()) + { + data.keep_key = true; + return; + } + + if (!data.key_names_to_keep.count(function_node->getColumnName())) + { + Visitor(data).visit(function_node->arguments); + } + } + + static void visit(ASTIdentifier * ident, Data & data) + { + if (!data.key_names_to_keep.count(ident->shortName())) + { + /// if variable of a function is not in GROUP BY keys, this function should not be deleted + data.keep_key = true; + return; + } + } + + static void visit(const ASTPtr & ast, Data & data) + { + if (data.keep_key) + return; + + if (auto * function_node = ast->as()) + { + visit(function_node, data); + } + else if (auto * ident = ast->as()) + { + visit(ident, data); + } + else if (!ast->as()) + { + data.keep_key = true; + } + } +}; + +using KeepFunctionVisitor = InDepthNodeVisitor; + +class GroupByFunctionKeysMatcher +{ +public: + struct Data + { + std::unordered_set & key_names_to_keep; + }; + + static bool needChildVisit(const ASTPtr & node, const ASTPtr &) + { + return !(node->as()); + } + + static void visit(ASTFunction * function_node, Data & data) + { + bool keep_key = false; + KeepFunctionVisitor::Data keep_data{data.key_names_to_keep, keep_key}; + KeepFunctionVisitor(keep_data).visit(function_node->arguments); + + if (!keep_key) + (data.key_names_to_keep).erase(function_node->getColumnName()); + } + + static void visit(const ASTPtr & ast, Data & data) + { + if (auto * function_node = ast->as()) + { + if (!(function_node->arguments->children.empty())) + visit(function_node, data); + } + } +}; + +using GroupByFunctionKeysVisitor = InDepthNodeVisitor; + +} diff --git a/src/Interpreters/SyntaxAnalyzer.cpp b/src/Interpreters/SyntaxAnalyzer.cpp index 4bfae18f9a5..2dc2943d36d 100644 --- a/src/Interpreters/SyntaxAnalyzer.cpp +++ b/src/Interpreters/SyntaxAnalyzer.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -346,6 +347,89 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum appendUnusedGroupByColumn(select_query, source_columns); } +///eliminate functions of other GROUP BY keys +void optimizeGroupByFunctionKeys(ASTSelectQuery * select_query, bool optimize_group_by_function_keys) +{ + if (!optimize_group_by_function_keys) + return; + + if (!select_query->groupBy()) + return; + + auto grp_by = select_query->groupBy(); + auto & group_keys = grp_by->children; + + ASTs modified; ///result + std::unordered_set key_names_to_keep; ///set of keys' short names + + ///check if optimization is needed while building set + bool need_optimization = false; + ///filling set with short names of keys + for (auto & group_key : group_keys) + { + if (!need_optimization && group_key->as()) + need_optimization = true; + + if (auto * group_key_ident = group_key->as()) + { + if (key_names_to_keep.count(group_key_ident->shortName())) + { + ///There may be a collision between different tables having similar variables. + ///Due to the fact that we can't track these conflicts yet, + ///it's better to disable optimization to avoid elimination necessary keys. + need_optimization = false; + break; + } + + key_names_to_keep.insert(group_key_ident->shortName()); + continue; + } + if (auto * group_key_func = group_key->as()) + { + key_names_to_keep.insert(group_key_func->getColumnName()); + continue; + } + else + { + key_names_to_keep.insert(group_key->getColumnName()); + } + } + if (!need_optimization) + return; + + GroupByFunctionKeysVisitor::Data visitor_data{key_names_to_keep}; + GroupByFunctionKeysVisitor(visitor_data).visit(grp_by); + + modified.reserve(group_keys.size()); + + ///filling the result + for (auto & group_key : group_keys) + { + if (auto * group_key_func = group_key->as()) + { + if (key_names_to_keep.count(group_key_func->getColumnName())) + modified.push_back(group_key); + + continue; + } + if (auto * group_key_ident = group_key->as()) + { + if (key_names_to_keep.count(group_key_ident->shortName())) + modified.push_back(group_key); + + continue; + } + else + { + if (key_names_to_keep.count(group_key->getColumnName())) + modified.push_back(group_key); + } + } + + ///modifying the input + grp_by->children = modified; +} + /// Remove duplicate items from ORDER BY. void optimizeOrderBy(const ASTSelectQuery * select_query) { @@ -843,6 +927,9 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect( /// GROUP BY injective function elimination. optimizeGroupBy(select_query, source_columns_set, context); + /// GROUP BY functions of other keys elimination. + optimizeGroupByFunctionKeys(select_query, settings.optimize_group_by_function_keys); + /// Remove duplicate items from ORDER BY. optimizeOrderBy(select_query); diff --git a/tests/performance/removing_group_by_keys.xml b/tests/performance/removing_group_by_keys.xml new file mode 100644 index 00000000000..6db641966ec --- /dev/null +++ b/tests/performance/removing_group_by_keys.xml @@ -0,0 +1,9 @@ + + + hits_10m_single + hits_100m_single + + + SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID FORMAT Null + SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID, WatchID - CounterID FORMAT Null + diff --git a/tests/queries/0_stateless/01300_group_by_other_keys.reference b/tests/queries/0_stateless/01300_group_by_other_keys.reference new file mode 100644 index 00000000000..52a5e7f0002 --- /dev/null +++ b/tests/queries/0_stateless/01300_group_by_other_keys.reference @@ -0,0 +1,58 @@ +6931467.646716369 +6931468.33986355 +6931469.0330107305 +6931469.726157911 +6931470.419305092 +6931471.112452272 +3465734.169931768 +3465734.8630789486 +3465735.5562261306 +3465736.24937331 +3465736.94252049 +3465735.209652544 +3465735.209652544 +3465735.5562261483 +3465735.9027997246 +3465735.902799725 +3465734.516505364 +3465735.209652544 +3465735.209652544 +3465735.9027997246 +3465735.902799725 +3465736.595946905 +3465735.2096525617 +3465735.9027997428 +SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY number % 5\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC +6931467.646716369 +6931468.33986355 +6931469.0330107305 +6931469.726157911 +6931470.419305092 +6931471.112452272 +3465734.169931768 +3465734.8630789486 +3465735.5562261306 +3465736.24937331 +3465736.94252049 +3465735.209652544 +3465735.209652544 +3465735.5562261483 +3465735.9027997246 +3465735.902799725 +3465734.516505364 +3465735.209652544 +3465735.209652544 +3465735.9027997246 +3465735.902799725 +3465736.595946905 +3465735.2096525617 +3465735.9027997428 +SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC diff --git a/tests/queries/0_stateless/01300_group_by_other_keys.sql b/tests/queries/0_stateless/01300_group_by_other_keys.sql new file mode 100644 index 00000000000..d28f3707f7a --- /dev/null +++ b/tests/queries/0_stateless/01300_group_by_other_keys.sql @@ -0,0 +1,30 @@ +set optimize_group_by_function_keys = 1; +set enable_debug_queries = 1; + +SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k; + + +analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k; + +set optimize_group_by_function_keys = 0; + +SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k; + +analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k; +-- TODO - test with similar variables of different tables (collision) diff --git a/tests/queries/0_stateless/01300_group_by_other_keys_having.reference b/tests/queries/0_stateless/01300_group_by_other_keys_having.reference new file mode 100644 index 00000000000..d7c04e64df5 --- /dev/null +++ b/tests/queries/0_stateless/01300_group_by_other_keys_having.reference @@ -0,0 +1,24 @@ +3465735.9027997246 +3465735.902799725 +3465736.595946905 +3465734.169931768 +3465734.8630789486 +3465735.5562261306 +0 +1 +4 +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC +SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC +3465735.9027997246 +3465735.902799725 +3465736.595946905 +3465734.169931768 +3465734.8630789486 +3465735.5562261306 +0 +1 +4 +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC +SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC +SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC diff --git a/tests/queries/0_stateless/01300_group_by_other_keys_having.sql b/tests/queries/0_stateless/01300_group_by_other_keys_having.sql new file mode 100644 index 00000000000..b359c074c44 --- /dev/null +++ b/tests/queries/0_stateless/01300_group_by_other_keys_having.sql @@ -0,0 +1,21 @@ +set optimize_group_by_function_keys = 1; +set enable_debug_queries = 1; + +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; +SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; + + +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; +analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; + +set optimize_group_by_function_keys = 0; + +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k; +SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; +SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; + +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k; +analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k; +analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;