From 2489b6af96ec3c3ad1a7772a28bb640034401cc5 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Thu, 15 Apr 2021 19:40:49 +0300 Subject: [PATCH] cleanup --- docker/test/performance-comparison/perf.py | 7 ++- src/Core/Settings.h | 2 +- src/Interpreters/TreeRewriter.cpp | 65 ++++++++++++---------- tests/performance/fuse_sumcount.xml | 32 +++++++---- 4 files changed, 66 insertions(+), 40 deletions(-) diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 4727f485943..2588b9f4213 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -66,7 +66,12 @@ reportStageEnd('parse') subst_elems = root.findall('substitutions/substitution') available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... } for e in subst_elems: - available_parameters[e.find('name').text] = [v.text for v in e.findall('values/value')] + name = e.find('name').text + values = [v.text for v in e.findall('values/value')] + if not values: + raise Exception(f'No values given for substitution {{{name}}}') + + available_parameters[name] = values # Takes parallel lists of templates, substitutes them with all combos of # parameters. The set of parameters is determined based on the first list. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 830d4e67737..775e70ffed0 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -426,7 +426,7 @@ class IColumn; M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \ M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \ M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \ - M(Bool, optimize_fuse_sum_count_avg, false, "If enabled, Fuse aggregate functions when exists at least two: sum, avg, count functions with identical argument to sumCount", 0) \ + M(Bool, optimize_fuse_sum_count_avg, false, "Fuse aggregate functions sum(), avg(), count() with identical arguments into one sumCount() call, if the query has at least two different functions", 0) \ M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \ M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \ M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index be8ed3dd78b..7b2fa11e724 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -182,24 +182,28 @@ struct CustomizeAggregateFunctionsMoveSuffixData } }; -struct FuseFunctions +struct FuseSumCountAggregates { std::vector sums {}; std::vector counts {}; std::vector avgs {}; - void addFuncNode(ASTFunction & func) + void addFuncNode(ASTFunction * func) { - if (func.name == "sum") - sums.push_back(&func); - else if (func.name == "count") - counts.push_back(&func); - else if (func.name == "avg") - avgs.push_back(&func); + if (func->name == "sum") + sums.push_back(func); + else if (func->name == "count") + counts.push_back(func); + else + { + assert(func->name == "avg"); + avgs.push_back(func); + } } bool canBeFused() const { + // Need at least two different kinds of functions to fuse. if (sums.empty() && counts.empty()) return false; if (sums.empty() && avgs.empty()) @@ -210,11 +214,11 @@ struct FuseFunctions } }; -struct CustomizeFuseAggregateFunctionsData +struct FuseSumCountAggregatesVisitorData { using TypeToVisit = ASTFunction; - std::unordered_map fuse_map; + std::unordered_map fuse_map; void visit(ASTFunction & func, ASTPtr &) { @@ -223,19 +227,19 @@ struct CustomizeFuseAggregateFunctionsData if (func.arguments->children.empty()) return; - ASTIdentifier * ident = func.arguments->children.at(0)->as(); - if (!ident) - return; - auto it = fuse_map.find(ident->name()); + // Probably we can extend it to match count() for non-nullable arugment + // to sum/avg with any other argument. Now we require strict match. + const auto argument = func.arguments->children.at(0)->getColumnName(); + auto it = fuse_map.find(argument); if (it != fuse_map.end()) { - it->second.addFuncNode(func); + it->second.addFuncNode(&func); } else { - DB::FuseFunctions funcs{}; - funcs.addFuncNode(func); - fuse_map[ident->name()] = funcs; + FuseSumCountAggregates funcs{}; + funcs.addFuncNode(&func); + fuse_map[argument] = funcs; } } } @@ -243,7 +247,7 @@ struct CustomizeFuseAggregateFunctionsData using CustomizeAggregateFunctionsOrNullVisitor = InDepthNodeVisitor, true>; using CustomizeAggregateFunctionsMoveOrNullVisitor = InDepthNodeVisitor, true>; -using CustomizeFuseAggregateFunctionsVisitor = InDepthNodeVisitor, true>; +using FuseSumCountAggregatesVisitor = InDepthNodeVisitor, true>; /// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form. /// Expand asterisks and qualified asterisks with column names. @@ -261,7 +265,9 @@ void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query throw Exception("Empty list of columns in SELECT query", ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED); } -void rewriterFusedFunction(String column_name, ASTFunction & func) +// Replaces one avg/sum/count function with an appropriate expression with +// sumCount(). +void replaceWithSumCount(String column_name, ASTFunction & func) { auto func_base = makeASTFunction("sumCount", std::make_shared(column_name)); auto exp_list = std::make_shared(); @@ -286,18 +292,18 @@ void rewriterFusedFunction(String column_name, ASTFunction & func) func.children.push_back(func.arguments); } -void fuseCandidates(std::unordered_map &fuse_map) +void fuseSumCountAggregates(std::unordered_map & fuse_map) { for (auto & it : fuse_map) { if (it.second.canBeFused()) { for (auto & func: it.second.sums) - rewriterFusedFunction(it.first, *func); + replaceWithSumCount(it.first, *func); for (auto & func: it.second.avgs) - rewriterFusedFunction(it.first, *func); + replaceWithSumCount(it.first, *func); for (auto & func: it.second.counts) - rewriterFusedFunction(it.first, *func); + replaceWithSumCount(it.first, *func); } } } @@ -1012,12 +1018,15 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const NameSet & CustomizeGlobalNotInVisitor(data_global_not_null_in).visit(query); } - /// Try to fuse sum/avg/count with identical column(at least two functions exist) to sumCount() + // Try to fuse sum/avg/count with identical arguments to one sumCount call, + // if we have at least two different functions. E.g. we will replace sum(x) + // and count(x) with sumCount(x).1 and sumCount(x).2, and sumCount() will + // be calculated only once because of CSE. if (settings.optimize_fuse_sum_count_avg) { - CustomizeFuseAggregateFunctionsVisitor::Data data; - CustomizeFuseAggregateFunctionsVisitor(data).visit(query); - fuseCandidates(data.fuse_map); + FuseSumCountAggregatesVisitor::Data data; + FuseSumCountAggregatesVisitor(data).visit(query); + fuseSumCountAggregates(data.fuse_map); } /// Rewrite all aggregate functions to add -OrNull suffix to them diff --git a/tests/performance/fuse_sumcount.xml b/tests/performance/fuse_sumcount.xml index 30048ee82fc..0fa159aa659 100644 --- a/tests/performance/fuse_sumcount.xml +++ b/tests/performance/fuse_sumcount.xml @@ -1,15 +1,27 @@ - - true - - SELECT sum(number), avg(number) FROM numbers(1000000000) - SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) - SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) settings optimize_fuse_sum_count_avg = 0 - SELECT sum(number), count(number) FROM numbers(1000000000) - SELECT sum(number), count(number) FROM numbers(1000000000) settings optimize_fuse_sum_count_avg = 0 - SELECT sum(number) FROM numbers(1000000000) + but when we add count() or avg(), the sumCount() should win. + Also test GROUP BY with and without keys, because they might have different + optimizations. --> + + 1 + + + + + key + + 1 + intHash32(number) % 1000 + + + + + SELECT sum(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null + SELECT sum(number), count(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null + SELECT sum(number), count(number) FROM numbers(1000000000) GROUP BY {key} SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) GROUP BY {key} SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null