This commit is contained in:
Alexander Kuzmenkov 2021-04-15 19:40:49 +03:00
parent 3b95b637a5
commit 2489b6af96
4 changed files with 66 additions and 40 deletions

View File

@ -66,7 +66,12 @@ reportStageEnd('parse')
subst_elems = root.findall('substitutions/substitution')
available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... }
for e in subst_elems:
available_parameters[e.find('name').text] = [v.text for v in e.findall('values/value')]
name = e.find('name').text
values = [v.text for v in e.findall('values/value')]
if not values:
raise Exception(f'No values given for substitution {{{name}}}')
available_parameters[name] = values
# Takes parallel lists of templates, substitutes them with all combos of
# parameters. The set of parameters is determined based on the first list.

View File

@ -426,7 +426,7 @@ class IColumn;
M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \
M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \
M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \
M(Bool, optimize_fuse_sum_count_avg, false, "If enabled, Fuse aggregate functions when exists at least two: sum, avg, count functions with identical argument to sumCount", 0) \
M(Bool, optimize_fuse_sum_count_avg, false, "Fuse aggregate functions sum(), avg(), count() with identical arguments into one sumCount() call, if the query has at least two different functions", 0) \
M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \
M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \
M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \

View File

@ -182,24 +182,28 @@ struct CustomizeAggregateFunctionsMoveSuffixData
}
};
struct FuseFunctions
struct FuseSumCountAggregates
{
std::vector<ASTFunction *> sums {};
std::vector<ASTFunction *> counts {};
std::vector<ASTFunction *> avgs {};
void addFuncNode(ASTFunction & func)
void addFuncNode(ASTFunction * func)
{
if (func.name == "sum")
sums.push_back(&func);
else if (func.name == "count")
counts.push_back(&func);
else if (func.name == "avg")
avgs.push_back(&func);
if (func->name == "sum")
sums.push_back(func);
else if (func->name == "count")
counts.push_back(func);
else
{
assert(func->name == "avg");
avgs.push_back(func);
}
}
bool canBeFused() const
{
// Need at least two different kinds of functions to fuse.
if (sums.empty() && counts.empty())
return false;
if (sums.empty() && avgs.empty())
@ -210,11 +214,11 @@ struct FuseFunctions
}
};
struct CustomizeFuseAggregateFunctionsData
struct FuseSumCountAggregatesVisitorData
{
using TypeToVisit = ASTFunction;
std::unordered_map<String, DB::FuseFunctions> fuse_map;
std::unordered_map<String, FuseSumCountAggregates> fuse_map;
void visit(ASTFunction & func, ASTPtr &)
{
@ -223,19 +227,19 @@ struct CustomizeFuseAggregateFunctionsData
if (func.arguments->children.empty())
return;
ASTIdentifier * ident = func.arguments->children.at(0)->as<ASTIdentifier>();
if (!ident)
return;
auto it = fuse_map.find(ident->name());
// Probably we can extend it to match count() for non-nullable arugment
// to sum/avg with any other argument. Now we require strict match.
const auto argument = func.arguments->children.at(0)->getColumnName();
auto it = fuse_map.find(argument);
if (it != fuse_map.end())
{
it->second.addFuncNode(func);
it->second.addFuncNode(&func);
}
else
{
DB::FuseFunctions funcs{};
funcs.addFuncNode(func);
fuse_map[ident->name()] = funcs;
FuseSumCountAggregates funcs{};
funcs.addFuncNode(&func);
fuse_map[argument] = funcs;
}
}
}
@ -243,7 +247,7 @@ struct CustomizeFuseAggregateFunctionsData
using CustomizeAggregateFunctionsOrNullVisitor = InDepthNodeVisitor<OneTypeMatcher<CustomizeAggregateFunctionsSuffixData>, true>;
using CustomizeAggregateFunctionsMoveOrNullVisitor = InDepthNodeVisitor<OneTypeMatcher<CustomizeAggregateFunctionsMoveSuffixData>, true>;
using CustomizeFuseAggregateFunctionsVisitor = InDepthNodeVisitor<OneTypeMatcher<CustomizeFuseAggregateFunctionsData>, true>;
using FuseSumCountAggregatesVisitor = InDepthNodeVisitor<OneTypeMatcher<FuseSumCountAggregatesVisitorData>, true>;
/// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form.
/// Expand asterisks and qualified asterisks with column names.
@ -261,7 +265,9 @@ void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query
throw Exception("Empty list of columns in SELECT query", ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED);
}
void rewriterFusedFunction(String column_name, ASTFunction & func)
// Replaces one avg/sum/count function with an appropriate expression with
// sumCount().
void replaceWithSumCount(String column_name, ASTFunction & func)
{
auto func_base = makeASTFunction("sumCount", std::make_shared<ASTIdentifier>(column_name));
auto exp_list = std::make_shared<ASTExpressionList>();
@ -286,18 +292,18 @@ void rewriterFusedFunction(String column_name, ASTFunction & func)
func.children.push_back(func.arguments);
}
void fuseCandidates(std::unordered_map<String, DB::FuseFunctions> &fuse_map)
void fuseSumCountAggregates(std::unordered_map<String, FuseSumCountAggregates> & fuse_map)
{
for (auto & it : fuse_map)
{
if (it.second.canBeFused())
{
for (auto & func: it.second.sums)
rewriterFusedFunction(it.first, *func);
replaceWithSumCount(it.first, *func);
for (auto & func: it.second.avgs)
rewriterFusedFunction(it.first, *func);
replaceWithSumCount(it.first, *func);
for (auto & func: it.second.counts)
rewriterFusedFunction(it.first, *func);
replaceWithSumCount(it.first, *func);
}
}
}
@ -1012,12 +1018,15 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const NameSet &
CustomizeGlobalNotInVisitor(data_global_not_null_in).visit(query);
}
/// Try to fuse sum/avg/count with identical column(at least two functions exist) to sumCount()
// Try to fuse sum/avg/count with identical arguments to one sumCount call,
// if we have at least two different functions. E.g. we will replace sum(x)
// and count(x) with sumCount(x).1 and sumCount(x).2, and sumCount() will
// be calculated only once because of CSE.
if (settings.optimize_fuse_sum_count_avg)
{
CustomizeFuseAggregateFunctionsVisitor::Data data;
CustomizeFuseAggregateFunctionsVisitor(data).visit(query);
fuseCandidates(data.fuse_map);
FuseSumCountAggregatesVisitor::Data data;
FuseSumCountAggregatesVisitor(data).visit(query);
fuseSumCountAggregates(data.fuse_map);
}
/// Rewrite all aggregate functions to add -OrNull suffix to them

View File

@ -1,15 +1,27 @@
<test>
<settings>
<optimize_fuse_sum_count_avg>true</optimize_fuse_sum_count_avg>
</settings>
<!-- We test rewriting sum(), avg(), count() to a single call of sumCount() here.
As a reference, we use the same queries with the optimization disabled.
sum() has a highly optimized algorithm, so alone it will be faster than sumCount(),
but when we add count() or avg(), the sumCount() should win.-->
<query>SELECT sum(number), avg(number) FROM numbers(1000000000)</query>
<query>SELECT sum(number), avg(number), count(number) FROM numbers(1000000000)</query>
<query>SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) settings optimize_fuse_sum_count_avg = 0</query>
<query>SELECT sum(number), count(number) FROM numbers(1000000000)</query>
<query>SELECT sum(number), count(number) FROM numbers(1000000000) settings optimize_fuse_sum_count_avg = 0</query>
<query>SELECT sum(number) FROM numbers(1000000000)</query>
but when we add count() or avg(), the sumCount() should win.
Also test GROUP BY with and without keys, because they might have different
optimizations. -->
<settings>
<optimize_fuse_sum_count_avg>1</optimize_fuse_sum_count_avg>
</settings>
<substitutions>
<substitution>
<name>key</name>
<values>
<value>1</value>
<value>intHash32(number) % 1000</value>
</values>
</substitution>
</substitutions>
<query>SELECT sum(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null</query>
<query>SELECT sum(number), count(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null</query>
<query>SELECT sum(number), count(number) FROM numbers(1000000000) GROUP BY {key} SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null</query>
<query>SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) GROUP BY {key} FORMAT Null</query>
<query>SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) GROUP BY {key} SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null</query>
</test>