Merge pull request #16951 from CurtizJ/fix-groupby-optimization

Fix optimization with 'optimize_aggregators_of_group_by_keys' and joins
This commit is contained in:
alexey-milovidov 2020-11-29 20:47:16 +03:00 committed by GitHub
commit 038a56d243
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 36 additions and 77 deletions

View File

@ -20,8 +20,8 @@ struct KeepAggregateFunctionMatcher
{
struct Data
{
std::unordered_set<String> & group_by_keys;
bool & keep_aggregator;
const NameSet & group_by_keys;
bool keep_aggregator;
};
using Visitor = InDepthNodeVisitor<KeepAggregateFunctionMatcher, true>;
@ -33,7 +33,7 @@ struct KeepAggregateFunctionMatcher
static void visit(ASTFunction & function_node, Data & data)
{
if ((function_node.arguments->children).empty())
if (function_node.arguments->children.empty())
{
data.keep_aggregator = true;
return;
@ -46,13 +46,10 @@ struct KeepAggregateFunctionMatcher
}
static void visit(ASTIdentifier & ident, Data & data)
{
if (!data.group_by_keys.count(ident.shortName()))
{
/// if variable of a function is not in GROUP BY keys, this function should not be deleted
if (!data.group_by_keys.count(ident.getColumnName()))
data.keep_aggregator = true;
return;
}
}
static void visit(const ASTPtr & ast, Data & data)
@ -75,21 +72,21 @@ struct KeepAggregateFunctionMatcher
}
};
using KeepAggregateFunctionVisitor = InDepthNodeVisitor<KeepAggregateFunctionMatcher, true>;
using KeepAggregateFunctionVisitor = KeepAggregateFunctionMatcher::Visitor;
class SelectAggregateFunctionOfGroupByKeysMatcher
{
public:
struct Data
{
std::unordered_set<String> & group_by_keys;
const NameSet & group_by_keys;
};
static bool needChildVisit(const ASTPtr & node, const ASTPtr &)
{
/// Don't descent into table functions and subqueries and special case for ArrayJoin.
return !node->as<ASTSubquery>() &&
!(node->as<ASTTableExpression>() || node->as<ASTSelectWithUnionQuery>() || node->as<ASTArrayJoin>());
return !node->as<ASTSubquery>() && !node->as<ASTTableExpression>()
&& !node->as<ASTSelectWithUnionQuery>() && !node->as<ASTArrayJoin>();
}
static void visit(ASTPtr & ast, Data & data)
@ -99,12 +96,11 @@ public:
if (function_node && (function_node->name == "min" || function_node->name == "max" ||
function_node->name == "any" || function_node->name == "anyLast"))
{
bool keep_aggregator = false;
KeepAggregateFunctionVisitor::Data keep_data{data.group_by_keys, keep_aggregator};
KeepAggregateFunctionVisitor::Data keep_data{data.group_by_keys, false};
KeepAggregateFunctionVisitor(keep_data).visit(function_node->arguments);
/// Place argument of an aggregate function instead of function
if (!keep_aggregator && !function_node->arguments->children.empty())
if (!keep_data.keep_aggregator && !function_node->arguments->children.empty())
{
String alias = function_node->alias;
ast = (function_node->arguments->children[0])->clone();

View File

@ -178,44 +178,22 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum
struct GroupByKeysInfo
{
std::unordered_set<String> key_names; ///set of keys' short names
bool has_identifier = false;
NameSet key_names; ///set of keys' short names
bool has_function = false;
bool has_possible_collision = false;
};
GroupByKeysInfo getGroupByKeysInfo(ASTs & group_keys)
GroupByKeysInfo getGroupByKeysInfo(const ASTs & group_by_keys)
{
GroupByKeysInfo data;
///filling set with short names of keys
for (auto & group_key : group_keys)
/// filling set with short names of keys
for (const auto & group_key : group_by_keys)
{
if (group_key->as<ASTFunction>())
data.has_function = true;
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
{
data.has_identifier = true;
if (data.key_names.count(group_key_ident->shortName()))
{
///There may be a collision between different tables having similar variables.
///Due to the fact that we can't track these conflicts yet,
///it's better to disable some optimizations to avoid elimination necessary keys.
data.has_possible_collision = true;
}
data.key_names.insert(group_key_ident->shortName());
}
else if (auto * group_key_func = group_key->as<ASTFunction>())
{
data.key_names.insert(group_key_func->getColumnName());
}
else
{
data.key_names.insert(group_key->getColumnName());
}
}
return data;
}
@ -226,47 +204,28 @@ void optimizeGroupByFunctionKeys(ASTSelectQuery * select_query)
if (!select_query->groupBy())
return;
auto grp_by = select_query->groupBy();
auto & group_keys = grp_by->children;
auto group_by = select_query->groupBy();
const auto & group_by_keys = group_by->children;
ASTs modified; ///result
GroupByKeysInfo group_by_keys_data = getGroupByKeysInfo(group_keys);
GroupByKeysInfo group_by_keys_data = getGroupByKeysInfo(group_by_keys);
if (!group_by_keys_data.has_function || group_by_keys_data.has_possible_collision)
if (!group_by_keys_data.has_function)
return;
GroupByFunctionKeysVisitor::Data visitor_data{group_by_keys_data.key_names};
GroupByFunctionKeysVisitor(visitor_data).visit(grp_by);
GroupByFunctionKeysVisitor(visitor_data).visit(group_by);
modified.reserve(group_keys.size());
modified.reserve(group_by_keys.size());
///filling the result
for (auto & group_key : group_keys)
{
if (auto * group_key_func = group_key->as<ASTFunction>())
{
if (group_by_keys_data.key_names.count(group_key_func->getColumnName()))
modified.push_back(group_key);
continue;
}
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
{
if (group_by_keys_data.key_names.count(group_key_ident->shortName()))
modified.push_back(group_key);
continue;
}
else
{
/// filling the result
for (const auto & group_key : group_by_keys)
if (group_by_keys_data.key_names.count(group_key->getColumnName()))
modified.push_back(group_key);
}
}
///modifying the input
grp_by->children = modified;
/// modifying the input
group_by->children = modified;
}
/// Eliminates min/max/any-aggregators of functions of GROUP BY keys
@ -275,10 +234,8 @@ void optimizeAggregateFunctionsOfGroupByKeys(ASTSelectQuery * select_query, ASTP
if (!select_query->groupBy())
return;
auto grp_by = select_query->groupBy();
auto & group_keys = grp_by->children;
GroupByKeysInfo group_by_keys_data = getGroupByKeysInfo(group_keys);
const auto & group_by_keys = select_query->groupBy()->children;
GroupByKeysInfo group_by_keys_data = getGroupByKeysInfo(group_by_keys);
SelectAggregateFunctionOfGroupByKeysVisitor::Data visitor_data{group_by_keys_data.key_names};
SelectAggregateFunctionOfGroupByKeysVisitor(visitor_data).visit(node);

View File

@ -0,0 +1,5 @@
SET optimize_aggregators_of_group_by_keys = 1;
SELECT source.key, max(target.key) FROM (SELECT 1 key, 'x' name) source
INNER JOIN (SELECT 2 key, 'x' name) target
ON source.name = target.name
GROUP BY source.key;