Remove functions of other keys in GROUP BY section (#10051)

This commit is contained in:
xPoSx 2020-06-16 12:14:25 +03:00 committed by GitHub
parent c5229301fc
commit 9fc37e1f75
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 345 additions and 0 deletions

View File

@ -409,6 +409,7 @@ struct Settings : public SettingsCollection<Settings>
\
M(SettingDateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
\
M(SettingBool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \
M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \
M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \

View File

@ -0,0 +1,115 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/IAST.h>
#include <Common/typeid_cast.h>
namespace DB
{
/// recursive traversal and check for optimizeGroupByFunctionKeys
struct KeepFunctionMatcher
{
struct Data
{
std::unordered_set<String> & key_names_to_keep;
bool & keep_key;
};
using Visitor = InDepthNodeVisitor<KeepFunctionMatcher, true>;
static bool needChildVisit(const ASTPtr & node, const ASTPtr &)
{
return !(node->as<ASTFunction>());
}
static void visit(ASTFunction * function_node, Data & data)
{
if ((function_node->arguments->children).empty())
{
data.keep_key = true;
return;
}
if (!data.key_names_to_keep.count(function_node->getColumnName()))
{
Visitor(data).visit(function_node->arguments);
}
}
static void visit(ASTIdentifier * ident, Data & data)
{
if (!data.key_names_to_keep.count(ident->shortName()))
{
/// if variable of a function is not in GROUP BY keys, this function should not be deleted
data.keep_key = true;
return;
}
}
static void visit(const ASTPtr & ast, Data & data)
{
if (data.keep_key)
return;
if (auto * function_node = ast->as<ASTFunction>())
{
visit(function_node, data);
}
else if (auto * ident = ast->as<ASTIdentifier>())
{
visit(ident, data);
}
else if (!ast->as<ASTExpressionList>())
{
data.keep_key = true;
}
}
};
using KeepFunctionVisitor = InDepthNodeVisitor<KeepFunctionMatcher, true>;
class GroupByFunctionKeysMatcher
{
public:
struct Data
{
std::unordered_set<String> & key_names_to_keep;
};
static bool needChildVisit(const ASTPtr & node, const ASTPtr &)
{
return !(node->as<ASTFunction>());
}
static void visit(ASTFunction * function_node, Data & data)
{
bool keep_key = false;
KeepFunctionVisitor::Data keep_data{data.key_names_to_keep, keep_key};
KeepFunctionVisitor(keep_data).visit(function_node->arguments);
if (!keep_key)
(data.key_names_to_keep).erase(function_node->getColumnName());
}
static void visit(const ASTPtr & ast, Data & data)
{
if (auto * function_node = ast->as<ASTFunction>())
{
if (!(function_node->arguments->children.empty()))
visit(function_node, data);
}
}
};
using GroupByFunctionKeysVisitor = InDepthNodeVisitor<GroupByFunctionKeysMatcher, true>;
}

View File

@ -25,6 +25,7 @@
#include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h>
#include <Interpreters/DuplicateDistinctVisitor.h>
#include <Interpreters/DuplicateOrderByVisitor.h>
#include <Interpreters/GroupByFunctionKeysVisitor.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h>
@ -346,6 +347,89 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum
appendUnusedGroupByColumn(select_query, source_columns);
}
///eliminate functions of other GROUP BY keys
void optimizeGroupByFunctionKeys(ASTSelectQuery * select_query, bool optimize_group_by_function_keys)
{
if (!optimize_group_by_function_keys)
return;
if (!select_query->groupBy())
return;
auto grp_by = select_query->groupBy();
auto & group_keys = grp_by->children;
ASTs modified; ///result
std::unordered_set<String> key_names_to_keep; ///set of keys' short names
///check if optimization is needed while building set
bool need_optimization = false;
///filling set with short names of keys
for (auto & group_key : group_keys)
{
if (!need_optimization && group_key->as<ASTFunction>())
need_optimization = true;
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
{
if (key_names_to_keep.count(group_key_ident->shortName()))
{
///There may be a collision between different tables having similar variables.
///Due to the fact that we can't track these conflicts yet,
///it's better to disable optimization to avoid elimination necessary keys.
need_optimization = false;
break;
}
key_names_to_keep.insert(group_key_ident->shortName());
continue;
}
if (auto * group_key_func = group_key->as<ASTFunction>())
{
key_names_to_keep.insert(group_key_func->getColumnName());
continue;
}
else
{
key_names_to_keep.insert(group_key->getColumnName());
}
}
if (!need_optimization)
return;
GroupByFunctionKeysVisitor::Data visitor_data{key_names_to_keep};
GroupByFunctionKeysVisitor(visitor_data).visit(grp_by);
modified.reserve(group_keys.size());
///filling the result
for (auto & group_key : group_keys)
{
if (auto * group_key_func = group_key->as<ASTFunction>())
{
if (key_names_to_keep.count(group_key_func->getColumnName()))
modified.push_back(group_key);
continue;
}
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
{
if (key_names_to_keep.count(group_key_ident->shortName()))
modified.push_back(group_key);
continue;
}
else
{
if (key_names_to_keep.count(group_key->getColumnName()))
modified.push_back(group_key);
}
}
///modifying the input
grp_by->children = modified;
}
/// Remove duplicate items from ORDER BY.
void optimizeOrderBy(const ASTSelectQuery * select_query)
{
@ -843,6 +927,9 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect(
/// GROUP BY injective function elimination.
optimizeGroupBy(select_query, source_columns_set, context);
/// GROUP BY functions of other keys elimination.
optimizeGroupByFunctionKeys(select_query, settings.optimize_group_by_function_keys);
/// Remove duplicate items from ORDER BY.
optimizeOrderBy(select_query);

View File

@ -0,0 +1,9 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<query>SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID FORMAT Null</query>
<query>SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID, WatchID - CounterID FORMAT Null</query>
</test>

View File

@ -0,0 +1,58 @@
6931467.646716369
6931468.33986355
6931469.0330107305
6931469.726157911
6931470.419305092
6931471.112452272
3465734.169931768
3465734.8630789486
3465735.5562261306
3465736.24937331
3465736.94252049
3465735.209652544
3465735.209652544
3465735.5562261483
3465735.9027997246
3465735.902799725
3465734.516505364
3465735.209652544
3465735.209652544
3465735.9027997246
3465735.902799725
3465736.595946905
3465735.2096525617
3465735.9027997428
SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY number % 5\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC
6931467.646716369
6931468.33986355
6931469.0330107305
6931469.726157911
6931470.419305092
6931471.112452272
3465734.169931768
3465734.8630789486
3465735.5562261306
3465736.24937331
3465736.94252049
3465735.209652544
3465735.209652544
3465735.5562261483
3465735.9027997246
3465735.902799725
3465734.516505364
3465735.209652544
3465735.209652544
3465735.9027997246
3465735.902799725
3465736.595946905
3465735.2096525617
3465735.9027997428
SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC

View File

@ -0,0 +1,30 @@
set optimize_group_by_function_keys = 1;
set enable_debug_queries = 1;
SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
set optimize_group_by_function_keys = 0;
SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
-- TODO - test with similar variables of different tables (collision)

View File

@ -0,0 +1,24 @@
3465735.9027997246
3465735.902799725
3465736.595946905
3465734.169931768
3465734.8630789486
3465735.5562261306
0
1
4
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC
SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC
3465735.9027997246
3465735.902799725
3465736.595946905
3465734.169931768
3465734.8630789486
3465735.5562261306
0
1
4
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC
SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC

View File

@ -0,0 +1,21 @@
set optimize_group_by_function_keys = 1;
set enable_debug_queries = 1;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
set optimize_group_by_function_keys = 0;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;