mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-28 02:21:59 +00:00
Remove functions of other keys in GROUP BY section (#10051)
This commit is contained in:
parent
c5229301fc
commit
9fc37e1f75
@ -409,6 +409,7 @@ struct Settings : public SettingsCollection<Settings>
|
||||
\
|
||||
M(SettingDateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic' and 'best_effort'.", 0) \
|
||||
\
|
||||
M(SettingBool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
|
||||
M(SettingBool, input_format_values_interpret_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser and try to interpret it as SQL expression.", 0) \
|
||||
M(SettingBool, input_format_values_deduce_templates_of_expressions, true, "For Values format: if the field could not be parsed by streaming parser, run SQL parser, deduce template of the SQL expression, try to parse all rows using template and then interpret expression for all rows.", 0) \
|
||||
M(SettingBool, input_format_values_accurate_types_of_literals, true, "For Values format: when parsing and interpreting expressions using template, check actual type of literal to avoid possible overflow and precision issues.", 0) \
|
||||
|
115
src/Interpreters/GroupByFunctionKeysVisitor.h
Normal file
115
src/Interpreters/GroupByFunctionKeysVisitor.h
Normal file
@ -0,0 +1,115 @@
|
||||
#pragma once
|
||||
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Interpreters/InDepthNodeVisitor.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTSetQuery.h>
|
||||
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
/// recursive traversal and check for optimizeGroupByFunctionKeys
|
||||
struct KeepFunctionMatcher
|
||||
{
|
||||
struct Data
|
||||
{
|
||||
std::unordered_set<String> & key_names_to_keep;
|
||||
bool & keep_key;
|
||||
};
|
||||
|
||||
using Visitor = InDepthNodeVisitor<KeepFunctionMatcher, true>;
|
||||
|
||||
static bool needChildVisit(const ASTPtr & node, const ASTPtr &)
|
||||
{
|
||||
return !(node->as<ASTFunction>());
|
||||
}
|
||||
|
||||
static void visit(ASTFunction * function_node, Data & data)
|
||||
{
|
||||
if ((function_node->arguments->children).empty())
|
||||
{
|
||||
data.keep_key = true;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!data.key_names_to_keep.count(function_node->getColumnName()))
|
||||
{
|
||||
Visitor(data).visit(function_node->arguments);
|
||||
}
|
||||
}
|
||||
|
||||
static void visit(ASTIdentifier * ident, Data & data)
|
||||
{
|
||||
if (!data.key_names_to_keep.count(ident->shortName()))
|
||||
{
|
||||
/// if variable of a function is not in GROUP BY keys, this function should not be deleted
|
||||
data.keep_key = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static void visit(const ASTPtr & ast, Data & data)
|
||||
{
|
||||
if (data.keep_key)
|
||||
return;
|
||||
|
||||
if (auto * function_node = ast->as<ASTFunction>())
|
||||
{
|
||||
visit(function_node, data);
|
||||
}
|
||||
else if (auto * ident = ast->as<ASTIdentifier>())
|
||||
{
|
||||
visit(ident, data);
|
||||
}
|
||||
else if (!ast->as<ASTExpressionList>())
|
||||
{
|
||||
data.keep_key = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using KeepFunctionVisitor = InDepthNodeVisitor<KeepFunctionMatcher, true>;
|
||||
|
||||
class GroupByFunctionKeysMatcher
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
std::unordered_set<String> & key_names_to_keep;
|
||||
};
|
||||
|
||||
static bool needChildVisit(const ASTPtr & node, const ASTPtr &)
|
||||
{
|
||||
return !(node->as<ASTFunction>());
|
||||
}
|
||||
|
||||
static void visit(ASTFunction * function_node, Data & data)
|
||||
{
|
||||
bool keep_key = false;
|
||||
KeepFunctionVisitor::Data keep_data{data.key_names_to_keep, keep_key};
|
||||
KeepFunctionVisitor(keep_data).visit(function_node->arguments);
|
||||
|
||||
if (!keep_key)
|
||||
(data.key_names_to_keep).erase(function_node->getColumnName());
|
||||
}
|
||||
|
||||
static void visit(const ASTPtr & ast, Data & data)
|
||||
{
|
||||
if (auto * function_node = ast->as<ASTFunction>())
|
||||
{
|
||||
if (!(function_node->arguments->children.empty()))
|
||||
visit(function_node, data);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using GroupByFunctionKeysVisitor = InDepthNodeVisitor<GroupByFunctionKeysMatcher, true>;
|
||||
|
||||
}
|
@ -25,6 +25,7 @@
|
||||
#include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h>
|
||||
#include <Interpreters/DuplicateDistinctVisitor.h>
|
||||
#include <Interpreters/DuplicateOrderByVisitor.h>
|
||||
#include <Interpreters/GroupByFunctionKeysVisitor.h>
|
||||
|
||||
#include <Parsers/ASTExpressionList.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
@ -346,6 +347,89 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum
|
||||
appendUnusedGroupByColumn(select_query, source_columns);
|
||||
}
|
||||
|
||||
///eliminate functions of other GROUP BY keys
|
||||
void optimizeGroupByFunctionKeys(ASTSelectQuery * select_query, bool optimize_group_by_function_keys)
|
||||
{
|
||||
if (!optimize_group_by_function_keys)
|
||||
return;
|
||||
|
||||
if (!select_query->groupBy())
|
||||
return;
|
||||
|
||||
auto grp_by = select_query->groupBy();
|
||||
auto & group_keys = grp_by->children;
|
||||
|
||||
ASTs modified; ///result
|
||||
std::unordered_set<String> key_names_to_keep; ///set of keys' short names
|
||||
|
||||
///check if optimization is needed while building set
|
||||
bool need_optimization = false;
|
||||
///filling set with short names of keys
|
||||
for (auto & group_key : group_keys)
|
||||
{
|
||||
if (!need_optimization && group_key->as<ASTFunction>())
|
||||
need_optimization = true;
|
||||
|
||||
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
|
||||
{
|
||||
if (key_names_to_keep.count(group_key_ident->shortName()))
|
||||
{
|
||||
///There may be a collision between different tables having similar variables.
|
||||
///Due to the fact that we can't track these conflicts yet,
|
||||
///it's better to disable optimization to avoid elimination necessary keys.
|
||||
need_optimization = false;
|
||||
break;
|
||||
}
|
||||
|
||||
key_names_to_keep.insert(group_key_ident->shortName());
|
||||
continue;
|
||||
}
|
||||
if (auto * group_key_func = group_key->as<ASTFunction>())
|
||||
{
|
||||
key_names_to_keep.insert(group_key_func->getColumnName());
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
key_names_to_keep.insert(group_key->getColumnName());
|
||||
}
|
||||
}
|
||||
if (!need_optimization)
|
||||
return;
|
||||
|
||||
GroupByFunctionKeysVisitor::Data visitor_data{key_names_to_keep};
|
||||
GroupByFunctionKeysVisitor(visitor_data).visit(grp_by);
|
||||
|
||||
modified.reserve(group_keys.size());
|
||||
|
||||
///filling the result
|
||||
for (auto & group_key : group_keys)
|
||||
{
|
||||
if (auto * group_key_func = group_key->as<ASTFunction>())
|
||||
{
|
||||
if (key_names_to_keep.count(group_key_func->getColumnName()))
|
||||
modified.push_back(group_key);
|
||||
|
||||
continue;
|
||||
}
|
||||
if (auto * group_key_ident = group_key->as<ASTIdentifier>())
|
||||
{
|
||||
if (key_names_to_keep.count(group_key_ident->shortName()))
|
||||
modified.push_back(group_key);
|
||||
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (key_names_to_keep.count(group_key->getColumnName()))
|
||||
modified.push_back(group_key);
|
||||
}
|
||||
}
|
||||
|
||||
///modifying the input
|
||||
grp_by->children = modified;
|
||||
}
|
||||
|
||||
/// Remove duplicate items from ORDER BY.
|
||||
void optimizeOrderBy(const ASTSelectQuery * select_query)
|
||||
{
|
||||
@ -843,6 +927,9 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect(
|
||||
/// GROUP BY injective function elimination.
|
||||
optimizeGroupBy(select_query, source_columns_set, context);
|
||||
|
||||
/// GROUP BY functions of other keys elimination.
|
||||
optimizeGroupByFunctionKeys(select_query, settings.optimize_group_by_function_keys);
|
||||
|
||||
/// Remove duplicate items from ORDER BY.
|
||||
optimizeOrderBy(select_query);
|
||||
|
||||
|
9
tests/performance/removing_group_by_keys.xml
Normal file
9
tests/performance/removing_group_by_keys.xml
Normal file
@ -0,0 +1,9 @@
|
||||
<test>
|
||||
<preconditions>
|
||||
<table_exists>hits_10m_single</table_exists>
|
||||
<table_exists>hits_100m_single</table_exists>
|
||||
</preconditions>
|
||||
|
||||
<query>SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID FORMAT Null</query>
|
||||
<query>SELECT avg(length(URL)) FROM hits_10m_single GROUP BY WatchID, CounterID, WatchID - CounterID FORMAT Null</query>
|
||||
</test>
|
@ -0,0 +1,58 @@
|
||||
6931467.646716369
|
||||
6931468.33986355
|
||||
6931469.0330107305
|
||||
6931469.726157911
|
||||
6931470.419305092
|
||||
6931471.112452272
|
||||
3465734.169931768
|
||||
3465734.8630789486
|
||||
3465735.5562261306
|
||||
3465736.24937331
|
||||
3465736.94252049
|
||||
3465735.209652544
|
||||
3465735.209652544
|
||||
3465735.5562261483
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465734.516505364
|
||||
3465735.209652544
|
||||
3465735.209652544
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465736.595946905
|
||||
3465735.2096525617
|
||||
3465735.9027997428
|
||||
SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY number % 5\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC
|
||||
6931467.646716369
|
||||
6931468.33986355
|
||||
6931469.0330107305
|
||||
6931469.726157911
|
||||
6931470.419305092
|
||||
6931471.112452272
|
||||
3465734.169931768
|
||||
3465734.8630789486
|
||||
3465735.5562261306
|
||||
3465736.24937331
|
||||
3465736.94252049
|
||||
3465735.209652544
|
||||
3465735.209652544
|
||||
3465735.5562261483
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465734.516505364
|
||||
3465735.209652544
|
||||
3465735.209652544
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465736.595946905
|
||||
3465735.2096525617
|
||||
3465735.9027997428
|
||||
SELECT max(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 2,\n number % 3,\n ((number % 2) + (number % 3)) % 2\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) % 3,\n number % 2\nORDER BY k ASC
|
30
tests/queries/0_stateless/01300_group_by_other_keys.sql
Normal file
30
tests/queries/0_stateless/01300_group_by_other_keys.sql
Normal file
@ -0,0 +1,30 @@
|
||||
set optimize_group_by_function_keys = 1;
|
||||
set enable_debug_queries = 1;
|
||||
|
||||
SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
|
||||
|
||||
|
||||
analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
|
||||
|
||||
set optimize_group_by_function_keys = 0;
|
||||
|
||||
SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
|
||||
|
||||
analyze SELECT max(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 2, number % 3, (number % 2 + number % 3) % 2 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) % 3, number % 2 ORDER BY k;
|
||||
-- TODO - test with similar variables of different tables (collision)
|
@ -0,0 +1,24 @@
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465736.595946905
|
||||
3465734.169931768
|
||||
3465734.8630789486
|
||||
3465735.5562261306
|
||||
0
|
||||
1
|
||||
4
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC
|
||||
SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY number % 5\nORDER BY k ASC
|
||||
3465735.9027997246
|
||||
3465735.902799725
|
||||
3465736.595946905
|
||||
3465734.169931768
|
||||
3465734.8630789486
|
||||
3465735.5562261306
|
||||
0
|
||||
1
|
||||
4
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nGROUP BY \n (number % 2) * (number % 3),\n number % 3,\n number % 2\nHAVING avg(log(2) * number) > 3465735.3\nORDER BY k ASC
|
||||
SELECT avg(log(2) * number) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC
|
||||
SELECT (number % 5) * (number % 5) AS k\nFROM numbers(10000000)\nWHERE ((number % 5) * (number % 5)) < 5\nGROUP BY \n number % 5,\n (number % 5) * (number % 5)\nORDER BY k ASC
|
@ -0,0 +1,21 @@
|
||||
set optimize_group_by_function_keys = 1;
|
||||
set enable_debug_queries = 1;
|
||||
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
|
||||
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
|
||||
set optimize_group_by_function_keys = 0;
|
||||
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
|
||||
SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY (number % 2) * (number % 3), number % 3, number % 2 HAVING avg(log(2) * number) > 3465735.3 ORDER BY k;
|
||||
analyze SELECT avg(log(2) * number) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
||||
analyze SELECT (number % 5) * (number % 5) AS k FROM numbers(10000000) GROUP BY number % 5, ((number % 5) * (number % 5)) HAVING ((number % 5) * (number % 5)) < 5 ORDER BY k;
|
Loading…
Reference in New Issue
Block a user