Merge pull request #38950 from hexiaoting/dev-prewhere

Support  optimize `where` clause with sorting key  expression move to `prewhere`  for query with` final `
This commit is contained in:
Han Fei 2023-02-15 18:18:19 +01:00 committed by GitHub
commit b1524196c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 149 additions and 91 deletions

View File

@ -77,8 +77,17 @@ public:
return std::make_shared<DataTypeUInt8>();
}
ColumnPtr executeImplDryRun(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
{
return execute(arguments, result_type, true);
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
{
return execute(arguments, result_type, false);
}
ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, bool dry_run) const
{
const IColumn * col = arguments[0].column.get();
@ -99,11 +108,14 @@ public:
if (seconds > 3.0) /// The choice is arbitrary
throw Exception(ErrorCodes::TOO_SLOW, "The maximum sleep time is 3 seconds. Requested: {}", toString(seconds));
UInt64 count = (variant == FunctionSleepVariant::PerBlock ? 1 : size);
UInt64 microseconds = static_cast<UInt64>(seconds * count * 1e6);
sleepForMicroseconds(microseconds);
ProfileEvents::increment(ProfileEvents::SleepFunctionCalls, count);
ProfileEvents::increment(ProfileEvents::SleepFunctionMicroseconds, microseconds);
if (!dry_run)
{
UInt64 count = (variant == FunctionSleepVariant::PerBlock ? 1 : size);
UInt64 microseconds = static_cast<UInt64>(seconds * count * 1e6);
sleepForMicroseconds(microseconds);
ProfileEvents::increment(ProfileEvents::SleepFunctionCalls, count);
ProfileEvents::increment(ProfileEvents::SleepFunctionMicroseconds, microseconds);
}
}
/// convertToFullColumn needed, because otherwise (constant expression case) function will not get called on each columns.

View File

@ -42,10 +42,6 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
, log{log_}
, column_sizes{std::move(column_sizes_)}
{
const auto & primary_key = metadata_snapshot->getPrimaryKey();
if (!primary_key.column_names.empty())
first_primary_key_column = primary_key.column_names[0];
for (const auto & name : queried_columns)
{
auto it = column_sizes.find(name);
@ -193,8 +189,9 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node,
/// Condition depend on some column. Constant expressions are not moved.
!cond.identifiers.empty()
&& !cannotBeMoved(node, is_final)
/// Do not take into consideration the conditions consisting only of the first primary key column
&& !hasPrimaryKeyAtoms(node)
/// When use final, do not take into consideration the conditions with non-sorting keys. Because final select
/// need to use all sorting keys, it will cause correctness issues if we filter other columns before final merge.
&& (!is_final || isExpressionOverSortingKey(node))
/// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded.
&& isSubsetOfTableColumns(cond.identifiers)
/// Do not move conditions involving all queried columns.
@ -320,48 +317,22 @@ UInt64 MergeTreeWhereOptimizer::getIdentifiersColumnSize(const NameSet & identif
return size;
}
bool MergeTreeWhereOptimizer::hasPrimaryKeyAtoms(const ASTPtr & ast) const
bool MergeTreeWhereOptimizer::isExpressionOverSortingKey(const ASTPtr & ast) const
{
if (const auto * func = ast->as<ASTFunction>())
{
const auto & args = func->arguments->children;
if ((func->name == "not" && 1 == args.size()) || func->name == "and" || func->name == "or")
for (const auto & arg : args)
{
for (const auto & arg : args)
if (hasPrimaryKeyAtoms(arg))
return true;
return false;
if (isConstant(ast) || sorting_key_names.contains(arg->getColumnName()))
continue;
if (!isExpressionOverSortingKey(arg))
return false;
}
return true;
}
return isPrimaryKeyAtom(ast);
}
bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const
{
if (const auto * func = ast->as<ASTFunction>())
{
if (!KeyCondition::atom_map.contains(func->name))
return false;
const auto & args = func->arguments->children;
if (args.size() != 2)
return false;
const auto & first_arg_name = args.front()->getColumnName();
const auto & second_arg_name = args.back()->getColumnName();
if ((first_primary_key_column == first_arg_name && isConstant(args[1]))
|| (first_primary_key_column == second_arg_name && isConstant(args[0]))
|| (first_primary_key_column == first_arg_name && functionIsInOrGlobalInOperator(func->name)))
return true;
}
return false;
return isConstant(ast) || sorting_key_names.contains(ast->getColumnName());
}

View File

@ -83,9 +83,7 @@ private:
UInt64 getIdentifiersColumnSize(const NameSet & identifiers) const;
bool hasPrimaryKeyAtoms(const ASTPtr & ast) const;
bool isPrimaryKeyAtom(const ASTPtr & ast) const;
bool isExpressionOverSortingKey(const ASTPtr & ast) const;
bool isSortingKey(const String & column_name) const;
@ -105,7 +103,6 @@ private:
using StringSet = std::unordered_set<std::string>;
String first_primary_key_column;
const StringSet table_columns;
const Names queried_columns;
const NameSet sorting_key_names;

View File

@ -179,7 +179,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
2000-01-01 1 test string 1 1
@ -203,7 +203,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
)
@ -229,7 +229,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
) AS b
WHERE id = 1
)
@ -248,7 +248,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
2000-01-01 1 test string 1 1
@ -272,7 +272,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
)
@ -291,7 +291,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
) AS b
WHERE id = 1
2000-01-01 1 test string 1 1
@ -315,7 +315,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
) AS a
WHERE id = 1
) AS b
@ -332,7 +332,7 @@ FROM
date,
min(value) AS value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
GROUP BY
id,
date
@ -352,7 +352,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
UNION ALL
SELECT
date,
@ -360,7 +360,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
2000-01-01 1 test string 1 1
@ -381,7 +381,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
ANY LEFT JOIN
(
@ -441,7 +441,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
ANY LEFT JOIN
(
@ -532,7 +532,7 @@ FROM
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
) AS a
ANY LEFT JOIN
(
@ -579,7 +579,7 @@ SEMI LEFT JOIN
name,
value
FROM test_00597
WHERE id = 1
PREWHERE id = 1
)
WHERE id = 1
) AS r USING (id)

View File

@ -24,6 +24,6 @@ FROM
n,
finalizeAggregation(s)
FROM test_00808_push_down_with_finalizeAggregation
WHERE (n <= 5) AND (n >= 2)
PREWHERE (n <= 5) AND (n >= 2)
)
WHERE (n >= 2) AND (n <= 5)

View File

@ -293,8 +293,8 @@ select * from (select * from tab where (a + b) * c = 8 union all select * from t
select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab3 where (a + b) * c = 18) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%';
Prefix sort description: sin(divide(a, b)) ASC
Result sort description: sin(divide(a, b)) ASC
ReadType: InOrder
ReadType: InOrder
ReadType: InOrder
ReadType: InOrder
select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b);
2 2 2 2
2 2 2 2
@ -311,7 +311,7 @@ select * from (select * from tab where (a + b) * c = 8 union all select * from t
select * from (explain plan actions = 1 select * from (select * from tab where (a + b) * c = 8 union all select * from tab4) order by sin(a / b)) where explain like '%sort description%' or explain like '%ReadType%';
Prefix sort description: sin(divide(a, b)) ASC
Result sort description: sin(divide(a, b)) ASC
ReadType: InOrder
ReadType: InOrder
ReadType: InOrder
select * from (select * from tab union all select * from tab5) order by (a + b) * c;
0 0 0 0
@ -403,3 +403,8 @@ select * from (explain plan actions = 1 select * from (select * from tab union a
Sort description: multiply(plus(a, b), c) ASC, sin(divide(a, b)) ASC, d ASC
Limit 3
ReadType: Default
drop table if exists tab;
drop table if exists tab2;
drop table if exists tab3;
drop table if exists tab4;
drop table if exists tab5;

View File

@ -1,5 +1,11 @@
SET optimize_read_in_order = 1, query_plan_read_in_order=1;
drop table if exists tab;
drop table if exists tab2;
drop table if exists tab3;
drop table if exists tab4;
drop table if exists tab5;
create table tab (a UInt32, b UInt32, c UInt32, d UInt32) engine = MergeTree order by ((a + b) * c, sin(a / b));
insert into tab select number, number, number, number from numbers(5);
insert into tab select number, number, number, number from numbers(5);
@ -142,3 +148,9 @@ select * from (explain plan actions = 1 select * from (select * from tab union a
-- In case of tab4, we do full sorting by ((a + b) * c, sin(a / b), d) with LIMIT. We can replace it to sorting by ((a + b) * c, sin(a / b)) and LIMIT WITH TIES, when sorting alog support it.
select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3;
select * from (explain plan actions = 1 select * from (select * from tab union all select * from tab5 union all select * from tab4) order by (a + b) * c, sin(a / b), d limit 3) where explain ilike '%sort description%' or explain like '%ReadType%' or explain like '%Limit%';
drop table if exists tab;
drop table if exists tab2;
drop table if exists tab3;
drop table if exists tab4;
drop table if exists tab5;

View File

@ -1,5 +1,20 @@
optimize_move_to_prewhere_if_final = 1
SELECT
x,
y,
z
FROM prewhere_move_select_final
PREWHERE x > 100
SELECT
x,
y,
z
FROM prewhere_move_select_final
FINAL
PREWHERE x > 100
SELECT
x,
y,
@ -15,6 +30,21 @@ FROM prewhere_move_select_final
FINAL
PREWHERE y > 100
SELECT
x,
y,
z
FROM prewhere_move_select_final
PREWHERE (x + y) > 100
SELECT
x,
y,
z
FROM prewhere_move_select_final
FINAL
PREWHERE (x + y) > 100
SELECT
x,
y,
@ -32,6 +62,24 @@ FINAL
PREWHERE y > 100
WHERE (y > 100) AND (z > 400)
SELECT
x,
y,
z
FROM prewhere_move_select_final
FINAL
PREWHERE x > 50
WHERE (x > 50) AND (z > 400)
SELECT
x,
y,
z
FROM prewhere_move_select_final
FINAL
PREWHERE (x + y) > 50
WHERE ((x + y) > 50) AND (z > 400)
optimize_move_to_prewhere_if_final = 0
SELECT

View File

@ -11,17 +11,29 @@ SET optimize_move_to_prewhere_if_final = 1;
-- order key can be pushed down with final
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE x > 100;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x > 100;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE y > 100;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE x + y > 100;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x + y > 100;
-- can not be pushed down
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE z > 400;
-- only y can be pushed down
-- only condition with x/y can be pushed down
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100 and z > 400;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x > 50 and z > 400;
select '';
EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE x + y > 50 and z > 400;
select '';
select 'optimize_move_to_prewhere_if_final = 0';

View File

@ -97,12 +97,12 @@
ReadType: InReverseOrder
Parts: 1
Granules: 3
ReadFromMergeTree (default.idx)
Indexes:
PrimaryKey
Keys:
x
plus(x, y)
Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-Inf, 2]))
Parts: 1/1
Granules: 1/1
ReadFromMergeTree (default.idx)
Indexes:
PrimaryKey
Keys:
x
plus(x, y)
Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-Inf, 2]))
Parts: 1/1
Granules: 1/1

View File

@ -64,10 +64,8 @@ ExpressionTransform
(Sorting)
(Expression)
ExpressionTransform
(Filter)
FilterTransform
(ReadFromMergeTree)
MergeTreeInOrder 0 → 1
(ReadFromMergeTree)
MergeTreeInOrder 0 → 1
2020-10-11 0 0
2020-10-11 0 10
2020-10-11 0 20
@ -82,15 +80,20 @@ ExpressionTransform
PartialSortingTransform
(Expression)
ExpressionTransform
(Filter)
FilterTransform
(ReadFromMergeTree)
MergeTreeInOrder 0 → 1
(ReadFromMergeTree)
MergeTreeInOrder 0 → 1
2020-10-12 0
2020-10-12 1
2020-10-12 2
2020-10-12 3
2020-10-12 4
SELECT
date,
i
FROM t_read_in_order
PREWHERE date = \'2020-10-12\'
ORDER BY i DESC
LIMIT 5
(Expression)
ExpressionTransform
(Limit)
@ -98,11 +101,9 @@ ExpressionTransform
(Sorting)
(Expression)
ExpressionTransform
(Filter)
FilterTransform
(ReadFromMergeTree)
ReverseTransform
MergeTreeReverse 0 → 1
(ReadFromMergeTree)
ReverseTransform
MergeTreeReverse 0 → 1
2020-10-12 99999
2020-10-12 99998
2020-10-12 99997

View File

@ -30,6 +30,7 @@ INSERT INTO t_read_in_order SELECT '2020-10-12', number, number FROM numbers(100
SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i LIMIT 5;
EXPLAIN SYNTAX SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5;
EXPLAIN PIPELINE SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5;
SELECT date, i FROM t_read_in_order WHERE date = '2020-10-12' ORDER BY i DESC LIMIT 5;

View File

@ -55,7 +55,6 @@ MergeTreeThread
Sorting (Stream): a ASC, b ASC
Sorting (Stream): a ASC, b ASC
Sorting (Stream): a ASC, b ASC
Sorting (Stream): a ASC, b ASC
-- check that reading in order optimization for ORDER BY and DISTINCT applied correctly in the same query
-- disabled, check that sorting description for ReadFromMergeTree match ORDER BY columns
Sorting (Stream): a ASC