Merge pull request #47135 from ClickHouse/remove-perf-test-duplicate-order-by-and-distinct

Remove duplicate_order_by_and_distinct optimization
This commit is contained in:
Igor Nikonov 2023-07-25 17:26:22 +02:00 committed by GitHub
commit 9b4357723f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 71 additions and 461 deletions

View File

@ -534,7 +534,6 @@ class IColumn;
M(Bool, convert_query_to_cnf, false, "Convert SELECT query to CNF", 0) \
M(Bool, optimize_or_like_chain, false, "Optimize multiple OR LIKE into multiMatchAny. This optimization should not be enabled by default, because it defies index analysis in some cases.", 0) \
M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \
M(Bool, optimize_duplicate_order_by_and_distinct, false, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \
M(Bool, optimize_redundant_functions_in_order_by, true, "Remove functions from ORDER BY if its argument is also in ORDER BY", 0) \
M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \
M(Bool, optimize_multiif_to_if, true, "Replace 'multiIf' with only one condition to 'if'.", 0) \
@ -832,6 +831,7 @@ class IColumn;
MAKE_OBSOLETE(M, Seconds, drain_timeout, 3) \
MAKE_OBSOLETE(M, UInt64, backup_threads, 16) \
MAKE_OBSOLETE(M, UInt64, restore_threads, 16) \
MAKE_OBSOLETE(M, Bool, optimize_duplicate_order_by_and_distinct, false) \
/** The section above is for obsolete settings. Do not add anything there. */

View File

@ -289,13 +289,6 @@ void optimizeDuplicatesInOrderBy(const ASTSelectQuery * select_query)
elems = std::move(unique_elems);
}
/// Optimize duplicate ORDER BY
void optimizeDuplicateOrderBy(ASTPtr & query, ContextPtr context)
{
DuplicateOrderByVisitor::Data order_by_data{context};
DuplicateOrderByVisitor(order_by_data).visit(query);
}
/// Return simple subselect (without UNIONs or JOINs or SETTINGS) if any
const ASTSelectQuery * getSimpleSubselect(const ASTSelectQuery & select)
{
@ -379,41 +372,6 @@ std::unordered_set<String> getDistinctNames(const ASTSelectQuery & select)
return names;
}
/// Remove DISTINCT from query if columns are known as DISTINCT from subquery
void optimizeDuplicateDistinct(ASTSelectQuery & select)
{
if (!select.select() || select.select()->children.empty())
return;
const ASTSelectQuery * subselect = getSimpleSubselect(select);
if (!subselect)
return;
std::unordered_set<String> distinct_names = getDistinctNames(*subselect);
std::unordered_set<std::string_view> selected_names;
/// Check source column names from select list (ignore aliases and table names)
for (const auto & id : select.select()->children)
{
const auto * identifier = id->as<ASTIdentifier>();
if (!identifier)
return;
const String & name = identifier->shortName();
if (!distinct_names.contains(name))
return; /// Not a distinct column, keep DISTINCT for it.
selected_names.emplace(name);
}
/// select columns list != distinct columns list
/// SELECT DISTINCT a FROM (SELECT DISTINCT a, b FROM ...)) -- cannot remove DISTINCT
if (selected_names.size() != distinct_names.size())
return;
select.distinct = false;
}
/// Replace monotonous functions in ORDER BY if they don't participate in GROUP BY expression,
/// has a single argument and not an aggregate functions.
void optimizeMonotonousFunctionsInOrderBy(ASTSelectQuery * select_query, ContextPtr context,
@ -830,17 +788,6 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result,
&& !select_query->group_by_with_cube)
optimizeAggregateFunctionsOfGroupByKeys(select_query, query);
/// Remove duplicate ORDER BY and DISTINCT from subqueries.
if (settings.optimize_duplicate_order_by_and_distinct)
{
optimizeDuplicateOrderBy(query, context);
/// DISTINCT has special meaning in Distributed query with enabled distributed_group_by_no_merge
/// TODO: disable Distributed/remote() tables only
if (!settings.distributed_group_by_no_merge)
optimizeDuplicateDistinct(*select_query);
}
/// Remove functions from ORDER BY if its argument is also in ORDER BY
if (settings.optimize_redundant_functions_in_order_by)
optimizeRedundantFunctionsInOrderBy(select_query, context);

View File

@ -1,8 +0,0 @@
<test>
<settings><max_threads>1</max_threads></settings>
<!-- FIXME this should have been an EXPLAIN test, no point in measuring performance to deduce that the query was rewritten -->
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY EventDate, CounterID FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single) FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY toStartOfWeek(EventDate) FORMAT Null</query>
</test>

View File

@ -1,58 +0,0 @@
SELECT number
FROM
(
SELECT number
FROM
(
SELECT DISTINCT number
FROM numbers(3)
)
)
ORDER BY number ASC
0
1
2
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number ASC
)
ORDER BY number ASC
0
1
2
SELECT number
FROM
(
SELECT number
FROM
(
SELECT DISTINCT number % 2 AS number
FROM numbers(3)
)
)
ORDER BY number ASC
0
1
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number % 2 AS number
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number ASC
)
ORDER BY number ASC
0
1

View File

@ -1,123 +0,0 @@
set optimize_duplicate_order_by_and_distinct = 1;
EXPLAIN SYNTAX SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 0;
EXPLAIN SYNTAX SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 1;
EXPLAIN SYNTAX SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 0;
EXPLAIN SYNTAX SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;

View File

@ -1,46 +0,0 @@
-- Tags: distributed
set query_plan_remove_redundant_distinct = 1;
set optimize_duplicate_order_by_and_distinct = 0;
SET distributed_group_by_no_merge = 0;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
SETTINGS distributed_group_by_no_merge = 1
);
SET distributed_group_by_no_merge = 1;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
);
set optimize_duplicate_order_by_and_distinct = 0;
SET distributed_group_by_no_merge = 0;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
SETTINGS distributed_group_by_no_merge = 1
);
SET distributed_group_by_no_merge = 1;
set optimize_duplicate_order_by_and_distinct = 0;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
);

View File

@ -1,136 +0,0 @@
SELECT DISTINCT number
FROM numbers(1)
SELECT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
)
SELECT DISTINCT number * 2
FROM
(
SELECT DISTINCT
number * 2,
number
FROM numbers(1)
)
SELECT number
FROM
(
SELECT DISTINCT number * 2 AS number
FROM numbers(1)
)
SELECT
b,
a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
SELECT DISTINCT a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
SELECT a
FROM
(
SELECT DISTINCT a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT DISTINCT a
FROM
(
SELECT
a,
b
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT
a,
b
FROM
(
SELECT
b,
a
FROM
(
SELECT DISTINCT
number AS a,
number AS b
FROM numbers(1)
)
)
SELECT
a,
b
FROM
(
SELECT
b,
a,
a + b
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT DISTINCT a
FROM
(
SELECT a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
) AS t1
CROSS JOIN numbers(2) AS t2
SELECT number
FROM
(
SELECT DISTINCT number
FROM numbers(1) AS t1
CROSS JOIN numbers(2) AS t2
)
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
UNION ALL
SELECT DISTINCT number
FROM numbers(2)
)
0
1

View File

@ -1,32 +0,0 @@
SET optimize_duplicate_order_by_and_distinct = 1;
EXPLAIN SYNTAX SELECT DISTINCT number FROM numbers(1);
EXPLAIN SYNTAX SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1));
EXPLAIN SYNTAX SELECT DISTINCT number * 2 FROM (SELECT DISTINCT number * 2, number FROM numbers(1));
EXPLAIN SYNTAX SELECT DISTINCT number FROM (SELECT DISTINCT number * 2 AS number FROM numbers(1));
EXPLAIN SYNTAX SELECT DISTINCT b, a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100));
EXPLAIN SYNTAX SELECT DISTINCT a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100));
EXPLAIN SYNTAX SELECT DISTINCT a FROM (SELECT DISTINCT a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
EXPLAIN SYNTAX SELECT DISTINCT a FROM (SELECT DISTINCT a, b FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
EXPLAIN SYNTAX SELECT DISTINCT a, b FROM (SELECT DISTINCT b, a FROM (SELECT DISTINCT number a, number b FROM numbers(1)));
EXPLAIN SYNTAX SELECT DISTINCT a, b FROM (SELECT b, a, a + b FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
EXPLAIN SYNTAX SELECT DISTINCT a FROM (SELECT a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
EXPLAIN SYNTAX SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1)) t1 CROSS JOIN numbers(2) t2;
EXPLAIN SYNTAX SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1) t1 CROSS JOIN numbers(2) t2);
EXPLAIN SYNTAX SELECT DISTINCT number FROM
(
(SELECT DISTINCT number FROM numbers(1))
UNION ALL
(SELECT DISTINCT number FROM numbers(2))
);
--
SELECT DISTINCT number FROM
(
(SELECT DISTINCT number FROM numbers(1))
UNION ALL
(SELECT DISTINCT number FROM numbers(2))
)
ORDER BY number;

View File

@ -477,3 +477,32 @@ Expression (Projection)
ReadFromStorage (SystemNumbers)
-- execute
1
-- UNION ALL with DISTINCT => do _not_ remove DISTINCT
-- query
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
UNION ALL
SELECT DISTINCT number
FROM numbers(2)
)
-- explain
Expression (Projection)
Distinct
Distinct (Preliminary DISTINCT)
Union
Expression ((Before ORDER BY + Projection))
Distinct
Distinct (Preliminary DISTINCT)
Expression (Before ORDER BY)
ReadFromStorage (SystemNumbers)
Expression (( + Projection))
Distinct
Distinct (Preliminary DISTINCT)
Expression (Before ORDER BY)
ReadFromStorage (SystemNumbers)
-- execute
0
1

View File

@ -264,3 +264,15 @@ run_query "$query"
echo "-- DISTINCT COUNT() with GROUP BY => do _not_ remove DISTINCT"
query="select distinct count() from numbers(10) group by number"
run_query "$query"
echo "-- UNION ALL with DISTINCT => do _not_ remove DISTINCT"
query="SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
UNION ALL
SELECT DISTINCT number
FROM numbers(2)
)"
run_query "$query"

View File

@ -479,3 +479,32 @@ Expression (Project names)
ReadFromStorage (SystemNumbers)
-- execute
1
-- UNION ALL with DISTINCT => do _not_ remove DISTINCT
-- query
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
UNION ALL
SELECT DISTINCT number
FROM numbers(2)
)
-- explain
Expression (Project names)
Distinct (DISTINCT)
Distinct (Preliminary DISTINCT)
Union
Expression ((Projection + (Change column names to column identifiers + Project names)))
Distinct (DISTINCT)
Distinct (Preliminary DISTINCT)
Expression ((Projection + Change column names to column identifiers))
ReadFromStorage (SystemNumbers)
Expression (( + ( + Project names)))
Distinct (DISTINCT)
Distinct (Preliminary DISTINCT)
Expression ((Projection + Change column names to column identifiers))
ReadFromStorage (SystemNumbers)
-- execute
0
1