Fix: correct update of data stream sorting properties after removing

sorting
This commit is contained in:
Igor Nikonov 2023-01-19 00:11:58 +00:00
parent 1866f990de
commit 57d2fd300a
3 changed files with 501 additions and 8 deletions

View File

@ -90,6 +90,20 @@ private:
const Derived & getDerived() const { return *static_cast<Derived *>(this); }
std::unordered_map<const IQueryPlanStep*, std::string> address2name;
std::unordered_map<std::string, UInt32> name_gen;
std::string getStepId(const IQueryPlanStep* step)
{
const auto step_name = step->getName();
auto it = address2name.find(step);
if (it != address2name.end())
return it->second;
const auto seq_num = name_gen[step_name]++;
return address2name.insert({step, fmt::format("{}{}", step_name, seq_num)}).first->second;
}
protected:
void logStep(const char * prefix, const QueryPlan::Node * node)
{
@ -100,13 +114,13 @@ protected:
&Poco::Logger::get("QueryPlanVisitor"),
"{}: {}: {}",
prefix,
current_step->getName(),
getStepId(current_step),
reinterpret_cast<const void *>(current_step));
}
}
};
constexpr bool debug_logging_enabled = false;
constexpr bool debug_logging_enabled = true;
class RemoveRedundantSorting : public QueryPlanVisitor<RemoveRedundantSorting, debug_logging_enabled>
{
@ -182,9 +196,13 @@ private:
for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it)
{
const QueryPlan::Node * node = it->node;
/// skip removed sorting steps
auto * step = node->step.get();
if (dynamic_cast<const SortingStep *>(step) && node != nodes_affect_order.back())
continue;
logStep("update sorting traits", node);
auto * step = node->step.get();
auto * trans = dynamic_cast<ITransformingStep *>(step);
if (!trans)
{
@ -262,15 +280,16 @@ private:
for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it)
{
const QueryPlan::Node * node = it->node;
logStep("checking for stateful function", node);
/// walking though stack until reach node which affects order
if (node == node_affect_order)
break;
const auto * step = node->step.get();
/// skip removed sorting steps
if (dynamic_cast<const SortingStep*>(step))
continue;
logStep("checking for stateful function", node);
if (const auto * expr = typeid_cast<const ExpressionStep *>(step); expr)
{
if (expr->getExpression()->hasStatefulFunctions())
@ -296,13 +315,16 @@ private:
for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it)
{
const QueryPlan::Node * node = it->node;
logStep("checking path from current sorting", node);
/// walking though stack until reach node which affects order
if (node == node_affect_order)
break;
const auto * step = node->step.get();
/// skip removed sorting steps
if (dynamic_cast<const SortingStep *>(step))
continue;
logStep("checking path from current sorting", node);
/// (2) for window function we do ORDER BY in 2 Sorting steps,
/// so do not delete Sorting if window function step is on top

View File

@ -0,0 +1,208 @@
-- Disabled query_plan_remove_redundant_sorting
-- ORDER BY clauses in subqueries are untouched
Expression (Projection)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + Projection))
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + Projection))
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression (Before ORDER BY)
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- Enabled query_plan_remove_redundant_sorting
-- ORDER BY removes ORDER BY clauses in subqueries
-- query
SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
)
ORDER BY number ASC
-- explain
Expression (Projection)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + (Before ORDER BY + (Projection + Before ORDER BY)))))
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- execute
0
1
2
-- ORDER BY cannot remove ORDER BY in subquery WITH FILL
-- query
SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC WITH FILL STEP 1
)
ORDER BY number ASC
-- explain
Expression (Projection)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + Projection))
Header: number UInt64
Filling
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + Before ORDER BY)))
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- execute
0
1
2
-- ORDER BY cannot remove ORDER BY in subquery with LIMIT BY
-- query
SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC
LIMIT 1 BY number
)
ORDER BY number ASC
-- explain
Expression (Projection)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + Projection))
Header: number UInt64
LimitBy
Header: number UInt64
Expression (Before LIMIT BY)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + Before ORDER BY)))
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- execute
0
1
2
-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed
-- query
SELECT *
FROM
(
SELECT number
FROM
(
SELECT number
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC
) AS t1,
(
SELECT number
FROM
(
SELECT number
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
) AS t2
-- explain
Expression ((Projection + Before ORDER BY))
Header: number UInt64
t2.number UInt64
Join (JOIN FillRightFirst)
Header: number UInt64
t2.number UInt64
Expression ((Before JOIN + Projection))
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + Before ORDER BY)))
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
Expression ((Joined actions + (Rename joined columns + Projection)))
Header: t2.number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + Before ORDER BY)))
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- execute
0 2
0 1
0 0
1 2
1 1
1 0
2 2
2 1
2 0
-- GROUP BY in most inner query makes execution parallelized, and removing inner sorting steps will keep it that way. But need to correctly update data streams sorting properties after removing sorting steps
-- query
SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
GROUP BY number
ORDER BY number ASC
)
ORDER BY number ASC
)
ORDER BY number ASC
-- explain
Expression (Projection)
Header: number UInt64
Sorting (Sorting for ORDER BY)
Header: number UInt64
Expression ((Before ORDER BY + (Projection + (Before ORDER BY + (Projection + Before ORDER BY)))))
Header: number UInt64
Aggregating
Header: number UInt64
Expression (Before GROUP BY)
Header: number UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- execute
0
1
2

View File

@ -0,0 +1,263 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
DISABLE_OPTIMIZATION="SET query_plan_remove_redundant_sorting=0;SET optimize_duplicate_order_by_and_distinct=0"
ENABLE_OPTIMIZATION="SET query_plan_remove_redundant_sorting=1;SET optimize_duplicate_order_by_and_distinct=0"
echo "-- Disabled query_plan_remove_redundant_sorting"
echo "-- ORDER BY clauses in subqueries are untouched"
query="SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
)
ORDER BY number ASC"
$CLICKHOUSE_CLIENT -nq "$DISABLE_OPTIMIZATION;EXPLAIN header=1 $query"
function explain_and_execute {
echo "-- query"
echo "$1"
echo "-- explain"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;EXPLAIN header=1 $1"
echo "-- execute"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;$1"
}
echo "-- Enabled query_plan_remove_redundant_sorting"
echo "-- ORDER BY removes ORDER BY clauses in subqueries"
explain_and_execute "$query"
echo "-- ORDER BY cannot remove ORDER BY in subquery WITH FILL"
query="SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC WITH FILL STEP 1
)
ORDER BY number ASC"
explain_and_execute "$query"
echo "-- ORDER BY cannot remove ORDER BY in subquery with LIMIT BY"
query="SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC
LIMIT 1 BY number
)
ORDER BY number ASC"
explain_and_execute "$query"
echo "-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed"
query="SELECT *
FROM
(
SELECT number
FROM
(
SELECT number
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC
) AS t1,
(
SELECT number
FROM
(
SELECT number
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
) AS t2"
explain_and_execute "$query"
# SELECT '-- CROSS JOIN with subqueries, ORDER BY in main query -> all ORDER BY clauses will be removed in subqueries';
# EXPLAIN header=1
# SELECT *
# FROM
# (
# SELECT number
# FROM
# (
# SELECT number
# FROM numbers(3)
# ORDER BY number DESC
# )
# ORDER BY number ASC
# ) AS t1,
# (
# SELECT number
# FROM
# (
# SELECT number
# FROM numbers(3)
# ORDER BY number ASC
# )
# ORDER BY number DESC
# ) AS t2
# ORDER BY t1.number ASC;
# SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY(s) in _all_ subqueries';
# EXPLAIN header=1
# SELECT sum(number)
# FROM
# (
# SELECT *
# FROM
# (
# SELECT *
# FROM numbers(3)
# ORDER BY number ASC
# )
# ORDER BY number DESC
# )
# GROUP BY number;
# SELECT '-- GROUP BY with aggregation function which depends on order -> keep ORDER BY in first subquery, and eliminate in second subquery';
# EXPLAIN header=1
# SELECT any(number)
# FROM
# (
# SELECT *
# FROM
# (
# SELECT *
# FROM numbers(3)
# ORDER BY number ASC
# )
# ORDER BY number DESC
# )
# GROUP BY number;
# SELECT '-- query with aggregation function but w/o GROUP BY -> remove sorting';
# EXPLAIN
# SELECT sum(number)
# FROM
# (
# SELECT *
# FROM numbers(10)
# ORDER BY number DESC
# );
# SELECT '-- check that optimization is applied recursively to subqueries as well';
# SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY in most inner subquery here';
# EXPLAIN header=1
# SELECT a
# FROM
# (
# SELECT sum(number) AS a
# FROM
# (
# SELECT *
# FROM numbers(3)
# ORDER BY number ASC
# )
# GROUP BY number
# )
# ORDER BY a ASC;
# SELECT '-- GROUP BY with aggregation function which depends on order -> ORDER BY in subquery is kept due to the aggregation function';
# EXPLAIN header=1
# SELECT a
# FROM
# (
# SELECT any(number) AS a
# FROM
# (
# SELECT *
# FROM numbers(3)
# ORDER BY number ASC
# )
# GROUP BY number
# )
# ORDER BY a ASC;
# SELECT '-- Check that optimization works for subqueries as well, - main query have neither ORDER BY nor GROUP BY';
# EXPLAIN header=1
# SELECT a
# FROM
# (
# SELECT any(number) AS a
# FROM
# (
# SELECT *
# FROM
# (
# SELECT *
# FROM numbers(3)
# ORDER BY number DESC
# )
# ORDER BY number ASC
# )
# GROUP BY number
# )
# WHERE a > 0;
# SELECT '-- disable common optimization to avoid functions to be lifted up (liftUpFunctions optimization), needed for testing with stateful function';
# SET query_plan_enable_optimizations = 0;
# SELECT '-- neighbor() as stateful function prevents removing inner ORDER BY since its result depends on order';
# EXPLAIN
# SELECT
# number,
# neighbor(number, 2)
# FROM
# (
# SELECT *
# FROM numbers(10)
# ORDER BY number DESC
# )
# ORDER BY number ASC;
# SELECT '-- non-stateful function does _not_ prevent removing inner ORDER BY';
# EXPLAIN
# SELECT
# number,
# plus(number, 2)
# FROM
# (
# SELECT *
# FROM numbers(10)
# ORDER BY number DESC
# )
echo "-- GROUP BY in most inner query makes execution parallelized, and removing inner sorting steps will keep it that way. But need to correctly update data streams sorting properties after removing sorting steps"
query="SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
GROUP BY number
ORDER BY number ASC
)
ORDER BY number ASC
)
ORDER BY number ASC"
explain_and_execute "$query"