From 57d2fd300a18c0cf4fa5ff039ab324c1c5b75c5a Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 19 Jan 2023 00:11:58 +0000 Subject: [PATCH] Fix: correct update of data stream sorting properties after removing sorting --- .../Optimizations/removeRedundantSorting.cpp | 38 ++- .../02496_remove_redundant_sorting.reference | 208 ++++++++++++++ .../02496_remove_redundant_sorting.sh | 263 ++++++++++++++++++ 3 files changed, 501 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/02496_remove_redundant_sorting.reference create mode 100755 tests/queries/0_stateless/02496_remove_redundant_sorting.sh diff --git a/src/Processors/QueryPlan/Optimizations/removeRedundantSorting.cpp b/src/Processors/QueryPlan/Optimizations/removeRedundantSorting.cpp index 37423ac19fd..11e642ed11f 100644 --- a/src/Processors/QueryPlan/Optimizations/removeRedundantSorting.cpp +++ b/src/Processors/QueryPlan/Optimizations/removeRedundantSorting.cpp @@ -90,6 +90,20 @@ private: const Derived & getDerived() const { return *static_cast(this); } + std::unordered_map address2name; + std::unordered_map name_gen; + + std::string getStepId(const IQueryPlanStep* step) + { + const auto step_name = step->getName(); + auto it = address2name.find(step); + if (it != address2name.end()) + return it->second; + + const auto seq_num = name_gen[step_name]++; + return address2name.insert({step, fmt::format("{}{}", step_name, seq_num)}).first->second; + } + protected: void logStep(const char * prefix, const QueryPlan::Node * node) { @@ -100,13 +114,13 @@ protected: &Poco::Logger::get("QueryPlanVisitor"), "{}: {}: {}", prefix, - current_step->getName(), + getStepId(current_step), reinterpret_cast(current_step)); } } }; -constexpr bool debug_logging_enabled = false; +constexpr bool debug_logging_enabled = true; class RemoveRedundantSorting : public QueryPlanVisitor { @@ -182,9 +196,13 @@ private: for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it) { const QueryPlan::Node * node = it->node; + /// skip removed sorting steps + auto * step = node->step.get(); + if (dynamic_cast(step) && node != nodes_affect_order.back()) + continue; + logStep("update sorting traits", node); - auto * step = node->step.get(); auto * trans = dynamic_cast(step); if (!trans) { @@ -262,15 +280,16 @@ private: for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it) { const QueryPlan::Node * node = it->node; - logStep("checking for stateful function", node); - /// walking though stack until reach node which affects order if (node == node_affect_order) break; const auto * step = node->step.get(); + /// skip removed sorting steps + if (dynamic_cast(step)) + continue; - + logStep("checking for stateful function", node); if (const auto * expr = typeid_cast(step); expr) { if (expr->getExpression()->hasStatefulFunctions()) @@ -296,13 +315,16 @@ private: for (StackWithParent::const_reverse_iterator it = stack.rbegin() + 1; it != stack.rend(); ++it) { const QueryPlan::Node * node = it->node; - logStep("checking path from current sorting", node); - /// walking though stack until reach node which affects order if (node == node_affect_order) break; const auto * step = node->step.get(); + /// skip removed sorting steps + if (dynamic_cast(step)) + continue; + + logStep("checking path from current sorting", node); /// (2) for window function we do ORDER BY in 2 Sorting steps, /// so do not delete Sorting if window function step is on top diff --git a/tests/queries/0_stateless/02496_remove_redundant_sorting.reference b/tests/queries/0_stateless/02496_remove_redundant_sorting.reference new file mode 100644 index 00000000000..14f4593f6b4 --- /dev/null +++ b/tests/queries/0_stateless/02496_remove_redundant_sorting.reference @@ -0,0 +1,208 @@ +-- Disabled query_plan_remove_redundant_sorting +-- ORDER BY clauses in subqueries are untouched +Expression (Projection) +Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + Projection)) + Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + Projection)) + Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression (Before ORDER BY) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- Enabled query_plan_remove_redundant_sorting +-- ORDER BY removes ORDER BY clauses in subqueries +-- query +SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number ASC + ) + ORDER BY number DESC +) +ORDER BY number ASC +-- explain +Expression (Projection) +Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + (Before ORDER BY + (Projection + Before ORDER BY))))) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- execute +0 +1 +2 +-- ORDER BY cannot remove ORDER BY in subquery WITH FILL +-- query +SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC WITH FILL STEP 1 +) +ORDER BY number ASC +-- explain +Expression (Projection) +Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + Projection)) + Header: number UInt64 + Filling + Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + Before ORDER BY))) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- execute +0 +1 +2 +-- ORDER BY cannot remove ORDER BY in subquery with LIMIT BY +-- query +SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC + LIMIT 1 BY number +) +ORDER BY number ASC +-- explain +Expression (Projection) +Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + Projection)) + Header: number UInt64 + LimitBy + Header: number UInt64 + Expression (Before LIMIT BY) + Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + Before ORDER BY))) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- execute +0 +1 +2 +-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed +-- query +SELECT * +FROM +( + SELECT number + FROM + ( + SELECT number + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC +) AS t1, +( + SELECT number + FROM + ( + SELECT number + FROM numbers(3) + ORDER BY number ASC + ) + ORDER BY number DESC +) AS t2 +-- explain +Expression ((Projection + Before ORDER BY)) +Header: number UInt64 + t2.number UInt64 + Join (JOIN FillRightFirst) + Header: number UInt64 + t2.number UInt64 + Expression ((Before JOIN + Projection)) + Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + Before ORDER BY))) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 + Expression ((Joined actions + (Rename joined columns + Projection))) + Header: t2.number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + Before ORDER BY))) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- execute +0 2 +0 1 +0 0 +1 2 +1 1 +1 0 +2 2 +2 1 +2 0 +-- GROUP BY in most inner query makes execution parallelized, and removing inner sorting steps will keep it that way. But need to correctly update data streams sorting properties after removing sorting steps +-- query +SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + GROUP BY number + ORDER BY number ASC + ) + ORDER BY number ASC +) +ORDER BY number ASC +-- explain +Expression (Projection) +Header: number UInt64 + Sorting (Sorting for ORDER BY) + Header: number UInt64 + Expression ((Before ORDER BY + (Projection + (Before ORDER BY + (Projection + Before ORDER BY))))) + Header: number UInt64 + Aggregating + Header: number UInt64 + Expression (Before GROUP BY) + Header: number UInt64 + ReadFromStorage (SystemNumbers) + Header: number UInt64 +-- execute +0 +1 +2 diff --git a/tests/queries/0_stateless/02496_remove_redundant_sorting.sh b/tests/queries/0_stateless/02496_remove_redundant_sorting.sh new file mode 100755 index 00000000000..b563609a522 --- /dev/null +++ b/tests/queries/0_stateless/02496_remove_redundant_sorting.sh @@ -0,0 +1,263 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DISABLE_OPTIMIZATION="SET query_plan_remove_redundant_sorting=0;SET optimize_duplicate_order_by_and_distinct=0" +ENABLE_OPTIMIZATION="SET query_plan_remove_redundant_sorting=1;SET optimize_duplicate_order_by_and_distinct=0" + +echo "-- Disabled query_plan_remove_redundant_sorting" +echo "-- ORDER BY clauses in subqueries are untouched" +query="SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number ASC + ) + ORDER BY number DESC +) +ORDER BY number ASC" +$CLICKHOUSE_CLIENT -nq "$DISABLE_OPTIMIZATION;EXPLAIN header=1 $query" + +function explain_and_execute { + echo "-- query" + echo "$1" + echo "-- explain" + $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;EXPLAIN header=1 $1" + echo "-- execute" + $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;$1" +} + +echo "-- Enabled query_plan_remove_redundant_sorting" +echo "-- ORDER BY removes ORDER BY clauses in subqueries" +explain_and_execute "$query" + +echo "-- ORDER BY cannot remove ORDER BY in subquery WITH FILL" +query="SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC WITH FILL STEP 1 +) +ORDER BY number ASC" +explain_and_execute "$query" + +echo "-- ORDER BY cannot remove ORDER BY in subquery with LIMIT BY" +query="SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC + LIMIT 1 BY number +) +ORDER BY number ASC" +explain_and_execute "$query" + +echo "-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed" +query="SELECT * +FROM +( + SELECT number + FROM + ( + SELECT number + FROM numbers(3) + ORDER BY number DESC + ) + ORDER BY number ASC +) AS t1, +( + SELECT number + FROM + ( + SELECT number + FROM numbers(3) + ORDER BY number ASC + ) + ORDER BY number DESC +) AS t2" +explain_and_execute "$query" + +# SELECT '-- CROSS JOIN with subqueries, ORDER BY in main query -> all ORDER BY clauses will be removed in subqueries'; +# EXPLAIN header=1 +# SELECT * +# FROM +# ( +# SELECT number +# FROM +# ( +# SELECT number +# FROM numbers(3) +# ORDER BY number DESC +# ) +# ORDER BY number ASC +# ) AS t1, +# ( +# SELECT number +# FROM +# ( +# SELECT number +# FROM numbers(3) +# ORDER BY number ASC +# ) +# ORDER BY number DESC +# ) AS t2 +# ORDER BY t1.number ASC; + +# SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY(s) in _all_ subqueries'; +# EXPLAIN header=1 +# SELECT sum(number) +# FROM +# ( +# SELECT * +# FROM +# ( +# SELECT * +# FROM numbers(3) +# ORDER BY number ASC +# ) +# ORDER BY number DESC +# ) +# GROUP BY number; + +# SELECT '-- GROUP BY with aggregation function which depends on order -> keep ORDER BY in first subquery, and eliminate in second subquery'; +# EXPLAIN header=1 +# SELECT any(number) +# FROM +# ( +# SELECT * +# FROM +# ( +# SELECT * +# FROM numbers(3) +# ORDER BY number ASC +# ) +# ORDER BY number DESC +# ) +# GROUP BY number; + +# SELECT '-- query with aggregation function but w/o GROUP BY -> remove sorting'; +# EXPLAIN +# SELECT sum(number) +# FROM +# ( +# SELECT * +# FROM numbers(10) +# ORDER BY number DESC +# ); + +# SELECT '-- check that optimization is applied recursively to subqueries as well'; +# SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY in most inner subquery here'; +# EXPLAIN header=1 +# SELECT a +# FROM +# ( +# SELECT sum(number) AS a +# FROM +# ( +# SELECT * +# FROM numbers(3) +# ORDER BY number ASC +# ) +# GROUP BY number +# ) +# ORDER BY a ASC; + +# SELECT '-- GROUP BY with aggregation function which depends on order -> ORDER BY in subquery is kept due to the aggregation function'; +# EXPLAIN header=1 +# SELECT a +# FROM +# ( +# SELECT any(number) AS a +# FROM +# ( +# SELECT * +# FROM numbers(3) +# ORDER BY number ASC +# ) +# GROUP BY number +# ) +# ORDER BY a ASC; + +# SELECT '-- Check that optimization works for subqueries as well, - main query have neither ORDER BY nor GROUP BY'; +# EXPLAIN header=1 +# SELECT a +# FROM +# ( +# SELECT any(number) AS a +# FROM +# ( +# SELECT * +# FROM +# ( +# SELECT * +# FROM numbers(3) +# ORDER BY number DESC +# ) +# ORDER BY number ASC +# ) +# GROUP BY number +# ) +# WHERE a > 0; + +# SELECT '-- disable common optimization to avoid functions to be lifted up (liftUpFunctions optimization), needed for testing with stateful function'; +# SET query_plan_enable_optimizations = 0; + +# SELECT '-- neighbor() as stateful function prevents removing inner ORDER BY since its result depends on order'; +# EXPLAIN +# SELECT +# number, +# neighbor(number, 2) +# FROM +# ( +# SELECT * +# FROM numbers(10) +# ORDER BY number DESC +# ) +# ORDER BY number ASC; + +# SELECT '-- non-stateful function does _not_ prevent removing inner ORDER BY'; +# EXPLAIN +# SELECT +# number, +# plus(number, 2) +# FROM +# ( +# SELECT * +# FROM numbers(10) +# ORDER BY number DESC +# ) + +echo "-- GROUP BY in most inner query makes execution parallelized, and removing inner sorting steps will keep it that way. But need to correctly update data streams sorting properties after removing sorting steps" +query="SELECT * +FROM +( + SELECT * + FROM + ( + SELECT * + FROM numbers(3) + GROUP BY number + ORDER BY number ASC + ) + ORDER BY number ASC +) +ORDER BY number ASC" +explain_and_execute "$query"