GROUP BY with check if aggr func depends on order

+ tests
This commit is contained in:
Igor Nikonov 2022-12-07 20:16:58 +00:00
parent 5684453f09
commit 49694578ba
4 changed files with 184 additions and 124 deletions

View File

@ -59,7 +59,7 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node,
size_t tryDistinctReadInOrder(QueryPlan::Node * node);
/// Remove redundant ORDER BY clauses in subqueries
void tryRemoveRedundantOrderBy(QueryPlan::Node * parent_node);
void tryRemoveRedundantOrderBy(QueryPlan::Node * root);
/// Put some steps under union, so that plan optimisation could be applied to union parts separately.
/// For example, the plan can be rewritten like:

View File

@ -1,3 +1,4 @@
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <Processors/QueryPlan/AggregatingStep.h>
#include <Processors/QueryPlan/ExpressionStep.h>
#include <Processors/QueryPlan/FillingStep.h>
@ -25,7 +26,6 @@ void printStepName(const char * prefix, const QueryPlan::Node * node)
void tryRemoveRedundantOrderBy(QueryPlan::Node * root)
{
// do top down find first order by or group by
struct Frame
{
QueryPlan::Node * node = nullptr;
@ -36,7 +36,7 @@ void tryRemoveRedundantOrderBy(QueryPlan::Node * root)
std::vector<Frame> stack;
stack.push_back({.node = root});
std::vector<IQueryPlanStep *> steps_affect_order;
std::vector<QueryPlan::Node *> nodes_affect_order;
while (!stack.empty())
{
@ -53,25 +53,35 @@ void tryRemoveRedundantOrderBy(QueryPlan::Node * root)
printStepName("visit", current_node);
/// if there is parent node which can affect order and current step is sorting
/// then check if we can remove the sorting step (and corresponding expression step)
if (!steps_affect_order.empty() && typeid_cast<SortingStep *>(current_step))
if (!nodes_affect_order.empty() && typeid_cast<SortingStep *>(current_step))
{
auto try_to_remove_sorting_step = [&]() -> bool
{
QueryPlan::Node * node_affect_order = nodes_affect_order.back();
IQueryPlanStep * step_affect_order = node_affect_order->step.get();
/// if there are LIMITs on top of ORDER BY, the ORDER BY is non-removable
/// if ORDER BY is with FILL WITH, it is non-removable
if (typeid_cast<LimitStep *>(steps_affect_order.back()) || typeid_cast<LimitByStep *>(steps_affect_order.back())
|| typeid_cast<FillingStep *>(steps_affect_order.back()))
if (typeid_cast<LimitStep *>(step_affect_order) || typeid_cast<LimitByStep *>(step_affect_order)
|| typeid_cast<FillingStep *>(step_affect_order))
return false;
bool remove_sorting = false;
/// TODO: (0) check stateful functions
/// (1) aggregation
if (const AggregatingStep * parent_aggr = typeid_cast<AggregatingStep *>(steps_affect_order.back()); parent_aggr)
if (const AggregatingStep * parent_aggr = typeid_cast<AggregatingStep *>(step_affect_order); parent_aggr)
{
/// TODO: check if it contains aggregation functions which depends on order
auto const & aggregates = parent_aggr->getParams().aggregates;
for (const auto & aggregate : aggregates)
{
auto aggregate_function_properties
= AggregateFunctionFactory::instance().tryGetProperties(aggregate.function->getName());
if (aggregate_function_properties && aggregate_function_properties->is_order_dependent)
return false;
}
remove_sorting = true;
}
/// (2) sorting
else if (SortingStep * parent_sorting = typeid_cast<SortingStep *>(steps_affect_order.back()); parent_sorting)
else if (typeid_cast<SortingStep *>(step_affect_order))
{
remove_sorting = true;
}
@ -109,7 +119,7 @@ void tryRemoveRedundantOrderBy(QueryPlan::Node * root)
|| typeid_cast<AggregatingStep *>(current_step)) /// (4) aggregation change order
{
printStepName("steps_affect_order/push", current_node);
steps_affect_order.push_back(current_step);
nodes_affect_order.push_back(current_node);
}
}
@ -124,10 +134,12 @@ void tryRemoveRedundantOrderBy(QueryPlan::Node * root)
}
/// bottom-up visit
if (!steps_affect_order.empty() && steps_affect_order.back() == current_node->step.get())
/// we come here when all children of current_node are visited,
/// so it's a node which affect order, remove it from the corresponding stack
if (!nodes_affect_order.empty() && nodes_affect_order.back() == current_node)
{
printStepName("steps_affect_order/pop", current_node);
steps_affect_order.pop_back();
printStepName("node_affect_order/pop", current_node);
nodes_affect_order.pop_back();
}
printStepName("pop", current_node);

View File

@ -58,15 +58,6 @@ Header: number UInt64
Header: number_1 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- GROUP BY removes ORDER BY in _all_ subqueries
Expression ((Project names + Projection))
Header: number UInt64
Aggregating
Header: number_0 UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + (Project names + (Projection + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))))))
Header: number_0 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed
Expression ((Project names + (Projection + DROP unused columns after JOIN)))
Header: t1.number UInt64
@ -111,3 +102,76 @@ Header: t1.number UInt64
Header: t2.number_1 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY(s) in _all_ subqueries
Expression ((Project names + Projection))
Header: sum(number) UInt64
Aggregating
Header: number_0 UInt64
sum(number_0) UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + (Project names + (Projection + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))))))
Header: number_0 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- GROUP BY with aggregation function which depends on order -> keep ORDER BY in first subquery, and eliminate in second subquery
Expression ((Project names + Projection))
Header: any(number) UInt64
Aggregating
Header: number_0 UInt64
any(number_0) UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + Project names)))
Header: number_0 UInt64
Sorting (Sorting for ORDER BY)
Header: number_1 UInt64
Expression ((Before ORDER BY + (Projection + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))))
Header: number_1 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- check that optimization is applied recursively to subqueries as well
-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY in most inner subquery here
Expression (Project names)
Header: a UInt64
Sorting (Sorting for ORDER BY)
Header: a_0 UInt64
Expression ((Before ORDER BY + (Projection + (Change column names to column identifiers + (Project names + Projection)))))
Header: a_0 UInt64
Aggregating
Header: number_1 UInt64
sum(number_1) UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers)))))
Header: number_1 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- GROUP BY with aggregation function which depends on order -> ORDER BY in subquery is kept due to the aggregation function
Expression (Project names)
Header: a UInt64
Sorting (Sorting for ORDER BY)
Header: a_0 UInt64
Expression ((Before ORDER BY + (Projection + (Change column names to column identifiers + (Project names + Projection)))))
Header: a_0 UInt64
Aggregating
Header: number_1 UInt64
any(number_1) UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + Project names)))
Header: number_1 UInt64
Sorting (Sorting for ORDER BY)
Header: number_2 UInt64
Expression ((Before ORDER BY + (Projection + Change column names to column identifiers)))
Header: number_2 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64
-- Check that optimization works for subqueries as well, - main query have neither ORDER BY nor GROUP BY
Expression ((Project names + Projection))
Header: a UInt64
Filter ((WHERE + (Change column names to column identifiers + (Project names + Projection))))
Header: a_0 UInt64
Aggregating
Header: number_1 UInt64
any(number_1) UInt64
Expression ((Before GROUP BY + (Change column names to column identifiers + Project names)))
Header: number_1 UInt64
Sorting (Sorting for ORDER BY)
Header: number_2 UInt64
Expression ((Before ORDER BY + (Projection + (Change column names to column identifiers + (Project names + (Projection + Change column names to column identifiers))))))
Header: number_2 UInt64
ReadFromStorage (SystemNumbers)
Header: number UInt64

View File

@ -73,108 +73,6 @@ FROM
)
ORDER BY number ASC;
SELECT '-- GROUP BY removes ORDER BY in _all_ subqueries';
EXPLAIN header=1
SELECT *
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
)
GROUP BY number;
-- SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY(s) in _all_ subqueries';
-- EXPLAIN
-- SELECT sum(number)
-- FROM
-- (
-- SELECT *
-- FROM
-- (
-- SELECT *
-- FROM numbers(3)
-- ORDER BY number ASC
-- )
-- ORDER BY number DESC
-- )
-- GROUP BY number;
-- SELECT '-- GROUP BY with aggregation function which depends on order -> keep ORDER BY in first subquery, and eliminate in second subquery';
-- EXPLAIN
-- SELECT any(number)
-- FROM
-- (
-- SELECT *
-- FROM
-- (
-- SELECT *
-- FROM numbers(3)
-- ORDER BY number ASC
-- )
-- ORDER BY number DESC
-- )
-- GROUP BY number;
-- SELECT '-- check that optimization is applied recursively to subqueries as well';
-- SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY in most inner subquery here';
-- EXPLAIN
-- SELECT a
-- FROM
-- (
-- SELECT sum(number) AS a
-- FROM
-- (
-- SELECT *
-- FROM numbers(3)
-- ORDER BY number ASC
-- )
-- GROUP BY number
-- )
-- ORDER BY a ASC;
-- SELECT '-- GROUP BY with aggregation function which depends on order -> ORDER BY in subquery is kept due to the aggregation function';
-- EXPLAIN
-- SELECT a
-- FROM
-- (
-- SELECT any(number) AS a
-- FROM
-- (
-- SELECT *
-- FROM numbers(3)
-- ORDER BY number ASC
-- )
-- GROUP BY number
-- )
-- ORDER BY a ASC;
-- SELECT '-- Check that optimization works for subqueries as well, - main query have nor ORDER BY nor GROUP BY';
-- EXPLAIN
-- SELECT a
-- FROM
-- (
-- SELECT any(number) AS a
-- FROM
-- (
-- SELECT *
-- FROM
-- (
-- SELECT *
-- FROM numbers(3)
-- ORDER BY number DESC
-- )
-- ORDER BY number ASC
-- )
-- GROUP BY number
-- )
-- WHERE a > 0;
SELECT '-- CROSS JOIN with subqueries, nor ORDER BY nor GROUP BY in main query -> only ORDER BY clauses in most inner subqueries will be removed';
EXPLAIN header=1
SELECT *
@ -225,3 +123,89 @@ FROM
ORDER BY number DESC
) AS t2
ORDER BY t1.number ASC;
SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY(s) in _all_ subqueries';
EXPLAIN header=1
SELECT sum(number)
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
)
GROUP BY number;
SELECT '-- GROUP BY with aggregation function which depends on order -> keep ORDER BY in first subquery, and eliminate in second subquery';
EXPLAIN header=1
SELECT any(number)
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
ORDER BY number DESC
)
GROUP BY number;
SELECT '-- check that optimization is applied recursively to subqueries as well';
SELECT '-- GROUP BY with aggregation function which does NOT depend on order -> eliminate ORDER BY in most inner subquery here';
EXPLAIN header=1
SELECT a
FROM
(
SELECT sum(number) AS a
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
GROUP BY number
)
ORDER BY a ASC;
SELECT '-- GROUP BY with aggregation function which depends on order -> ORDER BY in subquery is kept due to the aggregation function';
EXPLAIN header=1
SELECT a
FROM
(
SELECT any(number) AS a
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number ASC
)
GROUP BY number
)
ORDER BY a ASC;
SELECT '-- Check that optimization works for subqueries as well, - main query have neither ORDER BY nor GROUP BY';
EXPLAIN header=1
SELECT a
FROM
(
SELECT any(number) AS a
FROM
(
SELECT *
FROM
(
SELECT *
FROM numbers(3)
ORDER BY number DESC
)
ORDER BY number ASC
)
GROUP BY number
)
WHERE a > 0;