fix DISTINCT and ORDER BY

2024-11-24 08:32:02 +00:00 · 2020-12-25 06:13:30 +03:00 · 2020-12-25 06:13:30 +03:00 · 5bd025a180
commit 5bd025a180
parent 8c72fa063d
5 changed files with 87 additions and 40 deletions
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -970,7 +970,9 @@ void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments(
    ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns);

    // 1) Add actions for window functions and their arguments;
-    // 2) Mark the columns that are really required.
+    // 2) Mark the columns that are really required. We have to mark them as
+    //    required because we finish the expression chain before processing the
+    //    window functions.
    for (const auto & [_, w] : window_descriptions)
    {
        for (const auto & f : w.window_functions)
@ -1425,49 +1427,47 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
        /// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers.
        query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage));

-        has_order_by = query.orderBy() != nullptr;
-        // FIXME selected columns was set here. Should split order by and select
-        // and insert window functions in between.
-        // Will fix:
-        // 1) window function in DISTINCT
-        // 2) totally broken DISTINCT
-        // 3) window functions in order by
-        before_order_and_select = query_analyzer.appendOrderBy(
-                chain,
-                only_types || (need_aggregate ? !second_stage : !first_stage),
-                optimize_read_in_order,
-                order_by_elements_actions);
-
+        // Window functions are processed in a separate expression chain after
+        // the main SELECT, similar to what we do for aggregate functions.
        if (has_window)
        {
            query_analyzer.appendWindowFunctionsArguments(chain, only_types || !first_stage);

-            auto select_required_columns = chain.getLastStep().required_output;

            for (const auto & x : chain.getLastActions()->getNamesAndTypesList())
            {
                query_analyzer.columns_after_window.push_back(x);
            }

+            before_window = chain.getLastActions();
            finalize_chain(chain);

            auto & step = chain.lastStep(query_analyzer.columns_after_window);
-            step.required_output = select_required_columns;
+
+            // The output of this expression chain is the result of
+            // SELECT (before "final projection" i.e. renaming the columns), so
+            // we have to mark the expressions that are required in the output,
+            // again. We did it for the previous expression chain ("select w/o
+            // window functions") earlier, in appendSelect(). But that chain also
+            // produced the expressions required to calculate window functions.
+            // They are not needed in the final SELECT result. Knowing the correct
+            // list of columns is important when we apply SELECT DISTINCT later.
            const auto * select_query = query_analyzer.getSelectQuery();
            for (const auto & child : select_query->select()->children)
            {
-                if (const auto * function
-                        = typeid_cast<const ASTFunction *>(child.get());
-                    function && function->is_window_function)
-                {
-                    // See its counterpart in appendSelect()
-                    step.required_output.push_back(child->getColumnName());
-                }
+                step.required_output.push_back(child->getColumnName());
            }
        }

        selected_columns = chain.getLastStep().required_output;

+        has_order_by = query.orderBy() != nullptr;
+        before_order_by = query_analyzer.appendOrderBy(
+                chain,
+                only_types || (need_aggregate ? !second_stage : !first_stage),
+                optimize_read_in_order,
+                order_by_elements_actions);
+
        if (query_analyzer.appendLimitBy(chain, only_types || !second_stage))
        {
            before_limit_by = chain.getLastActions();
@ -1479,7 +1479,6 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
        finalize_chain(chain);
    }

-
    /// Before executing WHERE and HAVING, remove the extra columns from the block (mostly the aggregation keys).
    removeExtraColumns();

@ -1610,9 +1609,9 @@ std::string ExpressionAnalysisResult::dump() const
        ss << "before_window " << before_window->dumpDAG() << "\n";
    }

-    if (before_order_and_select)
+    if (before_order_by)
    {
-        ss << "before_order_and_select " << before_order_and_select->dumpDAG() << "\n";
+        ss << "before_order_by " << before_order_by->dumpDAG() << "\n";
    }

    if (before_limit_by)
@ -1625,6 +1624,20 @@ std::string ExpressionAnalysisResult::dump() const
        ss << "final_projection " << final_projection->dumpDAG() << "\n";
    }

+    if (!selected_columns.empty())
+    {
+        ss << "selected_columns ";
+        for (size_t i = 0; i < selected_columns.size(); i++)
+        {
+            if (i > 0)
+            {
+                ss << ", ";
+            }
+            ss << backQuote(selected_columns[i]);
+        }
+        ss << "\n";
+    }
+
    return ss.str();
 }

--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@ -204,11 +204,12 @@ struct ExpressionAnalysisResult
    ActionsDAGPtr before_aggregation;
    ActionsDAGPtr before_having;
    ActionsDAGPtr before_window;
-    ActionsDAGPtr before_order_and_select;
+    ActionsDAGPtr before_order_by;
    ActionsDAGPtr before_limit_by;
    ActionsDAGPtr final_projection;

-    /// Columns from the SELECT list, before renaming them to aliases.
+    /// Columns from the SELECT list, before renaming them to aliases. Used to
+    /// perform SELECT DISTINCT.
    Names selected_columns;

    /// Columns will be removed after prewhere actions execution.
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -538,7 +538,10 @@ Block InterpreterSelectQuery::getSampleBlockImpl()
    if (options.to_stage == QueryProcessingStage::Enum::WithMergeableState)
    {
        if (!analysis_result.need_aggregate)
-            return analysis_result.before_order_and_select->getResultColumns();
+        {
+            // What's the difference with selected_columns?
+            return analysis_result.before_order_by->getResultColumns();
+        }

        Block header = analysis_result.before_aggregation->getResultColumns();

@ -564,7 +567,8 @@ Block InterpreterSelectQuery::getSampleBlockImpl()

    if (options.to_stage == QueryProcessingStage::Enum::WithMergeableStateAfterAggregation)
    {
-        return analysis_result.before_order_and_select->getResultColumns();
+        // What's the difference with selected_columns?
+        return analysis_result.before_order_by->getResultColumns();
    }

    return analysis_result.final_projection->getResultColumns();
@ -958,8 +962,9 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
            }
            else
            {
-                executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
+                executeExpression(query_plan, expressions.before_window, "Before window functions");
                executeWindow(query_plan);
+                executeExpression(query_plan, expressions.before_order_by, "Before ORDER BY");
                executeDistinct(query_plan, true, expressions.selected_columns, true);
            }

@ -1005,8 +1010,10 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
                else if (expressions.hasHaving())
                    executeHaving(query_plan, expressions.before_having);

-                executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
+                executeExpression(query_plan, expressions.before_window,
+                    "Before window functions");
                executeWindow(query_plan);
+                executeExpression(query_plan, expressions.before_order_by, "Before ORDER BY");
                executeDistinct(query_plan, true, expressions.selected_columns, true);

            }
@ -1745,6 +1752,11 @@ void InterpreterSelectQuery::executeRollupOrCube(QueryPlan & query_plan, Modific

 void InterpreterSelectQuery::executeExpression(QueryPlan & query_plan, const ActionsDAGPtr & expression, const std::string & description)
 {
+    if (!expression)
+    {
+        return;
+    }
+
    auto expression_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), expression);

    expression_step->setStepDescription(description);
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@ -79,12 +79,22 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) q fro
 9	9
 select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }

-- should work in ORDER BY though
-- doesn't work now
-- select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
+-- should work in ORDER BY

-- at least it works in ORDER BY if you wrap it in a subquery
+select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;

+-- also works in ORDER BY if you wrap it in a subquery
+
+9	9
+6	8
+7	8
+8	8
+3	5
+4	5
+5	5
+0	2
+1	2
+2	2
 select * from (select count(*) over () c from numbers(3)) order by c;

 -- must work in WHERE if you wrap it in a subquery
@ -168,4 +178,12 @@ select groupArray(number) over () from numbers(3);
 [0,1,2]
 select count(1) over (), max(number + 1) over () from numbers(3);

+-- Should work in DISTINCT
+
 1	3
+select distinct sum(0) over () from numbers(2);
+
+0
+select distinct any(number) over () from numbers(2);
+
+0
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@ -24,11 +24,10 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) q fro
 -- last stage of select, after all other functions.
 select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }

-- should work in ORDER BY though
-- doesn't work now
-- select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
+-- should work in ORDER BY
+select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;

-- at least it works in ORDER BY if you wrap it in a subquery
+-- also works in ORDER BY if you wrap it in a subquery
 select * from (select count(*) over () c from numbers(3)) order by c;

 -- must work in WHERE if you wrap it in a subquery
@ -57,3 +56,7 @@ select groupArray(number) over () from numbers(3);
 -- This one tests we properly process the window  function arguments.
 -- Seen errors like 'column `1` not found' from count(1).
 select count(1) over (), max(number + 1) over () from numbers(3);
+
+-- Should work in DISTINCT
+select distinct sum(0) over () from numbers(2);
+select distinct any(number) over () from numbers(2);