fix DISTINCT and ORDER BY

This commit is contained in:
Alexander Kuzmenkov 2020-12-25 06:13:30 +03:00
parent 8c72fa063d
commit 5bd025a180
5 changed files with 87 additions and 40 deletions

View File

@ -970,7 +970,9 @@ void SelectQueryExpressionAnalyzer::appendWindowFunctionsArguments(
ExpressionActionsChain::Step & step = chain.lastStep(aggregated_columns);
// 1) Add actions for window functions and their arguments;
// 2) Mark the columns that are really required.
// 2) Mark the columns that are really required. We have to mark them as
// required because we finish the expression chain before processing the
// window functions.
for (const auto & [_, w] : window_descriptions)
{
for (const auto & f : w.window_functions)
@ -1425,49 +1427,47 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
/// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers.
query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage));
has_order_by = query.orderBy() != nullptr;
// FIXME selected columns was set here. Should split order by and select
// and insert window functions in between.
// Will fix:
// 1) window function in DISTINCT
// 2) totally broken DISTINCT
// 3) window functions in order by
before_order_and_select = query_analyzer.appendOrderBy(
chain,
only_types || (need_aggregate ? !second_stage : !first_stage),
optimize_read_in_order,
order_by_elements_actions);
// Window functions are processed in a separate expression chain after
// the main SELECT, similar to what we do for aggregate functions.
if (has_window)
{
query_analyzer.appendWindowFunctionsArguments(chain, only_types || !first_stage);
auto select_required_columns = chain.getLastStep().required_output;
for (const auto & x : chain.getLastActions()->getNamesAndTypesList())
{
query_analyzer.columns_after_window.push_back(x);
}
before_window = chain.getLastActions();
finalize_chain(chain);
auto & step = chain.lastStep(query_analyzer.columns_after_window);
step.required_output = select_required_columns;
// The output of this expression chain is the result of
// SELECT (before "final projection" i.e. renaming the columns), so
// we have to mark the expressions that are required in the output,
// again. We did it for the previous expression chain ("select w/o
// window functions") earlier, in appendSelect(). But that chain also
// produced the expressions required to calculate window functions.
// They are not needed in the final SELECT result. Knowing the correct
// list of columns is important when we apply SELECT DISTINCT later.
const auto * select_query = query_analyzer.getSelectQuery();
for (const auto & child : select_query->select()->children)
{
if (const auto * function
= typeid_cast<const ASTFunction *>(child.get());
function && function->is_window_function)
{
// See its counterpart in appendSelect()
step.required_output.push_back(child->getColumnName());
}
step.required_output.push_back(child->getColumnName());
}
}
selected_columns = chain.getLastStep().required_output;
has_order_by = query.orderBy() != nullptr;
before_order_by = query_analyzer.appendOrderBy(
chain,
only_types || (need_aggregate ? !second_stage : !first_stage),
optimize_read_in_order,
order_by_elements_actions);
if (query_analyzer.appendLimitBy(chain, only_types || !second_stage))
{
before_limit_by = chain.getLastActions();
@ -1479,7 +1479,6 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
finalize_chain(chain);
}
/// Before executing WHERE and HAVING, remove the extra columns from the block (mostly the aggregation keys).
removeExtraColumns();
@ -1610,9 +1609,9 @@ std::string ExpressionAnalysisResult::dump() const
ss << "before_window " << before_window->dumpDAG() << "\n";
}
if (before_order_and_select)
if (before_order_by)
{
ss << "before_order_and_select " << before_order_and_select->dumpDAG() << "\n";
ss << "before_order_by " << before_order_by->dumpDAG() << "\n";
}
if (before_limit_by)
@ -1625,6 +1624,20 @@ std::string ExpressionAnalysisResult::dump() const
ss << "final_projection " << final_projection->dumpDAG() << "\n";
}
if (!selected_columns.empty())
{
ss << "selected_columns ";
for (size_t i = 0; i < selected_columns.size(); i++)
{
if (i > 0)
{
ss << ", ";
}
ss << backQuote(selected_columns[i]);
}
ss << "\n";
}
return ss.str();
}

View File

@ -204,11 +204,12 @@ struct ExpressionAnalysisResult
ActionsDAGPtr before_aggregation;
ActionsDAGPtr before_having;
ActionsDAGPtr before_window;
ActionsDAGPtr before_order_and_select;
ActionsDAGPtr before_order_by;
ActionsDAGPtr before_limit_by;
ActionsDAGPtr final_projection;
/// Columns from the SELECT list, before renaming them to aliases.
/// Columns from the SELECT list, before renaming them to aliases. Used to
/// perform SELECT DISTINCT.
Names selected_columns;
/// Columns will be removed after prewhere actions execution.

View File

@ -538,7 +538,10 @@ Block InterpreterSelectQuery::getSampleBlockImpl()
if (options.to_stage == QueryProcessingStage::Enum::WithMergeableState)
{
if (!analysis_result.need_aggregate)
return analysis_result.before_order_and_select->getResultColumns();
{
// What's the difference with selected_columns?
return analysis_result.before_order_by->getResultColumns();
}
Block header = analysis_result.before_aggregation->getResultColumns();
@ -564,7 +567,8 @@ Block InterpreterSelectQuery::getSampleBlockImpl()
if (options.to_stage == QueryProcessingStage::Enum::WithMergeableStateAfterAggregation)
{
return analysis_result.before_order_and_select->getResultColumns();
// What's the difference with selected_columns?
return analysis_result.before_order_by->getResultColumns();
}
return analysis_result.final_projection->getResultColumns();
@ -958,8 +962,9 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
}
else
{
executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
executeExpression(query_plan, expressions.before_window, "Before window functions");
executeWindow(query_plan);
executeExpression(query_plan, expressions.before_order_by, "Before ORDER BY");
executeDistinct(query_plan, true, expressions.selected_columns, true);
}
@ -1005,8 +1010,10 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
else if (expressions.hasHaving())
executeHaving(query_plan, expressions.before_having);
executeExpression(query_plan, expressions.before_order_and_select, "Before ORDER BY and SELECT");
executeExpression(query_plan, expressions.before_window,
"Before window functions");
executeWindow(query_plan);
executeExpression(query_plan, expressions.before_order_by, "Before ORDER BY");
executeDistinct(query_plan, true, expressions.selected_columns, true);
}
@ -1745,6 +1752,11 @@ void InterpreterSelectQuery::executeRollupOrCube(QueryPlan & query_plan, Modific
void InterpreterSelectQuery::executeExpression(QueryPlan & query_plan, const ActionsDAGPtr & expression, const std::string & description)
{
if (!expression)
{
return;
}
auto expression_step = std::make_unique<ExpressionStep>(query_plan.getCurrentDataStream(), expression);
expression_step->setStepDescription(description);

View File

@ -79,12 +79,22 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) q fro
9 9
select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }
-- should work in ORDER BY though
-- doesn't work now
-- select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
-- should work in ORDER BY
-- at least it works in ORDER BY if you wrap it in a subquery
select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
-- also works in ORDER BY if you wrap it in a subquery
9 9
6 8
7 8
8 8
3 5
4 5
5 5
0 2
1 2
2 2
select * from (select count(*) over () c from numbers(3)) order by c;
-- must work in WHERE if you wrap it in a subquery
@ -168,4 +178,12 @@ select groupArray(number) over () from numbers(3);
[0,1,2]
select count(1) over (), max(number + 1) over () from numbers(3);
-- Should work in DISTINCT
1 3
select distinct sum(0) over () from numbers(2);
0
select distinct any(number) over () from numbers(2);
0

View File

@ -24,11 +24,10 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) q fro
-- last stage of select, after all other functions.
select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }
-- should work in ORDER BY though
-- doesn't work now
-- select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
-- should work in ORDER BY
select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
-- at least it works in ORDER BY if you wrap it in a subquery
-- also works in ORDER BY if you wrap it in a subquery
select * from (select count(*) over () c from numbers(3)) order by c;
-- must work in WHERE if you wrap it in a subquery
@ -57,3 +56,7 @@ select groupArray(number) over () from numbers(3);
-- This one tests we properly process the window function arguments.
-- Seen errors like 'column `1` not found' from count(1).
select count(1) over (), max(number + 1) over () from numbers(3);
-- Should work in DISTINCT
select distinct sum(0) over () from numbers(2);
select distinct any(number) over () from numbers(2);