From be2ec93a3f3a649d62e214e83f8d17053facd356 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 20 Feb 2024 16:35:30 +0100 Subject: [PATCH 1/2] Remove commented code in ExpressionActions::execute() This information will be added to the exception anyway. Signed-off-by: Azat Khuzhin --- src/Interpreters/ExpressionActions.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index 1bd1e2c318f..f86febb481d 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -736,10 +736,6 @@ void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) { executeAction(action, execution_context, dry_run); checkLimits(execution_context.columns); - - //std::cerr << "Action: " << action.toString() << std::endl; - //for (const auto & col : execution_context.columns) - // std::cerr << col.dumpStructure() << std::endl; } catch (Exception & e) { From 0cb6f205d7fa8a4b790879eae8d0ce4ae03cabc4 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 20 Feb 2024 16:16:46 +0100 Subject: [PATCH 2/2] Fix actions execution during preliminary filtering (PK, partition pruning) Previously preliminary filtering works incorrectly if input columns had been used multiple times, because: - position initialized only for the first user - column had been moved, so the second user will get nullptr And this was the case when it had been required multiple times. Consider the following example: select * from data final prewhere indexHint(_partition_id = 'all') or indexHint(_partition_id = 'all') Actions for this PREWHERE is the following: (lldb) p actions.__ptr_->dumpActions() (std::string) $1 = "input: _partition_id LowCardinality(String) _partition_id LowCardinality(String) actions: INPUT : 0 -> _partition_id LowCardinality(String) : 0 COLUMN Const(String) -> 'all' String : 1 COLUMN Const(String) -> 'UInt8'_String String : 2 INPUT : 1 -> _partition_id LowCardinality(String) : 3 COLUMN Const(String) -> 'all' String : 4 COLUMN Const(String) -> 'UInt8'_String String : 5 FUNCTION equals(_partition_id :: 0, 'all' :: 1) -> equals(_partition_id, 'all') LowCardinality(UInt8) : 6 FUNCTION equals(_partition_id :: 3, 'all' :: 4) -> equals(_partition_id, 'all') LowCardinality(UInt8) : 1 FUNCTION _CAST(equals(_partition_id, 'all') :: 6, 'UInt8'_String :: 2) -> _CAST(equals(_partition_id, 'all'), 'UInt8'_String) UInt8 : 4 FUNCTION _CAST(equals(_partition_id, 'all') :: 1, 'UInt8'_String :: 5) -> _CAST(equals(_partition_id, 'all'), 'UInt8'_String) UInt8 : 2 FUNCTION or(_CAST(equals(_partition_id, 'all'), 'UInt8'_String) :: 4, _CAST(equals(_partition_id, 'all'), 'UInt8'_String) :: 2) -> or(indexHint(), indexHint())"... It has _partition_id column in input multiple times. So fix this by adding a flag (`allow_duplicates_in_input`), if set, columns will be fully initialized. v2: fix "Position out of bound in Block::erase()" Signed-off-by: Azat Khuzhin --- src/Interpreters/ExpressionActions.cpp | 26 ++++++++++++++----- src/Interpreters/ExpressionActions.h | 10 +++++-- src/Storages/VirtualColumnUtils.cpp | 6 ++--- ...inary_filters_duplicated_columns.reference | 2 ++ ...preliminary_filters_duplicated_columns.sql | 6 +++++ ..._columns_SimpleAggregateFunction.reference | 1 + ...icated_columns_SimpleAggregateFunction.sql | 5 ++++ 7 files changed, 44 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.reference create mode 100644 tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.sql create mode 100644 tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.reference create mode 100644 tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.sql diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index f86febb481d..383bbbae0dd 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -563,7 +563,7 @@ namespace }; } -static void executeAction(const ExpressionActions::Action & action, ExecutionContext & execution_context, bool dry_run) +static void executeAction(const ExpressionActions::Action & action, ExecutionContext & execution_context, bool dry_run, bool allow_duplicates_in_input) { auto & inputs = execution_context.inputs; auto & columns = execution_context.columns; @@ -694,14 +694,19 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon action.node->result_name); } else - columns[action.result_position] = std::move(inputs[pos]); + { + if (allow_duplicates_in_input) + columns[action.result_position] = inputs[pos]; + else + columns[action.result_position] = std::move(inputs[pos]); + } break; } } } -void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) const +void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run, bool allow_duplicates_in_input) const { ExecutionContext execution_context { @@ -722,7 +727,8 @@ void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) if (execution_context.inputs_pos[input_pos] < 0) { execution_context.inputs_pos[input_pos] = pos; - break; + if (!allow_duplicates_in_input) + break; } } } @@ -734,7 +740,7 @@ void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) { try { - executeAction(action, execution_context, dry_run); + executeAction(action, execution_context, dry_run, allow_duplicates_in_input); checkLimits(execution_context.columns); } catch (Exception & e) @@ -748,6 +754,12 @@ void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) { block.clear(); } + else if (allow_duplicates_in_input) + { + /// This case is the same as when the input is projected + /// since we do not need any input columns. + block.clear(); + } else { ::sort(execution_context.inputs_pos.rbegin(), execution_context.inputs_pos.rend()); @@ -770,11 +782,11 @@ void ExpressionActions::execute(Block & block, size_t & num_rows, bool dry_run) num_rows = execution_context.num_rows; } -void ExpressionActions::execute(Block & block, bool dry_run) const +void ExpressionActions::execute(Block & block, bool dry_run, bool allow_duplicates_in_input) const { size_t num_rows = block.rows(); - execute(block, num_rows, dry_run); + execute(block, num_rows, dry_run, allow_duplicates_in_input); if (!block) block.insert({DataTypeUInt8().createColumnConst(num_rows, 0), std::make_shared(), "_dummy"}); diff --git a/src/Interpreters/ExpressionActions.h b/src/Interpreters/ExpressionActions.h index db6670c50b9..cb467004d29 100644 --- a/src/Interpreters/ExpressionActions.h +++ b/src/Interpreters/ExpressionActions.h @@ -98,9 +98,15 @@ public: const NamesAndTypesList & getRequiredColumnsWithTypes() const { return required_columns; } /// Execute the expression on the block. The block must contain all the columns returned by getRequiredColumns. - void execute(Block & block, size_t & num_rows, bool dry_run = false) const; + /// + /// @param allow_duplicates_in_input - actions are allowed to have + /// duplicated input (that will refer into the block). This is needed for + /// preliminary query filtering (filterBlockWithDAG()), because they just + /// pass available virtual columns, which cannot be moved in case they are + /// used multiple times. + void execute(Block & block, size_t & num_rows, bool dry_run = false, bool allow_duplicates_in_input = false) const; /// The same, but without `num_rows`. If result block is empty, adds `_dummy` column to keep block size. - void execute(Block & block, bool dry_run = false) const; + void execute(Block & block, bool dry_run = false, bool allow_duplicates_in_input = false) const; bool hasArrayJoin() const; void assertDeterministic() const; diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 33ff6e7104f..07ac61c110d 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -223,7 +223,7 @@ bool prepareFilterBlockWithQuery(const ASTPtr & query, ContextPtr context, Block auto expression_actions = std::make_shared(actions); auto block_with_constants = block; - expression_actions->execute(block_with_constants); + expression_actions->execute(block_with_constants, /*dry_run=*/ false, /*allow_duplicates_in_input=*/ true); return block_with_constants.has(expr_column_name) && isColumnConst(*block_with_constants.getByName(expr_column_name).column); }; @@ -266,7 +266,7 @@ void filterBlockWithDAG(ActionsDAGPtr dag, Block & block, ContextPtr context) auto actions = std::make_shared(dag); makeSets(actions, context); Block block_with_filter = block; - actions->execute(block_with_filter); + actions->execute(block_with_filter, /*dry_run=*/ false, /*allow_duplicates_in_input=*/ true); /// Filter the block. String filter_column_name = dag->getOutputs().at(0)->result_name; @@ -313,7 +313,7 @@ void filterBlockWithQuery(const ASTPtr & query, Block & block, ContextPtr contex makeSets(actions, context); Block block_with_filter = block; - actions->execute(block_with_filter); + actions->execute(block_with_filter, /*dry_run=*/ false, /*allow_duplicates_in_input=*/ true); /// Filter the block. String filter_column_name = expression_ast->getColumnName(); diff --git a/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.reference b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.sql b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.sql new file mode 100644 index 00000000000..060f16f8945 --- /dev/null +++ b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns.sql @@ -0,0 +1,6 @@ +-- It is special because actions cannot be reused for SimpleAggregateFunction (see https://github.com/ClickHouse/ClickHouse/pull/54436) +drop table if exists data; +create table data (key Int) engine=AggregatingMergeTree() order by tuple(); +insert into data values (0); +select * from data final prewhere indexHint(_partition_id = 'all') or indexHint(_partition_id = 'all'); +select * from data final prewhere indexHint(_partition_id = 'all') or indexHint(_partition_id = 'all') or indexHint(_partition_id = 'all'); diff --git a/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.reference b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.sql b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.sql new file mode 100644 index 00000000000..97df883fa48 --- /dev/null +++ b/tests/queries/0_stateless/02995_preliminary_filters_duplicated_columns_SimpleAggregateFunction.sql @@ -0,0 +1,5 @@ +-- It is special because actions cannot be reused for SimpleAggregateFunction (see https://github.com/ClickHouse/ClickHouse/pull/54436) +drop table if exists data; +create table data (key SimpleAggregateFunction(max, Int)) engine=AggregatingMergeTree() order by tuple(); +insert into data values (0); +select * from data final prewhere indexHint(_partition_id = 'all') and key >= -1 where key >= 0;