From 6fa6c0261b41ce63091b9ef353de14823469c5a8 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Thu, 25 Apr 2024 16:20:27 +0800 Subject: [PATCH] apply 27a2b19 --- .../sql-reference/statements/select/join.md | 23 ++- src/Interpreters/ExpressionActions.cpp | 11 +- src/Interpreters/HashJoin.cpp | 164 +++++++----------- src/Planner/PlannerJoins.cpp | 14 +- src/Planner/PlannerJoins.h | 4 - ...006_join_on_inequal_expression_fast.sql.j2 | 2 +- 6 files changed, 96 insertions(+), 122 deletions(-) diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 141bb80ceb4..4ef407a4d13 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -164,12 +164,26 @@ Result: │ 4 │ -4 │ 4 │ └───┴────┴─────┘ ``` -## Join with inequality conditions -Clickhouse currently supports inner, left, right and full join with inequality conditions, including with `OR` operator. You need to set `allow_experimental_analyzer = 1` and select `hash` or `grace_hash` join algorithm. + +## [experimental] Join with inequality conditions + +:::note +This feature is experimental. To use it, set `allow_experimental_join_condition` to 1 in your configuration files or by using the `SET` command: + +```sql +SET allow_experimental_join_condition=1 +``` + +Otherwise, you'll get `INVALID_JOIN_ON_EXPRESSION`. + +::: + +Clickhouse currently supports `ALL INNER/LEFT/RIGHT/FULL JOIN` with inequality conditions in addition to equality conditions. The inequality conditions are supported only for `hash` and `grace_hash` join algorithms. The inequality conditions are not supported with `join_use_nulls`. **Example** Table `t1`: + ``` ┌─key──┬─attr─┬─a─┬─b─┬─c─┐ │ key1 │ a │ 1 │ 1 │ 2 │ @@ -183,6 +197,7 @@ Table `t1`: ``` Table `t2` + ``` ┌─key──┬─attr─┬─a─┬─b─┬─c─┐ │ key1 │ A │ 1 │ 2 │ 1 │ @@ -193,9 +208,11 @@ Table `t2` │ key4 │ F │ 1 │ 1 │ 1 │ └──────┴──────┴───┴───┴───┘ ``` + ```sql SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); ``` + ``` key1 a 1 1 2 key1 B 2 1 2 key1 a 1 1 2 key1 C 3 4 5 @@ -318,7 +335,7 @@ For example, consider the following tables: ## PASTE JOIN Usage The result of `PASTE JOIN` is a table that contains all columns from left subquery followed by all columns from the right subquery. -The rows are matched based on their positions in the original tables (the order of rows should be defined). +The rows are matched based on their positions in the original tables (the order of rows should be defined). If the subqueries return a different number of rows, extra rows will be cut. Example: diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index aa2b677531a..04f29f35c3c 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -18,7 +18,6 @@ #include #include #include -#include "ExpressionActions.h" #include @@ -617,18 +616,14 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon res_column.column = action.node->function->execute(arguments, res_column.type, num_rows, dry_run); if (res_column.column->getDataType() != res_column.type->getColumnType()) { - WriteBufferFromOwnString out; - for (const auto & arg : arguments) - out << arg.dumpStructure() << ","; - throw Exception( ErrorCodes::LOGICAL_ERROR, "Unexpected return type from {}. Expected {}. Got {}. Action:\n{},\ninput block structure:{}", action.node->function->getName(), - res_column.type->getName(), //res_column.type->getColumnType(), - res_column.column->getName(), //res_column.column->getDataType(), + res_column.type->getName(), + res_column.column->getName(), action.toString(), - out.str()); + Block(arguments).dumpStructure()); } } break; diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 938f53c47e8..56955066191 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -1573,67 +1573,58 @@ ColumnPtr buildAdditionalFilter( } if (!executed_block) { - WriteBufferFromOwnString buf; - for (const auto & col : required_cols) - { - buf << col.name << ", "; - } throw Exception( ErrorCodes::LOGICAL_ERROR, - "required columns: {}. but not found any in left/right table. right table: {}, left table: {}", - buf.str(), + "required columns: [{}], but not found any in left/right table. right table: {}, left table: {}", + required_cols.toString(), sample_right_block.dumpNames(), added_columns.left_block.dumpNames()); } - // Debug + for (const auto & col : executed_block.getColumnsWithTypeAndName()) { if (!col.column || !col.type) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Null column in input block. {}", executed_block.dumpStructure()); - } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal nullptr column in input block: {}", executed_block.dumpStructure()); } + added_columns.additional_filter_expression->execute(executed_block); - return executed_block.getByPosition(0).column; + + ColumnPtr result_column = executed_block.getByPosition(0).column->convertToFullColumnIfConst(); + executed_block.clear(); + + if (result_column->isNullable()) + { + /// Convert Nullable(UInt8) to UInt8 ensuring that nulls are zeros + /// Trying to avoid copying data, since we are the only owner of the column. + ColumnPtr mask_column = assert_cast(*result_column).getNullMapColumnPtr(); + + MutableColumnPtr mutable_column; + { + ColumnPtr nested_column = assert_cast(*result_column).getNestedColumnPtr(); + result_column.reset(); + mutable_column = IColumn::mutate(std::move(nested_column)); + } + + auto & column_data = assert_cast(*mutable_column).getData(); + const auto & mask_column_data = assert_cast(*mask_column).getData(); + for (size_t i = 0; i < column_data.size(); ++i) + { + if (mask_column_data[i]) + column_data[i] = 0; + } + return mutable_column; + } + return result_column; } -template -void addFoundRowRefAll( - const RowRefList & row_list, - std::vector & selected_rows, - IColumn::Offset & current_offset, - KnownRowsHolder & known_rows [[maybe_unused]]) +/// Adapter class to pass into addFoundRowAll +/// In joinRightColumnsWithAdditionalFilter we don't want to add rows directly into AddedColumns, +/// because they need to be filtered by additional_filter_expression. +class PreSelectedRows : public std::vector { - if constexpr (flag_per_row) - { - std::unique_ptr::Type>> new_known_rows_ptr; - for (auto it = row_list.begin(); it.ok(); ++it) - { - auto row_ref = std::make_pair(it->block, it->row_num); - if (!known_rows.isKnown(row_ref)) - { - selected_rows.emplace_back(row_ref.first, row_ref.second); - ++current_offset; - if (!new_known_rows_ptr) - { - new_known_rows_ptr = std::make_unique::Type>>(); - } - new_known_rows_ptr->push_back(row_ref); - } - } - - if (new_known_rows_ptr) - known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr)); - } - else - { - for (auto it = row_list.begin(); it.ok(); ++it) - { - selected_rows.emplace_back(it->block, it->row_num); - ++current_offset; - } - } -} +public: + void appendFromBlock(const Block & block, size_t row_num, bool /* has_default */) { this->emplace_back(&block, row_num); } +}; /// First to collect all matched rows refs by join keys, then filter out rows which are not true in additional filter expression. template < @@ -1666,7 +1657,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter( using FindResult = typename KeyGetter::FindResult; size_t max_joined_block_rows = added_columns.max_joined_block_rows; size_t left_row_iter = 0; - std::vector selected_rows; + PreSelectedRows selected_rows; selected_rows.reserve(left_block_rows); std::vector find_results; find_results.reserve(left_block_rows); @@ -1709,9 +1700,9 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter( auto & mapped = find_result.getMapped(); find_results.push_back(find_result); if (flag_per_row) - addFoundRowRefAll(mapped, selected_rows, current_added_rows, all_flag_known_rows); + addFoundRowAll(mapped, selected_rows, current_added_rows, all_flag_known_rows, nullptr); else - addFoundRowRefAll(mapped, selected_rows, current_added_rows, single_flag_know_rows); + addFoundRowAll(mapped, selected_rows, current_added_rows, single_flag_know_rows, nullptr); } } row_replicate_offset.push_back(current_added_rows); @@ -1720,17 +1711,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter( auto copy_final_matched_rows = [&](size_t left_start_row, ColumnPtr filter_col) { - const PaddedPODArray * filter_flags = nullptr; - filter_col = filter_col->convertToFullIfNeeded(); - if (filter_col->isNullable()) - { - auto nested_col = typeid_cast(*filter_col).getNestedColumnPtr(); - filter_flags = &(dynamic_cast(*nested_col).getData()); - } - else - { - filter_flags = &(dynamic_cast(*filter_col).getData()); - } + const PaddedPODArray & filter_flags = assert_cast(*filter_col).getData(); size_t prev_replicated_row = 0; auto selected_right_row_it = selected_rows.begin(); @@ -1743,7 +1724,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter( { for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row) { - if ((*filter_flags)[replicated_row]) + if (filter_flags[replicated_row]) { any_matched = true; added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing); @@ -1758,7 +1739,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter( { for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row) { - if ((*filter_flags)[replicated_row]) + if (filter_flags[replicated_row]) { any_matched = true; added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing); @@ -1979,48 +1960,30 @@ size_t joinRightColumnsSwitchMultipleDisjuncts( AddedColumns & added_columns, JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]]) { - auto join_without_additional_filter = [&]() - { - return mapv.size() > 1 ? joinRightColumns( - std::forward>(key_getter_vector), mapv, added_columns, used_flags) - : joinRightColumns( - std::forward>(key_getter_vector), mapv, added_columns, used_flags); - }; - constexpr JoinFeatures join_features; if constexpr (join_features.is_all_join) { if (added_columns.additional_filter_expression) { - constexpr bool mark_per_row_used = join_features.right || join_features.full; - return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter( - std::forward>(key_getter_vector), - mapv, - added_columns, - used_flags, - need_filter, - join_features.need_flags, - join_features.add_missing, - true) - : joinRightColumnsWithAddtitionalFilter( - std::forward>(key_getter_vector), - mapv, - added_columns, - used_flags, - need_filter, - join_features.need_flags, - join_features.add_missing, - mark_per_row_used); - } - else - { - return join_without_additional_filter(); + bool mark_per_row_used = join_features.right || join_features.full || mapv.size() > 1; + return joinRightColumnsWithAddtitionalFilter( + std::forward>(key_getter_vector), + mapv, + added_columns, + used_flags, + need_filter, + join_features.need_flags, + join_features.add_missing, + mark_per_row_used); } } - else - { - return join_without_additional_filter(); - } + + if (added_columns.additional_filter_expression) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Additional filter expression is not supported for this JOIN"); + + return mapv.size() > 1 + ? joinRightColumns(std::forward>(key_getter_vector), mapv, added_columns, used_flags) + : joinRightColumns(std::forward>(key_getter_vector), mapv, added_columns, used_flags); } template @@ -2796,6 +2759,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona { if (!additional_filter_expression) return; + Block expression_sample_block = additional_filter_expression->getSampleBlock(); if (expression_sample_block.columns() != 1) @@ -2818,7 +2782,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona if (!is_supported) { throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, - "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs.", + "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs", expression_sample_block.getByPosition(0).name); } } diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp index 930881ec7d1..1fdf51f399f 100644 --- a/src/Planner/PlannerJoins.cpp +++ b/src/Planner/PlannerJoins.cpp @@ -459,7 +459,7 @@ JoinClausesAndActions buildJoinClausesAndActions( join_right_table_expressions, join_node, result.join_clauses.back()); - is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition(); + is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty(); } } else @@ -476,7 +476,7 @@ JoinClausesAndActions buildJoinClausesAndActions( join_right_table_expressions, join_node, result.join_clauses.back()); - is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition(); + is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty(); } auto and_function = FunctionFactory::instance().get("and", planner_context->getQueryContext()); @@ -595,9 +595,10 @@ JoinClausesAndActions buildJoinClausesAndActions( result.right_join_tmp_expression_actions = std::move(right_join_actions); result.right_join_expressions_actions->removeUnusedActions(join_right_actions_names); - /// If there is any inequal join condition, we need to build full join expressions actions. if (is_inequal_join) { + /// In case of multiple disjuncts and any inequal join condition, we need to build full join on expression actions. + /// So, for each column, we recalculate the value of the whole expression from JOIN ON to check if rows should be joined. if (result.join_clauses.size() > 1) { auto mixed_join_expressions_actions = std::make_shared(mixed_table_expression_columns); @@ -622,7 +623,7 @@ JoinClausesAndActions buildJoinClausesAndActions( auto outputs = result.mixed_join_expressions_actions->getOutputs(); if (outputs.size() != 1) { - throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.mixed_join_expressions_actions->dumpDAG()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Only one output is expected, got: {}", result.mixed_join_expressions_actions->dumpDAG()); } auto output_type = removeNullable(outputs[0]->result_type); WhichDataType which_type(output_type); @@ -846,11 +847,12 @@ std::shared_ptr chooseJoinAlgorithm(std::shared_ptr & table_jo const Block & right_table_expression_header, const PlannerContextPtr & planner_context) { - if (table_join->getMixedJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH) + if (table_join->getMixedJoinExpression() + && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH) && !table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH)) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "JOIN with mixed conditions supports only hash join or grace hash join with one disjunct."); + "JOIN with mixed conditions supports only hash join or grace hash join"); } trySetStorageInTableJoin(right_table_expression, table_join); diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h index c96941a3c16..8adf6edd7ea 100644 --- a/src/Planner/PlannerJoins.h +++ b/src/Planner/PlannerJoins.h @@ -155,10 +155,6 @@ public: return mixed_filter_condition_nodes; } - bool hasMixedFilterCondition() const - { - return !mixed_filter_condition_nodes.empty(); - } /// Dump clause into buffer void dump(WriteBuffer & buffer) const; diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 index 89df825b32b..d3aa74f5c38 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 @@ -7,7 +7,7 @@ CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64) INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1); SET allow_experimental_analyzer=1; -SET allow_mixed_join_condition=1; +SET allow_experimental_join_condition=1; SET join_use_nulls=0; -- { echoOn } {% for algorithm in ['hash', 'grace_hash'] -%}