apply 27a2b19

This commit is contained in:
lgbo-ustc 2024-04-25 16:20:27 +08:00
parent 7ee720ffb0
commit 6fa6c0261b
6 changed files with 96 additions and 122 deletions

View File

@ -164,12 +164,26 @@ Result:
│ 4 │ -4 │ 4 │
└───┴────┴─────┘
```
## Join with inequality conditions
Clickhouse currently supports inner, left, right and full join with inequality conditions, including with `OR` operator. You need to set `allow_experimental_analyzer = 1` and select `hash` or `grace_hash` join algorithm.
## [experimental] Join with inequality conditions
:::note
This feature is experimental. To use it, set `allow_experimental_join_condition` to 1 in your configuration files or by using the `SET` command:
```sql
SET allow_experimental_join_condition=1
```
Otherwise, you'll get `INVALID_JOIN_ON_EXPRESSION`.
:::
Clickhouse currently supports `ALL INNER/LEFT/RIGHT/FULL JOIN` with inequality conditions in addition to equality conditions. The inequality conditions are supported only for `hash` and `grace_hash` join algorithms. The inequality conditions are not supported with `join_use_nulls`.
**Example**
Table `t1`:
```
┌─key──┬─attr─┬─a─┬─b─┬─c─┐
│ key1 │ a │ 1 │ 1 │ 2 │
@ -183,6 +197,7 @@ Table `t1`:
```
Table `t2`
```
┌─key──┬─attr─┬─a─┬─b─┬─c─┐
│ key1 │ A │ 1 │ 2 │ 1 │
@ -193,9 +208,11 @@ Table `t2`
│ key4 │ F │ 1 │ 1 │ 1 │
└──────┴──────┴───┴───┴───┘
```
```sql
SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
```
```
key1 a 1 1 2 key1 B 2 1 2
key1 a 1 1 2 key1 C 3 4 5
@ -318,7 +335,7 @@ For example, consider the following tables:
## PASTE JOIN Usage
The result of `PASTE JOIN` is a table that contains all columns from left subquery followed by all columns from the right subquery.
The rows are matched based on their positions in the original tables (the order of rows should be defined).
The rows are matched based on their positions in the original tables (the order of rows should be defined).
If the subqueries return a different number of rows, extra rows will be cut.
Example:

View File

@ -18,7 +18,6 @@
#include <stack>
#include <base/sort.h>
#include <Common/JSONBuilder.h>
#include "ExpressionActions.h"
#include <Core/SettingsEnums.h>
@ -617,18 +616,14 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon
res_column.column = action.node->function->execute(arguments, res_column.type, num_rows, dry_run);
if (res_column.column->getDataType() != res_column.type->getColumnType())
{
WriteBufferFromOwnString out;
for (const auto & arg : arguments)
out << arg.dumpStructure() << ",";
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Unexpected return type from {}. Expected {}. Got {}. Action:\n{},\ninput block structure:{}",
action.node->function->getName(),
res_column.type->getName(), //res_column.type->getColumnType(),
res_column.column->getName(), //res_column.column->getDataType(),
res_column.type->getName(),
res_column.column->getName(),
action.toString(),
out.str());
Block(arguments).dumpStructure());
}
}
break;

View File

@ -1573,67 +1573,58 @@ ColumnPtr buildAdditionalFilter(
}
if (!executed_block)
{
WriteBufferFromOwnString buf;
for (const auto & col : required_cols)
{
buf << col.name << ", ";
}
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"required columns: {}. but not found any in left/right table. right table: {}, left table: {}",
buf.str(),
"required columns: [{}], but not found any in left/right table. right table: {}, left table: {}",
required_cols.toString(),
sample_right_block.dumpNames(),
added_columns.left_block.dumpNames());
}
// Debug
for (const auto & col : executed_block.getColumnsWithTypeAndName())
{
if (!col.column || !col.type)
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Null column in input block. {}", executed_block.dumpStructure());
}
throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal nullptr column in input block: {}", executed_block.dumpStructure());
}
added_columns.additional_filter_expression->execute(executed_block);
return executed_block.getByPosition(0).column;
ColumnPtr result_column = executed_block.getByPosition(0).column->convertToFullColumnIfConst();
executed_block.clear();
if (result_column->isNullable())
{
/// Convert Nullable(UInt8) to UInt8 ensuring that nulls are zeros
/// Trying to avoid copying data, since we are the only owner of the column.
ColumnPtr mask_column = assert_cast<const ColumnNullable &>(*result_column).getNullMapColumnPtr();
MutableColumnPtr mutable_column;
{
ColumnPtr nested_column = assert_cast<const ColumnNullable &>(*result_column).getNestedColumnPtr();
result_column.reset();
mutable_column = IColumn::mutate(std::move(nested_column));
}
auto & column_data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
const auto & mask_column_data = assert_cast<const ColumnUInt8 &>(*mask_column).getData();
for (size_t i = 0; i < column_data.size(); ++i)
{
if (mask_column_data[i])
column_data[i] = 0;
}
return mutable_column;
}
return result_column;
}
template <bool flag_per_row>
void addFoundRowRefAll(
const RowRefList & row_list,
std::vector<RowRef> & selected_rows,
IColumn::Offset & current_offset,
KnownRowsHolder<flag_per_row> & known_rows [[maybe_unused]])
/// Adapter class to pass into addFoundRowAll
/// In joinRightColumnsWithAdditionalFilter we don't want to add rows directly into AddedColumns,
/// because they need to be filtered by additional_filter_expression.
class PreSelectedRows : public std::vector<RowRef>
{
if constexpr (flag_per_row)
{
std::unique_ptr<std::vector<KnownRowsHolder<true>::Type>> new_known_rows_ptr;
for (auto it = row_list.begin(); it.ok(); ++it)
{
auto row_ref = std::make_pair(it->block, it->row_num);
if (!known_rows.isKnown(row_ref))
{
selected_rows.emplace_back(row_ref.first, row_ref.second);
++current_offset;
if (!new_known_rows_ptr)
{
new_known_rows_ptr = std::make_unique<std::vector<KnownRowsHolder<true>::Type>>();
}
new_known_rows_ptr->push_back(row_ref);
}
}
if (new_known_rows_ptr)
known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr));
}
else
{
for (auto it = row_list.begin(); it.ok(); ++it)
{
selected_rows.emplace_back(it->block, it->row_num);
++current_offset;
}
}
}
public:
void appendFromBlock(const Block & block, size_t row_num, bool /* has_default */) { this->emplace_back(&block, row_num); }
};
/// First to collect all matched rows refs by join keys, then filter out rows which are not true in additional filter expression.
template <
@ -1666,7 +1657,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
using FindResult = typename KeyGetter::FindResult;
size_t max_joined_block_rows = added_columns.max_joined_block_rows;
size_t left_row_iter = 0;
std::vector<RowRef> selected_rows;
PreSelectedRows selected_rows;
selected_rows.reserve(left_block_rows);
std::vector<FindResult> find_results;
find_results.reserve(left_block_rows);
@ -1709,9 +1700,9 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
auto & mapped = find_result.getMapped();
find_results.push_back(find_result);
if (flag_per_row)
addFoundRowRefAll<true>(mapped, selected_rows, current_added_rows, all_flag_known_rows);
addFoundRowAll<Map, false, true>(mapped, selected_rows, current_added_rows, all_flag_known_rows, nullptr);
else
addFoundRowRefAll<false>(mapped, selected_rows, current_added_rows, single_flag_know_rows);
addFoundRowAll<Map, false, false>(mapped, selected_rows, current_added_rows, single_flag_know_rows, nullptr);
}
}
row_replicate_offset.push_back(current_added_rows);
@ -1720,17 +1711,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
auto copy_final_matched_rows = [&](size_t left_start_row, ColumnPtr filter_col)
{
const PaddedPODArray<UInt8> * filter_flags = nullptr;
filter_col = filter_col->convertToFullIfNeeded();
if (filter_col->isNullable())
{
auto nested_col = typeid_cast<const ColumnNullable &>(*filter_col).getNestedColumnPtr();
filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*nested_col).getData());
}
else
{
filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*filter_col).getData());
}
const PaddedPODArray<UInt8> & filter_flags = assert_cast<const ColumnUInt8 &>(*filter_col).getData();
size_t prev_replicated_row = 0;
auto selected_right_row_it = selected_rows.begin();
@ -1743,7 +1724,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
{
for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
{
if ((*filter_flags)[replicated_row])
if (filter_flags[replicated_row])
{
any_matched = true;
added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
@ -1758,7 +1739,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
{
for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
{
if ((*filter_flags)[replicated_row])
if (filter_flags[replicated_row])
{
any_matched = true;
added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
@ -1979,48 +1960,30 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
AddedColumns & added_columns,
JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
{
auto join_without_additional_filter = [&]()
{
return mapv.size() > 1 ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
: joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
};
constexpr JoinFeatures<KIND, STRICTNESS> join_features;
if constexpr (join_features.is_all_join)
{
if (added_columns.additional_filter_expression)
{
constexpr bool mark_per_row_used = join_features.right || join_features.full;
return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
std::forward<std::vector<KeyGetter>>(key_getter_vector),
mapv,
added_columns,
used_flags,
need_filter,
join_features.need_flags,
join_features.add_missing,
true)
: joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
std::forward<std::vector<KeyGetter>>(key_getter_vector),
mapv,
added_columns,
used_flags,
need_filter,
join_features.need_flags,
join_features.add_missing,
mark_per_row_used);
}
else
{
return join_without_additional_filter();
bool mark_per_row_used = join_features.right || join_features.full || mapv.size() > 1;
return joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
std::forward<std::vector<KeyGetter>>(key_getter_vector),
mapv,
added_columns,
used_flags,
need_filter,
join_features.need_flags,
join_features.add_missing,
mark_per_row_used);
}
}
else
{
return join_without_additional_filter();
}
if (added_columns.additional_filter_expression)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Additional filter expression is not supported for this JOIN");
return mapv.size() > 1
? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
: joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
}
template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, typename AddedColumns>
@ -2796,6 +2759,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona
{
if (!additional_filter_expression)
return;
Block expression_sample_block = additional_filter_expression->getSampleBlock();
if (expression_sample_block.columns() != 1)
@ -2818,7 +2782,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona
if (!is_supported)
{
throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
"Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs.",
"Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs",
expression_sample_block.getByPosition(0).name);
}
}

View File

@ -459,7 +459,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
join_right_table_expressions,
join_node,
result.join_clauses.back());
is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty();
}
}
else
@ -476,7 +476,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
join_right_table_expressions,
join_node,
result.join_clauses.back());
is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty();
}
auto and_function = FunctionFactory::instance().get("and", planner_context->getQueryContext());
@ -595,9 +595,10 @@ JoinClausesAndActions buildJoinClausesAndActions(
result.right_join_tmp_expression_actions = std::move(right_join_actions);
result.right_join_expressions_actions->removeUnusedActions(join_right_actions_names);
/// If there is any inequal join condition, we need to build full join expressions actions.
if (is_inequal_join)
{
/// In case of multiple disjuncts and any inequal join condition, we need to build full join on expression actions.
/// So, for each column, we recalculate the value of the whole expression from JOIN ON to check if rows should be joined.
if (result.join_clauses.size() > 1)
{
auto mixed_join_expressions_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
@ -622,7 +623,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
auto outputs = result.mixed_join_expressions_actions->getOutputs();
if (outputs.size() != 1)
{
throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.mixed_join_expressions_actions->dumpDAG());
throw Exception(ErrorCodes::LOGICAL_ERROR, "Only one output is expected, got: {}", result.mixed_join_expressions_actions->dumpDAG());
}
auto output_type = removeNullable(outputs[0]->result_type);
WhichDataType which_type(output_type);
@ -846,11 +847,12 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
const Block & right_table_expression_header,
const PlannerContextPtr & planner_context)
{
if (table_join->getMixedJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
if (table_join->getMixedJoinExpression()
&& !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
&& !table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH))
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"JOIN with mixed conditions supports only hash join or grace hash join with one disjunct.");
"JOIN with mixed conditions supports only hash join or grace hash join");
}
trySetStorageInTableJoin(right_table_expression, table_join);

View File

@ -155,10 +155,6 @@ public:
return mixed_filter_condition_nodes;
}
bool hasMixedFilterCondition() const
{
return !mixed_filter_condition_nodes.empty();
}
/// Dump clause into buffer
void dump(WriteBuffer & buffer) const;

View File

@ -7,7 +7,7 @@ CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)
INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1);
SET allow_experimental_analyzer=1;
SET allow_mixed_join_condition=1;
SET allow_experimental_join_condition=1;
SET join_use_nulls=0;
-- { echoOn }
{% for algorithm in ['hash', 'grace_hash'] -%}