ClickHouse/dbms/src/Interpreters/MergeJoin.cpp

550 lines
18 KiB
C++
Raw Normal View History

2019-09-09 19:43:37 +00:00
#include <Core/NamesAndTypes.h>
2019-09-13 16:17:37 +00:00
#include <Core/SortCursor.h>
#include <Columns/ColumnNullable.h>
2019-09-09 19:43:37 +00:00
#include <Interpreters/MergeJoin.h>
#include <Interpreters/AnalyzedJoin.h>
2019-09-12 18:06:25 +00:00
#include <Interpreters/sortBlock.h>
#include <Interpreters/join_common.h>
2019-09-09 19:43:37 +00:00
#include <DataStreams/materializeBlock.h>
2019-09-12 18:06:25 +00:00
#include <DataStreams/MergeSortingBlockInputStream.h>
2019-09-09 19:43:37 +00:00
namespace DB
{
2019-09-10 14:51:28 +00:00
namespace ErrorCodes
{
extern const int SET_SIZE_LIMIT_EXCEEDED;
2019-09-13 16:17:37 +00:00
extern const int NOT_IMPLEMENTED;
2019-09-23 19:36:47 +00:00
extern const int PARAMETER_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
2019-09-13 16:17:37 +00:00
}
2019-09-19 17:09:59 +00:00
namespace
{
template <bool has_nulls>
int nullableCompareAt(const IColumn & left_column, const IColumn & right_column, size_t lhs_pos, size_t rhs_pos)
{
static constexpr int null_direction_hint = 1;
if constexpr (has_nulls)
{
auto * left_nullable = checkAndGetColumn<ColumnNullable>(left_column);
auto * right_nullable = checkAndGetColumn<ColumnNullable>(right_column);
if (left_nullable && right_nullable)
{
int res = left_column.compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
if (res)
return res;
/// NULL != NULL case
if (left_column.isNullAt(lhs_pos))
return null_direction_hint;
}
if (left_nullable && !right_nullable)
{
if (left_column.isNullAt(lhs_pos))
return null_direction_hint;
return left_nullable->getNestedColumn().compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
}
if (!left_nullable && right_nullable)
{
if (right_column.isNullAt(rhs_pos))
return -null_direction_hint;
return left_column.compareAt(lhs_pos, rhs_pos, right_nullable->getNestedColumn(), null_direction_hint);
}
}
/// !left_nullable && !right_nullable
return left_column.compareAt(lhs_pos, rhs_pos, right_column, null_direction_hint);
}
2019-09-23 19:36:47 +00:00
Block extractMinMax(const Block & block, const Block & keys)
{
if (block.rows() == 0)
throw Exception("Unexpected empty block", ErrorCodes::LOGICAL_ERROR);
Block min_max = keys.cloneEmpty();
MutableColumns columns = min_max.mutateColumns();
for (size_t i = 0; i < columns.size(); ++i)
{
auto & src_column = block.getByName(keys.getByPosition(i).name);
columns[i]->insertFrom(*src_column.column, 0);
columns[i]->insertFrom(*src_column.column, block.rows() - 1);
}
min_max.setColumns(std::move(columns));
return min_max;
}
2019-09-19 17:09:59 +00:00
}
2019-09-13 16:17:37 +00:00
struct MergeJoinEqualRange
{
size_t left_start = 0;
size_t right_start = 0;
size_t left_length = 0;
size_t right_length = 0;
bool empty() const { return !left_length && !right_length; }
};
using Range = MergeJoinEqualRange;
class MergeJoinCursor
{
public:
MergeJoinCursor(const Block & block, const SortDescription & desc_)
: impl(SortCursorImpl(block, desc_))
{}
size_t position() const { return impl.pos; }
size_t end() const { return impl.rows; }
2019-09-13 16:17:37 +00:00
bool atEnd() const { return impl.pos >= impl.rows; }
void nextN(size_t num) { impl.pos += num; }
2019-09-19 17:09:59 +00:00
void setCompareNullability(const MergeJoinCursor & rhs)
2019-09-13 16:17:37 +00:00
{
2019-09-19 17:09:59 +00:00
has_nullable_columns = false;
2019-09-13 16:17:37 +00:00
for (size_t i = 0; i < impl.sort_columns_size; ++i)
{
2019-09-19 17:09:59 +00:00
bool is_left_nullable = isColumnNullable(*impl.sort_columns[i]);
bool is_right_nullable = isColumnNullable(*rhs.impl.sort_columns[i]);
if (is_left_nullable || is_right_nullable)
{
has_nullable_columns = true;
2019-09-13 16:17:37 +00:00
break;
2019-09-19 17:09:59 +00:00
}
2019-09-13 16:17:37 +00:00
}
}
2019-09-19 17:09:59 +00:00
Range getNextEqualRange(MergeJoinCursor & rhs)
2019-09-13 16:17:37 +00:00
{
2019-09-19 17:09:59 +00:00
if (has_nullable_columns)
return getNextEqualRangeImpl<true>(rhs);
return getNextEqualRangeImpl<false>(rhs);
2019-09-13 16:17:37 +00:00
}
2019-09-24 13:45:59 +00:00
int intersect(const Block & right_block, const Block & right_table_keys, const Names & key_names)
2019-09-23 19:36:47 +00:00
{
const Block min_max = extractMinMax(right_block, right_table_keys);
if (end() == 0 || min_max.rows() != 2)
throw Exception("Unexpected block size", ErrorCodes::LOGICAL_ERROR);
size_t last_position = end() - 1;
int first_vs_max = 0;
int last_vs_min = 0;
for (size_t i = 0; i < impl.sort_columns.size(); ++i)
{
auto & left_column = *impl.sort_columns[i];
2019-09-24 13:45:59 +00:00
auto & right_column = *min_max.getByName(key_names[i]).column; /// cannot get by position cause of possible duplicates
2019-09-23 19:36:47 +00:00
if (!first_vs_max)
first_vs_max = nullableCompareAt<true>(left_column, right_column, position(), 1);
if (!last_vs_min)
last_vs_min = nullableCompareAt<true>(left_column, right_column, last_position, 0);
}
if (first_vs_max > 0)
return 1;
if (last_vs_min < 0)
return -1;
return 0;
}
2019-09-19 17:09:59 +00:00
private:
SortCursorImpl impl;
bool has_nullable_columns = false;
2019-09-13 16:17:37 +00:00
2019-09-19 17:09:59 +00:00
template <bool has_nulls>
Range getNextEqualRangeImpl(MergeJoinCursor & rhs)
2019-09-13 16:17:37 +00:00
{
while (!atEnd() && !rhs.atEnd())
{
2019-09-19 17:09:59 +00:00
int cmp = compareAt<has_nulls>(rhs, impl.pos, rhs.impl.pos);
2019-09-13 16:17:37 +00:00
if (cmp < 0)
impl.next();
if (cmp > 0)
rhs.impl.next();
if (!cmp)
{
Range range{impl.pos, rhs.impl.pos, 0, 0};
range.left_length = getEqualLength();
range.right_length = rhs.getEqualLength();
return range;
}
}
return Range{impl.pos, rhs.impl.pos, 0, 0};
}
2019-09-19 17:09:59 +00:00
template <bool has_nulls>
int compareAt(const MergeJoinCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
{
int res = 0;
for (size_t i = 0; i < impl.sort_columns_size; ++i)
{
auto * left_column = impl.sort_columns[i];
auto * right_column = rhs.impl.sort_columns[i];
res = nullableCompareAt<has_nulls>(*left_column, *right_column, lhs_pos, rhs_pos);
if (res)
break;
}
return res;
}
size_t getEqualLength()
{
if (atEnd())
return 0;
size_t pos = impl.pos;
while (sameNext(pos))
++pos;
return pos - impl.pos + 1;
}
bool sameNext(size_t lhs_pos) const
{
if (lhs_pos + 1 >= impl.rows)
return false;
for (size_t i = 0; i < impl.sort_columns_size; ++i)
if (impl.sort_columns[i]->compareAt(lhs_pos, lhs_pos + 1, *(impl.sort_columns[i]), 1) != 0)
return false;
return true;
}
2019-09-13 16:17:37 +00:00
};
namespace
{
MutableColumns makeMutableColumns(const Block & block, size_t rows_to_reserve = 0)
{
MutableColumns columns;
columns.reserve(block.columns());
for (const auto & src_column : block)
{
columns.push_back(src_column.column->cloneEmpty());
columns.back()->reserve(rows_to_reserve);
}
return columns;
}
void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDescription & merge)
2019-09-13 16:17:37 +00:00
{
NameSet unique_keys;
for (auto & key_name : keys)
{
merge.emplace_back(SortColumnDescription(key_name, 1, 1));
if (!unique_keys.count(key_name))
{
unique_keys.insert(key_name);
sort.emplace_back(SortColumnDescription(key_name, 1, 1));
}
}
2019-09-10 14:51:28 +00:00
}
void copyLeftRange(const Block & block, MutableColumns & columns, size_t start, size_t rows_to_add)
{
for (size_t i = 0; i < block.columns(); ++i)
{
const auto & src_column = block.getByPosition(i).column;
columns[i]->insertRangeFrom(*src_column, start, rows_to_add);
}
}
void copyRightRange(const Block & right_block, const Block & right_columns_to_add, MutableColumns & columns,
size_t row_position, size_t rows_to_add)
{
for (size_t i = 0; i < right_columns_to_add.columns(); ++i)
{
const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name).column;
auto & dst_column = columns[i];
auto * dst_nullable = typeid_cast<ColumnNullable *>(dst_column.get());
if (dst_nullable && !isColumnNullable(*src_column))
dst_nullable->insertManyFromNotNullable(*src_column, row_position, rows_to_add);
else
dst_column->insertManyFrom(*src_column, row_position, rows_to_add);
}
}
void joinEqualsAnyLeft(const Block & right_block, const Block & right_columns_to_add, MutableColumns & right_columns, const Range & range)
{
copyRightRange(right_block, right_columns_to_add, right_columns, range.right_start, range.left_length);
}
void joinEquals(const Block & left_block, const Block & right_block, const Block & right_columns_to_add,
MutableColumns & left_columns, MutableColumns & right_columns, const Range & range, bool is_all)
{
size_t left_rows_to_add = range.left_length;
size_t right_rows_to_add = is_all ? range.right_length : 1;
size_t row_position = range.right_start;
for (size_t right_row = 0; right_row < right_rows_to_add; ++right_row, ++row_position)
{
copyLeftRange(left_block, left_columns, range.left_start, left_rows_to_add);
copyRightRange(right_block, right_columns_to_add, right_columns, row_position, left_rows_to_add);
}
}
void appendNulls(MutableColumns & right_columns, size_t rows_to_add)
{
for (auto & column : right_columns)
column->insertManyDefaults(rows_to_add);
}
void joinInequalsLeft(const Block & left_block, MutableColumns & left_columns, MutableColumns & right_columns,
size_t start, size_t end, bool copy_left)
{
if (end <= start)
return;
size_t rows_to_add = end - start;
if (copy_left)
copyLeftRange(left_block, left_columns, start, rows_to_add);
appendNulls(right_columns, rows_to_add);
}
}
2019-09-10 14:51:28 +00:00
MergeJoin::MergeJoin(std::shared_ptr<AnalyzedJoin> table_join_, const Block & right_sample_block)
2019-09-09 19:43:37 +00:00
: table_join(table_join_)
2019-09-18 12:24:35 +00:00
, nullable_right_side(table_join->forceNullableRight())
, is_all(table_join->strictness() == ASTTableJoin::Strictness::All)
, is_inner(isInner(table_join->kind()))
, is_left(isLeft(table_join->kind()))
, skip_not_intersected(table_join->enablePartialMergeJoinOptimizations())
2019-09-09 19:43:37 +00:00
{
if (!isLeft(table_join->kind()) && !isInner(table_join->kind()))
2019-09-13 16:17:37 +00:00
throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED);
JoinCommon::extractKeysForJoin(table_join->keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add);
2019-09-12 14:09:05 +00:00
const NameSet required_right_keys = table_join->requiredRightKeys();
2019-09-12 14:09:05 +00:00
for (const auto & column : right_table_keys)
if (required_right_keys.count(column.name))
right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name});
JoinCommon::removeLowCardinalityInplace(right_columns_to_add);
2019-09-12 14:09:05 +00:00
JoinCommon::createMissedColumns(right_columns_to_add);
if (nullable_right_side)
JoinCommon::convertColumnsToNullable(right_columns_to_add);
2019-09-12 18:06:25 +00:00
makeSortAndMerge(table_join->keyNamesLeft(), left_sort_description, left_merge_description);
makeSortAndMerge(table_join->keyNamesRight(), right_sort_description, right_merge_description);
2019-09-12 18:06:25 +00:00
}
void MergeJoin::setTotals(const Block & totals_block)
{
totals = totals_block;
mergeRightBlocks();
2019-09-09 19:43:37 +00:00
}
2019-09-19 14:53:03 +00:00
void MergeJoin::joinTotals(Block & block) const
{
JoinCommon::joinTotals(totals, right_columns_to_add, table_join->keyNamesRight(), block);
}
2019-09-13 16:17:37 +00:00
void MergeJoin::mergeRightBlocks()
{
if (right_blocks.empty())
return;
2019-09-13 16:17:37 +00:00
Blocks unsorted_blocks;
unsorted_blocks.reserve(right_blocks.size());
for (const auto & block : right_blocks)
unsorted_blocks.push_back(block);
2019-09-23 19:36:47 +00:00
size_t max_rows_in_block = table_join->maxRowsInRightBlock();
if (!max_rows_in_block)
throw Exception("partial_merge_join_rows_in_right_blocks cannot be zero", ErrorCodes::PARAMETER_OUT_OF_BOUND);
2019-09-13 16:17:37 +00:00
/// TODO: there should be no splitted keys by blocks for RIGHT|FULL JOIN
2019-09-23 19:36:47 +00:00
MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_rows_in_block);
2019-09-13 16:17:37 +00:00
right_blocks.clear();
while (Block block = stream.read())
2019-09-23 19:36:47 +00:00
right_blocks.emplace_back(std::move(block));
2019-09-13 16:17:37 +00:00
}
2019-09-12 18:06:25 +00:00
bool MergeJoin::addJoinedBlock(const Block & src_block)
2019-09-09 19:43:37 +00:00
{
Block block = materializeBlock(src_block);
JoinCommon::removeLowCardinalityInplace(block);
2019-09-12 18:06:25 +00:00
sortBlock(block, right_sort_description);
2019-09-10 14:51:28 +00:00
std::unique_lock lock(rwlock);
right_blocks.push_back(block);
right_blocks_row_count += block.rows();
right_blocks_bytes += block.bytes();
return table_join->sizeLimits().check(right_blocks_row_count, right_blocks_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
2019-09-09 19:43:37 +00:00
}
2019-09-10 14:51:28 +00:00
void MergeJoin::joinBlock(Block & block)
2019-09-09 19:43:37 +00:00
{
JoinCommon::checkTypesOfKeys(block, table_join->keyNamesLeft(), right_table_keys, table_join->keyNamesRight());
materializeBlockInplace(block);
JoinCommon::removeLowCardinalityInplace(block);
2019-09-12 18:06:25 +00:00
sortBlock(block, left_sort_description);
std::shared_lock lock(rwlock);
2019-09-11 18:03:21 +00:00
size_t rows_to_reserve = is_left ? block.rows() : 0;
MutableColumns left_columns = makeMutableColumns(block, (is_all ? rows_to_reserve : 0));
MutableColumns right_columns = makeMutableColumns(right_columns_to_add, rows_to_reserve);
MergeJoinCursor left_cursor(block, left_merge_description);
size_t left_key_tail = 0;
2019-09-13 16:17:37 +00:00
if (is_left)
{
2019-09-13 16:17:37 +00:00
for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it)
{
if (left_cursor.atEnd())
break;
2019-09-23 19:36:47 +00:00
if (skip_not_intersected)
{
2019-09-24 13:45:59 +00:00
int intersection = left_cursor.intersect(*it, right_table_keys, table_join->keyNamesRight());
2019-09-23 19:36:47 +00:00
if (intersection < 0)
break; /// (left) ... (right)
if (intersection > 0)
continue; /// (right) ... (left)
}
leftJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail);
2019-09-13 16:17:37 +00:00
}
2019-09-11 18:03:21 +00:00
left_cursor.nextN(left_key_tail);
joinInequalsLeft(block, left_columns, right_columns, left_cursor.position(), left_cursor.end(), is_all);
//left_cursor.nextN(left_cursor.end() - left_cursor.position());
changeLeftColumns(block, std::move(left_columns));
addRightColumns(block, std::move(right_columns));
2019-09-13 16:17:37 +00:00
}
else if (is_inner)
2019-09-13 16:17:37 +00:00
{
2019-09-13 17:23:32 +00:00
for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it)
{
if (left_cursor.atEnd())
break;
2019-09-23 19:36:47 +00:00
if (skip_not_intersected)
{
2019-09-24 13:45:59 +00:00
int intersection = left_cursor.intersect(*it, right_table_keys, table_join->keyNamesRight());
2019-09-23 19:36:47 +00:00
if (intersection < 0)
break; /// (left) ... (right)
if (intersection > 0)
continue; /// (right) ... (left)
}
innerJoin(left_cursor, block, *it, left_columns, right_columns, left_key_tail);
2019-09-13 17:23:32 +00:00
}
left_cursor.nextN(left_key_tail);
changeLeftColumns(block, std::move(left_columns));
addRightColumns(block, std::move(right_columns));
2019-09-13 16:17:37 +00:00
}
2019-09-10 14:51:28 +00:00
}
void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block,
MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail)
2019-09-10 14:51:28 +00:00
{
2019-09-13 16:17:37 +00:00
MergeJoinCursor right_cursor(right_block, right_merge_description);
2019-09-19 17:09:59 +00:00
left_cursor.setCompareNullability(right_cursor);
2019-09-13 16:17:37 +00:00
while (!left_cursor.atEnd() && !right_cursor.atEnd())
{
/// Not zero left_key_tail means there were equality for the last left key in previous leftJoin() call.
/// Do not join it twice: join only if it's equal with a first right key of current leftJoin() call and skip otherwise.
size_t left_unequal_position = left_cursor.position() + left_key_tail;
left_key_tail = 0;
2019-09-13 16:17:37 +00:00
Range range = left_cursor.getNextEqualRange(right_cursor);
joinInequalsLeft(left_block, left_columns, right_columns, left_unequal_position, range.left_start, is_all);
2019-09-13 16:17:37 +00:00
if (range.empty())
break;
if (is_all)
joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all);
else
joinEqualsAnyLeft(right_block, right_columns_to_add, right_columns, range);
2019-09-13 16:17:37 +00:00
right_cursor.nextN(range.right_length);
/// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block)
if (is_all && right_cursor.atEnd())
{
left_key_tail = range.left_length;
break;
}
2019-09-13 16:17:37 +00:00
left_cursor.nextN(range.left_length);
}
2019-09-10 14:51:28 +00:00
}
2019-09-09 19:43:37 +00:00
2019-09-13 17:23:32 +00:00
void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block,
MutableColumns & left_columns, MutableColumns & right_columns, size_t & left_key_tail)
2019-09-13 17:23:32 +00:00
{
MergeJoinCursor right_cursor(right_block, right_merge_description);
2019-09-19 17:09:59 +00:00
left_cursor.setCompareNullability(right_cursor);
2019-09-13 17:23:32 +00:00
while (!left_cursor.atEnd() && !right_cursor.atEnd())
{
Range range = left_cursor.getNextEqualRange(right_cursor);
if (range.empty())
break;
joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all);
2019-09-13 17:23:32 +00:00
right_cursor.nextN(range.right_length);
/// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block)
if (is_all && right_cursor.atEnd())
{
left_key_tail = range.left_length;
break;
}
2019-09-13 17:23:32 +00:00
left_cursor.nextN(range.left_length);
}
}
void MergeJoin::changeLeftColumns(Block & block, MutableColumns && columns)
2019-09-12 18:06:25 +00:00
{
if (is_left && !is_all)
return;
block.setColumns(std::move(columns));
2019-09-13 16:17:37 +00:00
}
2019-09-12 18:06:25 +00:00
void MergeJoin::addRightColumns(Block & block, MutableColumns && right_columns)
2019-09-13 16:17:37 +00:00
{
for (size_t i = 0; i < right_columns_to_add.columns(); ++i)
{
const auto & column = right_columns_to_add.getByPosition(i);
block.insert(ColumnWithTypeAndName{std::move(right_columns[i]), column.type, column.name});
}
}
2019-09-12 18:06:25 +00:00
2019-09-09 19:43:37 +00:00
}