2019-09-09 19:43:37 +00:00
|
|
|
#include <Core/NamesAndTypes.h>
|
2019-09-13 16:17:37 +00:00
|
|
|
#include <Core/SortCursor.h>
|
2019-09-09 19:43:37 +00:00
|
|
|
#include <Interpreters/MergeJoin.h>
|
|
|
|
#include <Interpreters/AnalyzedJoin.h>
|
2019-09-12 18:06:25 +00:00
|
|
|
#include <Interpreters/sortBlock.h>
|
2019-09-09 19:43:37 +00:00
|
|
|
#include <DataStreams/materializeBlock.h>
|
2019-09-12 18:06:25 +00:00
|
|
|
#include <DataStreams/MergeSortingBlockInputStream.h>
|
2019-09-09 19:43:37 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2019-09-10 14:51:28 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
2019-09-13 16:17:37 +00:00
|
|
|
extern const int NOT_IMPLEMENTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct MergeJoinEqualRange
|
|
|
|
{
|
|
|
|
size_t left_start = 0;
|
|
|
|
size_t right_start = 0;
|
|
|
|
size_t left_length = 0;
|
|
|
|
size_t right_length = 0;
|
|
|
|
|
|
|
|
bool empty() const { return !left_length && !right_length; }
|
|
|
|
};
|
|
|
|
|
|
|
|
using Range = MergeJoinEqualRange;
|
|
|
|
|
|
|
|
|
|
|
|
class MergeJoinCursor
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
MergeJoinCursor(const Block & block, const SortDescription & desc_)
|
|
|
|
: impl(SortCursorImpl(block, desc_))
|
|
|
|
{}
|
|
|
|
|
|
|
|
size_t position() const { return impl.pos; }
|
2019-09-16 19:31:22 +00:00
|
|
|
size_t end() const { return impl.rows; }
|
2019-09-13 16:17:37 +00:00
|
|
|
bool atEnd() const { return impl.pos >= impl.rows; }
|
|
|
|
void nextN(size_t num) { impl.pos += num; }
|
|
|
|
|
|
|
|
int compareAt(const MergeJoinCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
|
|
|
|
{
|
|
|
|
int res = 0;
|
|
|
|
for (size_t i = 0; i < impl.sort_columns_size; ++i)
|
|
|
|
{
|
|
|
|
res = impl.sort_columns[i]->compareAt(lhs_pos, rhs_pos, *(rhs.impl.sort_columns[i]), 1);
|
|
|
|
if (res)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool sameNext(size_t lhs_pos) const
|
|
|
|
{
|
|
|
|
if (impl.isLast())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < impl.sort_columns_size; ++i)
|
|
|
|
if (impl.sort_columns[i]->compareAt(lhs_pos, lhs_pos + 1, *(impl.sort_columns[i]), 1) != 0)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t getEqualLength()
|
|
|
|
{
|
|
|
|
if (atEnd())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
size_t pos = impl.pos;
|
|
|
|
for (; pos < impl.rows; ++pos)
|
|
|
|
if (!sameNext(pos))
|
|
|
|
break;
|
|
|
|
|
|
|
|
return pos - impl.pos + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Range getNextEqualRange(MergeJoinCursor & rhs)
|
|
|
|
{
|
|
|
|
while (!atEnd() && !rhs.atEnd())
|
|
|
|
{
|
|
|
|
int cmp = compareAt(rhs, impl.pos, rhs.impl.pos);
|
|
|
|
if (cmp < 0)
|
|
|
|
impl.next();
|
|
|
|
if (cmp > 0)
|
|
|
|
rhs.impl.next();
|
|
|
|
if (!cmp)
|
|
|
|
{
|
|
|
|
Range range{impl.pos, rhs.impl.pos, 0, 0};
|
|
|
|
range.left_length = getEqualLength();
|
|
|
|
range.right_length = rhs.getEqualLength();
|
|
|
|
return range;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Range{impl.pos, rhs.impl.pos, 0, 0};
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
SortCursorImpl impl;
|
|
|
|
};
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
MutableColumns makeMutableColumns(const Block & block)
|
|
|
|
{
|
|
|
|
MutableColumns columns;
|
|
|
|
columns.reserve(block.columns());
|
|
|
|
|
|
|
|
for (const auto & src_column : block)
|
|
|
|
columns.push_back(src_column.column->cloneEmpty());
|
|
|
|
return columns;
|
|
|
|
}
|
|
|
|
|
|
|
|
void makeSortAndMerge(const Names & keys, SortDescription & sort, SortDescription & merge)
|
2019-09-13 16:17:37 +00:00
|
|
|
{
|
|
|
|
NameSet unique_keys;
|
|
|
|
for (auto & key_name : keys)
|
|
|
|
{
|
|
|
|
merge.emplace_back(SortColumnDescription(key_name, 1, 1));
|
|
|
|
|
|
|
|
if (!unique_keys.count(key_name))
|
|
|
|
{
|
|
|
|
unique_keys.insert(key_name);
|
|
|
|
sort.emplace_back(SortColumnDescription(key_name, 1, 1));
|
|
|
|
}
|
|
|
|
}
|
2019-09-10 14:51:28 +00:00
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
void copyLeftRange(const Block & block, MutableColumns & columns, size_t start, size_t rows_to_add)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < block.columns(); ++i)
|
|
|
|
{
|
|
|
|
const auto & src_column = block.getByPosition(i);
|
|
|
|
auto & dst_column = columns[i];
|
|
|
|
|
|
|
|
size_t row_pos = start;
|
|
|
|
for (size_t row = 0; row < rows_to_add; ++row, ++row_pos)
|
|
|
|
dst_column->insertFrom(*src_column.column, row_pos);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void copyRightRange(const Block & right_block, const Block & right_columns_to_add, MutableColumns & columns,
|
|
|
|
size_t row_position, size_t rows_to_add)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < right_columns_to_add.columns(); ++i)
|
|
|
|
{
|
|
|
|
const auto & src_column = right_block.getByName(right_columns_to_add.getByPosition(i).name);
|
|
|
|
auto & dst_column = columns[i];
|
|
|
|
|
|
|
|
for (size_t row = 0; row < rows_to_add; ++row)
|
|
|
|
dst_column->insertFrom(*src_column.column, row_position);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void joinEqualsAnyLeft(const Block & right_block, const Block & right_columns_to_add, MutableColumns & right_columns, const Range & range)
|
|
|
|
{
|
|
|
|
copyRightRange(right_block, right_columns_to_add, right_columns, range.right_start, range.left_length);
|
|
|
|
}
|
|
|
|
|
|
|
|
void joinEquals(const Block & left_block, const Block & right_block, const Block & right_columns_to_add,
|
|
|
|
MutableColumns & left_columns, MutableColumns & right_columns, const Range & range, bool is_all)
|
|
|
|
{
|
|
|
|
size_t left_rows_to_add = range.left_length;
|
|
|
|
size_t right_rows_to_add = is_all ? range.right_length : 1;
|
|
|
|
|
|
|
|
size_t row_position = range.right_start;
|
|
|
|
for (size_t right_row = 0; right_row < right_rows_to_add; ++right_row, ++row_position)
|
|
|
|
{
|
|
|
|
copyLeftRange(left_block, left_columns, range.left_start, left_rows_to_add);
|
|
|
|
copyRightRange(right_block, right_columns_to_add, right_columns, row_position, left_rows_to_add);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void appendNulls(MutableColumns & right_columns, size_t rows_to_add)
|
|
|
|
{
|
|
|
|
for (auto & column : right_columns)
|
|
|
|
for (size_t i = 0; i < rows_to_add; ++i)
|
|
|
|
column->insertDefault();
|
|
|
|
}
|
|
|
|
|
|
|
|
void joinInequalsLeft(const Block & left_block, MutableColumns & left_columns, MutableColumns & right_columns,
|
|
|
|
size_t start, size_t end, bool copy_left)
|
|
|
|
{
|
|
|
|
if (end <= start)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size_t rows_to_add = end - start;
|
|
|
|
if (copy_left)
|
|
|
|
copyLeftRange(left_block, left_columns, start, rows_to_add);
|
|
|
|
appendNulls(right_columns, rows_to_add);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-09-10 14:51:28 +00:00
|
|
|
|
2019-09-16 12:37:46 +00:00
|
|
|
MergeJoin::MergeJoin(std::shared_ptr<AnalyzedJoin> table_join_, const Block & right_sample_block)
|
2019-09-09 19:43:37 +00:00
|
|
|
: table_join(table_join_)
|
2019-09-16 12:37:46 +00:00
|
|
|
, nullable_right_side(table_join->forceNullabelRight())
|
2019-09-16 19:31:22 +00:00
|
|
|
, is_all(table_join->strictness() == ASTTableJoin::Strictness::All)
|
|
|
|
, is_inner(isInner(table_join->kind()))
|
|
|
|
, is_left(isLeft(table_join->kind()))
|
2019-09-09 19:43:37 +00:00
|
|
|
{
|
2019-09-16 12:37:46 +00:00
|
|
|
if (!isLeft(table_join->kind()) && !isInner(table_join->kind()))
|
2019-09-13 16:17:37 +00:00
|
|
|
throw Exception("Partial merge supported for LEFT and INNER JOINs only", ErrorCodes::NOT_IMPLEMENTED);
|
|
|
|
|
2019-09-16 12:37:46 +00:00
|
|
|
JoinCommon::extractKeysForJoin(table_join->keyNamesRight(), right_sample_block, right_table_keys, right_columns_to_add);
|
2019-09-12 14:09:05 +00:00
|
|
|
|
2019-09-16 12:37:46 +00:00
|
|
|
const NameSet required_right_keys = table_join->requiredRightKeys();
|
2019-09-12 14:09:05 +00:00
|
|
|
for (const auto & column : right_table_keys)
|
|
|
|
if (required_right_keys.count(column.name))
|
|
|
|
right_columns_to_add.insert(ColumnWithTypeAndName{nullptr, column.type, column.name});
|
|
|
|
|
|
|
|
JoinCommon::createMissedColumns(right_columns_to_add);
|
|
|
|
|
|
|
|
if (nullable_right_side)
|
|
|
|
JoinCommon::convertColumnsToNullable(right_columns_to_add);
|
2019-09-12 18:06:25 +00:00
|
|
|
|
2019-09-16 12:37:46 +00:00
|
|
|
makeSortAndMerge(table_join->keyNamesLeft(), left_sort_description, left_merge_description);
|
|
|
|
makeSortAndMerge(table_join->keyNamesRight(), right_sort_description, right_merge_description);
|
2019-09-12 18:06:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void MergeJoin::setTotals(const Block & totals_block)
|
|
|
|
{
|
|
|
|
totals = totals_block;
|
|
|
|
mergeRightBlocks();
|
2019-09-09 19:43:37 +00:00
|
|
|
}
|
|
|
|
|
2019-09-13 16:17:37 +00:00
|
|
|
void MergeJoin::mergeRightBlocks()
|
|
|
|
{
|
|
|
|
const size_t max_merged_block_size = 128 * 1024 * 1024;
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
if (right_blocks.empty())
|
|
|
|
return;
|
|
|
|
|
2019-09-13 16:17:37 +00:00
|
|
|
Blocks unsorted_blocks;
|
|
|
|
unsorted_blocks.reserve(right_blocks.size());
|
|
|
|
for (const auto & block : right_blocks)
|
|
|
|
unsorted_blocks.push_back(block);
|
|
|
|
|
|
|
|
/// TODO: there should be no splitted keys by blocks for RIGHT|FULL JOIN
|
|
|
|
MergeSortingBlocksBlockInputStream stream(unsorted_blocks, right_sort_description, max_merged_block_size);
|
|
|
|
|
|
|
|
right_blocks.clear();
|
|
|
|
while (Block block = stream.read())
|
|
|
|
right_blocks.push_back(block);
|
|
|
|
}
|
|
|
|
|
2019-09-12 18:06:25 +00:00
|
|
|
bool MergeJoin::addJoinedBlock(const Block & src_block)
|
2019-09-09 19:43:37 +00:00
|
|
|
{
|
2019-09-12 18:06:25 +00:00
|
|
|
Block block = src_block;
|
|
|
|
sortBlock(block, right_sort_description);
|
|
|
|
|
2019-09-10 14:51:28 +00:00
|
|
|
std::unique_lock lock(rwlock);
|
|
|
|
|
|
|
|
right_blocks.push_back(block);
|
|
|
|
right_blocks_row_count += block.rows();
|
|
|
|
right_blocks_bytes += block.bytes();
|
|
|
|
|
2019-09-16 12:37:46 +00:00
|
|
|
return table_join->sizeLimits().check(right_blocks_row_count, right_blocks_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
|
2019-09-09 19:43:37 +00:00
|
|
|
}
|
|
|
|
|
2019-09-10 14:51:28 +00:00
|
|
|
void MergeJoin::joinBlock(Block & block)
|
2019-09-09 19:43:37 +00:00
|
|
|
{
|
2019-09-16 12:37:46 +00:00
|
|
|
JoinCommon::checkTypesOfKeys(block, table_join->keyNamesLeft(), right_table_keys, table_join->keyNamesRight());
|
2019-09-12 18:06:25 +00:00
|
|
|
sortBlock(block, left_sort_description);
|
|
|
|
|
|
|
|
std::shared_lock lock(rwlock);
|
2019-09-11 18:03:21 +00:00
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
MutableColumns left_columns = makeMutableColumns(block);
|
|
|
|
MutableColumns right_columns = makeMutableColumns(right_columns_to_add);
|
|
|
|
MergeJoinCursor left_cursor(block, left_merge_description);
|
2019-09-13 16:17:37 +00:00
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
if (is_left)
|
|
|
|
{
|
2019-09-13 16:17:37 +00:00
|
|
|
for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it)
|
|
|
|
{
|
|
|
|
if (left_cursor.atEnd())
|
|
|
|
break;
|
2019-09-16 19:31:22 +00:00
|
|
|
leftJoin(left_cursor, block, *it, left_columns, right_columns);
|
2019-09-13 16:17:37 +00:00
|
|
|
}
|
2019-09-11 18:03:21 +00:00
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
joinInequalsLeft(block, left_columns, right_columns, left_cursor.position(), left_cursor.end(), is_all);
|
|
|
|
//left_cursor.nextN(left_cursor.end() - left_cursor.position());
|
|
|
|
|
|
|
|
changeLeftColumns(block, std::move(left_columns));
|
|
|
|
addRightColumns(block, std::move(right_columns));
|
2019-09-13 16:17:37 +00:00
|
|
|
}
|
2019-09-16 19:31:22 +00:00
|
|
|
else if (is_inner)
|
2019-09-13 16:17:37 +00:00
|
|
|
{
|
2019-09-13 17:23:32 +00:00
|
|
|
for (auto it = right_blocks.begin(); it != right_blocks.end(); ++it)
|
|
|
|
{
|
|
|
|
if (left_cursor.atEnd())
|
|
|
|
break;
|
|
|
|
innerJoin(left_cursor, block, *it, left_columns, right_columns);
|
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
changeLeftColumns(block, std::move(left_columns));
|
|
|
|
addRightColumns(block, std::move(right_columns));
|
2019-09-13 16:17:37 +00:00
|
|
|
}
|
2019-09-10 14:51:28 +00:00
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
void MergeJoin::leftJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block,
|
|
|
|
MutableColumns & left_columns, MutableColumns & right_columns)
|
2019-09-10 14:51:28 +00:00
|
|
|
{
|
2019-09-13 16:17:37 +00:00
|
|
|
MergeJoinCursor right_cursor(right_block, right_merge_description);
|
|
|
|
|
|
|
|
while (!left_cursor.atEnd() && !right_cursor.atEnd())
|
|
|
|
{
|
2019-09-16 19:31:22 +00:00
|
|
|
size_t left_position = left_cursor.position(); /// save inequal position
|
2019-09-13 16:17:37 +00:00
|
|
|
Range range = left_cursor.getNextEqualRange(right_cursor);
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
joinInequalsLeft(left_block, left_columns, right_columns, left_position, range.left_start, is_all);
|
2019-09-13 16:17:37 +00:00
|
|
|
|
|
|
|
if (range.empty())
|
|
|
|
break;
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
if (is_all)
|
|
|
|
joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all);
|
|
|
|
else
|
|
|
|
joinEqualsAnyLeft(right_block, right_columns_to_add, right_columns, range);
|
|
|
|
|
2019-09-13 16:17:37 +00:00
|
|
|
right_cursor.nextN(range.right_length);
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
/// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block)
|
|
|
|
if (is_all && right_cursor.atEnd())
|
|
|
|
break;
|
2019-09-13 16:17:37 +00:00
|
|
|
left_cursor.nextN(range.left_length);
|
|
|
|
}
|
2019-09-10 14:51:28 +00:00
|
|
|
}
|
2019-09-09 19:43:37 +00:00
|
|
|
|
2019-09-13 17:23:32 +00:00
|
|
|
void MergeJoin::innerJoin(MergeJoinCursor & left_cursor, const Block & left_block, const Block & right_block,
|
|
|
|
MutableColumns & left_columns, MutableColumns & right_columns)
|
|
|
|
{
|
|
|
|
MergeJoinCursor right_cursor(right_block, right_merge_description);
|
|
|
|
|
|
|
|
while (!left_cursor.atEnd() && !right_cursor.atEnd())
|
|
|
|
{
|
|
|
|
Range range = left_cursor.getNextEqualRange(right_cursor);
|
|
|
|
if (range.empty())
|
|
|
|
break;
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
joinEquals(left_block, right_block, right_columns_to_add, left_columns, right_columns, range, is_all);
|
2019-09-13 17:23:32 +00:00
|
|
|
right_cursor.nextN(range.right_length);
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
/// Do not run over last left keys for ALL JOIN (cause of possible duplicates in next right block)
|
|
|
|
if (is_all && right_cursor.atEnd())
|
|
|
|
break;
|
2019-09-13 17:23:32 +00:00
|
|
|
left_cursor.nextN(range.left_length);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
void MergeJoin::changeLeftColumns(Block & block, MutableColumns && columns)
|
2019-09-12 18:06:25 +00:00
|
|
|
{
|
2019-09-16 19:31:22 +00:00
|
|
|
if (is_left && !is_all)
|
|
|
|
return;
|
|
|
|
block.setColumns(std::move(columns));
|
2019-09-13 16:17:37 +00:00
|
|
|
}
|
2019-09-12 18:06:25 +00:00
|
|
|
|
2019-09-16 19:31:22 +00:00
|
|
|
void MergeJoin::addRightColumns(Block & block, MutableColumns && right_columns)
|
2019-09-13 16:17:37 +00:00
|
|
|
{
|
|
|
|
for (size_t i = 0; i < right_columns_to_add.columns(); ++i)
|
|
|
|
{
|
|
|
|
const auto & column = right_columns_to_add.getByPosition(i);
|
|
|
|
block.insert(ColumnWithTypeAndName{std::move(right_columns[i]), column.type, column.name});
|
|
|
|
}
|
|
|
|
}
|
2019-09-12 18:06:25 +00:00
|
|
|
|
2019-09-09 19:43:37 +00:00
|
|
|
}
|