2017-04-10 16:15:06 +00:00
|
|
|
#include <DataStreams/DistinctSortedBlockInputStream.h>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
|
|
|
}
|
|
|
|
|
2018-03-11 00:15:26 +00:00
|
|
|
DistinctSortedBlockInputStream::DistinctSortedBlockInputStream(
|
2019-02-10 15:17:45 +00:00
|
|
|
const BlockInputStreamPtr & input, const SizeLimits & set_size_limits, UInt64 limit_hint_, const Names & columns)
|
2017-09-08 02:29:47 +00:00
|
|
|
: description(input->getSortDescription())
|
2017-09-08 04:10:43 +00:00
|
|
|
, columns_names(columns)
|
2017-04-10 16:15:06 +00:00
|
|
|
, limit_hint(limit_hint_)
|
2018-03-11 00:15:26 +00:00
|
|
|
, set_size_limits(set_size_limits)
|
2017-04-10 16:15:06 +00:00
|
|
|
{
|
2017-09-08 02:29:47 +00:00
|
|
|
children.push_back(input);
|
2017-04-10 16:15:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Block DistinctSortedBlockInputStream::readImpl()
|
|
|
|
{
|
|
|
|
/// Execute until end of stream or until
|
|
|
|
/// a block with some new records will be gotten.
|
|
|
|
for (;;)
|
|
|
|
{
|
2017-04-14 19:55:06 +00:00
|
|
|
/// Stop reading if we already reached the limit.
|
2017-04-10 16:15:06 +00:00
|
|
|
if (limit_hint && data.getTotalRowCount() >= limit_hint)
|
|
|
|
return Block();
|
|
|
|
|
|
|
|
Block block = children.back()->read();
|
|
|
|
if (!block)
|
|
|
|
return Block();
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
const ColumnRawPtrs column_ptrs(getKeyColumns(block));
|
2017-04-10 16:15:06 +00:00
|
|
|
if (column_ptrs.empty())
|
|
|
|
return block;
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
const ColumnRawPtrs clearing_hint_columns(getClearingColumns(block, column_ptrs));
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
if (data.type == ClearableSetVariants::Type::EMPTY)
|
|
|
|
data.init(ClearableSetVariants::chooseMethod(column_ptrs, key_sizes));
|
|
|
|
|
|
|
|
const size_t rows = block.rows();
|
|
|
|
IColumn::Filter filter(rows);
|
|
|
|
|
|
|
|
bool has_new_data = false;
|
|
|
|
switch (data.type)
|
|
|
|
{
|
|
|
|
case ClearableSetVariants::Type::EMPTY:
|
|
|
|
break;
|
|
|
|
#define M(NAME) \
|
|
|
|
case ClearableSetVariants::Type::NAME: \
|
|
|
|
has_new_data = buildFilter(*data.NAME, column_ptrs, clearing_hint_columns, filter, rows, data); \
|
|
|
|
break;
|
|
|
|
APPLY_FOR_SET_VARIANTS(M)
|
|
|
|
#undef M
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Just go to the next block if there isn't any new record in the current one.
|
|
|
|
if (!has_new_data)
|
|
|
|
continue;
|
|
|
|
|
2018-03-11 00:15:26 +00:00
|
|
|
if (!set_size_limits.check(data.getTotalRowCount(), data.getTotalByteCount(), "DISTINCT", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED))
|
|
|
|
return {};
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
prev_block.block = block;
|
|
|
|
prev_block.clearing_hint_columns = std::move(clearing_hint_columns);
|
|
|
|
|
|
|
|
size_t all_columns = block.columns();
|
|
|
|
for (size_t i = 0; i < all_columns; ++i)
|
|
|
|
block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(filter, -1);
|
|
|
|
|
|
|
|
return block;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename Method>
|
|
|
|
bool DistinctSortedBlockInputStream::buildFilter(
|
|
|
|
Method & method,
|
2017-12-13 01:27:53 +00:00
|
|
|
const ColumnRawPtrs & columns,
|
|
|
|
const ColumnRawPtrs & clearing_hint_columns,
|
2017-04-10 16:15:06 +00:00
|
|
|
IColumn::Filter & filter,
|
|
|
|
size_t rows,
|
|
|
|
ClearableSetVariants & variants) const
|
|
|
|
{
|
2019-02-04 14:36:15 +00:00
|
|
|
typename Method::State state(columns, key_sizes, nullptr);
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
/// Compare last row of previous block and first row of current block,
|
|
|
|
/// If rows not equal, we can clear HashSet,
|
|
|
|
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
|
2017-04-14 19:55:06 +00:00
|
|
|
if (!clearing_hint_columns.empty() && !prev_block.clearing_hint_columns.empty()
|
|
|
|
&& !rowsEqual(clearing_hint_columns, 0, prev_block.clearing_hint_columns, prev_block.block.rows() - 1))
|
|
|
|
{
|
2017-04-10 16:15:06 +00:00
|
|
|
method.data.clear();
|
2017-04-14 19:55:06 +00:00
|
|
|
}
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
bool has_new_data = false;
|
|
|
|
for (size_t i = 0; i < rows; ++i)
|
|
|
|
{
|
2017-04-14 19:55:06 +00:00
|
|
|
/// Compare i-th row and i-1-th row,
|
|
|
|
/// If rows are not equal, we can clear HashSet,
|
2017-04-10 16:15:06 +00:00
|
|
|
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
|
2017-04-14 19:55:06 +00:00
|
|
|
if (i > 0 && !clearing_hint_columns.empty() && !rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1))
|
2017-04-10 16:15:06 +00:00
|
|
|
method.data.clear();
|
|
|
|
|
2019-02-04 14:36:15 +00:00
|
|
|
auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
|
|
|
|
|
|
|
|
if (emplace_result.isInserted())
|
2017-04-14 19:55:06 +00:00
|
|
|
has_new_data = true;
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
/// Emit the record if there is no such key in the current set yet.
|
|
|
|
/// Skip it otherwise.
|
2019-02-04 14:36:15 +00:00
|
|
|
filter[i] = emplace_result.isInserted();
|
2017-04-10 16:15:06 +00:00
|
|
|
}
|
|
|
|
return has_new_data;
|
|
|
|
}
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
ColumnRawPtrs DistinctSortedBlockInputStream::getKeyColumns(const Block & block) const
|
2017-04-10 16:15:06 +00:00
|
|
|
{
|
|
|
|
size_t columns = columns_names.empty() ? block.columns() : columns_names.size();
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
ColumnRawPtrs column_ptrs;
|
2017-04-10 16:15:06 +00:00
|
|
|
column_ptrs.reserve(columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < columns; ++i)
|
|
|
|
{
|
|
|
|
auto & column = columns_names.empty()
|
|
|
|
? block.safeGetByPosition(i).column
|
|
|
|
: block.getByName(columns_names[i]).column;
|
|
|
|
|
|
|
|
/// Ignore all constant columns.
|
2019-06-27 19:28:52 +00:00
|
|
|
if (!isColumnConst(*column))
|
2017-04-10 16:15:06 +00:00
|
|
|
column_ptrs.emplace_back(column.get());
|
|
|
|
}
|
|
|
|
|
|
|
|
return column_ptrs;
|
|
|
|
}
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
ColumnRawPtrs DistinctSortedBlockInputStream::getClearingColumns(const Block & block, const ColumnRawPtrs & key_columns) const
|
2017-04-10 16:15:06 +00:00
|
|
|
{
|
2017-12-13 01:27:53 +00:00
|
|
|
ColumnRawPtrs clearing_hint_columns;
|
2017-04-10 16:15:06 +00:00
|
|
|
clearing_hint_columns.reserve(description.size());
|
2018-11-23 18:52:00 +00:00
|
|
|
for (const auto & sort_column_description : description)
|
2017-04-14 19:55:06 +00:00
|
|
|
{
|
2017-04-10 16:15:06 +00:00
|
|
|
const auto sort_column_ptr = block.safeGetByPosition(sort_column_description.column_number).column.get();
|
|
|
|
const auto it = std::find(key_columns.cbegin(), key_columns.cend(), sort_column_ptr);
|
2017-04-14 19:55:06 +00:00
|
|
|
if (it != key_columns.cend()) /// if found in key_columns
|
2017-04-10 16:15:06 +00:00
|
|
|
clearing_hint_columns.emplace_back(sort_column_ptr);
|
|
|
|
else
|
2017-04-14 19:55:06 +00:00
|
|
|
return clearing_hint_columns; /// We will use common prefix of sort description and requested DISTINCT key.
|
|
|
|
}
|
2017-04-10 16:15:06 +00:00
|
|
|
return clearing_hint_columns;
|
|
|
|
}
|
|
|
|
|
2017-12-13 01:27:53 +00:00
|
|
|
bool DistinctSortedBlockInputStream::rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m)
|
2017-04-14 19:55:06 +00:00
|
|
|
{
|
|
|
|
for (size_t column_index = 0, num_columns = lhs.size(); column_index < num_columns; ++column_index)
|
2017-04-10 16:15:06 +00:00
|
|
|
{
|
|
|
|
const auto & lhs_column = *lhs[column_index];
|
|
|
|
const auto & rhs_column = *rhs[column_index];
|
2017-04-14 19:55:06 +00:00
|
|
|
if (lhs_column.compareAt(n, m, rhs_column, 0) != 0) /// not equal
|
2017-04-10 16:15:06 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2017-04-14 19:55:06 +00:00
|
|
|
}
|
2017-04-10 16:15:06 +00:00
|
|
|
|
|
|
|
}
|