2019-03-25 16:58:59 +00:00
|
|
|
#include <Processors/Transforms/DistinctTransform.h>
|
2019-03-25 16:37:27 +00:00
|
|
|
|
2019-03-25 16:58:59 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
|
|
|
}
|
|
|
|
|
|
|
|
DistinctTransform::DistinctTransform(
|
2019-08-03 11:02:40 +00:00
|
|
|
const Block & header_,
|
|
|
|
const SizeLimits & set_size_limits_,
|
2022-08-05 21:29:57 +00:00
|
|
|
const UInt64 limit_hint_,
|
2019-08-03 11:02:40 +00:00
|
|
|
const Names & columns_)
|
|
|
|
: ISimpleTransform(header_, header_, true)
|
|
|
|
, limit_hint(limit_hint_)
|
|
|
|
, set_size_limits(set_size_limits_)
|
2019-03-25 16:58:59 +00:00
|
|
|
{
|
2022-06-26 09:43:31 +00:00
|
|
|
const size_t num_columns = columns_.empty() ? header_.columns() : columns_.size();
|
|
|
|
key_columns_pos.reserve(num_columns);
|
2019-03-25 16:58:59 +00:00
|
|
|
for (size_t i = 0; i < num_columns; ++i)
|
|
|
|
{
|
2022-06-26 09:43:31 +00:00
|
|
|
const auto pos = columns_.empty() ? i : header_.getPositionByName(columns_[i]);
|
2020-04-22 07:03:43 +00:00
|
|
|
const auto & col = header_.getByPosition(pos).column;
|
2022-06-26 09:43:31 +00:00
|
|
|
if (col && !isColumnConst(*col))
|
2019-04-08 16:35:44 +00:00
|
|
|
key_columns_pos.emplace_back(pos);
|
2019-03-25 16:58:59 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename Method>
|
|
|
|
void DistinctTransform::buildFilter(
|
|
|
|
Method & method,
|
|
|
|
const ColumnRawPtrs & columns,
|
|
|
|
IColumn::Filter & filter,
|
2022-07-30 20:25:56 +00:00
|
|
|
const size_t rows,
|
2019-03-25 16:58:59 +00:00
|
|
|
SetVariants & variants) const
|
|
|
|
{
|
|
|
|
typename Method::State state(columns, key_sizes, nullptr);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < rows; ++i)
|
|
|
|
{
|
|
|
|
auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
|
|
|
|
|
|
|
|
/// Emit the record if there is no such key in the current set yet.
|
|
|
|
/// Skip it otherwise.
|
|
|
|
filter[i] = emplace_result.isInserted();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DistinctTransform::transform(Chunk & chunk)
|
|
|
|
{
|
2022-08-05 21:29:57 +00:00
|
|
|
if (unlikely(!chunk.hasRows()))
|
|
|
|
return;
|
|
|
|
|
2021-12-08 15:29:00 +00:00
|
|
|
/// Convert to full column, because SetVariant for sparse column is not implemented.
|
2021-09-16 12:52:35 +00:00
|
|
|
convertToFullIfSparse(chunk);
|
|
|
|
|
2022-08-05 21:29:57 +00:00
|
|
|
const auto num_rows = chunk.getNumRows();
|
2019-03-25 16:58:59 +00:00
|
|
|
auto columns = chunk.detachColumns();
|
|
|
|
|
2022-07-30 20:25:56 +00:00
|
|
|
/// Special case, - only const columns, return single row
|
|
|
|
if (unlikely(key_columns_pos.empty()))
|
|
|
|
{
|
|
|
|
for (auto & column : columns)
|
|
|
|
column = column->cut(0, 1);
|
|
|
|
|
|
|
|
chunk.setColumns(std::move(columns), 1);
|
|
|
|
stopReading();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-03-25 16:58:59 +00:00
|
|
|
ColumnRawPtrs column_ptrs;
|
|
|
|
column_ptrs.reserve(key_columns_pos.size());
|
|
|
|
for (auto pos : key_columns_pos)
|
|
|
|
column_ptrs.emplace_back(columns[pos].get());
|
|
|
|
|
|
|
|
if (data.empty())
|
|
|
|
data.init(SetVariants::chooseMethod(column_ptrs, key_sizes));
|
|
|
|
|
|
|
|
const auto old_set_size = data.getTotalRowCount();
|
|
|
|
IColumn::Filter filter(num_rows);
|
|
|
|
|
|
|
|
switch (data.type)
|
|
|
|
{
|
|
|
|
case SetVariants::Type::EMPTY:
|
|
|
|
break;
|
|
|
|
#define M(NAME) \
|
|
|
|
case SetVariants::Type::NAME: \
|
|
|
|
buildFilter(*data.NAME, column_ptrs, filter, num_rows, data); \
|
|
|
|
break;
|
|
|
|
APPLY_FOR_SET_VARIANTS(M)
|
|
|
|
#undef M
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Just go to the next chunk if there isn't any new record in the current one.
|
|
|
|
if (data.getTotalRowCount() == old_set_size)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!set_size_limits.check(data.getTotalRowCount(), data.getTotalByteCount(), "DISTINCT", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED))
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (auto & column : columns)
|
|
|
|
column = column->filter(filter, -1);
|
|
|
|
|
|
|
|
chunk.setColumns(std::move(columns), data.getTotalRowCount() - old_set_size);
|
2022-08-05 21:29:57 +00:00
|
|
|
|
|
|
|
/// Stop reading if we already reach the limit
|
|
|
|
if (limit_hint && data.getTotalRowCount() >= limit_hint)
|
|
|
|
{
|
|
|
|
stopReading();
|
|
|
|
return;
|
|
|
|
}
|
2019-03-25 16:58:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|