Merge pull request #39528 from ClickHouse/distinct_sorted_simplify

Use DistinctSorted only when applicable
This commit is contained in:
Anton Popov 2022-08-12 17:13:17 +02:00 committed by GitHub
commit 1f9d247299
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 140 additions and 81 deletions

View File

@ -114,15 +114,19 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
if (distinct_sort_desc.size() < columns.size())
{
pipeline.addSimpleTransform(
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
{
if (stream_type != QueryPipelineBuilder::StreamType::Main)
return nullptr;
if (DistinctSortedTransform::isApplicable(pipeline.getHeader(), distinct_sort_desc, columns))
{
pipeline.addSimpleTransform(
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
{
if (stream_type != QueryPipelineBuilder::StreamType::Main)
return nullptr;
return std::make_shared<DistinctSortedTransform>(
header, distinct_sort_desc, set_size_limits, limit_hint, columns);
});
return std::make_shared<DistinctSortedTransform>(
header, distinct_sort_desc, set_size_limits, limit_hint, columns);
});
return;
}
}
else
{
@ -135,8 +139,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
return std::make_shared<DistinctSortedChunkTransform>(
header, set_size_limits, limit_hint, distinct_sort_desc, columns, true);
});
return;
}
return;
}
}
}

View File

@ -6,49 +6,41 @@ namespace DB
namespace ErrorCodes
{
extern const int SET_SIZE_LIMIT_EXCEEDED;
extern const int LOGICAL_ERROR;
}
static void handleAllColumnsConst(Chunk & chunk)
/// calculate column positions to use during chunk transformation
static void calcColumnPositionsInHeader(const Block& header, const Names & column_names, ColumnNumbers& column_positions, ColumnNumbers& const_column_positions)
{
const size_t rows = chunk.getNumRows();
IColumn::Filter filter(rows);
Chunk res_chunk;
std::fill(filter.begin(), filter.end(), 0);
filter[0] = 1;
for (const auto & column : chunk.getColumns())
res_chunk.addColumn(column->filter(filter, -1));
chunk = std::move(res_chunk);
}
DistinctSortedTransform::DistinctSortedTransform(
Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns)
: ISimpleTransform(header_, header_, true)
, header(std::move(header_))
, description(std::move(sort_description))
, column_names(columns)
, limit_hint(limit_hint_)
, set_size_limits(set_size_limits_)
{
/// pre-calculate column positions to use during chunk transformation
const size_t num_columns = column_names.empty() ? header.columns() : column_names.size();
column_positions.clear();
column_positions.reserve(num_columns);
const_column_positions.clear();
const_column_positions.reserve(num_columns);
for (size_t i = 0; i < num_columns; ++i)
{
auto pos = column_names.empty() ? i : header.getPositionByName(column_names[i]);
const auto & column = header.getByPosition(pos).column;
if (column && !isColumnConst(*column))
if (column)
{
column_positions.emplace_back(pos);
all_columns_const = false;
if (isColumnConst(*column))
const_column_positions.emplace_back(pos);
else
column_positions.emplace_back(pos);
}
}
column_ptrs.reserve(column_positions.size());
}
/// pre-calculate DISTINCT column positions which form sort prefix of sort description
sort_prefix_positions.reserve(description.size());
for (const auto & column_sort_descr : description)
/// calculate DISTINCT column positions which form sort prefix of sort description
static void calcSortPrefixPositionsInHeader(
const Block & header,
const SortDescription & sort_description,
const ColumnNumbers & column_positions,
const ColumnNumbers & const_column_positions,
ColumnNumbers & sort_prefix_positions)
{
sort_prefix_positions.reserve(sort_description.size());
for (const auto & column_sort_descr : sort_description)
{
/// check if there is such column in header
if (!header.has(column_sort_descr.column_name))
@ -57,10 +49,62 @@ DistinctSortedTransform::DistinctSortedTransform(
/// check if sorted column position matches any DISTINCT column
const auto pos = header.getPositionByName(column_sort_descr.column_name);
if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end())
{
/// if sorted column found in const columns then we can skip it
if (std::find(begin(const_column_positions), end(const_column_positions), pos) != const_column_positions.end())
continue;
break;
}
sort_prefix_positions.emplace_back(pos);
}
}
/// check if distinct sorted is applicable for provided header, sort description and distinct columns
bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names)
{
if (sort_description.empty())
return false;
ColumnNumbers column_positions;
ColumnNumbers const_column_positions;
calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions);
if (column_positions.empty())
return false;
/// check if sorted columns matches DISTINCT columns
ColumnNumbers sort_prefix_positions;
calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions);
return !sort_prefix_positions.empty();
}
DistinctSortedTransform::DistinctSortedTransform(
const Block & header,
const SortDescription & sort_description,
const SizeLimits & set_size_limits_,
const UInt64 limit_hint_,
const Names & column_names)
: ISimpleTransform(header, header, true)
, limit_hint(limit_hint_)
, set_size_limits(set_size_limits_)
{
if (sort_description.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: sort description can't be empty");
/// pre-calculate column positions to use during chunk transformation
ColumnNumbers const_column_positions;
calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions);
if (column_positions.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: all columns can't be const. DistinctTransform should be used instead");
/// pre-calculate DISTINCT column positions which form sort prefix of sort description
calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions);
if (sort_prefix_positions.empty())
throw Exception(
ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description");
column_ptrs.reserve(column_positions.size());
sort_prefix_columns.reserve(sort_prefix_positions.size());
}
@ -69,14 +113,6 @@ void DistinctSortedTransform::transform(Chunk & chunk)
if (unlikely(!chunk.hasRows()))
return;
/// special case - all column constant
if (unlikely(all_columns_const))
{
handleAllColumnsConst(chunk);
stopReading();
return;
}
/// get DISTINCT columns from chunk
column_ptrs.clear();
for (const auto pos : column_positions)
@ -136,7 +172,7 @@ void DistinctSortedTransform::transform(Chunk & chunk)
prev_chunk.chunk = std::move(chunk);
prev_chunk.clearing_hint_columns = std::move(sort_prefix_columns);
size_t all_columns = prev_chunk.chunk.getNumColumns();
const size_t all_columns = prev_chunk.chunk.getNumColumns();
Chunk res_chunk;
for (size_t i = 0; i < all_columns; ++i)
res_chunk.addColumn(prev_chunk.chunk.getColumns().at(i)->filter(filter, -1));
@ -144,38 +180,40 @@ void DistinctSortedTransform::transform(Chunk & chunk)
chunk = std::move(res_chunk);
}
template <typename Method>
bool DistinctSortedTransform::buildFilter(
Method & method,
const ColumnRawPtrs & columns,
const ColumnRawPtrs & clearing_hint_columns,
IColumn::Filter & filter,
size_t rows,
const size_t rows,
ClearableSetVariants & variants) const
{
typename Method::State state(columns, key_sizes, nullptr);
/// Compare last row of previous block and first row of current block,
/// If rows not equal, we can clear HashSet,
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
if (!clearing_hint_columns.empty() && !prev_chunk.clearing_hint_columns.empty()
&& !rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1))
/// If rows are NOT equal, we can clear HashSet
if (!prev_chunk.clearing_hint_columns.empty()) /// it's not first chunk in stream
{
method.data.clear();
if (!rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1))
method.data.clear();
}
bool has_new_data = false;
for (size_t i = 0; i < rows; ++i)
{ /// handle 0-indexed row to avoid index check in loop below
const auto emplace_result = state.emplaceKey(method.data, 0, variants.string_pool);
if (emplace_result.isInserted())
has_new_data = true;
filter[0] = emplace_result.isInserted();
}
for (size_t i = 1; i < rows; ++i)
{
/// Compare i-th row and i-1-th row,
/// If rows are not equal, we can clear HashSet,
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
if (i > 0 && !clearing_hint_columns.empty() && !rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1))
/// If rows are not equal, we can clear HashSet
if (!rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1))
method.data.clear();
auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
const auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
if (emplace_result.isInserted())
has_new_data = true;
@ -192,7 +230,7 @@ bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, con
{
const auto & lhs_column = *lhs[column_index];
const auto & rhs_column = *rhs[column_index];
if (lhs_column.compareAt(n, m, rhs_column, 0) != 0) /// not equal
if (lhs_column.compareAt(n, m, rhs_column, -1) != 0) /// not equal
return false;
}
return true;

View File

@ -12,6 +12,9 @@ namespace DB
/** This class is intended for implementation of SELECT DISTINCT clause and
* leaves only unique rows in the stream.
*
* DistinctSortedTransform::isApplicable() have to be used to check if DistinctSortedTransform can be constructed with particular arguments,
* otherwise the constructor can throw LOGICAL_ERROR exception
*
* Implementation for case, when input stream has rows for same DISTINCT key or at least its prefix,
* grouped together (going consecutively).
*
@ -24,10 +27,16 @@ class DistinctSortedTransform : public ISimpleTransform
public:
/// Empty columns_ means all columns.
DistinctSortedTransform(
Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns);
const Block & header,
const SortDescription & sort_description,
const SizeLimits & set_size_limits_,
UInt64 limit_hint_,
const Names & column_names);
String getName() const override { return "DistinctSortedTransform"; }
static bool isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names);
protected:
void transform(Chunk & chunk) override;
@ -44,9 +53,6 @@ private:
size_t rows,
ClearableSetVariants & variants) const;
Block header;
SortDescription description;
struct PreviousChunk
{
Chunk chunk;
@ -54,11 +60,10 @@ private:
};
PreviousChunk prev_chunk;
Names column_names;
ColumnNumbers column_positions; /// DISTINCT columns positions in header
ColumnNumbers sort_prefix_positions; /// DISTINCT columns positions which form sort prefix of sort description
ColumnRawPtrs column_ptrs; /// DISTINCT columns from chunk
ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description
ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description
ClearableSetVariants data;
Sizes key_sizes;
@ -66,7 +71,6 @@ private:
/// Restrictions on the maximum size of the output data.
SizeLimits set_size_limits;
bool all_columns_const = true;
};
}

View File

@ -28,6 +28,7 @@
#include <Processors/Transforms/TTLTransform.h>
#include <Processors/Transforms/TTLCalcTransform.h>
#include <Processors/Transforms/DistinctSortedTransform.h>
#include <Processors/Transforms/DistinctTransform.h>
namespace DB
{
@ -913,8 +914,14 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
res_pipe.addTransform(std::move(merged_transform));
if (global_ctx->deduplicate)
res_pipe.addTransform(std::make_shared<DistinctSortedTransform>(
res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
{
if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns))
res_pipe.addTransform(std::make_shared<DistinctSortedTransform>(
res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
else
res_pipe.addTransform(std::make_shared<DistinctTransform>(
res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
}
if (ctx->need_remove_expired_values)
res_pipe.addTransform(std::make_shared<TTLTransform>(

View File

@ -9,24 +9,27 @@ DistinctSortedChunkTransform
-- distinct with primary key prefix -> pre-distinct optimization only
DistinctTransform
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization
-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization
DistinctSortedTransform
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization
-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization
DistinctSortedStreamTransform
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization
DistinctSortedTransform
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only
-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only
DistinctTransform
DistinctSortedChunkTransform
-- distinct with non-primary key prefix -> ordinary distinct
DistinctTransform
DistinctTransform
-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only
-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only
DistinctSortedTransform
DistinctTransform
-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct
-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct
DistinctTransform
DistinctTransform
-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct
DistinctTransform
DistinctTransform

View File

@ -27,25 +27,28 @@ $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct *
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix -> pre-distinct optimization only'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization'"
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'"
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'"
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> ordinary distinct'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only'"
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct'"
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct'"
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, 1 as x from distinct_in_order_explain order by x" | eval $FIND_DISTINCT
$CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync"