mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Merge pull request #39528 from ClickHouse/distinct_sorted_simplify
Use DistinctSorted only when applicable
This commit is contained in:
commit
1f9d247299
@ -114,15 +114,19 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
|
||||
|
||||
if (distinct_sort_desc.size() < columns.size())
|
||||
{
|
||||
pipeline.addSimpleTransform(
|
||||
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
|
||||
{
|
||||
if (stream_type != QueryPipelineBuilder::StreamType::Main)
|
||||
return nullptr;
|
||||
if (DistinctSortedTransform::isApplicable(pipeline.getHeader(), distinct_sort_desc, columns))
|
||||
{
|
||||
pipeline.addSimpleTransform(
|
||||
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
|
||||
{
|
||||
if (stream_type != QueryPipelineBuilder::StreamType::Main)
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DistinctSortedTransform>(
|
||||
header, distinct_sort_desc, set_size_limits, limit_hint, columns);
|
||||
});
|
||||
return std::make_shared<DistinctSortedTransform>(
|
||||
header, distinct_sort_desc, set_size_limits, limit_hint, columns);
|
||||
});
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -135,8 +139,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
|
||||
return std::make_shared<DistinctSortedChunkTransform>(
|
||||
header, set_size_limits, limit_hint, distinct_sort_desc, columns, true);
|
||||
});
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,49 +6,41 @@ namespace DB
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
static void handleAllColumnsConst(Chunk & chunk)
|
||||
/// calculate column positions to use during chunk transformation
|
||||
static void calcColumnPositionsInHeader(const Block& header, const Names & column_names, ColumnNumbers& column_positions, ColumnNumbers& const_column_positions)
|
||||
{
|
||||
const size_t rows = chunk.getNumRows();
|
||||
IColumn::Filter filter(rows);
|
||||
|
||||
Chunk res_chunk;
|
||||
std::fill(filter.begin(), filter.end(), 0);
|
||||
filter[0] = 1;
|
||||
for (const auto & column : chunk.getColumns())
|
||||
res_chunk.addColumn(column->filter(filter, -1));
|
||||
|
||||
chunk = std::move(res_chunk);
|
||||
}
|
||||
|
||||
DistinctSortedTransform::DistinctSortedTransform(
|
||||
Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns)
|
||||
: ISimpleTransform(header_, header_, true)
|
||||
, header(std::move(header_))
|
||||
, description(std::move(sort_description))
|
||||
, column_names(columns)
|
||||
, limit_hint(limit_hint_)
|
||||
, set_size_limits(set_size_limits_)
|
||||
{
|
||||
/// pre-calculate column positions to use during chunk transformation
|
||||
const size_t num_columns = column_names.empty() ? header.columns() : column_names.size();
|
||||
column_positions.clear();
|
||||
column_positions.reserve(num_columns);
|
||||
const_column_positions.clear();
|
||||
const_column_positions.reserve(num_columns);
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
{
|
||||
auto pos = column_names.empty() ? i : header.getPositionByName(column_names[i]);
|
||||
const auto & column = header.getByPosition(pos).column;
|
||||
if (column && !isColumnConst(*column))
|
||||
if (column)
|
||||
{
|
||||
column_positions.emplace_back(pos);
|
||||
all_columns_const = false;
|
||||
if (isColumnConst(*column))
|
||||
const_column_positions.emplace_back(pos);
|
||||
else
|
||||
column_positions.emplace_back(pos);
|
||||
}
|
||||
}
|
||||
column_ptrs.reserve(column_positions.size());
|
||||
}
|
||||
|
||||
/// pre-calculate DISTINCT column positions which form sort prefix of sort description
|
||||
sort_prefix_positions.reserve(description.size());
|
||||
for (const auto & column_sort_descr : description)
|
||||
/// calculate DISTINCT column positions which form sort prefix of sort description
|
||||
static void calcSortPrefixPositionsInHeader(
|
||||
const Block & header,
|
||||
const SortDescription & sort_description,
|
||||
const ColumnNumbers & column_positions,
|
||||
const ColumnNumbers & const_column_positions,
|
||||
ColumnNumbers & sort_prefix_positions)
|
||||
{
|
||||
sort_prefix_positions.reserve(sort_description.size());
|
||||
for (const auto & column_sort_descr : sort_description)
|
||||
{
|
||||
/// check if there is such column in header
|
||||
if (!header.has(column_sort_descr.column_name))
|
||||
@ -57,10 +49,62 @@ DistinctSortedTransform::DistinctSortedTransform(
|
||||
/// check if sorted column position matches any DISTINCT column
|
||||
const auto pos = header.getPositionByName(column_sort_descr.column_name);
|
||||
if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end())
|
||||
{
|
||||
/// if sorted column found in const columns then we can skip it
|
||||
if (std::find(begin(const_column_positions), end(const_column_positions), pos) != const_column_positions.end())
|
||||
continue;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
sort_prefix_positions.emplace_back(pos);
|
||||
}
|
||||
}
|
||||
|
||||
/// check if distinct sorted is applicable for provided header, sort description and distinct columns
|
||||
bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names)
|
||||
{
|
||||
if (sort_description.empty())
|
||||
return false;
|
||||
|
||||
ColumnNumbers column_positions;
|
||||
ColumnNumbers const_column_positions;
|
||||
calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions);
|
||||
if (column_positions.empty())
|
||||
return false;
|
||||
|
||||
/// check if sorted columns matches DISTINCT columns
|
||||
ColumnNumbers sort_prefix_positions;
|
||||
calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions);
|
||||
return !sort_prefix_positions.empty();
|
||||
}
|
||||
|
||||
DistinctSortedTransform::DistinctSortedTransform(
|
||||
const Block & header,
|
||||
const SortDescription & sort_description,
|
||||
const SizeLimits & set_size_limits_,
|
||||
const UInt64 limit_hint_,
|
||||
const Names & column_names)
|
||||
: ISimpleTransform(header, header, true)
|
||||
, limit_hint(limit_hint_)
|
||||
, set_size_limits(set_size_limits_)
|
||||
{
|
||||
if (sort_description.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: sort description can't be empty");
|
||||
|
||||
/// pre-calculate column positions to use during chunk transformation
|
||||
ColumnNumbers const_column_positions;
|
||||
calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions);
|
||||
if (column_positions.empty())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: all columns can't be const. DistinctTransform should be used instead");
|
||||
|
||||
/// pre-calculate DISTINCT column positions which form sort prefix of sort description
|
||||
calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions);
|
||||
if (sort_prefix_positions.empty())
|
||||
throw Exception(
|
||||
ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description");
|
||||
|
||||
column_ptrs.reserve(column_positions.size());
|
||||
sort_prefix_columns.reserve(sort_prefix_positions.size());
|
||||
}
|
||||
|
||||
@ -69,14 +113,6 @@ void DistinctSortedTransform::transform(Chunk & chunk)
|
||||
if (unlikely(!chunk.hasRows()))
|
||||
return;
|
||||
|
||||
/// special case - all column constant
|
||||
if (unlikely(all_columns_const))
|
||||
{
|
||||
handleAllColumnsConst(chunk);
|
||||
stopReading();
|
||||
return;
|
||||
}
|
||||
|
||||
/// get DISTINCT columns from chunk
|
||||
column_ptrs.clear();
|
||||
for (const auto pos : column_positions)
|
||||
@ -136,7 +172,7 @@ void DistinctSortedTransform::transform(Chunk & chunk)
|
||||
prev_chunk.chunk = std::move(chunk);
|
||||
prev_chunk.clearing_hint_columns = std::move(sort_prefix_columns);
|
||||
|
||||
size_t all_columns = prev_chunk.chunk.getNumColumns();
|
||||
const size_t all_columns = prev_chunk.chunk.getNumColumns();
|
||||
Chunk res_chunk;
|
||||
for (size_t i = 0; i < all_columns; ++i)
|
||||
res_chunk.addColumn(prev_chunk.chunk.getColumns().at(i)->filter(filter, -1));
|
||||
@ -144,38 +180,40 @@ void DistinctSortedTransform::transform(Chunk & chunk)
|
||||
chunk = std::move(res_chunk);
|
||||
}
|
||||
|
||||
|
||||
template <typename Method>
|
||||
bool DistinctSortedTransform::buildFilter(
|
||||
Method & method,
|
||||
const ColumnRawPtrs & columns,
|
||||
const ColumnRawPtrs & clearing_hint_columns,
|
||||
IColumn::Filter & filter,
|
||||
size_t rows,
|
||||
const size_t rows,
|
||||
ClearableSetVariants & variants) const
|
||||
{
|
||||
typename Method::State state(columns, key_sizes, nullptr);
|
||||
|
||||
/// Compare last row of previous block and first row of current block,
|
||||
/// If rows not equal, we can clear HashSet,
|
||||
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
|
||||
if (!clearing_hint_columns.empty() && !prev_chunk.clearing_hint_columns.empty()
|
||||
&& !rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1))
|
||||
/// If rows are NOT equal, we can clear HashSet
|
||||
if (!prev_chunk.clearing_hint_columns.empty()) /// it's not first chunk in stream
|
||||
{
|
||||
method.data.clear();
|
||||
if (!rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1))
|
||||
method.data.clear();
|
||||
}
|
||||
|
||||
bool has_new_data = false;
|
||||
for (size_t i = 0; i < rows; ++i)
|
||||
{ /// handle 0-indexed row to avoid index check in loop below
|
||||
const auto emplace_result = state.emplaceKey(method.data, 0, variants.string_pool);
|
||||
if (emplace_result.isInserted())
|
||||
has_new_data = true;
|
||||
filter[0] = emplace_result.isInserted();
|
||||
}
|
||||
for (size_t i = 1; i < rows; ++i)
|
||||
{
|
||||
/// Compare i-th row and i-1-th row,
|
||||
/// If rows are not equal, we can clear HashSet,
|
||||
/// If clearing_hint_columns is empty, we CAN'T clear HashSet.
|
||||
if (i > 0 && !clearing_hint_columns.empty() && !rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1))
|
||||
/// If rows are not equal, we can clear HashSet
|
||||
if (!rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1))
|
||||
method.data.clear();
|
||||
|
||||
auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
|
||||
|
||||
const auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool);
|
||||
if (emplace_result.isInserted())
|
||||
has_new_data = true;
|
||||
|
||||
@ -192,7 +230,7 @@ bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, con
|
||||
{
|
||||
const auto & lhs_column = *lhs[column_index];
|
||||
const auto & rhs_column = *rhs[column_index];
|
||||
if (lhs_column.compareAt(n, m, rhs_column, 0) != 0) /// not equal
|
||||
if (lhs_column.compareAt(n, m, rhs_column, -1) != 0) /// not equal
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -12,6 +12,9 @@ namespace DB
|
||||
/** This class is intended for implementation of SELECT DISTINCT clause and
|
||||
* leaves only unique rows in the stream.
|
||||
*
|
||||
* DistinctSortedTransform::isApplicable() have to be used to check if DistinctSortedTransform can be constructed with particular arguments,
|
||||
* otherwise the constructor can throw LOGICAL_ERROR exception
|
||||
*
|
||||
* Implementation for case, when input stream has rows for same DISTINCT key or at least its prefix,
|
||||
* grouped together (going consecutively).
|
||||
*
|
||||
@ -24,10 +27,16 @@ class DistinctSortedTransform : public ISimpleTransform
|
||||
public:
|
||||
/// Empty columns_ means all columns.
|
||||
DistinctSortedTransform(
|
||||
Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns);
|
||||
const Block & header,
|
||||
const SortDescription & sort_description,
|
||||
const SizeLimits & set_size_limits_,
|
||||
UInt64 limit_hint_,
|
||||
const Names & column_names);
|
||||
|
||||
String getName() const override { return "DistinctSortedTransform"; }
|
||||
|
||||
static bool isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names);
|
||||
|
||||
protected:
|
||||
void transform(Chunk & chunk) override;
|
||||
|
||||
@ -44,9 +53,6 @@ private:
|
||||
size_t rows,
|
||||
ClearableSetVariants & variants) const;
|
||||
|
||||
Block header;
|
||||
SortDescription description;
|
||||
|
||||
struct PreviousChunk
|
||||
{
|
||||
Chunk chunk;
|
||||
@ -54,11 +60,10 @@ private:
|
||||
};
|
||||
PreviousChunk prev_chunk;
|
||||
|
||||
Names column_names;
|
||||
ColumnNumbers column_positions; /// DISTINCT columns positions in header
|
||||
ColumnNumbers sort_prefix_positions; /// DISTINCT columns positions which form sort prefix of sort description
|
||||
ColumnRawPtrs column_ptrs; /// DISTINCT columns from chunk
|
||||
ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description
|
||||
ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description
|
||||
|
||||
ClearableSetVariants data;
|
||||
Sizes key_sizes;
|
||||
@ -66,7 +71,6 @@ private:
|
||||
|
||||
/// Restrictions on the maximum size of the output data.
|
||||
SizeLimits set_size_limits;
|
||||
bool all_columns_const = true;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <Processors/Transforms/TTLTransform.h>
|
||||
#include <Processors/Transforms/TTLCalcTransform.h>
|
||||
#include <Processors/Transforms/DistinctSortedTransform.h>
|
||||
#include <Processors/Transforms/DistinctTransform.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -913,8 +914,14 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
|
||||
res_pipe.addTransform(std::move(merged_transform));
|
||||
|
||||
if (global_ctx->deduplicate)
|
||||
res_pipe.addTransform(std::make_shared<DistinctSortedTransform>(
|
||||
res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
|
||||
{
|
||||
if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns))
|
||||
res_pipe.addTransform(std::make_shared<DistinctSortedTransform>(
|
||||
res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
|
||||
else
|
||||
res_pipe.addTransform(std::make_shared<DistinctTransform>(
|
||||
res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
|
||||
}
|
||||
|
||||
if (ctx->need_remove_expired_values)
|
||||
res_pipe.addTransform(std::make_shared<TTLTransform>(
|
||||
|
@ -9,24 +9,27 @@ DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix -> pre-distinct optimization only
|
||||
DistinctTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization
|
||||
-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization
|
||||
DistinctSortedTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization
|
||||
-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization
|
||||
DistinctSortedStreamTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization
|
||||
DistinctSortedTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only
|
||||
-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only
|
||||
DistinctTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with non-primary key prefix -> ordinary distinct
|
||||
DistinctTransform
|
||||
DistinctTransform
|
||||
-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only
|
||||
-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only
|
||||
DistinctSortedTransform
|
||||
DistinctTransform
|
||||
-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct
|
||||
-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct
|
||||
DistinctTransform
|
||||
DistinctTransform
|
||||
-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct
|
||||
DistinctTransform
|
||||
DistinctTransform
|
||||
|
@ -27,25 +27,28 @@ $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct *
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix -> pre-distinct optimization only'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization'"
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'"
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'"
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> ordinary distinct'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only'"
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct'"
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct'"
|
||||
$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, 1 as x from distinct_in_order_explain order by x" | eval $FIND_DISTINCT
|
||||
|
||||
$CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync"
|
||||
|
Loading…
Reference in New Issue
Block a user