From eacc57b6a49f714bf12d160967c4098260747549 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sun, 24 Jul 2022 14:26:25 +0000 Subject: [PATCH 1/6] Simplify DistinctSorted + use ordinary Distinct for deduplication in MergeTree when no sorting provided --- src/Storages/MergeTree/MergeTask.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index dc468174dfa..424d1a20a0c 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace DB { @@ -895,8 +896,14 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() res_pipe.addTransform(std::move(merged_transform)); if (global_ctx->deduplicate) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + { + if (!sort_description.empty()) + res_pipe.addTransform(std::make_shared( + res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + else + res_pipe.addTransform(std::make_shared( + res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + } if (ctx->need_remove_expired_values) res_pipe.addTransform(std::make_shared( From 7b0b38e997e424f95e306c34684549415a02d3ac Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 29 Jul 2022 19:42:22 +0000 Subject: [PATCH 2/6] DistinctSortedTransform works only if columns contains sort prefix of sort description --- .../Transforms/DistinctSortedTransform.cpp | 84 +++++++++++++------ .../Transforms/DistinctSortedTransform.h | 12 ++- src/Storages/MergeTree/MergeTask.cpp | 2 +- 3 files changed, 70 insertions(+), 28 deletions(-) diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 3762504fda5..8227b6d581a 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -6,19 +6,14 @@ namespace DB namespace ErrorCodes { extern const int SET_SIZE_LIMIT_EXCEEDED; + extern const int LOGICAL_ERROR; } -DistinctSortedTransform::DistinctSortedTransform( - Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns) - : ISimpleTransform(header_, header_, true) - , header(std::move(header_)) - , description(std::move(sort_description)) - , column_names(columns) - , limit_hint(limit_hint_) - , set_size_limits(set_size_limits_) +static void calcColumnPositionsInHeader(const Block& header, const Names & column_names, ColumnNumbers& column_positions) { /// pre-calculate column positions to use during chunk transformation const size_t num_columns = column_names.empty() ? header.columns() : column_names.size(); + column_positions.clear(); column_positions.reserve(num_columns); for (size_t i = 0; i < num_columns; ++i) { @@ -27,11 +22,46 @@ DistinctSortedTransform::DistinctSortedTransform( if (col && !isColumnConst(*col)) column_positions.emplace_back(pos); } +} + +bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names) +{ + if (sort_description.empty()) + return false; + + /// check if first sorted column in header + const SortColumnDescription & column_sort_descr = sort_description.front(); + if (!header.has(column_sort_descr.column_name)) + return false; + + ColumnNumbers column_positions; + calcColumnPositionsInHeader(header, column_names, column_positions); + + /// check if sorted column matches any DISTINCT column + const auto pos = header.getPositionByName(column_sort_descr.column_name); + return std::find(begin(column_positions), end(column_positions), pos) != column_positions.end(); +} + +DistinctSortedTransform::DistinctSortedTransform( + const Block & header_, + const SortDescription & sort_description_, + const SizeLimits & set_size_limits_, + UInt64 limit_hint_, + const Names & columns) + : ISimpleTransform(header_, header_, true) + , header(header_) + , sort_description(sort_description_) + , column_names(columns) + , limit_hint(limit_hint_) + , set_size_limits(set_size_limits_) +{ + /// pre-calculate column positions to use during chunk transformation + calcColumnPositionsInHeader(header, column_names, column_positions); column_ptrs.reserve(column_positions.size()); /// pre-calculate DISTINCT column positions which form sort prefix of sort description - sort_prefix_positions.reserve(description.size()); - for (const auto & column_sort_descr : description) + sort_prefix_positions.reserve(sort_description.size()); + for (const auto & column_sort_descr : sort_description) { /// check if there is such column in header if (!header.has(column_sort_descr.column_name)) @@ -44,6 +74,10 @@ DistinctSortedTransform::DistinctSortedTransform( sort_prefix_positions.emplace_back(pos); } + if (sort_prefix_positions.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description"); + sort_prefix_columns.reserve(sort_prefix_positions.size()); } @@ -111,7 +145,7 @@ void DistinctSortedTransform::transform(Chunk & chunk) prev_chunk.chunk = std::move(chunk); prev_chunk.clearing_hint_columns = std::move(sort_prefix_columns); - size_t all_columns = prev_chunk.chunk.getNumColumns(); + const size_t all_columns = prev_chunk.chunk.getNumColumns(); Chunk res_chunk; for (size_t i = 0; i < all_columns; ++i) res_chunk.addColumn(prev_chunk.chunk.getColumns().at(i)->filter(filter, -1)); @@ -119,38 +153,40 @@ void DistinctSortedTransform::transform(Chunk & chunk) chunk = std::move(res_chunk); } - template bool DistinctSortedTransform::buildFilter( Method & method, const ColumnRawPtrs & columns, const ColumnRawPtrs & clearing_hint_columns, IColumn::Filter & filter, - size_t rows, + const size_t rows, ClearableSetVariants & variants) const { typename Method::State state(columns, key_sizes, nullptr); /// Compare last row of previous block and first row of current block, - /// If rows not equal, we can clear HashSet, - /// If clearing_hint_columns is empty, we CAN'T clear HashSet. - if (!clearing_hint_columns.empty() && !prev_chunk.clearing_hint_columns.empty() - && !rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1)) + /// If rows are NOT equal, we can clear HashSet + if (!prev_chunk.clearing_hint_columns.empty()) /// it's not first chunk in stream { - method.data.clear(); + if (!rowsEqual(clearing_hint_columns, 0, prev_chunk.clearing_hint_columns, prev_chunk.chunk.getNumRows() - 1)) + method.data.clear(); } bool has_new_data = false; - for (size_t i = 0; i < rows; ++i) + { /// handle 0-indexed row to avoid index check in loop below + const auto emplace_result = state.emplaceKey(method.data, 0, variants.string_pool); + if (emplace_result.isInserted()) + has_new_data = true; + filter[0] = emplace_result.isInserted(); + } + for (size_t i = 1; i < rows; ++i) { /// Compare i-th row and i-1-th row, - /// If rows are not equal, we can clear HashSet, - /// If clearing_hint_columns is empty, we CAN'T clear HashSet. - if (i > 0 && !clearing_hint_columns.empty() && !rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1)) + /// If rows are not equal, we can clear HashSet + if (!rowsEqual(clearing_hint_columns, i, clearing_hint_columns, i - 1)) method.data.clear(); - auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool); - + const auto emplace_result = state.emplaceKey(method.data, i, variants.string_pool); if (emplace_result.isInserted()) has_new_data = true; diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index 2fe40408683..86e4f6f6ed7 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -24,10 +24,16 @@ class DistinctSortedTransform : public ISimpleTransform public: /// Empty columns_ means all columns. DistinctSortedTransform( - Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns); + const Block & header_, + const SortDescription & sort_description, + const SizeLimits & set_size_limits_, + UInt64 limit_hint_, + const Names & columns); String getName() const override { return "DistinctSortedTransform"; } + static bool isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names); + protected: void transform(Chunk & chunk) override; @@ -45,7 +51,7 @@ private: ClearableSetVariants & variants) const; Block header; - SortDescription description; + SortDescription sort_description; struct PreviousChunk { @@ -58,7 +64,7 @@ private: ColumnNumbers column_positions; /// DISTINCT columns positions in header ColumnNumbers sort_prefix_positions; /// DISTINCT columns positions which form sort prefix of sort description ColumnRawPtrs column_ptrs; /// DISTINCT columns from chunk - ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description + ColumnRawPtrs sort_prefix_columns; /// DISTINCT columns from chunk which form sort prefix of sort description ClearableSetVariants data; Sizes key_sizes; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 8c6d47378a7..04945e20b65 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -916,7 +916,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() if (global_ctx->deduplicate) { - if (!sort_description.empty()) + if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns)) res_pipe.addTransform(std::make_shared( res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); else From 13dc1697fb045fd88ebd8ba690585598ef152d2d Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 29 Jul 2022 20:34:23 +0000 Subject: [PATCH 3/6] Remove unnecessary initialization --- src/Processors/Transforms/DistinctSortedTransform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index 05beae647bd..b415657d238 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -72,7 +72,7 @@ private: /// Restrictions on the maximum size of the output data. SizeLimits set_size_limits; - bool all_columns_const = true; + bool all_columns_const; }; } From d951154ef4e948f46f81e1ac93c93809591defa9 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 29 Jul 2022 22:12:03 +0000 Subject: [PATCH 4/6] Proved NULLs direction when compare rows --- .../Transforms/DistinctSortedTransform.cpp | 56 ++++++++++--------- .../Transforms/DistinctSortedTransform.h | 8 +-- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 8558573873e..61f6e1b329e 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -43,7 +43,7 @@ bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescr if (sort_description.empty()) return false; - /// check if first sorted column in header + /// check if at least first sorted column in header const SortColumnDescription & column_sort_descr = sort_description.front(); if (!header.has(column_sort_descr.column_name)) return false; @@ -51,21 +51,18 @@ bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescr ColumnNumbers column_positions; calcColumnPositionsInHeader(header, column_names, column_positions); - /// check if sorted column matches any DISTINCT column + /// check if the sorted column matches any DISTINCT column const auto pos = header.getPositionByName(column_sort_descr.column_name); return std::find(begin(column_positions), end(column_positions), pos) != column_positions.end(); } DistinctSortedTransform::DistinctSortedTransform( - const Block & header_, - const SortDescription & sort_description_, + const Block & header, + const SortDescription & sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, - const Names & columns) - : ISimpleTransform(header_, header_, true) - , header(header_) - , sort_description(sort_description_) - , column_names(columns) + const Names & column_names) + : ISimpleTransform(header, header, true) , limit_hint(limit_hint_) , set_size_limits(set_size_limits_) { @@ -74,26 +71,31 @@ DistinctSortedTransform::DistinctSortedTransform( column_ptrs.reserve(column_positions.size()); all_columns_const = column_positions.empty(); - /// pre-calculate DISTINCT column positions which form sort prefix of sort description - sort_prefix_positions.reserve(sort_description.size()); - for (const auto & column_sort_descr : sort_description) + if (!all_columns_const) { - /// check if there is such column in header - if (!header.has(column_sort_descr.column_name)) - break; + /// pre-calculate DISTINCT column positions which form sort prefix of sort description + sort_prefix_positions.reserve(sort_description.size()); + sorted_columns_nulls_direction.reserve(sort_description.size()); + for (const auto & column_sort_descr : sort_description) + { + /// check if there is such column in header + if (!header.has(column_sort_descr.column_name)) + break; - /// check if sorted column position matches any DISTINCT column - const auto pos = header.getPositionByName(column_sort_descr.column_name); - if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end()) - break; + /// check if sorted column position matches any DISTINCT column + const auto pos = header.getPositionByName(column_sort_descr.column_name); + if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end()) + break; - sort_prefix_positions.emplace_back(pos); + sort_prefix_positions.emplace_back(pos); + sorted_columns_nulls_direction.emplace_back(column_sort_descr.nulls_direction); + } + if (sort_prefix_positions.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description"); + + sort_prefix_columns.reserve(sort_prefix_positions.size()); } - if (sort_prefix_positions.empty()) - throw Exception( - ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description"); - - sort_prefix_columns.reserve(sort_prefix_positions.size()); } void DistinctSortedTransform::transform(Chunk & chunk) @@ -220,13 +222,13 @@ bool DistinctSortedTransform::buildFilter( return has_new_data; } -bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) +bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) const { for (size_t column_index = 0, num_columns = lhs.size(); column_index < num_columns; ++column_index) { const auto & lhs_column = *lhs[column_index]; const auto & rhs_column = *rhs[column_index]; - if (lhs_column.compareAt(n, m, rhs_column, 0) != 0) /// not equal + if (lhs_column.compareAt(n, m, rhs_column, sorted_columns_nulls_direction[column_index]) != 0) /// not equal return false; } return true; diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index b415657d238..7bd4a9d825e 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -28,7 +28,7 @@ public: const SortDescription & sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, - const Names & columns); + const Names & column_names); String getName() const override { return "DistinctSortedTransform"; } @@ -38,7 +38,7 @@ protected: void transform(Chunk & chunk) override; private: - static bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m); + bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) const; /// return true if has new data template @@ -50,8 +50,7 @@ private: size_t rows, ClearableSetVariants & variants) const; - Block header; - SortDescription sort_description; + std::vector sorted_columns_nulls_direction; struct PreviousChunk { @@ -60,7 +59,6 @@ private: }; PreviousChunk prev_chunk; - Names column_names; ColumnNumbers column_positions; /// DISTINCT columns positions in header ColumnNumbers sort_prefix_positions; /// DISTINCT columns positions which form sort prefix of sort description ColumnRawPtrs column_ptrs; /// DISTINCT columns from chunk From 3be51a6dea2defd97c9d0e5aade5d99c25e05320 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Sat, 30 Jul 2022 19:22:28 +0000 Subject: [PATCH 5/6] Construct DistinctSortedTransform only when it makes sense otherwise fallback to DistinctTransform (i.e. ordinary distinct) --- src/Processors/QueryPlan/DistinctStep.cpp | 22 +-- .../Transforms/DistinctSortedTransform.cpp | 135 +++++++++--------- .../Transforms/DistinctSortedTransform.h | 10 +- 3 files changed, 86 insertions(+), 81 deletions(-) diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index c268cb44267..f4b474f83a3 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -140,15 +140,19 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil if (distinct_sort_desc.size() < columns.size()) { - pipeline.addSimpleTransform( - [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr - { - if (stream_type != QueryPipelineBuilder::StreamType::Main) - return nullptr; + if (DistinctSortedTransform::isApplicable(pipeline.getHeader(), distinct_sort_desc, columns)) + { + pipeline.addSimpleTransform( + [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr + { + if (stream_type != QueryPipelineBuilder::StreamType::Main) + return nullptr; - return std::make_shared( - header, distinct_sort_desc, set_size_limits, limit_hint, columns); - }); + return std::make_shared( + header, distinct_sort_desc, set_size_limits, limit_hint, columns); + }); + return; + } } else { @@ -161,8 +165,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil return std::make_shared( header, set_size_limits, limit_hint, distinct_sort_desc, columns, true); }); + return; } - return; } } } diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 61f6e1b329e..12f8da67025 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -9,93 +9,103 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -static void handleAllColumnsConst(Chunk & chunk) +/// calculate column positions to use during chunk transformation +static void calcColumnPositionsInHeader(const Block& header, const Names & column_names, ColumnNumbers& column_positions, ColumnNumbers& const_column_positions) { - const size_t rows = chunk.getNumRows(); - IColumn::Filter filter(rows); - - Chunk res_chunk; - std::fill(filter.begin(), filter.end(), 0); - filter[0] = 1; - for (const auto & column : chunk.getColumns()) - res_chunk.addColumn(column->filter(filter, -1)); - - chunk = std::move(res_chunk); -} - -static void calcColumnPositionsInHeader(const Block& header, const Names & column_names, ColumnNumbers& column_positions) -{ - /// pre-calculate column positions to use during chunk transformation const size_t num_columns = column_names.empty() ? header.columns() : column_names.size(); column_positions.clear(); column_positions.reserve(num_columns); + const_column_positions.clear(); + const_column_positions.reserve(num_columns); for (size_t i = 0; i < num_columns; ++i) { auto pos = column_names.empty() ? i : header.getPositionByName(column_names[i]); const auto & column = header.getByPosition(pos).column; - if (column && !isColumnConst(*column)) - column_positions.emplace_back(pos); + if (column) + { + if (isColumnConst(*column)) + const_column_positions.emplace_back(pos); + else + column_positions.emplace_back(pos); + } } } +/// calculate DISTINCT column positions which form sort prefix of sort description +static void calcSortPrefixPositionsInHeader( + const Block & header, + const SortDescription & sort_description, + const ColumnNumbers & column_positions, + const ColumnNumbers & const_column_positions, + ColumnNumbers & sort_prefix_positions) +{ + sort_prefix_positions.reserve(sort_description.size()); + for (const auto & column_sort_descr : sort_description) + { + /// check if there is such column in header + if (!header.has(column_sort_descr.column_name)) + break; + + /// check if sorted column position matches any DISTINCT column + const auto pos = header.getPositionByName(column_sort_descr.column_name); + if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end()) + { + /// if sorted column found in const columns then we can skip it + if (std::find(begin(const_column_positions), end(const_column_positions), pos) != const_column_positions.end()) + continue; + + break; + } + + sort_prefix_positions.emplace_back(pos); + } +} + +/// check if distinct sorted is applicable for provided header, sort description and distinct columns bool DistinctSortedTransform::isApplicable(const Block & header, const SortDescription & sort_description, const Names & column_names) { if (sort_description.empty()) return false; - /// check if at least first sorted column in header - const SortColumnDescription & column_sort_descr = sort_description.front(); - if (!header.has(column_sort_descr.column_name)) + ColumnNumbers column_positions; + ColumnNumbers const_column_positions; + calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions); + if (column_positions.empty()) return false; - ColumnNumbers column_positions; - calcColumnPositionsInHeader(header, column_names, column_positions); - - /// check if the sorted column matches any DISTINCT column - const auto pos = header.getPositionByName(column_sort_descr.column_name); - return std::find(begin(column_positions), end(column_positions), pos) != column_positions.end(); + /// check if sorted columns matches DISTINCT columns + ColumnNumbers sort_prefix_positions; + calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions); + return !sort_prefix_positions.empty(); } DistinctSortedTransform::DistinctSortedTransform( const Block & header, const SortDescription & sort_description, const SizeLimits & set_size_limits_, - UInt64 limit_hint_, + const UInt64 limit_hint_, const Names & column_names) : ISimpleTransform(header, header, true) , limit_hint(limit_hint_) , set_size_limits(set_size_limits_) { + if (sort_description.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: sort description can't be empty"); + /// pre-calculate column positions to use during chunk transformation - calcColumnPositionsInHeader(header, column_names, column_positions); + ColumnNumbers const_column_positions; + calcColumnPositionsInHeader(header, column_names, column_positions, const_column_positions); + if (column_positions.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: all columns can't be const. DistinctTransform should be used instead"); + + /// pre-calculate DISTINCT column positions which form sort prefix of sort description + calcSortPrefixPositionsInHeader(header, sort_description, column_positions, const_column_positions, sort_prefix_positions); + if (sort_prefix_positions.empty()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description"); + column_ptrs.reserve(column_positions.size()); - all_columns_const = column_positions.empty(); - - if (!all_columns_const) - { - /// pre-calculate DISTINCT column positions which form sort prefix of sort description - sort_prefix_positions.reserve(sort_description.size()); - sorted_columns_nulls_direction.reserve(sort_description.size()); - for (const auto & column_sort_descr : sort_description) - { - /// check if there is such column in header - if (!header.has(column_sort_descr.column_name)) - break; - - /// check if sorted column position matches any DISTINCT column - const auto pos = header.getPositionByName(column_sort_descr.column_name); - if (std::find(begin(column_positions), end(column_positions), pos) == column_positions.end()) - break; - - sort_prefix_positions.emplace_back(pos); - sorted_columns_nulls_direction.emplace_back(column_sort_descr.nulls_direction); - } - if (sort_prefix_positions.empty()) - throw Exception( - ErrorCodes::LOGICAL_ERROR, "DistinctSortedTransform: columns have to form a sort prefix for provided sort description"); - - sort_prefix_columns.reserve(sort_prefix_positions.size()); - } + sort_prefix_columns.reserve(sort_prefix_positions.size()); } void DistinctSortedTransform::transform(Chunk & chunk) @@ -103,14 +113,6 @@ void DistinctSortedTransform::transform(Chunk & chunk) if (unlikely(!chunk.hasRows())) return; - /// special case - all column constant - if (unlikely(all_columns_const)) - { - handleAllColumnsConst(chunk); - stopReading(); - return; - } - /// get DISTINCT columns from chunk column_ptrs.clear(); for (const auto pos : column_positions) @@ -177,7 +179,6 @@ void DistinctSortedTransform::transform(Chunk & chunk) chunk = std::move(res_chunk); } - template bool DistinctSortedTransform::buildFilter( Method & method, @@ -222,13 +223,13 @@ bool DistinctSortedTransform::buildFilter( return has_new_data; } -bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) const +bool DistinctSortedTransform::rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) { for (size_t column_index = 0, num_columns = lhs.size(); column_index < num_columns; ++column_index) { const auto & lhs_column = *lhs[column_index]; const auto & rhs_column = *rhs[column_index]; - if (lhs_column.compareAt(n, m, rhs_column, sorted_columns_nulls_direction[column_index]) != 0) /// not equal + if (lhs_column.compareAt(n, m, rhs_column, -1) != 0) /// not equal return false; } return true; diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index 7bd4a9d825e..86c5c968e4b 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -12,6 +12,9 @@ namespace DB /** This class is intended for implementation of SELECT DISTINCT clause and * leaves only unique rows in the stream. * + * DistinctSortedTransform::isApplicable() have to be used to check if DistinctSortedTransform can be constructed with particular arguments, + * otherwise the constructor can throw LOGICAL_ERROR exception + * * Implementation for case, when input stream has rows for same DISTINCT key or at least its prefix, * grouped together (going consecutively). * @@ -24,7 +27,7 @@ class DistinctSortedTransform : public ISimpleTransform public: /// Empty columns_ means all columns. DistinctSortedTransform( - const Block & header_, + const Block & header, const SortDescription & sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, @@ -38,7 +41,7 @@ protected: void transform(Chunk & chunk) override; private: - bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m) const; + static bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m); /// return true if has new data template @@ -50,8 +53,6 @@ private: size_t rows, ClearableSetVariants & variants) const; - std::vector sorted_columns_nulls_direction; - struct PreviousChunk { Chunk chunk; @@ -70,7 +71,6 @@ private: /// Restrictions on the maximum size of the output data. SizeLimits set_size_limits; - bool all_columns_const; }; } From 30782a2b0580400440c977c9d97a5287cd3ca714 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 2 Aug 2022 17:44:43 +0000 Subject: [PATCH 6/6] Test: distinct sorted is not used on const column --- .../Transforms/DistinctSortedTransform.cpp | 1 + ...distinct_in_order_optimization_explain.reference | 13 ++++++++----- .../02317_distinct_in_order_optimization_explain.sh | 13 ++++++++----- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 12f8da67025..4c6ca03950c 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -179,6 +179,7 @@ void DistinctSortedTransform::transform(Chunk & chunk) chunk = std::move(res_chunk); } + template bool DistinctSortedTransform::buildFilter( Method & method, diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index 3e57d4de586..01e0c397dc7 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -9,24 +9,27 @@ DistinctSortedChunkTransform -- distinct with primary key prefix -> pre-distinct optimization only DistinctTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization +-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization DistinctSortedTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization +-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization DistinctSortedStreamTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization DistinctSortedTransform DistinctSortedChunkTransform --- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only +-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only DistinctTransform DistinctSortedChunkTransform -- distinct with non-primary key prefix -> ordinary distinct DistinctTransform DistinctTransform --- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only +-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only DistinctSortedTransform DistinctTransform --- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct +-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct +DistinctTransform +DistinctTransform +-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct DistinctTransform DistinctTransform diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index 903f3bb9e11..fcffb974433 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -27,25 +27,28 @@ $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct * $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix -> pre-distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on the same columns -> pre-distinct and final distinct optimization'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by the same columns -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b from distinct_in_order_explain order by a, b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column in distinct but non-primary key prefix -> pre-distinct and final distinct optimization'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, b, c from distinct_in_order_explain order by c" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with primary key prefix and order by column _not_ in distinct -> pre-distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct a, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT $CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix -> ordinary distinct'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column in distinct -> final distinct optimization only'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by b" | eval $FIND_DISTINCT -$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by on column _not_ in distinct -> ordinary distinct'" +$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by column _not_ in distinct -> ordinary distinct'" $CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, c from distinct_in_order_explain order by a" | eval $FIND_DISTINCT +$CLICKHOUSE_CLIENT -q "select '-- distinct with non-primary key prefix and order by _const_ column in distinct -> ordinary distinct'" +$CLICKHOUSE_CLIENT -nq "$ENABLE_OPTIMIZATION;explain pipeline select distinct b, 1 as x from distinct_in_order_explain order by x" | eval $FIND_DISTINCT + $CLICKHOUSE_CLIENT -q "drop table if exists distinct_in_order_explain sync"