diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d8080fc427d..332b7364605 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -607,7 +607,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \ M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \ M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \ - M(UInt64, distinct_in_order_range_search_step, 0, "Setting for DISTINCT in order optimization. TBD", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index c7149cd30dc..28438a86e47 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2578,8 +2578,7 @@ void InterpreterSelectQuery::executeDistinct(QueryPlan & query_plan, bool before limit_for_distinct, columns, pre_distinct, - settings.optimize_distinct_in_order, - settings.distinct_in_order_range_search_step); + settings.optimize_distinct_in_order); if (pre_distinct) distinct_step->setStepDescription("Preliminary DISTINCT"); diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp index 6a5e5658ccf..9f87a47fced 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp @@ -335,8 +335,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan) 0, result_header.getNames(), false, - settings.optimize_distinct_in_order, - settings.distinct_in_order_range_search_step); + settings.optimize_distinct_in_order); query_plan.addStep(std::move(distinct_step)); } diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp index c182d336b35..946af9ca4d6 100644 --- a/src/Processors/QueryPlan/DistinctStep.cpp +++ b/src/Processors/QueryPlan/DistinctStep.cpp @@ -59,8 +59,7 @@ DistinctStep::DistinctStep( UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, - bool optimize_distinct_in_order_, - UInt64 distinct_in_order_range_search_step_) + bool optimize_distinct_in_order_) : ITransformingStep( input_stream_, input_stream_.header, @@ -68,7 +67,6 @@ DistinctStep::DistinctStep( , set_size_limits(set_size_limits_) , limit_hint(limit_hint_) , columns(columns_) - , distinct_in_order_range_search_step(distinct_in_order_range_search_step_) , pre_distinct(pre_distinct_) , optimize_distinct_in_order(optimize_distinct_in_order_) { @@ -96,9 +94,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil SortDescription distinct_sort_desc = getSortDescription(input_stream.sort_description, columns); if (!distinct_sort_desc.empty()) { - /// pre-distinct for sorted chunks or - /// final distinct for sorted stream (sorting inside and among chunks) - if (pre_distinct || input_stream.has_single_port) + /// pre-distinct for sorted chunks + if (pre_distinct) { pipeline.addSimpleTransform( [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr @@ -107,7 +104,20 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil return nullptr; return std::make_shared( - header, set_size_limits, limit_hint, distinct_sort_desc, columns, distinct_in_order_range_search_step); + header, set_size_limits, limit_hint, distinct_sort_desc, columns); + }); + return; + } + /// final distinct for sorted stream (sorting inside and among chunks) + if (input_stream.has_single_port) + { + pipeline.addSimpleTransform( + [&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr + { + if (stream_type != QueryPipelineBuilder::StreamType::Main) + return nullptr; + + return std::make_shared(header, distinct_sort_desc, set_size_limits, limit_hint, columns); }); return; } diff --git a/src/Processors/QueryPlan/DistinctStep.h b/src/Processors/QueryPlan/DistinctStep.h index 1d678ac3144..dc734a58704 100644 --- a/src/Processors/QueryPlan/DistinctStep.h +++ b/src/Processors/QueryPlan/DistinctStep.h @@ -15,8 +15,7 @@ public: UInt64 limit_hint_, const Names & columns_, bool pre_distinct_, /// If is enabled, execute distinct for separate streams. Otherwise, merge streams. - bool optimize_distinct_in_order_, - UInt64 distinct_in_order_range_search_step); + bool optimize_distinct_in_order_); String getName() const override { return "Distinct"; } @@ -31,7 +30,6 @@ private: SizeLimits set_size_limits; UInt64 limit_hint; Names columns; - UInt64 distinct_in_order_range_search_step = 0; bool pre_distinct; bool optimize_distinct_in_order; }; diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp index 89fa675dbc7..064c827a8cc 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.cpp @@ -13,13 +13,11 @@ DistinctSortedChunkTransform::DistinctSortedChunkTransform( const SizeLimits & output_size_limits_, UInt64 limit_hint_, const SortDescription & sorted_columns_descr_, - const Names & source_columns, - size_t range_search_step_) + const Names & source_columns) : ISimpleTransform(header_, header_, true) , limit_hint(limit_hint_) , output_size_limits(output_size_limits_) , sorted_columns_descr(sorted_columns_descr_) - , range_search_step(range_search_step_) { /// calculate sorted columns positions sorted_columns_pos.reserve(sorted_columns_descr.size()); @@ -124,57 +122,35 @@ bool DistinctSortedChunkTransform::isCurrentKey(const size_t row_pos) const return true; } -size_t DistinctSortedChunkTransform::getRangeEnd(size_t range_begin, size_t range_end) const +size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const { - assert(range_begin < range_end); + assert(begin < end); - // probe latest row - if (isCurrentKey(range_end-1)) { - return range_end; - } + const size_t linear_probe_threadhold = 16; + size_t linear_probe_end = begin + linear_probe_threadhold; + if (linear_probe_end > end) + linear_probe_end = end; - auto find_range_end = [this](size_t begin, size_t end) -> size_t + for (size_t pos = begin; pos < linear_probe_end; ++pos) { - const size_t linear_probe_threadhold = 32; - size_t linear_probe_end = begin + linear_probe_threadhold; - if (linear_probe_end > end) - linear_probe_end = end; - - for(size_t pos=begin; pos < linear_probe_end; ++pos) - { - if (!isCurrentKey(pos)) - return pos; - } - - size_t low = linear_probe_end; - size_t high = end - 1; - while (low <= high) - { - size_t mid = low + (high - low) / 2; - if (isCurrentKey(mid)) - low = mid + 1; - else - { - high = mid - 1; - end = mid; - } - } - return end; - }; - - const size_t step = range_search_step; - if (!step) - return find_range_end(range_begin, range_end); - - size_t begin = range_begin; - while (begin + step <= range_end) - { - const size_t pos = find_range_end(begin, begin + step); - if (pos < begin + step) + if (!isCurrentKey(pos)) return pos; - begin += step; } - return find_range_end(begin, range_end); + + size_t low = linear_probe_end; + size_t high = end - 1; + while (low <= high) + { + size_t mid = low + (high - low) / 2; + if (isCurrentKey(mid)) + low = mid + 1; + else + { + high = mid - 1; + end = mid; + } + } + return end; } std::pair DistinctSortedChunkTransform::continueWithPrevRange(const size_t chunk_rows, IColumn::Filter & filter) diff --git a/src/Processors/Transforms/DistinctSortedChunkTransform.h b/src/Processors/Transforms/DistinctSortedChunkTransform.h index 983b976c49f..2e21c36f7dc 100644 --- a/src/Processors/Transforms/DistinctSortedChunkTransform.h +++ b/src/Processors/Transforms/DistinctSortedChunkTransform.h @@ -32,8 +32,7 @@ public: const SizeLimits & output_size_limits_, UInt64 limit_hint_, const SortDescription & sorted_columns_descr_, - const Names & source_columns_, - size_t range_search_step); + const Names & source_columns_); String getName() const override { return "DistinctSortedChunkTransform"; } @@ -68,7 +67,6 @@ private: ColumnRawPtrs other_columns; // used during processing MutableColumns current_key; - const size_t range_search_step = 0; }; } diff --git a/tests/performance/distinct_in_order.xml b/tests/performance/distinct_in_order.xml index ea76fedcc34..c4d09aa825b 100644 --- a/tests/performance/distinct_in_order.xml +++ b/tests/performance/distinct_in_order.xml @@ -1,33 +1,33 @@ - DROP TABLE IF EXISTS distinct_cardinality_high - CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium, low) - INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 from numbers(1000000) + CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium) + INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 FROM numbers_mt(1e8) - select distinct high from distinct_cardinality_high - select distinct high, low from distinct_cardinality_high - select distinct high, medium from distinct_cardinality_high - select distinct high, medium, low from distinct_cardinality_high + SELECT DISTINCT high FROM distinct_cardinality_high FORMAT Null + SELECT DISTINCT high, low FROM distinct_cardinality_high FORMAT Null + SELECT DISTINCT high, medium FROM distinct_cardinality_high FORMAT Null + SELECT DISTINCT high, medium, low FROM distinct_cardinality_high FORMAT Null - select distinct high, medium from distinct_cardinality_high order by medium - select distinct high, low from distinct_cardinality_high order by low - select distinct high, medium, low from distinct_cardinality_high order by high + SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY medium FORMAT Null + SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY high FORMAT Null + SELECT DISTINCT high, low FROM distinct_cardinality_high ORDER BY low FORMAT Null + SELECT DISTINCT high, medium, low FROM distinct_cardinality_high ORDER BY low FORMAT Null DROP TABLE IF EXISTS distinct_cardinality_high - DROP TABLE IF EXISTS distinct_cardinality_low - CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium, high) - INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 from numbers(1000000) + CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium) + INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 FROM numbers_mt(1e8) - select distinct low from distinct_cardinality_low - select distinct low, medium from distinct_cardinality_low - select distinct low, high from distinct_cardinality_low - select distinct low, medium, high from distinct_cardinality_low + SELECT DISTINCT low FROM distinct_cardinality_low FORMAT Null + SELECT DISTINCT low, medium FROM distinct_cardinality_low FORMAT Null + SELECT DISTINCT low, high FROM distinct_cardinality_low FORMAT Null + SELECT DISTINCT low, medium, high FROM distinct_cardinality_low FORMAT Null - select distinct low, medium from distinct_cardinality_low order by medium - select distinct low, high from distinct_cardinality_low order by high - select distinct low, medium, high from distinct_cardinality_low order by low + SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null + SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null + SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null + SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null DROP TABLE IF EXISTS distinct_cardinality_low diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference index d3951859b1b..2dac69edc41 100644 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.reference @@ -7,13 +7,13 @@ DistinctSortedChunkTransform -- distinct with primary key prefix -> pre-distinct optimization only DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization -DistinctSortedChunkTransform +DistinctSortedTransform DistinctSortedChunkTransform -- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only DistinctSortedChunkTransform -- distinct with non-primary key prefix -> no optimizations No optimizations -- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only -DistinctSortedChunkTransform +DistinctSortedTransform -- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations No optimizations diff --git a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh index 33fb6f12110..21f50a147ac 100755 --- a/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh +++ b/tests/queries/0_stateless/02317_distinct_in_order_optimization_explain.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) DISABLE_OPTIMIZATION="set optimize_distinct_in_order=0" ENABLE_OPTIMIZATION="set optimize_distinct_in_order=1" -GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform'" +GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedTransform'" TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'" FIND_OPTIMIZATIONS="$GREP_OPTIMIZATIONS | $TRIM_LEADING_SPACES"