+ use DistinctSorted for final distinct step

+ fix performance tests
This commit is contained in:
Igor Nikonov 2022-06-30 13:03:39 +00:00
parent 4cbbfb431d
commit 488ee75fc4
10 changed files with 68 additions and 89 deletions

View File

@ -607,7 +607,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \
M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
M(UInt64, distinct_in_order_range_search_step, 0, "Setting for DISTINCT in order optimization. TBD", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -2578,8 +2578,7 @@ void InterpreterSelectQuery::executeDistinct(QueryPlan & query_plan, bool before
limit_for_distinct,
columns,
pre_distinct,
settings.optimize_distinct_in_order,
settings.distinct_in_order_range_search_step);
settings.optimize_distinct_in_order);
if (pre_distinct)
distinct_step->setStepDescription("Preliminary DISTINCT");

View File

@ -335,8 +335,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan)
0,
result_header.getNames(),
false,
settings.optimize_distinct_in_order,
settings.distinct_in_order_range_search_step);
settings.optimize_distinct_in_order);
query_plan.addStep(std::move(distinct_step));
}

View File

@ -59,8 +59,7 @@ DistinctStep::DistinctStep(
UInt64 limit_hint_,
const Names & columns_,
bool pre_distinct_,
bool optimize_distinct_in_order_,
UInt64 distinct_in_order_range_search_step_)
bool optimize_distinct_in_order_)
: ITransformingStep(
input_stream_,
input_stream_.header,
@ -68,7 +67,6 @@ DistinctStep::DistinctStep(
, set_size_limits(set_size_limits_)
, limit_hint(limit_hint_)
, columns(columns_)
, distinct_in_order_range_search_step(distinct_in_order_range_search_step_)
, pre_distinct(pre_distinct_)
, optimize_distinct_in_order(optimize_distinct_in_order_)
{
@ -96,9 +94,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
SortDescription distinct_sort_desc = getSortDescription(input_stream.sort_description, columns);
if (!distinct_sort_desc.empty())
{
/// pre-distinct for sorted chunks or
/// final distinct for sorted stream (sorting inside and among chunks)
if (pre_distinct || input_stream.has_single_port)
/// pre-distinct for sorted chunks
if (pre_distinct)
{
pipeline.addSimpleTransform(
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
@ -107,7 +104,20 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
return nullptr;
return std::make_shared<DistinctSortedChunkTransform>(
header, set_size_limits, limit_hint, distinct_sort_desc, columns, distinct_in_order_range_search_step);
header, set_size_limits, limit_hint, distinct_sort_desc, columns);
});
return;
}
/// final distinct for sorted stream (sorting inside and among chunks)
if (input_stream.has_single_port)
{
pipeline.addSimpleTransform(
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
{
if (stream_type != QueryPipelineBuilder::StreamType::Main)
return nullptr;
return std::make_shared<DistinctSortedTransform>(header, distinct_sort_desc, set_size_limits, limit_hint, columns);
});
return;
}

View File

@ -15,8 +15,7 @@ public:
UInt64 limit_hint_,
const Names & columns_,
bool pre_distinct_, /// If is enabled, execute distinct for separate streams. Otherwise, merge streams.
bool optimize_distinct_in_order_,
UInt64 distinct_in_order_range_search_step);
bool optimize_distinct_in_order_);
String getName() const override { return "Distinct"; }
@ -31,7 +30,6 @@ private:
SizeLimits set_size_limits;
UInt64 limit_hint;
Names columns;
UInt64 distinct_in_order_range_search_step = 0;
bool pre_distinct;
bool optimize_distinct_in_order;
};

View File

@ -13,13 +13,11 @@ DistinctSortedChunkTransform::DistinctSortedChunkTransform(
const SizeLimits & output_size_limits_,
UInt64 limit_hint_,
const SortDescription & sorted_columns_descr_,
const Names & source_columns,
size_t range_search_step_)
const Names & source_columns)
: ISimpleTransform(header_, header_, true)
, limit_hint(limit_hint_)
, output_size_limits(output_size_limits_)
, sorted_columns_descr(sorted_columns_descr_)
, range_search_step(range_search_step_)
{
/// calculate sorted columns positions
sorted_columns_pos.reserve(sorted_columns_descr.size());
@ -124,57 +122,35 @@ bool DistinctSortedChunkTransform::isCurrentKey(const size_t row_pos) const
return true;
}
size_t DistinctSortedChunkTransform::getRangeEnd(size_t range_begin, size_t range_end) const
size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const
{
assert(range_begin < range_end);
assert(begin < end);
// probe latest row
if (isCurrentKey(range_end-1)) {
return range_end;
}
const size_t linear_probe_threadhold = 16;
size_t linear_probe_end = begin + linear_probe_threadhold;
if (linear_probe_end > end)
linear_probe_end = end;
auto find_range_end = [this](size_t begin, size_t end) -> size_t
for (size_t pos = begin; pos < linear_probe_end; ++pos)
{
const size_t linear_probe_threadhold = 32;
size_t linear_probe_end = begin + linear_probe_threadhold;
if (linear_probe_end > end)
linear_probe_end = end;
for(size_t pos=begin; pos < linear_probe_end; ++pos)
{
if (!isCurrentKey(pos))
return pos;
}
size_t low = linear_probe_end;
size_t high = end - 1;
while (low <= high)
{
size_t mid = low + (high - low) / 2;
if (isCurrentKey(mid))
low = mid + 1;
else
{
high = mid - 1;
end = mid;
}
}
return end;
};
const size_t step = range_search_step;
if (!step)
return find_range_end(range_begin, range_end);
size_t begin = range_begin;
while (begin + step <= range_end)
{
const size_t pos = find_range_end(begin, begin + step);
if (pos < begin + step)
if (!isCurrentKey(pos))
return pos;
begin += step;
}
return find_range_end(begin, range_end);
size_t low = linear_probe_end;
size_t high = end - 1;
while (low <= high)
{
size_t mid = low + (high - low) / 2;
if (isCurrentKey(mid))
low = mid + 1;
else
{
high = mid - 1;
end = mid;
}
}
return end;
}
std::pair<size_t, size_t> DistinctSortedChunkTransform::continueWithPrevRange(const size_t chunk_rows, IColumn::Filter & filter)

View File

@ -32,8 +32,7 @@ public:
const SizeLimits & output_size_limits_,
UInt64 limit_hint_,
const SortDescription & sorted_columns_descr_,
const Names & source_columns_,
size_t range_search_step);
const Names & source_columns_);
String getName() const override { return "DistinctSortedChunkTransform"; }
@ -68,7 +67,6 @@ private:
ColumnRawPtrs other_columns; // used during processing
MutableColumns current_key;
const size_t range_search_step = 0;
};
}

View File

@ -1,33 +1,33 @@
<test>
<!-- high cardinality -->
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_high</drop_query>
<create_query>CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium, low)</create_query>
<fill_query>INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 from numbers(1000000)</fill_query>
<create_query>CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium)</create_query>
<fill_query>INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 FROM numbers_mt(1e8)</fill_query>
<query>select distinct high from distinct_cardinality_high</query>
<query>select distinct high, low from distinct_cardinality_high</query>
<query>select distinct high, medium from distinct_cardinality_high</query>
<query>select distinct high, medium, low from distinct_cardinality_high</query>
<query>SELECT DISTINCT high FROM distinct_cardinality_high FORMAT Null</query>
<query>SELECT DISTINCT high, low FROM distinct_cardinality_high FORMAT Null</query>
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high FORMAT Null</query>
<query>SELECT DISTINCT high, medium, low FROM distinct_cardinality_high FORMAT Null</query>
<query>select distinct high, medium from distinct_cardinality_high order by medium</query>
<query>select distinct high, low from distinct_cardinality_high order by low</query>
<query>select distinct high, medium, low from distinct_cardinality_high order by high</query>
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY medium FORMAT Null</query>
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY high FORMAT Null</query>
<query>SELECT DISTINCT high, low FROM distinct_cardinality_high ORDER BY low FORMAT Null</query>
<query>SELECT DISTINCT high, medium, low FROM distinct_cardinality_high ORDER BY low FORMAT Null</query>
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_high</drop_query>
<!-- low cardinality -->
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_low</drop_query>
<create_query>CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium, high)</create_query>
<fill_query>INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 from numbers(1000000)</fill_query>
<create_query>CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium)</create_query>
<fill_query>INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 FROM numbers_mt(1e8)</fill_query>
<query>select distinct low from distinct_cardinality_low</query>
<query>select distinct low, medium from distinct_cardinality_low</query>
<query>select distinct low, high from distinct_cardinality_low</query>
<query>select distinct low, medium, high from distinct_cardinality_low</query>
<query>SELECT DISTINCT low FROM distinct_cardinality_low FORMAT Null</query>
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low FORMAT Null</query>
<query>SELECT DISTINCT low, high FROM distinct_cardinality_low FORMAT Null</query>
<query>SELECT DISTINCT low, medium, high FROM distinct_cardinality_low FORMAT Null</query>
<query>select distinct low, medium from distinct_cardinality_low order by medium</query>
<query>select distinct low, high from distinct_cardinality_low order by high</query>
<query>select distinct low, medium, high from distinct_cardinality_low order by low</query>
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null</query>
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null</query>
<query>SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null</query>
<query>SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null</query>
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_low</drop_query>
</test>

View File

@ -7,13 +7,13 @@ DistinctSortedChunkTransform
-- distinct with primary key prefix -> pre-distinct optimization only
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization
DistinctSortedChunkTransform
DistinctSortedTransform
DistinctSortedChunkTransform
-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only
DistinctSortedChunkTransform
-- distinct with non-primary key prefix -> no optimizations
No optimizations
-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only
DistinctSortedChunkTransform
DistinctSortedTransform
-- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations
No optimizations

View File

@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
DISABLE_OPTIMIZATION="set optimize_distinct_in_order=0"
ENABLE_OPTIMIZATION="set optimize_distinct_in_order=1"
GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform'"
GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedTransform'"
TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'"
FIND_OPTIMIZATIONS="$GREP_OPTIMIZATIONS | $TRIM_LEADING_SPACES"