mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
+ use DistinctSorted for final distinct step
+ fix performance tests
This commit is contained in:
parent
4cbbfb431d
commit
488ee75fc4
@ -607,7 +607,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
|
||||
M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \
|
||||
M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
|
||||
M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
|
||||
M(UInt64, distinct_in_order_range_search_step, 0, "Setting for DISTINCT in order optimization. TBD", 0) \
|
||||
// End of COMMON_SETTINGS
|
||||
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.
|
||||
|
||||
|
@ -2578,8 +2578,7 @@ void InterpreterSelectQuery::executeDistinct(QueryPlan & query_plan, bool before
|
||||
limit_for_distinct,
|
||||
columns,
|
||||
pre_distinct,
|
||||
settings.optimize_distinct_in_order,
|
||||
settings.distinct_in_order_range_search_step);
|
||||
settings.optimize_distinct_in_order);
|
||||
|
||||
if (pre_distinct)
|
||||
distinct_step->setStepDescription("Preliminary DISTINCT");
|
||||
|
@ -335,8 +335,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan)
|
||||
0,
|
||||
result_header.getNames(),
|
||||
false,
|
||||
settings.optimize_distinct_in_order,
|
||||
settings.distinct_in_order_range_search_step);
|
||||
settings.optimize_distinct_in_order);
|
||||
|
||||
query_plan.addStep(std::move(distinct_step));
|
||||
}
|
||||
|
@ -59,8 +59,7 @@ DistinctStep::DistinctStep(
|
||||
UInt64 limit_hint_,
|
||||
const Names & columns_,
|
||||
bool pre_distinct_,
|
||||
bool optimize_distinct_in_order_,
|
||||
UInt64 distinct_in_order_range_search_step_)
|
||||
bool optimize_distinct_in_order_)
|
||||
: ITransformingStep(
|
||||
input_stream_,
|
||||
input_stream_.header,
|
||||
@ -68,7 +67,6 @@ DistinctStep::DistinctStep(
|
||||
, set_size_limits(set_size_limits_)
|
||||
, limit_hint(limit_hint_)
|
||||
, columns(columns_)
|
||||
, distinct_in_order_range_search_step(distinct_in_order_range_search_step_)
|
||||
, pre_distinct(pre_distinct_)
|
||||
, optimize_distinct_in_order(optimize_distinct_in_order_)
|
||||
{
|
||||
@ -96,9 +94,8 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
|
||||
SortDescription distinct_sort_desc = getSortDescription(input_stream.sort_description, columns);
|
||||
if (!distinct_sort_desc.empty())
|
||||
{
|
||||
/// pre-distinct for sorted chunks or
|
||||
/// final distinct for sorted stream (sorting inside and among chunks)
|
||||
if (pre_distinct || input_stream.has_single_port)
|
||||
/// pre-distinct for sorted chunks
|
||||
if (pre_distinct)
|
||||
{
|
||||
pipeline.addSimpleTransform(
|
||||
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
|
||||
@ -107,7 +104,20 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DistinctSortedChunkTransform>(
|
||||
header, set_size_limits, limit_hint, distinct_sort_desc, columns, distinct_in_order_range_search_step);
|
||||
header, set_size_limits, limit_hint, distinct_sort_desc, columns);
|
||||
});
|
||||
return;
|
||||
}
|
||||
/// final distinct for sorted stream (sorting inside and among chunks)
|
||||
if (input_stream.has_single_port)
|
||||
{
|
||||
pipeline.addSimpleTransform(
|
||||
[&](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
|
||||
{
|
||||
if (stream_type != QueryPipelineBuilder::StreamType::Main)
|
||||
return nullptr;
|
||||
|
||||
return std::make_shared<DistinctSortedTransform>(header, distinct_sort_desc, set_size_limits, limit_hint, columns);
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
@ -15,8 +15,7 @@ public:
|
||||
UInt64 limit_hint_,
|
||||
const Names & columns_,
|
||||
bool pre_distinct_, /// If is enabled, execute distinct for separate streams. Otherwise, merge streams.
|
||||
bool optimize_distinct_in_order_,
|
||||
UInt64 distinct_in_order_range_search_step);
|
||||
bool optimize_distinct_in_order_);
|
||||
|
||||
String getName() const override { return "Distinct"; }
|
||||
|
||||
@ -31,7 +30,6 @@ private:
|
||||
SizeLimits set_size_limits;
|
||||
UInt64 limit_hint;
|
||||
Names columns;
|
||||
UInt64 distinct_in_order_range_search_step = 0;
|
||||
bool pre_distinct;
|
||||
bool optimize_distinct_in_order;
|
||||
};
|
||||
|
@ -13,13 +13,11 @@ DistinctSortedChunkTransform::DistinctSortedChunkTransform(
|
||||
const SizeLimits & output_size_limits_,
|
||||
UInt64 limit_hint_,
|
||||
const SortDescription & sorted_columns_descr_,
|
||||
const Names & source_columns,
|
||||
size_t range_search_step_)
|
||||
const Names & source_columns)
|
||||
: ISimpleTransform(header_, header_, true)
|
||||
, limit_hint(limit_hint_)
|
||||
, output_size_limits(output_size_limits_)
|
||||
, sorted_columns_descr(sorted_columns_descr_)
|
||||
, range_search_step(range_search_step_)
|
||||
{
|
||||
/// calculate sorted columns positions
|
||||
sorted_columns_pos.reserve(sorted_columns_descr.size());
|
||||
@ -124,57 +122,35 @@ bool DistinctSortedChunkTransform::isCurrentKey(const size_t row_pos) const
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t DistinctSortedChunkTransform::getRangeEnd(size_t range_begin, size_t range_end) const
|
||||
size_t DistinctSortedChunkTransform::getRangeEnd(size_t begin, size_t end) const
|
||||
{
|
||||
assert(range_begin < range_end);
|
||||
assert(begin < end);
|
||||
|
||||
// probe latest row
|
||||
if (isCurrentKey(range_end-1)) {
|
||||
return range_end;
|
||||
}
|
||||
const size_t linear_probe_threadhold = 16;
|
||||
size_t linear_probe_end = begin + linear_probe_threadhold;
|
||||
if (linear_probe_end > end)
|
||||
linear_probe_end = end;
|
||||
|
||||
auto find_range_end = [this](size_t begin, size_t end) -> size_t
|
||||
for (size_t pos = begin; pos < linear_probe_end; ++pos)
|
||||
{
|
||||
const size_t linear_probe_threadhold = 32;
|
||||
size_t linear_probe_end = begin + linear_probe_threadhold;
|
||||
if (linear_probe_end > end)
|
||||
linear_probe_end = end;
|
||||
|
||||
for(size_t pos=begin; pos < linear_probe_end; ++pos)
|
||||
{
|
||||
if (!isCurrentKey(pos))
|
||||
return pos;
|
||||
}
|
||||
|
||||
size_t low = linear_probe_end;
|
||||
size_t high = end - 1;
|
||||
while (low <= high)
|
||||
{
|
||||
size_t mid = low + (high - low) / 2;
|
||||
if (isCurrentKey(mid))
|
||||
low = mid + 1;
|
||||
else
|
||||
{
|
||||
high = mid - 1;
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
return end;
|
||||
};
|
||||
|
||||
const size_t step = range_search_step;
|
||||
if (!step)
|
||||
return find_range_end(range_begin, range_end);
|
||||
|
||||
size_t begin = range_begin;
|
||||
while (begin + step <= range_end)
|
||||
{
|
||||
const size_t pos = find_range_end(begin, begin + step);
|
||||
if (pos < begin + step)
|
||||
if (!isCurrentKey(pos))
|
||||
return pos;
|
||||
begin += step;
|
||||
}
|
||||
return find_range_end(begin, range_end);
|
||||
|
||||
size_t low = linear_probe_end;
|
||||
size_t high = end - 1;
|
||||
while (low <= high)
|
||||
{
|
||||
size_t mid = low + (high - low) / 2;
|
||||
if (isCurrentKey(mid))
|
||||
low = mid + 1;
|
||||
else
|
||||
{
|
||||
high = mid - 1;
|
||||
end = mid;
|
||||
}
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
std::pair<size_t, size_t> DistinctSortedChunkTransform::continueWithPrevRange(const size_t chunk_rows, IColumn::Filter & filter)
|
||||
|
@ -32,8 +32,7 @@ public:
|
||||
const SizeLimits & output_size_limits_,
|
||||
UInt64 limit_hint_,
|
||||
const SortDescription & sorted_columns_descr_,
|
||||
const Names & source_columns_,
|
||||
size_t range_search_step);
|
||||
const Names & source_columns_);
|
||||
|
||||
String getName() const override { return "DistinctSortedChunkTransform"; }
|
||||
|
||||
@ -68,7 +67,6 @@ private:
|
||||
ColumnRawPtrs other_columns; // used during processing
|
||||
|
||||
MutableColumns current_key;
|
||||
const size_t range_search_step = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -1,33 +1,33 @@
|
||||
<test>
|
||||
<!-- high cardinality -->
|
||||
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_high</drop_query>
|
||||
<create_query>CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium, low)</create_query>
|
||||
<fill_query>INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 from numbers(1000000)</fill_query>
|
||||
<create_query>CREATE TABLE distinct_cardinality_high (high UInt64, medium UInt64, low UInt64) ENGINE MergeTree() ORDER BY (high, medium)</create_query>
|
||||
<fill_query>INSERT INTO distinct_cardinality_high SELECT number % 10000, number % 1000, number % 100 FROM numbers_mt(1e8)</fill_query>
|
||||
|
||||
<query>select distinct high from distinct_cardinality_high</query>
|
||||
<query>select distinct high, low from distinct_cardinality_high</query>
|
||||
<query>select distinct high, medium from distinct_cardinality_high</query>
|
||||
<query>select distinct high, medium, low from distinct_cardinality_high</query>
|
||||
<query>SELECT DISTINCT high FROM distinct_cardinality_high FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, low FROM distinct_cardinality_high FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, medium, low FROM distinct_cardinality_high FORMAT Null</query>
|
||||
|
||||
<query>select distinct high, medium from distinct_cardinality_high order by medium</query>
|
||||
<query>select distinct high, low from distinct_cardinality_high order by low</query>
|
||||
<query>select distinct high, medium, low from distinct_cardinality_high order by high</query>
|
||||
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY medium FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, medium FROM distinct_cardinality_high ORDER BY high FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, low FROM distinct_cardinality_high ORDER BY low FORMAT Null</query>
|
||||
<query>SELECT DISTINCT high, medium, low FROM distinct_cardinality_high ORDER BY low FORMAT Null</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_high</drop_query>
|
||||
|
||||
<!-- low cardinality -->
|
||||
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_low</drop_query>
|
||||
<create_query>CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium, high)</create_query>
|
||||
<fill_query>INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 from numbers(1000000)</fill_query>
|
||||
<create_query>CREATE TABLE distinct_cardinality_low (low UInt64, medium UInt64, high UInt64) ENGINE MergeTree() ORDER BY (low, medium)</create_query>
|
||||
<fill_query>INSERT INTO distinct_cardinality_low SELECT number % 100, number % 1000, number % 10000 FROM numbers_mt(1e8)</fill_query>
|
||||
|
||||
<query>select distinct low from distinct_cardinality_low</query>
|
||||
<query>select distinct low, medium from distinct_cardinality_low</query>
|
||||
<query>select distinct low, high from distinct_cardinality_low</query>
|
||||
<query>select distinct low, medium, high from distinct_cardinality_low</query>
|
||||
<query>SELECT DISTINCT low FROM distinct_cardinality_low FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, high FROM distinct_cardinality_low FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, medium, high FROM distinct_cardinality_low FORMAT Null</query>
|
||||
|
||||
<query>select distinct low, medium from distinct_cardinality_low order by medium</query>
|
||||
<query>select distinct low, high from distinct_cardinality_low order by high</query>
|
||||
<query>select distinct low, medium, high from distinct_cardinality_low order by low</query>
|
||||
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY medium FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, medium FROM distinct_cardinality_low ORDER BY low FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, high FROM distinct_cardinality_low ORDER BY high FORMAT Null</query>
|
||||
<query>SELECT DISTINCT low, medium, high FROM distinct_cardinality_low ORDER BY high FORMAT Null</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS distinct_cardinality_low</drop_query>
|
||||
</test>
|
||||
|
@ -7,13 +7,13 @@ DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix -> pre-distinct optimization only
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by on column in distinct -> pre-distinct and final distinct optimization
|
||||
DistinctSortedChunkTransform
|
||||
DistinctSortedTransform
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with primary key prefix and order by on column _not_ in distinct -> pre-distinct optimization only
|
||||
DistinctSortedChunkTransform
|
||||
-- distinct with non-primary key prefix -> no optimizations
|
||||
No optimizations
|
||||
-- distinct with non-primary key prefix and order by on column in distinct -> final distinct optimization only
|
||||
DistinctSortedChunkTransform
|
||||
DistinctSortedTransform
|
||||
-- distinct with non-primary key prefix and order by on column _not_ in distinct -> no optimizations
|
||||
No optimizations
|
||||
|
@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
|
||||
DISABLE_OPTIMIZATION="set optimize_distinct_in_order=0"
|
||||
ENABLE_OPTIMIZATION="set optimize_distinct_in_order=1"
|
||||
GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform'"
|
||||
GREP_OPTIMIZATIONS="grep 'DistinctSortedChunkTransform\|DistinctSortedTransform'"
|
||||
TRIM_LEADING_SPACES="sed -e 's/^[ \t]*//'"
|
||||
FIND_OPTIMIZATIONS="$GREP_OPTIMIZATIONS | $TRIM_LEADING_SPACES"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user