Merge branch 'ClickHouse:master' into footer_column__names

This commit is contained in:
Shaun Struwig 2024-06-11 15:14:00 +02:00 committed by GitHub
commit cf17a0aa48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 174 additions and 52 deletions

2
contrib/googletest vendored

@ -1 +1 @@
Subproject commit e47544ad31cb3ceecd04cc13e8fe556f8df9fe0b
Subproject commit a7f443b80b105f940225332ed3c31f2790092f47

View File

@ -421,6 +421,9 @@ struct AggregateProjectionCandidates
/// This flag means that DAG for projection candidate should be used in FilterStep.
bool has_filter = false;
/// If not empty, try to find exact ranges from parts to speed up trivial count queries.
String only_count_column;
};
AggregateProjectionCandidates getAggregateProjectionCandidates(
@ -502,6 +505,12 @@ AggregateProjectionCandidates getAggregateProjectionCandidates(
candidates.minmax_projection.emplace(std::move(minmax));
}
}
else
{
/// Trivial count optimization only applies after @can_use_minmax_projection.
if (keys.empty() && aggregates.size() == 1 && typeid_cast<const AggregateFunctionCount *>(aggregates[0].function.get()))
candidates.only_count_column = aggregates[0].column_name;
}
}
if (!candidates.minmax_projection)
@ -584,13 +593,21 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
ContextPtr context = reading->getContext();
MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());
AggregateProjectionCandidate * best_candidate = nullptr;
/// Stores row count from exact ranges of parts.
size_t exact_count = 0;
if (candidates.minmax_projection)
{
best_candidate = &candidates.minmax_projection->candidate;
}
else if (!candidates.real.empty())
else if (!candidates.real.empty() || !candidates.only_count_column.empty())
{
auto ordinary_reading_select_result = reading->selectRangesToRead();
auto ordinary_reading_select_result = reading->getAnalyzedResult();
bool find_exact_ranges = !candidates.only_count_column.empty();
if (!ordinary_reading_select_result || (!ordinary_reading_select_result->has_exact_ranges && find_exact_ranges))
ordinary_reading_select_result = reading->selectRangesToRead(find_exact_ranges);
size_t ordinary_reading_marks = ordinary_reading_select_result->selected_marks;
/// Nothing to read. Ignore projections.
@ -600,7 +617,49 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
return {};
}
const auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges;
auto & parts_with_ranges = ordinary_reading_select_result->parts_with_ranges;
if (!candidates.only_count_column.empty())
{
for (auto & part_with_ranges : parts_with_ranges)
{
MarkRanges new_ranges;
auto & ranges = part_with_ranges.ranges;
const auto & exact_ranges = part_with_ranges.exact_ranges;
if (exact_ranges.empty())
continue;
size_t i = 0;
size_t len = exact_ranges.size();
for (auto & range : ranges)
{
while (i < len && exact_ranges[i].begin < range.end)
{
chassert(exact_ranges[i].begin >= range.begin);
chassert(exact_ranges[i].end <= range.end);
/// Found some marks which are not exact
if (range.begin < exact_ranges[i].begin)
new_ranges.emplace_back(range.begin, exact_ranges[i].begin);
range.begin = exact_ranges[i].end;
ordinary_reading_marks -= exact_ranges[i].end - exact_ranges[i].begin;
exact_count += part_with_ranges.data_part->index_granularity.getRowsCountInRange(exact_ranges[i]);
++i;
}
/// Current range still contains some marks which are not exact
if (range.begin < range.end)
new_ranges.emplace_back(range);
}
chassert(i == len);
part_with_ranges.ranges = std::move(new_ranges);
}
std::erase_if(parts_with_ranges, [&](const auto & part_with_ranges) { return part_with_ranges.ranges.empty(); });
if (parts_with_ranges.empty())
chassert(ordinary_reading_marks == 0);
}
/// Selecting best candidate.
for (auto & candidate : candidates.real)
@ -630,8 +689,20 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
if (!best_candidate)
{
reading->setAnalyzedResult(std::move(ordinary_reading_select_result));
return {};
if (exact_count > 0)
{
if (ordinary_reading_marks > 0)
{
ordinary_reading_select_result->selected_marks = ordinary_reading_marks;
ordinary_reading_select_result->selected_rows -= exact_count;
reading->setAnalyzedResult(std::move(ordinary_reading_select_result));
}
}
else
{
reading->setAnalyzedResult(std::move(ordinary_reading_select_result));
return {};
}
}
}
else
@ -639,10 +710,11 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
return {};
}
chassert(best_candidate != nullptr);
QueryPlanStepPtr projection_reading;
bool has_ordinary_parts;
String selected_projection_name;
if (best_candidate)
selected_projection_name = best_candidate->projection->name;
/// Add reading from projection step.
if (candidates.minmax_projection)
@ -654,6 +726,32 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
projection_reading = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
has_ordinary_parts = false;
}
else if (best_candidate == nullptr)
{
chassert(exact_count > 0);
auto agg_count = std::make_shared<AggregateFunctionCount>(DataTypes{});
std::vector<char> state(agg_count->sizeOfData());
AggregateDataPtr place = state.data();
agg_count->create(place);
SCOPE_EXIT_MEMORY_SAFE(agg_count->destroy(place));
agg_count->set(place, exact_count);
auto column = ColumnAggregateFunction::create(agg_count);
column->insertFrom(place);
Block block_with_count{
{std::move(column),
std::make_shared<DataTypeAggregateFunction>(agg_count, DataTypes{}, Array{}),
candidates.only_count_column}};
Pipe pipe(std::make_shared<SourceFromSingleChunk>(std::move(block_with_count)));
projection_reading = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
selected_projection_name = "Optimized trivial count";
has_ordinary_parts = reading->getAnalyzedResult() != nullptr;
}
else
{
auto storage_snapshot = reading->getStorageSnapshot();
@ -694,46 +792,54 @@ std::optional<String> optimizeUseAggregateProjections(QueryPlan::Node & node, Qu
context->getQueryContext()->addQueryAccessInfo(Context::QualifiedProjectionName
{
.storage_id = reading->getMergeTreeData().getStorageID(),
.projection_name = best_candidate->projection->name,
.projection_name = selected_projection_name,
});
}
// LOG_TRACE(getLogger("optimizeUseProjections"), "Projection reading header {}",
// projection_reading->getOutputStream().header.dumpStructure());
projection_reading->setStepDescription(best_candidate->projection->name);
projection_reading->setStepDescription(selected_projection_name);
auto & projection_reading_node = nodes.emplace_back(QueryPlan::Node{.step = std::move(projection_reading)});
auto & expr_or_filter_node = nodes.emplace_back();
if (candidates.has_filter)
/// Root node of optimized child plan using @projection_name
QueryPlan::Node * aggregate_projection_node = nullptr;
if (best_candidate)
{
expr_or_filter_node.step = std::make_unique<FilterStep>(
projection_reading_node.step->getOutputStream(),
best_candidate->dag,
best_candidate->dag->getOutputs().front()->result_name,
true);
}
else
expr_or_filter_node.step = std::make_unique<ExpressionStep>(
projection_reading_node.step->getOutputStream(),
best_candidate->dag);
aggregate_projection_node = &nodes.emplace_back();
if (candidates.has_filter)
{
aggregate_projection_node->step = std::make_unique<FilterStep>(
projection_reading_node.step->getOutputStream(),
best_candidate->dag,
best_candidate->dag->getOutputs().front()->result_name,
true);
}
else
aggregate_projection_node->step
= std::make_unique<ExpressionStep>(projection_reading_node.step->getOutputStream(), best_candidate->dag);
expr_or_filter_node.children.push_back(&projection_reading_node);
aggregate_projection_node->children.push_back(&projection_reading_node);
}
else /// trivial count optimization
{
aggregate_projection_node = &projection_reading_node;
}
if (!has_ordinary_parts)
{
/// All parts are taken from projection
aggregating->requestOnlyMergeForAggregateProjection(expr_or_filter_node.step->getOutputStream());
node.children.front() = &expr_or_filter_node;
aggregating->requestOnlyMergeForAggregateProjection(aggregate_projection_node->step->getOutputStream());
node.children.front() = aggregate_projection_node;
}
else
{
node.step = aggregating->convertToAggregatingProjection(expr_or_filter_node.step->getOutputStream());
node.children.push_back(&expr_or_filter_node);
node.step = aggregating->convertToAggregatingProjection(aggregate_projection_node->step->getOutputStream());
node.children.push_back(aggregate_projection_node);
}
return best_candidate->projection->name;
return selected_projection_name;
}
}

View File

@ -139,7 +139,9 @@ std::optional<String> optimizeUseNormalProjections(Stack & stack, QueryPlan::Nod
const auto & query_info = reading->getQueryInfo();
MergeTreeDataSelectExecutor reader(reading->getMergeTreeData());
auto ordinary_reading_select_result = reading->selectRangesToRead();
auto ordinary_reading_select_result = reading->getAnalyzedResult();
if (!ordinary_reading_select_result)
ordinary_reading_select_result = reading->selectRangesToRead();
size_t ordinary_reading_marks = ordinary_reading_select_result->selected_marks;
/// Nothing to read. Ignore projections.

View File

@ -25,8 +25,7 @@ namespace QueryPlanOptimizations
bool canUseProjectionForReadingStep(ReadFromMergeTree * reading)
{
/// Probably some projection already was applied.
if (reading->hasAnalyzedResult())
if (reading->getAnalyzedResult() && reading->getAnalyzedResult()->readFromProjection())
return false;
if (reading->isQueryWithFinal())

View File

@ -1358,9 +1358,9 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal(
return merging_pipes.empty() ? Pipe::unitePipes(std::move(no_merging_pipes)) : Pipe::unitePipes(std::move(merging_pipes));
}
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead() const
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(bool find_exact_ranges) const
{
return selectRangesToRead(prepared_parts, alter_conversions_for_parts, false /* find_exact_ranges */);
return selectRangesToRead(prepared_parts, alter_conversions_for_parts, find_exact_ranges);
}
ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
@ -1664,6 +1664,7 @@ ReadFromMergeTree::AnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
result.selected_marks_pk = sum_marks_pk;
result.total_marks_pk = total_marks_pk;
result.selected_rows = sum_rows;
result.has_exact_ranges = result.selected_parts == 0 || find_exact_ranges;
if (query_info_.input_order_info)
result.read_type = (query_info_.input_order_info->direction > 0)

View File

@ -100,7 +100,9 @@ public:
UInt64 selected_marks_pk = 0;
UInt64 total_marks_pk = 0;
UInt64 selected_rows = 0;
bool has_exact_ranges = false;
bool readFromProjection() const { return !parts_with_ranges.empty() && parts_with_ranges.front().data_part->isProjectionPart(); }
void checkLimits(const Settings & settings, const SelectQueryInfo & query_info_) const;
};
@ -167,7 +169,7 @@ public:
AnalysisResultPtr selectRangesToRead(
MergeTreeData::DataPartsVector parts, std::vector<AlterConversionsPtr> alter_conversions, bool find_exact_ranges = false) const;
AnalysisResultPtr selectRangesToRead() const;
AnalysisResultPtr selectRangesToRead(bool find_exact_ranges = false) const;
StorageMetadataPtr getStorageMetadata() const { return metadata_for_reading; }
@ -182,7 +184,7 @@ public:
bool requestOutputEachPartitionThroughSeparatePort();
bool willOutputEachPartitionThroughSeparatePort() const { return output_each_partition_through_separate_port; }
bool hasAnalyzedResult() const { return analyzed_result_ptr != nullptr; }
AnalysisResultPtr getAnalyzedResult() const { return analyzed_result_ptr; }
void setAnalyzedResult(AnalysisResultPtr analyzed_result_ptr_) { analyzed_result_ptr = std::move(analyzed_result_ptr_); }
const MergeTreeData::DataPartsVector & getParts() const { return prepared_parts; }

View File

@ -64,7 +64,6 @@ ProjectionDescription ProjectionDescription::clone() const
other.sample_block_for_keys = sample_block_for_keys;
other.metadata = metadata;
other.key_size = key_size;
other.is_minmax_count_projection = is_minmax_count_projection;
other.primary_key_max_column_name = primary_key_max_column_name;
other.partition_value_indices = partition_value_indices;
@ -195,7 +194,6 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection(
ContextPtr query_context)
{
ProjectionDescription result;
result.is_minmax_count_projection = true;
auto select_query = std::make_shared<ASTProjectionSelectQuery>();
ASTPtr select_expression_list = std::make_shared<ASTExpressionList>();
@ -282,13 +280,11 @@ ProjectionDescription ProjectionDescription::getMinMaxCountProjection(
return result;
}
void ProjectionDescription::recalculateWithNewColumns(const ColumnsDescription & new_columns, ContextPtr query_context)
{
*this = getProjectionFromAST(definition_ast, new_columns, query_context);
}
Block ProjectionDescription::calculate(const Block & block, ContextPtr context) const
{
auto mut_context = Context::createCopy(context);

View File

@ -56,8 +56,6 @@ struct ProjectionDescription
size_t key_size = 0;
bool is_minmax_count_projection = false;
/// If a primary key expression is used in the minmax_count projection, store the name of max expression.
String primary_key_max_column_name;

View File

@ -11,9 +11,11 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE single_col_partition_key(x UInt32) EN
${CLICKHOUSE_CLIENT} --query="INSERT INTO single_col_partition_key VALUES (1), (2), (3), (4), (11), (12), (20)"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x < 3 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x >= 11 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x = 20 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
DISABLE_COUNT_OPTIMIZATION="SETTINGS optimize_trivial_count_query = 0, optimize_use_implicit_projections = 0"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x < 3 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x >= 11 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM single_col_partition_key WHERE x = 20 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="DROP TABLE single_col_partition_key"
@ -31,14 +33,14 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
(301, 20, 3), (302, 21, 3), (303, 22, 3)"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML $DISABLE_COUNT_OPTIMIZATION" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="DROP TABLE composite_partition_key"

View File

@ -45,6 +45,7 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO enum_test_table VALUES ('hello'), ('wo
${CLICKHOUSE_CLIENT} --query="INSERT INTO date_test_table VALUES (1), (2), (2), (256), (257), (257);"
CLICKHOUSE_CLIENT=$(echo ${CLICKHOUSE_CLIENT} | sed 's/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/--send_logs_level=debug/g')
CLICKHOUSE_CLIENT="${CLICKHOUSE_CLIENT} --optimize_use_implicit_projections 0"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM string_test_table WHERE toUInt64(val) == 0;" 2>&1 |grep -q "3 marks to read from 1 ranges" && echo "no monotonic int case: String -> UInt64"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM fixed_string_test_table WHERE toUInt64(val) == 0;" 2>&1 |grep -q "3 marks to read from 1 ranges" && echo "no monotonic int case: FixedString -> UInt64"

View File

@ -0,0 +1,3 @@
ReadFromMergeTree (default.x)
ReadFromPreparedSource (Optimized trivial count)
5

View File

@ -0,0 +1,8 @@
drop table if exists x;
create table x (i int) engine MergeTree order by i settings index_granularity = 3;
insert into x select * from numbers(10);
select * from (explain select count() from x where (i >= 3 and i <= 6) or i = 7) where explain like '%ReadFromPreparedSource%' or explain like '%ReadFromMergeTree%';
select count() from x where (i >= 3 and i <= 6) or i = 7;
drop table x;

View File

@ -1,6 +1,8 @@
-- Tags: no-replicated-database
-- Tag no-replicated-database: Requires investigation
SET optimize_use_implicit_projections = 0;
EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID = 29103473;
EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID != 29103473;
EXPLAIN ESTIMATE SELECT count() FROM test.hits WHERE CounterID > 29103473;

View File

@ -1,3 +1,5 @@
SET optimize_use_implicit_projections = 0;
-- the work for scalar subquery is properly accounted:
SET max_rows_to_read = 1000000;
SELECT 1 = (SELECT count() FROM test.hits WHERE NOT ignore(AdvEngineID)); -- { serverError TOO_MANY_ROWS }