mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Extend trivial count optimization.
This commit is contained in:
parent
412a54e356
commit
867216103f
@ -1115,6 +1115,7 @@ void InterpreterSelectQuery::executeFetchColumns(
|
||||
/// Optimization for trivial query like SELECT count() FROM table.
|
||||
bool optimize_trivial_count =
|
||||
syntax_analyzer_result->optimize_trivial_count
|
||||
&& (settings.max_parallel_replicas <= 1)
|
||||
&& storage
|
||||
&& !filter_info
|
||||
&& processing_stage == QueryProcessingStage::FetchColumns
|
||||
@ -1126,7 +1127,17 @@ void InterpreterSelectQuery::executeFetchColumns(
|
||||
{
|
||||
const auto & desc = query_analyzer->aggregates()[0];
|
||||
const auto & func = desc.function;
|
||||
std::optional<UInt64> num_rows = storage->totalRows();
|
||||
std::optional<UInt64> num_rows{};
|
||||
if (!query.prewhere() && !query.where())
|
||||
num_rows = storage->totalRows();
|
||||
else // It's possible to optimize count() given only partition predicates
|
||||
{
|
||||
SelectQueryInfo temp_query_info;
|
||||
temp_query_info.query = query_ptr;
|
||||
temp_query_info.syntax_analyzer_result = syntax_analyzer_result;
|
||||
temp_query_info.sets = query_analyzer->getPreparedSets();
|
||||
num_rows = storage->totalRowsByPartitionPredicate(temp_query_info, *context);
|
||||
}
|
||||
if (num_rows)
|
||||
{
|
||||
AggregateFunctionCount & agg_count = static_cast<AggregateFunctionCount &>(*func);
|
||||
|
@ -478,6 +478,24 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
|
||||
/// If we have no information about columns sizes, choose a column of minimum size of its data type.
|
||||
required.insert(ExpressionActions::getSmallestColumn(source_columns));
|
||||
}
|
||||
else if (is_select && metadata_snapshot)
|
||||
{
|
||||
const auto & partition_desc = metadata_snapshot->getPartitionKey();
|
||||
if (partition_desc.expression)
|
||||
{
|
||||
const auto & partition_source_columns = partition_desc.expression->getRequiredColumns();
|
||||
optimize_trivial_count = true;
|
||||
for (const auto & required_column : required)
|
||||
{
|
||||
if (std::find(partition_source_columns.begin(), partition_source_columns.end(), required_column)
|
||||
== partition_source_columns.end())
|
||||
{
|
||||
optimize_trivial_count = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NameSet unknown_required_source_columns = required;
|
||||
|
||||
@ -620,7 +638,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
|
||||
|
||||
if (result.optimize_trivial_count)
|
||||
result.optimize_trivial_count = settings.optimize_trivial_count_query &&
|
||||
!select_query->where() && !select_query->prewhere() && !select_query->groupBy() && !select_query->having() &&
|
||||
!select_query->groupBy() && !select_query->having() &&
|
||||
!select_query->sampleSize() && !select_query->sampleOffset() && !select_query->final() &&
|
||||
(tables_with_columns.size() < 2 || isLeft(result.analyzed_join->kind()));
|
||||
|
||||
|
@ -463,6 +463,9 @@ public:
|
||||
/// Does takes underlying Storage (if any) into account.
|
||||
virtual std::optional<UInt64> totalRows() const { return {}; }
|
||||
|
||||
/// Same as above but also take partition predicate into account.
|
||||
virtual std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const { return {}; }
|
||||
|
||||
/// If it is possible to quickly determine exact number of bytes for the table on storage:
|
||||
/// - memory (approximated, resident)
|
||||
/// - disk (compressed)
|
||||
|
@ -368,8 +368,10 @@ KeyCondition::KeyCondition(
|
||||
const SelectQueryInfo & query_info,
|
||||
const Context & context,
|
||||
const Names & key_column_names,
|
||||
const ExpressionActionsPtr & key_expr_)
|
||||
: key_expr(key_expr_), prepared_sets(query_info.sets)
|
||||
const ExpressionActionsPtr & key_expr_,
|
||||
bool single_point_,
|
||||
bool strict_)
|
||||
: key_expr(key_expr_), prepared_sets(query_info.sets), single_point(single_point_), strict(strict_)
|
||||
{
|
||||
for (size_t i = 0, size = key_column_names.size(); i < size; ++i)
|
||||
{
|
||||
@ -551,6 +553,18 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions(
|
||||
Field & out_value,
|
||||
DataTypePtr & out_type)
|
||||
{
|
||||
/// We don't look for inversed key transformations when strict is true, which is required for trivial count().
|
||||
/// Consider the following test case:
|
||||
///
|
||||
/// create table test1(p DateTime, k int) engine MergeTree partition by toDate(p) order by k;
|
||||
/// insert into test1 values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
|
||||
/// select count() from test1 where p > toDateTime('2020-09-01 10:00:00');
|
||||
///
|
||||
/// toDate(DateTime) is always monotonic, but we cannot relaxing the predicates to be
|
||||
/// >= toDate(toDateTime('2020-09-01 10:00:00')), which returns 3 instead of the right count: 2.
|
||||
if (strict)
|
||||
return false;
|
||||
|
||||
String expr_name = node->getColumnName();
|
||||
const auto & sample_block = key_expr->getSampleBlock();
|
||||
if (!sample_block.has(expr_name))
|
||||
@ -734,7 +748,8 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctions(
|
||||
arguments.push_back({ nullptr, key_column_type, "" });
|
||||
auto func = func_builder->build(arguments);
|
||||
|
||||
if (!func || !func->hasInformationAboutMonotonicity())
|
||||
/// If we know the given range only contains one value, then we treat all functions as positive monotonic.
|
||||
if (!func || (!single_point && !func->hasInformationAboutMonotonicity()))
|
||||
return false;
|
||||
|
||||
key_column_type = func->getReturnType();
|
||||
@ -1163,13 +1178,16 @@ BoolMask KeyCondition::checkInRange(
|
||||
std::optional<Range> KeyCondition::applyMonotonicFunctionsChainToRange(
|
||||
Range key_range,
|
||||
const MonotonicFunctionsChain & functions,
|
||||
DataTypePtr current_type)
|
||||
DataTypePtr current_type,
|
||||
bool single_point)
|
||||
{
|
||||
for (const auto & func : functions)
|
||||
{
|
||||
/// We check the monotonicity of each function on a specific range.
|
||||
IFunction::Monotonicity monotonicity = func->getMonotonicityForRange(
|
||||
*current_type.get(), key_range.left, key_range.right);
|
||||
/// If we know the given range only contains one value, then we treat all functions as positive monotonic.
|
||||
IFunction::Monotonicity monotonicity = single_point
|
||||
? IFunction::Monotonicity{true}
|
||||
: func->getMonotonicityForRange(*current_type.get(), key_range.left, key_range.right);
|
||||
|
||||
if (!monotonicity.is_monotonic)
|
||||
{
|
||||
@ -1299,7 +1317,8 @@ BoolMask KeyCondition::checkInHyperrectangle(
|
||||
std::optional<Range> new_range = applyMonotonicFunctionsChainToRange(
|
||||
*key_range,
|
||||
element.monotonic_functions_chain,
|
||||
data_types[element.key_column]
|
||||
data_types[element.key_column],
|
||||
single_point
|
||||
);
|
||||
|
||||
if (!new_range)
|
||||
|
@ -232,7 +232,9 @@ public:
|
||||
const SelectQueryInfo & query_info,
|
||||
const Context & context,
|
||||
const Names & key_column_names,
|
||||
const ExpressionActionsPtr & key_expr);
|
||||
const ExpressionActionsPtr & key_expr,
|
||||
bool single_point_ = false,
|
||||
bool strict_ = false);
|
||||
|
||||
/// Whether the condition and its negation are feasible in the direct product of single column ranges specified by `hyperrectangle`.
|
||||
BoolMask checkInHyperrectangle(
|
||||
@ -307,7 +309,8 @@ public:
|
||||
static std::optional<Range> applyMonotonicFunctionsChainToRange(
|
||||
Range key_range,
|
||||
const MonotonicFunctionsChain & functions,
|
||||
DataTypePtr current_type);
|
||||
DataTypePtr current_type,
|
||||
bool single_point = false);
|
||||
|
||||
bool matchesExactContinuousRange() const;
|
||||
|
||||
@ -413,6 +416,11 @@ private:
|
||||
ColumnIndices key_columns;
|
||||
ExpressionActionsPtr key_expr;
|
||||
PreparedSets prepared_sets;
|
||||
|
||||
// If true, always allow key_expr to be wrapped by function
|
||||
bool single_point;
|
||||
// If true, do not use always_monotonic information to transform constants
|
||||
bool strict;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -189,6 +189,39 @@ std::optional<UInt64> StorageMergeTree::totalRows() const
|
||||
return getTotalActiveSizeInRows();
|
||||
}
|
||||
|
||||
std::optional<UInt64> StorageMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const
|
||||
{
|
||||
auto metadata_snapshot = getInMemoryMetadataPtr();
|
||||
const auto & partition_key = metadata_snapshot->getPartitionKey();
|
||||
Names partition_key_columns = partition_key.column_names;
|
||||
KeyCondition key_condition(
|
||||
query_info, context, partition_key_columns, partition_key.expression, true /* single_point */, true /* strict */);
|
||||
if (key_condition.alwaysUnknownOrTrue())
|
||||
return {};
|
||||
std::unordered_map<String, bool> partition_filter_map;
|
||||
size_t res = 0;
|
||||
auto lock = lockParts();
|
||||
for (const auto & part : getDataPartsStateRange(DataPartState::Committed))
|
||||
{
|
||||
if (part->isEmpty())
|
||||
continue;
|
||||
const auto & partition_id = part->info.partition_id;
|
||||
bool is_valid;
|
||||
if (auto it = partition_filter_map.find(partition_id); it != partition_filter_map.end())
|
||||
is_valid = it->second;
|
||||
else
|
||||
{
|
||||
const auto & partition_value = part->partition.value;
|
||||
std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
|
||||
is_valid = key_condition.mayBeTrueInRange(partition_value.size(), index_value.data(), index_value.data(), partition_key.data_types);
|
||||
partition_filter_map.emplace(partition_id, is_valid);
|
||||
}
|
||||
if (is_valid)
|
||||
res += part->rows_count;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::optional<UInt64> StorageMergeTree::totalBytes() const
|
||||
{
|
||||
return getTotalActiveSizeInBytes();
|
||||
|
@ -47,6 +47,7 @@ public:
|
||||
unsigned num_streams) override;
|
||||
|
||||
std::optional<UInt64> totalRows() const override;
|
||||
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const override;
|
||||
std::optional<UInt64> totalBytes() const override;
|
||||
|
||||
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;
|
||||
|
@ -3563,6 +3563,36 @@ std::optional<UInt64> StorageReplicatedMergeTree::totalRows() const
|
||||
return res;
|
||||
}
|
||||
|
||||
std::optional<UInt64> StorageReplicatedMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const
|
||||
{
|
||||
auto metadata_snapshot = getInMemoryMetadataPtr();
|
||||
const auto & partition_key = metadata_snapshot->getPartitionKey();
|
||||
Names partition_key_columns = partition_key.column_names;
|
||||
KeyCondition key_condition(
|
||||
query_info, context, partition_key_columns, partition_key.expression, true /* single_point */, true /* strict */);
|
||||
if (key_condition.alwaysUnknownOrTrue())
|
||||
return {};
|
||||
std::unordered_map<String, bool> partition_filter_map;
|
||||
size_t res = 0;
|
||||
foreachCommittedParts([&](auto & part)
|
||||
{
|
||||
const auto & partition_id = part->info.partition_id;
|
||||
bool is_valid;
|
||||
if (auto it = partition_filter_map.find(partition_id); it != partition_filter_map.end())
|
||||
is_valid = it->second;
|
||||
else
|
||||
{
|
||||
const auto & partition_value = part->partition.value;
|
||||
std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
|
||||
is_valid = key_condition.mayBeTrueInRange(partition_value.size(), index_value.data(), index_value.data(), partition_key.data_types);
|
||||
partition_filter_map.emplace(partition_id, is_valid);
|
||||
}
|
||||
if (is_valid)
|
||||
res += part->rows_count;
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
std::optional<UInt64> StorageReplicatedMergeTree::totalBytes() const
|
||||
{
|
||||
UInt64 res = 0;
|
||||
|
@ -97,6 +97,7 @@ public:
|
||||
unsigned num_streams) override;
|
||||
|
||||
std::optional<UInt64> totalRows() const override;
|
||||
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const override;
|
||||
std::optional<UInt64> totalBytes() const override;
|
||||
|
||||
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;
|
||||
|
@ -30,14 +30,14 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
|
||||
${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
|
||||
(301, 20, 3), (302, 21, 3), (303, 22, 3)"
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
|
||||
|
||||
${CLICKHOUSE_CLIENT} --query="DROP TABLE composite_partition_key"
|
||||
|
@ -0,0 +1,9 @@
|
||||
0
|
||||
0
|
||||
2
|
||||
1
|
||||
1
|
||||
0
|
||||
2
|
||||
0
|
||||
3
|
@ -0,0 +1,45 @@
|
||||
drop table if exists test1;
|
||||
|
||||
create table test1(p DateTime, k int) engine MergeTree partition by toDate(p) order by k;
|
||||
insert into test1 values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
|
||||
|
||||
set max_rows_to_read = 1;
|
||||
-- non-optimized
|
||||
select count() from test1 settings max_parallel_replicas = 3; -- { serverError 158; }
|
||||
-- optimized (toYear is monotonic and we provide the partition expr as is)
|
||||
select count() from test1 where toYear(toDate(p)) = 1999;
|
||||
-- non-optimized (toDate(DateTime) is always monotonic, but we cannot relaxing the predicates to do trivial count())
|
||||
select count() from test1 where p > toDateTime('2020-09-01 10:00:00'); -- { serverError 158; }
|
||||
-- optimized (partition expr wrapped with non-monotonic functions)
|
||||
select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(toDate(p))) % 2 = 1;
|
||||
select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(toDate(p))) % 2 = 0;
|
||||
-- non-optimized (some predicate depends on non-partition_expr columns)
|
||||
select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
|
||||
-- optimized
|
||||
select count() from test1 where toDate(p) > '2020-09-01';
|
||||
|
||||
create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;
|
||||
|
||||
insert into test_tuple values ('2020-09-01 00:01:02', 1, 2), ('2020-09-01 00:01:03', 2, 3), ('2020-09-02 00:01:03', 3, 4);
|
||||
|
||||
-- optimized
|
||||
select count() from test_tuple where toDate(p) > '2020-09-01';
|
||||
-- optimized
|
||||
select count() from test_tuple where toDate(p) > '2020-09-01' and i = 1;
|
||||
-- optimized
|
||||
select count() from test_tuple where i > 1;
|
||||
-- optimized
|
||||
select count() from test_tuple where i < 1;
|
||||
|
||||
create table test_two_args(i int, j int, k int) engine MergeTree partition by i + j order by k;
|
||||
|
||||
insert into test_two_args values (1, 2, 3), (2, 1, 3), (0, 3, 4);
|
||||
|
||||
-- optimized
|
||||
select count() from test_two_args where i + j = 3;
|
||||
-- non-optimized
|
||||
select count() from test_two_args where i = 1; -- { serverError 158; }
|
||||
|
||||
drop table test1;
|
||||
drop table test_tuple;
|
||||
drop table test_two_args;
|
Loading…
Reference in New Issue
Block a user