Merge pull request #15074 from amosbird/btc

Extend trivial count optimization.
This commit is contained in:
alexey-milovidov 2020-10-22 02:50:57 +03:00 committed by GitHub
commit adeba6bdd8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 196 additions and 18 deletions

View File

@ -1116,6 +1116,7 @@ void InterpreterSelectQuery::executeFetchColumns(
/// Optimization for trivial query like SELECT count() FROM table.
bool optimize_trivial_count =
syntax_analyzer_result->optimize_trivial_count
&& (settings.max_parallel_replicas <= 1)
&& storage
&& storage->getName() != "MaterializeMySQL"
&& !filter_info
@ -1128,7 +1129,17 @@ void InterpreterSelectQuery::executeFetchColumns(
{
const auto & desc = query_analyzer->aggregates()[0];
const auto & func = desc.function;
std::optional<UInt64> num_rows = storage->totalRows();
std::optional<UInt64> num_rows{};
if (!query.prewhere() && !query.where())
num_rows = storage->totalRows();
else // It's possible to optimize count() given only partition predicates
{
SelectQueryInfo temp_query_info;
temp_query_info.query = query_ptr;
temp_query_info.syntax_analyzer_result = syntax_analyzer_result;
temp_query_info.sets = query_analyzer->getPreparedSets();
num_rows = storage->totalRowsByPartitionPredicate(temp_query_info, *context);
}
if (num_rows)
{
AggregateFunctionCount & agg_count = static_cast<AggregateFunctionCount &>(*func);

View File

@ -478,6 +478,24 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
/// If we have no information about columns sizes, choose a column of minimum size of its data type.
required.insert(ExpressionActions::getSmallestColumn(source_columns));
}
else if (is_select && metadata_snapshot)
{
const auto & partition_desc = metadata_snapshot->getPartitionKey();
if (partition_desc.expression)
{
const auto & partition_source_columns = partition_desc.expression->getRequiredColumns();
optimize_trivial_count = true;
for (const auto & required_column : required)
{
if (std::find(partition_source_columns.begin(), partition_source_columns.end(), required_column)
== partition_source_columns.end())
{
optimize_trivial_count = false;
break;
}
}
}
}
NameSet unknown_required_source_columns = required;
@ -620,7 +638,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
if (result.optimize_trivial_count)
result.optimize_trivial_count = settings.optimize_trivial_count_query &&
!select_query->where() && !select_query->prewhere() && !select_query->groupBy() && !select_query->having() &&
!select_query->groupBy() && !select_query->having() &&
!select_query->sampleSize() && !select_query->sampleOffset() && !select_query->final() &&
(tables_with_columns.size() < 2 || isLeft(result.analyzed_join->kind()));

View File

@ -463,6 +463,9 @@ public:
/// Does takes underlying Storage (if any) into account.
virtual std::optional<UInt64> totalRows() const { return {}; }
/// Same as above but also take partition predicate into account.
virtual std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const { return {}; }
/// If it is possible to quickly determine exact number of bytes for the table on storage:
/// - memory (approximated, resident)
/// - disk (compressed)

View File

@ -368,8 +368,10 @@ KeyCondition::KeyCondition(
const SelectQueryInfo & query_info,
const Context & context,
const Names & key_column_names,
const ExpressionActionsPtr & key_expr_)
: key_expr(key_expr_), prepared_sets(query_info.sets)
const ExpressionActionsPtr & key_expr_,
bool single_point_,
bool strict_)
: key_expr(key_expr_), prepared_sets(query_info.sets), single_point(single_point_), strict(strict_)
{
for (size_t i = 0, size = key_column_names.size(); i < size; ++i)
{
@ -549,6 +551,18 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions(
Field & out_value,
DataTypePtr & out_type)
{
/// We don't look for inversed key transformations when strict is true, which is required for trivial count().
/// Consider the following test case:
///
/// create table test1(p DateTime, k int) engine MergeTree partition by toDate(p) order by k;
/// insert into test1 values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
/// select count() from test1 where p > toDateTime('2020-09-01 10:00:00');
///
/// toDate(DateTime) is always monotonic, but we cannot relaxing the predicates to be
/// >= toDate(toDateTime('2020-09-01 10:00:00')), which returns 3 instead of the right count: 2.
if (strict)
return false;
String expr_name = node->getColumnName();
const auto & sample_block = key_expr->getSampleBlock();
if (!sample_block.has(expr_name))
@ -732,7 +746,8 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctions(
arguments.push_back({ nullptr, key_column_type, "" });
auto func = func_builder->build(arguments);
if (!func || !func->hasInformationAboutMonotonicity())
/// If we know the given range only contains one value, then we treat all functions as positive monotonic.
if (!func || (!single_point && !func->hasInformationAboutMonotonicity()))
return false;
key_column_type = func->getResultType();
@ -1161,13 +1176,16 @@ BoolMask KeyCondition::checkInRange(
std::optional<Range> KeyCondition::applyMonotonicFunctionsChainToRange(
Range key_range,
const MonotonicFunctionsChain & functions,
DataTypePtr current_type)
DataTypePtr current_type,
bool single_point)
{
for (const auto & func : functions)
{
/// We check the monotonicity of each function on a specific range.
IFunction::Monotonicity monotonicity = func->getMonotonicityForRange(
*current_type.get(), key_range.left, key_range.right);
/// If we know the given range only contains one value, then we treat all functions as positive monotonic.
IFunction::Monotonicity monotonicity = single_point
? IFunction::Monotonicity{true}
: func->getMonotonicityForRange(*current_type.get(), key_range.left, key_range.right);
if (!monotonicity.is_monotonic)
{
@ -1297,7 +1315,8 @@ BoolMask KeyCondition::checkInHyperrectangle(
std::optional<Range> new_range = applyMonotonicFunctionsChainToRange(
*key_range,
element.monotonic_functions_chain,
data_types[element.key_column]
data_types[element.key_column],
single_point
);
if (!new_range)

View File

@ -232,7 +232,9 @@ public:
const SelectQueryInfo & query_info,
const Context & context,
const Names & key_column_names,
const ExpressionActionsPtr & key_expr);
const ExpressionActionsPtr & key_expr,
bool single_point_ = false,
bool strict_ = false);
/// Whether the condition and its negation are feasible in the direct product of single column ranges specified by `hyperrectangle`.
BoolMask checkInHyperrectangle(
@ -307,7 +309,8 @@ public:
static std::optional<Range> applyMonotonicFunctionsChainToRange(
Range key_range,
const MonotonicFunctionsChain & functions,
DataTypePtr current_type);
DataTypePtr current_type,
bool single_point = false);
bool matchesExactContinuousRange() const;
@ -413,6 +416,11 @@ private:
ColumnIndices key_columns;
ExpressionActionsPtr key_expr;
PreparedSets prepared_sets;
// If true, always allow key_expr to be wrapped by function
bool single_point;
// If true, do not use always_monotonic information to transform constants
bool strict;
};
}

View File

@ -190,6 +190,39 @@ std::optional<UInt64> StorageMergeTree::totalRows() const
return getTotalActiveSizeInRows();
}
std::optional<UInt64> StorageMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const
{
auto metadata_snapshot = getInMemoryMetadataPtr();
const auto & partition_key = metadata_snapshot->getPartitionKey();
Names partition_key_columns = partition_key.column_names;
KeyCondition key_condition(
query_info, context, partition_key_columns, partition_key.expression, true /* single_point */, true /* strict */);
if (key_condition.alwaysUnknownOrTrue())
return {};
std::unordered_map<String, bool> partition_filter_map;
size_t res = 0;
auto lock = lockParts();
for (const auto & part : getDataPartsStateRange(DataPartState::Committed))
{
if (part->isEmpty())
continue;
const auto & partition_id = part->info.partition_id;
bool is_valid;
if (auto it = partition_filter_map.find(partition_id); it != partition_filter_map.end())
is_valid = it->second;
else
{
const auto & partition_value = part->partition.value;
std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
is_valid = key_condition.mayBeTrueInRange(partition_value.size(), index_value.data(), index_value.data(), partition_key.data_types);
partition_filter_map.emplace(partition_id, is_valid);
}
if (is_valid)
res += part->rows_count;
}
return res;
}
std::optional<UInt64> StorageMergeTree::totalBytes() const
{
return getTotalActiveSizeInBytes();

View File

@ -47,6 +47,7 @@ public:
unsigned num_streams) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, const Context &) const override;
std::optional<UInt64> totalBytes() const override;
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;

View File

@ -3681,6 +3681,36 @@ std::optional<UInt64> StorageReplicatedMergeTree::totalRows() const
return res;
}
std::optional<UInt64> StorageReplicatedMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const
{
auto metadata_snapshot = getInMemoryMetadataPtr();
const auto & partition_key = metadata_snapshot->getPartitionKey();
Names partition_key_columns = partition_key.column_names;
KeyCondition key_condition(
query_info, context, partition_key_columns, partition_key.expression, true /* single_point */, true /* strict */);
if (key_condition.alwaysUnknownOrTrue())
return {};
std::unordered_map<String, bool> partition_filter_map;
size_t res = 0;
foreachCommittedParts([&](auto & part)
{
const auto & partition_id = part->info.partition_id;
bool is_valid;
if (auto it = partition_filter_map.find(partition_id); it != partition_filter_map.end())
is_valid = it->second;
else
{
const auto & partition_value = part->partition.value;
std::vector<FieldRef> index_value(partition_value.begin(), partition_value.end());
is_valid = key_condition.mayBeTrueInRange(partition_value.size(), index_value.data(), index_value.data(), partition_key.data_types);
partition_filter_map.emplace(partition_id, is_valid);
}
if (is_valid)
res += part->rows_count;
});
return res;
}
std::optional<UInt64> StorageReplicatedMergeTree::totalBytes() const
{
UInt64 res = 0;

View File

@ -97,6 +97,7 @@ public:
unsigned num_streams) override;
std::optional<UInt64> totalRows() const override;
std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, const Context & context) const override;
std::optional<UInt64> totalBytes() const override;
BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;

View File

@ -30,14 +30,14 @@ ${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
${CLICKHOUSE_CLIENT} --query="INSERT INTO composite_partition_key VALUES \
(301, 20, 3), (302, 21, 3), (303, 22, 3)"
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a > 400 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b = 11 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE c = 4 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a < 200 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b < 20 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE b >= 12 AND c = 2 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="SELECT count() FROM composite_partition_key WHERE a = 301 AND b = 21 AND c = 3 FORMAT XML SETTINGS optimize_trivial_count_query = 0" | grep -F rows_read | sed 's/^[ \t]*//g'
${CLICKHOUSE_CLIENT} --query="DROP TABLE composite_partition_key"

View File

@ -0,0 +1,9 @@
0
0
2
1
1
0
2
0
3

View File

@ -0,0 +1,45 @@
drop table if exists test1;
create table test1(p DateTime, k int) engine MergeTree partition by toDate(p) order by k;
insert into test1 values ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
set max_rows_to_read = 1;
-- non-optimized
select count() from test1 settings max_parallel_replicas = 3; -- { serverError 158; }
-- optimized (toYear is monotonic and we provide the partition expr as is)
select count() from test1 where toYear(toDate(p)) = 1999;
-- non-optimized (toDate(DateTime) is always monotonic, but we cannot relaxing the predicates to do trivial count())
select count() from test1 where p > toDateTime('2020-09-01 10:00:00'); -- { serverError 158; }
-- optimized (partition expr wrapped with non-monotonic functions)
select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(toDate(p))) % 2 = 1;
select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(toDate(p))) % 2 = 0;
-- non-optimized (some predicate depends on non-partition_expr columns)
select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
-- optimized
select count() from test1 where toDate(p) > '2020-09-01';
create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;
insert into test_tuple values ('2020-09-01 00:01:02', 1, 2), ('2020-09-01 00:01:03', 2, 3), ('2020-09-02 00:01:03', 3, 4);
-- optimized
select count() from test_tuple where toDate(p) > '2020-09-01';
-- optimized
select count() from test_tuple where toDate(p) > '2020-09-01' and i = 1;
-- optimized
select count() from test_tuple where i > 1;
-- optimized
select count() from test_tuple where i < 1;
create table test_two_args(i int, j int, k int) engine MergeTree partition by i + j order by k;
insert into test_two_args values (1, 2, 3), (2, 1, 3), (0, 3, 4);
-- optimized
select count() from test_two_args where i + j = 3;
-- non-optimized
select count() from test_two_args where i = 1; -- { serverError 158; }
drop table test1;
drop table test_tuple;
drop table test_two_args;