From 0b47f4a9e9573df9203eb3c7a4d1de30225c7d25 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 6 Nov 2020 21:14:36 +0300 Subject: [PATCH] Fix optimize_trivial_count_query with partition predicate Consider the following example: CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k; INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND) and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN. since there is no post processing and return count() based on only the first predicate is wrong. Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong. And two examples above just to show the difference, the behaviour hadn't been changed with this patch: - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND) so it will prune everything out and nothing will be read. - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0 In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND) and all, two, partitions will be scanned, but due to filtering later none of rows will be matched. --- src/Storages/MergeTree/KeyCondition.cpp | 22 ++++++++++++-- src/Storages/MergeTree/KeyCondition.h | 30 ++++++++++++++++++- src/Storages/MergeTree/PartitionPruner.h | 2 +- ...l_count_with_partition_predicate.reference | 1 + ...trivial_count_with_partition_predicate.sql | 2 ++ 5 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index f6ff13dc9c6..ff538b40617 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1674,13 +1674,29 @@ String KeyCondition::RPNElement::toString() const bool KeyCondition::alwaysUnknownOrTrue() const +{ + return unknownOrAlwaysTrue(false); +} +bool KeyCondition::anyUnknownOrAlwaysTrue() const +{ + return unknownOrAlwaysTrue(true); +} +bool KeyCondition::unknownOrAlwaysTrue(bool unknown_any) const { std::vector rpn_stack; for (const auto & element : rpn) { - if (element.function == RPNElement::FUNCTION_UNKNOWN - || element.function == RPNElement::ALWAYS_TRUE) + if (element.function == RPNElement::FUNCTION_UNKNOWN) + { + /// If unknown_any is true, return instantly, + /// to avoid processing it with FUNCTION_AND, and change the outcome. + if (unknown_any) + return true; + /// Otherwise, it may be AND'ed via FUNCTION_AND + rpn_stack.push_back(true); + } + else if (element.function == RPNElement::ALWAYS_TRUE) { rpn_stack.push_back(true); } @@ -1718,7 +1734,7 @@ bool KeyCondition::alwaysUnknownOrTrue() const } if (rpn_stack.size() != 1) - throw Exception("Unexpected stack size in KeyCondition::alwaysUnknownOrTrue", ErrorCodes::LOGICAL_ERROR); + throw Exception("Unexpected stack size in KeyCondition::unknownOrAlwaysTrue", ErrorCodes::LOGICAL_ERROR); return rpn_stack[0]; } diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index aa8a49226ba..b8167f406bd 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -275,8 +275,12 @@ public: const FieldRef * left_key, const DataTypes & data_types) const; - /// Checks that the index can not be used. + /// Checks that the index can not be used + /// FUNCTION_UNKNOWN will be AND'ed (if any). bool alwaysUnknownOrTrue() const; + /// Checks that the index can not be used + /// Does not allow any FUNCTION_UNKNOWN (will instantly return true). + bool anyUnknownOrAlwaysTrue() const; /// Get the maximum number of the key element used in the condition. size_t getMaxKeyColumn() const; @@ -413,6 +417,30 @@ private: RPNElement & out, size_t & out_key_column_num); + /// Checks that the index can not be used. + /// + /// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed, + /// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used). + /// + /// Consider the following example: + /// + /// CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k; + /// INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); + /// + /// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' + /// In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND) + /// and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN. + /// since there is no post processing and return count() based on only the first predicate is wrong. + /// + /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' + /// In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND) + /// so it will prune everything out and nothing will be read. + /// + /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0 + /// In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND) + /// and all, two, partitions will be scanned, but due to filtering later none of rows will be matched. + bool unknownOrAlwaysTrue(bool unknown_any) const; + RPN rpn; ColumnIndices key_columns; diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h index 74b02d671bb..3cb7552c427 100644 --- a/src/Storages/MergeTree/PartitionPruner.h +++ b/src/Storages/MergeTree/PartitionPruner.h @@ -25,7 +25,7 @@ public: : partition_key(partition_key_) , partition_condition( query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict) - , useless(partition_condition.alwaysUnknownOrTrue()) + , useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue()) { } diff --git a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference index 4fe8dbf8cfb..9db37bb5f81 100644 --- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference +++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference @@ -2,6 +2,7 @@ 0 2 1 +0 1 0 2 diff --git a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql index 41be36cd573..47ad0128130 100644 --- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql +++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql @@ -19,6 +19,8 @@ select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString( select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; } -- optimized select count() from test1 where toDate(p) > '2020-09-01'; +-- non-optimized +select count() from test1 where toDate(p) >= '2020-09-01' and p <= '2020-09-01 00:00:00'; create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;