Fix optimize_trivial_count_query with partition predicate

Consider the following example: CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k; INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND) and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN. since there is no post processing and return count() based on only the first predicate is wrong. Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong. And two examples above just to show the difference, the behaviour hadn't been changed with this patch: - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND) so it will prune everything out and nothing will be read. - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0 In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND) and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
2024-12-03 13:02:00 +00:00 · 2020-11-06 21:14:36 +03:00 · 2020-11-06 21:14:36 +03:00 · 0b47f4a9e9
commit 0b47f4a9e9
parent 32f69dc76f
5 changed files with 52 additions and 5 deletions
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@ -1674,13 +1674,29 @@ String KeyCondition::RPNElement::toString() const


 bool KeyCondition::alwaysUnknownOrTrue() const
+{
+    return unknownOrAlwaysTrue(false);
+}
+bool KeyCondition::anyUnknownOrAlwaysTrue() const
+{
+    return unknownOrAlwaysTrue(true);
+}
+bool KeyCondition::unknownOrAlwaysTrue(bool unknown_any) const
 {
    std::vector<UInt8> rpn_stack;

    for (const auto & element : rpn)
    {
-        if (element.function == RPNElement::FUNCTION_UNKNOWN
-            || element.function == RPNElement::ALWAYS_TRUE)
+        if (element.function == RPNElement::FUNCTION_UNKNOWN)
+        {
+            /// If unknown_any is true, return instantly,
+            /// to avoid processing it with FUNCTION_AND, and change the outcome.
+            if (unknown_any)
+                return true;
+            /// Otherwise, it may be AND'ed via FUNCTION_AND
+            rpn_stack.push_back(true);
+        }
+        else if (element.function == RPNElement::ALWAYS_TRUE)
        {
            rpn_stack.push_back(true);
        }
@ -1718,7 +1734,7 @@ bool KeyCondition::alwaysUnknownOrTrue() const
    }

    if (rpn_stack.size() != 1)
-        throw Exception("Unexpected stack size in KeyCondition::alwaysUnknownOrTrue", ErrorCodes::LOGICAL_ERROR);
+        throw Exception("Unexpected stack size in KeyCondition::unknownOrAlwaysTrue", ErrorCodes::LOGICAL_ERROR);

    return rpn_stack[0];
 }
--- a/src/Storages/MergeTree/KeyCondition.h
+++ b/src/Storages/MergeTree/KeyCondition.h
@ -275,8 +275,12 @@ public:
        const FieldRef * left_key,
        const DataTypes & data_types) const;

-    /// Checks that the index can not be used.
+    /// Checks that the index can not be used
+    /// FUNCTION_UNKNOWN will be AND'ed (if any).
    bool alwaysUnknownOrTrue() const;
+    /// Checks that the index can not be used
+    /// Does not allow any FUNCTION_UNKNOWN (will instantly return true).
+    bool anyUnknownOrAlwaysTrue() const;

    /// Get the maximum number of the key element used in the condition.
    size_t getMaxKeyColumn() const;
@ -413,6 +417,30 @@ private:
        RPNElement & out,
        size_t & out_key_column_num);

+    /// Checks that the index can not be used.
+    ///
+    /// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed,
+    /// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used).
+    ///
+    /// Consider the following example:
+    ///
+    ///     CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
+    ///     INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
+    ///
+    /// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
+    ///   In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
+    ///   and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
+    ///   since there is no post processing and return count() based on only the first predicate is wrong.
+    ///
+    /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
+    ///   In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
+    ///   so it will prune everything out and nothing will be read.
+    ///
+    /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
+    ///   In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
+    ///   and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
+    bool unknownOrAlwaysTrue(bool unknown_any) const;
+
    RPN rpn;

    ColumnIndices key_columns;
--- a/src/Storages/MergeTree/PartitionPruner.h
+++ b/src/Storages/MergeTree/PartitionPruner.h
@ -25,7 +25,7 @@ public:
        : partition_key(partition_key_)
        , partition_condition(
              query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict)
-        , useless(partition_condition.alwaysUnknownOrTrue())
+        , useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue())
    {
    }

--- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference
+++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference
@ -2,6 +2,7 @@
 0
 2
 1
+0
 1
 0
 2
--- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql
+++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql
@ -19,6 +19,8 @@ select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(
 select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
 -- optimized
 select count() from test1 where toDate(p) > '2020-09-01';
+-- non-optimized
+select count() from test1 where toDate(p) >= '2020-09-01' and p <= '2020-09-01 00:00:00';

 create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;