From 0b47f4a9e9573df9203eb3c7a4d1de30225c7d25 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 6 Nov 2020 21:14:36 +0300
Subject: [PATCH] Fix optimize_trivial_count_query with partition predicate

Consider the following example:

    CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
    INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);

- SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
  In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
  and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
  since there is no post processing and return count() based on only the first predicate is wrong.

  Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong.

And two examples above just to show the difference, the behaviour hadn't been changed with this patch:

- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
  In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
  so it will prune everything out and nothing will be read.

- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
  In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
  and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
---
 src/Storages/MergeTree/KeyCondition.cpp       | 22 ++++++++++++--
 src/Storages/MergeTree/KeyCondition.h         | 30 ++++++++++++++++++-
 src/Storages/MergeTree/PartitionPruner.h      |  2 +-
 ...l_count_with_partition_predicate.reference |  1 +
 ...trivial_count_with_partition_predicate.sql |  2 ++
 5 files changed, 52 insertions(+), 5 deletions(-)
diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index f6ff13dc9c6..ff538b40617 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -1674,13 +1674,29 @@ String KeyCondition::RPNElement::toString() const
 
 
 bool KeyCondition::alwaysUnknownOrTrue() const
+{
+    return unknownOrAlwaysTrue(false);
+}
+bool KeyCondition::anyUnknownOrAlwaysTrue() const
+{
+    return unknownOrAlwaysTrue(true);
+}
+bool KeyCondition::unknownOrAlwaysTrue(bool unknown_any) const
 {
     std::vector<UInt8> rpn_stack;
 
     for (const auto & element : rpn)
     {
-        if (element.function == RPNElement::FUNCTION_UNKNOWN
-            || element.function == RPNElement::ALWAYS_TRUE)
+        if (element.function == RPNElement::FUNCTION_UNKNOWN)
+        {
+            /// If unknown_any is true, return instantly,
+            /// to avoid processing it with FUNCTION_AND, and change the outcome.
+            if (unknown_any)
+                return true;
+            /// Otherwise, it may be AND'ed via FUNCTION_AND
+            rpn_stack.push_back(true);
+        }
+        else if (element.function == RPNElement::ALWAYS_TRUE)
         {
             rpn_stack.push_back(true);
         }
@@ -1718,7 +1734,7 @@ bool KeyCondition::alwaysUnknownOrTrue() const
     }
 
     if (rpn_stack.size() != 1)
-        throw Exception("Unexpected stack size in KeyCondition::alwaysUnknownOrTrue", ErrorCodes::LOGICAL_ERROR);
+        throw Exception("Unexpected stack size in KeyCondition::unknownOrAlwaysTrue", ErrorCodes::LOGICAL_ERROR);
 
     return rpn_stack[0];
 }
diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h
index aa8a49226ba..b8167f406bd 100644
--- a/src/Storages/MergeTree/KeyCondition.h
+++ b/src/Storages/MergeTree/KeyCondition.h
@@ -275,8 +275,12 @@ public:
         const FieldRef * left_key,
         const DataTypes & data_types) const;
 
-    /// Checks that the index can not be used.
+    /// Checks that the index can not be used
+    /// FUNCTION_UNKNOWN will be AND'ed (if any).
     bool alwaysUnknownOrTrue() const;
+    /// Checks that the index can not be used
+    /// Does not allow any FUNCTION_UNKNOWN (will instantly return true).
+    bool anyUnknownOrAlwaysTrue() const;
 
     /// Get the maximum number of the key element used in the condition.
     size_t getMaxKeyColumn() const;
@@ -413,6 +417,30 @@ private:
         RPNElement & out,
         size_t & out_key_column_num);
 
+    /// Checks that the index can not be used.
+    ///
+    /// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed,
+    /// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used).
+    ///
+    /// Consider the following example:
+    ///
+    ///     CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
+    ///     INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
+    ///
+    /// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
+    ///   In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
+    ///   and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
+    ///   since there is no post processing and return count() based on only the first predicate is wrong.
+    ///
+    /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
+    ///   In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
+    ///   so it will prune everything out and nothing will be read.
+    ///
+    /// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
+    ///   In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
+    ///   and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
+    bool unknownOrAlwaysTrue(bool unknown_any) const;
+
     RPN rpn;
 
     ColumnIndices key_columns;
diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h
index 74b02d671bb..3cb7552c427 100644
--- a/src/Storages/MergeTree/PartitionPruner.h
+++ b/src/Storages/MergeTree/PartitionPruner.h
@@ -25,7 +25,7 @@ public:
         : partition_key(partition_key_)
         , partition_condition(
               query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict)
-        , useless(partition_condition.alwaysUnknownOrTrue())
+        , useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue())
     {
     }
 
diff --git a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference
index 4fe8dbf8cfb..9db37bb5f81 100644
--- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference
+++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.reference
@@ -2,6 +2,7 @@
 0
 2
 1
+0
 1
 0
 2
diff --git a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql
index 41be36cd573..47ad0128130 100644
--- a/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql
+++ b/tests/queries/0_stateless/01505_trivial_count_with_partition_predicate.sql
@@ -19,6 +19,8 @@ select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(
 select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
 -- optimized
 select count() from test1 where toDate(p) > '2020-09-01';
+-- non-optimized
+select count() from test1 where toDate(p) >= '2020-09-01' and p <= '2020-09-01 00:00:00';
 
 create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;