Fix optimize_trivial_count_query with partition predicate

Consider the following example:

    CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
    INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);

- SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
  In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
  and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
  since there is no post processing and return count() based on only the first predicate is wrong.

  Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong.

And two examples above just to show the difference, the behaviour hadn't been changed with this patch:

- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
  In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
  so it will prune everything out and nothing will be read.

- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
  In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
  and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
This commit is contained in:
Azat Khuzhin 2020-11-06 21:14:36 +03:00
parent 32f69dc76f
commit 0b47f4a9e9
5 changed files with 52 additions and 5 deletions

View File

@ -1674,13 +1674,29 @@ String KeyCondition::RPNElement::toString() const
bool KeyCondition::alwaysUnknownOrTrue() const
{
return unknownOrAlwaysTrue(false);
}
bool KeyCondition::anyUnknownOrAlwaysTrue() const
{
return unknownOrAlwaysTrue(true);
}
bool KeyCondition::unknownOrAlwaysTrue(bool unknown_any) const
{
std::vector<UInt8> rpn_stack;
for (const auto & element : rpn)
{
if (element.function == RPNElement::FUNCTION_UNKNOWN
|| element.function == RPNElement::ALWAYS_TRUE)
if (element.function == RPNElement::FUNCTION_UNKNOWN)
{
/// If unknown_any is true, return instantly,
/// to avoid processing it with FUNCTION_AND, and change the outcome.
if (unknown_any)
return true;
/// Otherwise, it may be AND'ed via FUNCTION_AND
rpn_stack.push_back(true);
}
else if (element.function == RPNElement::ALWAYS_TRUE)
{
rpn_stack.push_back(true);
}
@ -1718,7 +1734,7 @@ bool KeyCondition::alwaysUnknownOrTrue() const
}
if (rpn_stack.size() != 1)
throw Exception("Unexpected stack size in KeyCondition::alwaysUnknownOrTrue", ErrorCodes::LOGICAL_ERROR);
throw Exception("Unexpected stack size in KeyCondition::unknownOrAlwaysTrue", ErrorCodes::LOGICAL_ERROR);
return rpn_stack[0];
}

View File

@ -275,8 +275,12 @@ public:
const FieldRef * left_key,
const DataTypes & data_types) const;
/// Checks that the index can not be used.
/// Checks that the index can not be used
/// FUNCTION_UNKNOWN will be AND'ed (if any).
bool alwaysUnknownOrTrue() const;
/// Checks that the index can not be used
/// Does not allow any FUNCTION_UNKNOWN (will instantly return true).
bool anyUnknownOrAlwaysTrue() const;
/// Get the maximum number of the key element used in the condition.
size_t getMaxKeyColumn() const;
@ -413,6 +417,30 @@ private:
RPNElement & out,
size_t & out_key_column_num);
/// Checks that the index can not be used.
///
/// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed,
/// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used).
///
/// Consider the following example:
///
/// CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
/// INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
///
/// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
/// In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
/// and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
/// since there is no post processing and return count() based on only the first predicate is wrong.
///
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
/// so it will prune everything out and nothing will be read.
///
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
/// and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
bool unknownOrAlwaysTrue(bool unknown_any) const;
RPN rpn;
ColumnIndices key_columns;

View File

@ -25,7 +25,7 @@ public:
: partition_key(partition_key_)
, partition_condition(
query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict)
, useless(partition_condition.alwaysUnknownOrTrue())
, useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue())
{
}

View File

@ -19,6 +19,8 @@ select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(
select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
-- optimized
select count() from test1 where toDate(p) > '2020-09-01';
-- non-optimized
select count() from test1 where toDate(p) >= '2020-09-01' and p <= '2020-09-01 00:00:00';
create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;