mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-03 13:02:00 +00:00
Fix optimize_trivial_count_query with partition predicate
Consider the following example: CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k; INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3); - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND) and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN. since there is no post processing and return count() based on only the first predicate is wrong. Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong. And two examples above just to show the difference, the behaviour hadn't been changed with this patch: - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00' In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND) so it will prune everything out and nothing will be read. - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0 In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND) and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
This commit is contained in:
parent
32f69dc76f
commit
0b47f4a9e9
@ -1674,13 +1674,29 @@ String KeyCondition::RPNElement::toString() const
|
||||
|
||||
|
||||
bool KeyCondition::alwaysUnknownOrTrue() const
|
||||
{
|
||||
return unknownOrAlwaysTrue(false);
|
||||
}
|
||||
bool KeyCondition::anyUnknownOrAlwaysTrue() const
|
||||
{
|
||||
return unknownOrAlwaysTrue(true);
|
||||
}
|
||||
bool KeyCondition::unknownOrAlwaysTrue(bool unknown_any) const
|
||||
{
|
||||
std::vector<UInt8> rpn_stack;
|
||||
|
||||
for (const auto & element : rpn)
|
||||
{
|
||||
if (element.function == RPNElement::FUNCTION_UNKNOWN
|
||||
|| element.function == RPNElement::ALWAYS_TRUE)
|
||||
if (element.function == RPNElement::FUNCTION_UNKNOWN)
|
||||
{
|
||||
/// If unknown_any is true, return instantly,
|
||||
/// to avoid processing it with FUNCTION_AND, and change the outcome.
|
||||
if (unknown_any)
|
||||
return true;
|
||||
/// Otherwise, it may be AND'ed via FUNCTION_AND
|
||||
rpn_stack.push_back(true);
|
||||
}
|
||||
else if (element.function == RPNElement::ALWAYS_TRUE)
|
||||
{
|
||||
rpn_stack.push_back(true);
|
||||
}
|
||||
@ -1718,7 +1734,7 @@ bool KeyCondition::alwaysUnknownOrTrue() const
|
||||
}
|
||||
|
||||
if (rpn_stack.size() != 1)
|
||||
throw Exception("Unexpected stack size in KeyCondition::alwaysUnknownOrTrue", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("Unexpected stack size in KeyCondition::unknownOrAlwaysTrue", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
return rpn_stack[0];
|
||||
}
|
||||
|
@ -275,8 +275,12 @@ public:
|
||||
const FieldRef * left_key,
|
||||
const DataTypes & data_types) const;
|
||||
|
||||
/// Checks that the index can not be used.
|
||||
/// Checks that the index can not be used
|
||||
/// FUNCTION_UNKNOWN will be AND'ed (if any).
|
||||
bool alwaysUnknownOrTrue() const;
|
||||
/// Checks that the index can not be used
|
||||
/// Does not allow any FUNCTION_UNKNOWN (will instantly return true).
|
||||
bool anyUnknownOrAlwaysTrue() const;
|
||||
|
||||
/// Get the maximum number of the key element used in the condition.
|
||||
size_t getMaxKeyColumn() const;
|
||||
@ -413,6 +417,30 @@ private:
|
||||
RPNElement & out,
|
||||
size_t & out_key_column_num);
|
||||
|
||||
/// Checks that the index can not be used.
|
||||
///
|
||||
/// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed,
|
||||
/// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used).
|
||||
///
|
||||
/// Consider the following example:
|
||||
///
|
||||
/// CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
|
||||
/// INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
|
||||
///
|
||||
/// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
|
||||
/// In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
|
||||
/// and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
|
||||
/// since there is no post processing and return count() based on only the first predicate is wrong.
|
||||
///
|
||||
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
|
||||
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
|
||||
/// so it will prune everything out and nothing will be read.
|
||||
///
|
||||
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
|
||||
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
|
||||
/// and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
|
||||
bool unknownOrAlwaysTrue(bool unknown_any) const;
|
||||
|
||||
RPN rpn;
|
||||
|
||||
ColumnIndices key_columns;
|
||||
|
@ -25,7 +25,7 @@ public:
|
||||
: partition_key(partition_key_)
|
||||
, partition_condition(
|
||||
query_info, context, partition_key.column_names, partition_key.expression, true /* single_point */, strict)
|
||||
, useless(partition_condition.alwaysUnknownOrTrue())
|
||||
, useless(strict ? partition_condition.anyUnknownOrAlwaysTrue() : partition_condition.alwaysUnknownOrTrue())
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,7 @@
|
||||
0
|
||||
2
|
||||
1
|
||||
0
|
||||
1
|
||||
0
|
||||
2
|
||||
|
@ -19,6 +19,8 @@ select count() FROM test1 where toDate(p) = '2020-09-01' and sipHash64(toString(
|
||||
select count() FROM test1 where toDate(p) = '2020-09-01' and k = 2; -- { serverError 158; }
|
||||
-- optimized
|
||||
select count() from test1 where toDate(p) > '2020-09-01';
|
||||
-- non-optimized
|
||||
select count() from test1 where toDate(p) >= '2020-09-01' and p <= '2020-09-01 00:00:00';
|
||||
|
||||
create table test_tuple(p DateTime, i int, j int) engine MergeTree partition by (toDate(p), i) order by j;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user