2012-12-05 12:44:55 +00:00
|
|
|
#pragma once
|
|
|
|
|
2018-02-08 15:25:46 +00:00
|
|
|
#include <optional>
|
2012-12-05 12:44:55 +00:00
|
|
|
|
2017-07-14 00:33:37 +00:00
|
|
|
#include <Interpreters/Set.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/SortDescription.h>
|
|
|
|
#include <Parsers/ASTExpressionList.h>
|
2017-07-15 03:48:36 +00:00
|
|
|
#include <Storages/SelectQueryInfo.h>
|
2012-12-05 12:44:55 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
2013-12-12 00:50:54 +00:00
|
|
|
|
2021-11-26 17:21:54 +00:00
|
|
|
class ASTFunction;
|
2020-05-20 20:16:32 +00:00
|
|
|
class Context;
|
2017-01-14 09:00:19 +00:00
|
|
|
class IFunction;
|
2018-02-02 08:33:36 +00:00
|
|
|
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
|
2020-02-10 15:50:12 +00:00
|
|
|
class ExpressionActions;
|
|
|
|
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
|
|
|
|
|
2020-07-05 15:57:59 +00:00
|
|
|
/** A field, that can be stored in two representations:
|
2020-04-02 17:27:07 +00:00
|
|
|
* - A standalone field.
|
2020-04-08 02:56:25 +00:00
|
|
|
* - A field with reference to its position in a block.
|
|
|
|
* It's needed for execution of functions on ranges during
|
2020-04-02 17:27:07 +00:00
|
|
|
* index analysis. If function was executed once for field,
|
2020-04-08 02:56:25 +00:00
|
|
|
* its result would be cached for whole block for which field's reference points to.
|
2020-04-02 17:27:07 +00:00
|
|
|
*/
|
|
|
|
struct FieldRef : public Field
|
|
|
|
{
|
|
|
|
FieldRef() = default;
|
|
|
|
|
|
|
|
/// Create as explicit field without block.
|
|
|
|
template <typename T>
|
2022-03-13 12:23:51 +00:00
|
|
|
FieldRef(T && value) : Field(std::forward<T>(value)) {} /// NOLINT
|
2020-04-02 17:27:07 +00:00
|
|
|
|
|
|
|
/// Create as reference to field in block.
|
2020-10-09 07:41:28 +00:00
|
|
|
FieldRef(ColumnsWithTypeAndName * columns_, size_t row_idx_, size_t column_idx_)
|
|
|
|
: Field((*(*columns_)[column_idx_].column)[row_idx_]),
|
|
|
|
columns(columns_), row_idx(row_idx_), column_idx(column_idx_) {}
|
2020-04-02 17:27:07 +00:00
|
|
|
|
2020-10-09 07:41:28 +00:00
|
|
|
bool isExplicit() const { return columns == nullptr; }
|
2020-04-02 17:27:07 +00:00
|
|
|
|
2020-10-09 07:41:28 +00:00
|
|
|
ColumnsWithTypeAndName * columns = nullptr;
|
2020-04-06 13:35:11 +00:00
|
|
|
size_t row_idx = 0;
|
|
|
|
size_t column_idx = 0;
|
2020-04-02 17:27:07 +00:00
|
|
|
};
|
|
|
|
|
2017-08-29 18:51:48 +00:00
|
|
|
/** Range with open or closed ends; possibly unbounded.
|
2013-12-12 00:50:54 +00:00
|
|
|
*/
|
2012-12-05 12:44:55 +00:00
|
|
|
struct Range
|
|
|
|
{
|
2013-12-12 00:50:54 +00:00
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
static bool equals(const Field & lhs, const Field & rhs);
|
|
|
|
static bool less(const Field & lhs, const Field & rhs);
|
2015-03-27 03:06:06 +00:00
|
|
|
|
2013-12-12 00:50:54 +00:00
|
|
|
public:
|
2021-08-27 14:09:15 +00:00
|
|
|
FieldRef left = NEGATIVE_INFINITY; /// the left border
|
|
|
|
FieldRef right = POSITIVE_INFINITY; /// the right border
|
2021-01-02 09:47:38 +00:00
|
|
|
bool left_included = false; /// includes the left border
|
|
|
|
bool right_included = false; /// includes the right border
|
|
|
|
|
|
|
|
/// The whole universe (not null).
|
2022-03-13 12:23:51 +00:00
|
|
|
Range() {} /// NOLINT
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// One point.
|
2022-03-13 12:23:51 +00:00
|
|
|
Range(const FieldRef & point) /// NOLINT
|
2021-01-02 09:47:38 +00:00
|
|
|
: left(point), right(point), left_included(true), right_included(true) {}
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// A bounded two-sided range.
|
2020-04-02 17:27:07 +00:00
|
|
|
Range(const FieldRef & left_, bool left_included_, const FieldRef & right_, bool right_included_)
|
2021-01-02 09:47:38 +00:00
|
|
|
: left(left_)
|
|
|
|
, right(right_)
|
|
|
|
, left_included(left_included_)
|
|
|
|
, right_included(right_included_)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
shrinkToIncludedIfPossible();
|
|
|
|
}
|
|
|
|
|
2020-04-02 17:27:07 +00:00
|
|
|
static Range createRightBounded(const FieldRef & right_point, bool right_included)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
Range r;
|
|
|
|
r.right = right_point;
|
|
|
|
r.right_included = right_included;
|
|
|
|
r.shrinkToIncludedIfPossible();
|
2021-01-02 09:47:38 +00:00
|
|
|
// Special case for [-Inf, -Inf]
|
|
|
|
if (r.right.isNegativeInfinity() && right_included)
|
|
|
|
r.left_included = true;
|
2017-04-01 07:20:54 +00:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2020-04-02 17:27:07 +00:00
|
|
|
static Range createLeftBounded(const FieldRef & left_point, bool left_included)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
Range r;
|
|
|
|
r.left = left_point;
|
|
|
|
r.left_included = left_included;
|
|
|
|
r.shrinkToIncludedIfPossible();
|
2021-01-02 09:47:38 +00:00
|
|
|
// Special case for [+Inf, +Inf]
|
|
|
|
if (r.left.isPositiveInfinity() && left_included)
|
|
|
|
r.right_included = true;
|
2017-04-01 07:20:54 +00:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Optimize the range. If it has an open boundary and the Field type is "loose"
|
|
|
|
* - then convert it to closed, narrowing by one.
|
|
|
|
* That is, for example, turn (0,2) into [1].
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void shrinkToIncludedIfPossible()
|
|
|
|
{
|
2021-01-02 09:47:38 +00:00
|
|
|
if (left.isExplicit() && !left_included)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (left.getType() == Field::Types::UInt64 && left.get<UInt64>() != std::numeric_limits<UInt64>::max())
|
|
|
|
{
|
|
|
|
++left.get<UInt64 &>();
|
|
|
|
left_included = true;
|
|
|
|
}
|
|
|
|
if (left.getType() == Field::Types::Int64 && left.get<Int64>() != std::numeric_limits<Int64>::max())
|
|
|
|
{
|
|
|
|
++left.get<Int64 &>();
|
|
|
|
left_included = true;
|
|
|
|
}
|
|
|
|
}
|
2021-01-02 09:47:38 +00:00
|
|
|
if (right.isExplicit() && !right_included)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (right.getType() == Field::Types::UInt64 && right.get<UInt64>() != std::numeric_limits<UInt64>::min())
|
|
|
|
{
|
|
|
|
--right.get<UInt64 &>();
|
|
|
|
right_included = true;
|
|
|
|
}
|
|
|
|
if (right.getType() == Field::Types::Int64 && right.get<Int64>() != std::numeric_limits<Int64>::min())
|
|
|
|
{
|
|
|
|
--right.get<Int64 &>();
|
|
|
|
right_included = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-02 09:47:38 +00:00
|
|
|
bool empty() const { return less(right, left) || ((!left_included || !right_included) && !less(left, right)); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// x contained in the range
|
2020-04-02 17:27:07 +00:00
|
|
|
bool contains(const FieldRef & x) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return !leftThan(x) && !rightThan(x);
|
|
|
|
}
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// x is to the left
|
2020-04-02 17:27:07 +00:00
|
|
|
bool rightThan(const FieldRef & x) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2021-01-02 09:47:38 +00:00
|
|
|
return less(left, x) || (left_included && equals(x, left));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// x is to the right
|
2020-04-02 17:27:07 +00:00
|
|
|
bool leftThan(const FieldRef & x) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2021-01-02 09:47:38 +00:00
|
|
|
return less(x, right) || (right_included && equals(x, right));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool intersectsRange(const Range & r) const
|
|
|
|
{
|
2017-04-16 15:00:33 +00:00
|
|
|
/// r to the left of me.
|
2021-01-02 09:47:38 +00:00
|
|
|
if (less(r.right, left) || ((!left_included || !r.right_included) && equals(r.right, left)))
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// r to the right of me.
|
2021-01-02 09:47:38 +00:00
|
|
|
if (less(right, r.left) || ((!right_included || !r.left_included) && equals(r.left, right)))
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool containsRange(const Range & r) const
|
|
|
|
{
|
2017-04-16 15:00:33 +00:00
|
|
|
/// r starts to the left of me.
|
2021-01-02 09:47:38 +00:00
|
|
|
if (less(r.left, left) || (r.left_included && !left_included && equals(r.left, left)))
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// r ends right of me.
|
2021-01-02 09:47:38 +00:00
|
|
|
if (less(right, r.right) || (r.right_included && !right_included && equals(r.right, right)))
|
2017-04-01 07:20:54 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-01-02 09:47:38 +00:00
|
|
|
void invert()
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
std::swap(left, right);
|
2021-01-02 09:47:38 +00:00
|
|
|
if (left.isPositiveInfinity())
|
2021-08-27 14:09:15 +00:00
|
|
|
left = NEGATIVE_INFINITY;
|
2021-01-02 09:47:38 +00:00
|
|
|
if (right.isNegativeInfinity())
|
2021-08-27 14:09:15 +00:00
|
|
|
right = POSITIVE_INFINITY;
|
2017-04-01 07:20:54 +00:00
|
|
|
std::swap(left_included, right_included);
|
|
|
|
}
|
|
|
|
|
|
|
|
String toString() const;
|
2012-12-05 12:44:55 +00:00
|
|
|
};
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Condition on the index.
|
2015-03-27 03:06:06 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* Consists of the conditions for the key belonging to all possible ranges or sets,
|
2017-08-29 19:02:07 +00:00
|
|
|
* as well as logical operators AND/OR/NOT above these conditions.
|
2015-03-27 03:06:06 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* Constructs a reverse polish notation from these conditions
|
2017-08-29 19:02:07 +00:00
|
|
|
* and can calculate (interpret) its satisfiability over key ranges.
|
2015-03-27 03:06:06 +00:00
|
|
|
*/
|
2018-04-20 00:20:36 +00:00
|
|
|
class KeyCondition
|
2012-12-05 12:44:55 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-08-29 19:02:07 +00:00
|
|
|
/// Does not take into account the SAMPLE section. all_columns - the set of all columns of the table.
|
2018-04-20 00:20:36 +00:00
|
|
|
KeyCondition(
|
2017-07-15 03:48:36 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr context,
|
2018-06-30 21:35:01 +00:00
|
|
|
const Names & key_column_names,
|
2020-09-21 10:13:01 +00:00
|
|
|
const ExpressionActionsPtr & key_expr,
|
|
|
|
bool single_point_ = false,
|
|
|
|
bool strict_ = false);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-03-10 14:56:55 +00:00
|
|
|
/// Whether the condition and its negation are feasible in the direct product of single column ranges specified by `hyperrectangle`.
|
|
|
|
BoolMask checkInHyperrectangle(
|
|
|
|
const std::vector<Range> & hyperrectangle,
|
2020-01-29 21:40:22 +00:00
|
|
|
const DataTypes & data_types) const;
|
|
|
|
|
|
|
|
/// Whether the condition and its negation are (independently) feasible in the key range.
|
2018-04-20 00:27:25 +00:00
|
|
|
/// left_key and right_key must contain all fields in the sort_descr in the appropriate order.
|
|
|
|
/// data_types - the types of the key columns.
|
2020-01-29 21:40:22 +00:00
|
|
|
/// Argument initial_mask is used for early exiting the implementation when we do not care about
|
|
|
|
/// one of the resulting mask components (see BoolMask::consider_only_can_be_XXX).
|
|
|
|
BoolMask checkInRange(
|
|
|
|
size_t used_key_size,
|
2021-01-02 09:47:38 +00:00
|
|
|
const FieldRef * left_keys,
|
|
|
|
const FieldRef * right_keys,
|
2020-01-29 21:40:22 +00:00
|
|
|
const DataTypes & data_types,
|
|
|
|
BoolMask initial_mask = BoolMask(false, false)) const;
|
|
|
|
|
|
|
|
/// Same as checkInRange, but calculate only may_be_true component of a result.
|
|
|
|
/// This is more efficient than checkInRange(...).can_be_true.
|
|
|
|
bool mayBeTrueInRange(
|
|
|
|
size_t used_key_size,
|
2021-01-02 09:47:38 +00:00
|
|
|
const FieldRef * left_keys,
|
|
|
|
const FieldRef * right_keys,
|
2020-01-29 21:40:22 +00:00
|
|
|
const DataTypes & data_types) const;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Fix optimize_trivial_count_query with partition predicate
Consider the following example:
CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
- SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
since there is no post processing and return count() based on only the first predicate is wrong.
Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong.
And two examples above just to show the difference, the behaviour hadn't been changed with this patch:
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
so it will prune everything out and nothing will be read.
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
2020-11-06 18:14:36 +00:00
|
|
|
/// Checks that the index can not be used
|
|
|
|
/// FUNCTION_UNKNOWN will be AND'ed (if any).
|
2017-04-01 07:20:54 +00:00
|
|
|
bool alwaysUnknownOrTrue() const;
|
Fix optimize_trivial_count_query with partition predicate
Consider the following example:
CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
- SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
since there is no post processing and return count() based on only the first predicate is wrong.
Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong.
And two examples above just to show the difference, the behaviour hadn't been changed with this patch:
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
so it will prune everything out and nothing will be read.
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
2020-11-06 18:14:36 +00:00
|
|
|
/// Checks that the index can not be used
|
|
|
|
/// Does not allow any FUNCTION_UNKNOWN (will instantly return true).
|
|
|
|
bool anyUnknownOrAlwaysTrue() const;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-20 00:27:25 +00:00
|
|
|
/// Get the maximum number of the key element used in the condition.
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t getMaxKeyColumn() const;
|
|
|
|
|
2020-04-02 17:27:07 +00:00
|
|
|
bool hasMonotonicFunctionsChain() const;
|
|
|
|
|
2018-12-11 17:43:12 +00:00
|
|
|
/// Impose an additional condition: the value in the column `column` must be in the range `range`.
|
2018-04-20 00:27:25 +00:00
|
|
|
/// Returns whether there is such a column in the key.
|
2017-04-01 07:20:54 +00:00
|
|
|
bool addCondition(const String & column, const Range & range);
|
|
|
|
|
|
|
|
String toString() const;
|
|
|
|
|
2021-04-15 17:30:04 +00:00
|
|
|
/// Condition description for EXPLAIN query.
|
|
|
|
struct Description
|
|
|
|
{
|
2021-04-16 09:42:23 +00:00
|
|
|
/// Which columns from PK were used, in PK order.
|
2021-04-15 17:30:04 +00:00
|
|
|
std::vector<std::string> used_keys;
|
2021-04-16 09:42:23 +00:00
|
|
|
/// Condition which was applied, mostly human-readable.
|
2021-04-15 17:30:04 +00:00
|
|
|
std::string condition;
|
|
|
|
};
|
|
|
|
|
|
|
|
Description getDescription() const;
|
|
|
|
|
2018-06-04 17:01:00 +00:00
|
|
|
/** A chain of possibly monotone functions.
|
|
|
|
* If the key column is wrapped in functions that can be monotonous in some value ranges
|
|
|
|
* (for example: -toFloat64(toDayOfWeek(date))), then here the functions will be located: toDayOfWeek, toFloat64, negate.
|
|
|
|
*/
|
|
|
|
using MonotonicFunctionsChain = std::vector<FunctionBasePtr>;
|
|
|
|
|
2019-02-20 16:24:46 +00:00
|
|
|
/** Computes value of constant expression and its data type.
|
|
|
|
* Returns false, if expression isn't constant.
|
|
|
|
*/
|
|
|
|
static bool getConstant(
|
|
|
|
const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type);
|
2018-06-04 17:01:00 +00:00
|
|
|
|
|
|
|
static Block getBlockWithConstants(
|
2021-04-10 23:33:54 +00:00
|
|
|
const ASTPtr & query, const TreeRewriterResultPtr & syntax_analyzer_result, ContextPtr context);
|
2018-06-04 17:01:00 +00:00
|
|
|
|
|
|
|
static std::optional<Range> applyMonotonicFunctionsChainToRange(
|
|
|
|
Range key_range,
|
2020-07-21 11:02:58 +00:00
|
|
|
const MonotonicFunctionsChain & functions,
|
2020-09-21 10:13:01 +00:00
|
|
|
DataTypePtr current_type,
|
|
|
|
bool single_point = false);
|
2018-06-04 17:01:00 +00:00
|
|
|
|
Optimize PK lookup for queries that match exact PK range
Existing code that looks up marks that match the query has a pathological
case, when most of the part does in fact match the query.
The code works by recursively splitting a part into ranges and then discarding
the ranges that definitely do not match the query, based on primary key.
The problem is that it requires visiting every mark that matches the query,
making the complexity of this sort of look up O(n).
For queries that match exact range on the primary key, we can find
both left and right parts of the range with O(log 2) complexity.
This change implements exactly that.
To engage this optimization, the query must:
* Have a prefix list of the primary key.
* Have only range or single set element constraints for columns.
* Have only AND as a boolean operator.
Consider a table with `(service, timestamp)` as the primary key.
The following conditions will be optimized:
* `service = 'foo'`
* `service = 'foo' and timestamp >= now() - 3600`
* `service in ('foo')`
* `service in ('foo') and timestamp >= now() - 3600 and timestamp <= now`
The following will fall back to previous lookup algorithm:
* `timestamp >= now() - 3600`
* `service in ('foo', 'bar') and timestamp >= now() - 3600`
* `service = 'foo'`
Note that the optimization won't engage when PK has a range expression
followed by a point expression, since in that case the range is not continuous.
Trace query logging provides the following messages types of messages,
each representing a different kind of PK usage for a part:
```
Used optimized inclusion search over index for part 20200711_5710108_5710108_0 with 9 steps
Used generic exclusion search over index for part 20200711_5710118_5710228_5 with 1495 steps
Not using index on part 20200710_5710473_5710473_0
```
Number of steps translates to computational complexity.
Here's a comparison for before and after for a query over 24h of data:
```
Read 4562944 rows, 148.05 MiB in 45.19249672 sec., 100966 rows/sec., 3.28 MiB/sec.
Read 4183040 rows, 135.78 MiB in 0.196279627 sec., 21311636 rows/sec., 691.75 MiB/sec.
```
This is especially useful for queries that read data in order
and terminate early to return "last X things" matching a query.
See #11564 for more thoughts on this.
2020-07-07 18:10:44 +00:00
|
|
|
bool matchesExactContinuousRange() const;
|
|
|
|
|
2018-06-04 17:01:00 +00:00
|
|
|
private:
|
2017-04-16 15:00:33 +00:00
|
|
|
/// The expression is stored as Reverse Polish Notation.
|
2017-04-01 07:20:54 +00:00
|
|
|
struct RPNElement
|
|
|
|
{
|
|
|
|
enum Function
|
|
|
|
{
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Atoms of a Boolean expression.
|
2017-04-01 07:20:54 +00:00
|
|
|
FUNCTION_IN_RANGE,
|
|
|
|
FUNCTION_NOT_IN_RANGE,
|
|
|
|
FUNCTION_IN_SET,
|
|
|
|
FUNCTION_NOT_IN_SET,
|
2021-01-02 09:47:38 +00:00
|
|
|
FUNCTION_IS_NULL,
|
|
|
|
FUNCTION_IS_NOT_NULL,
|
2017-04-16 15:00:33 +00:00
|
|
|
FUNCTION_UNKNOWN, /// Can take any value.
|
|
|
|
/// Operators of the logical expression.
|
2017-04-01 07:20:54 +00:00
|
|
|
FUNCTION_NOT,
|
|
|
|
FUNCTION_AND,
|
|
|
|
FUNCTION_OR,
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Constants
|
2017-04-01 07:20:54 +00:00
|
|
|
ALWAYS_FALSE,
|
|
|
|
ALWAYS_TRUE,
|
|
|
|
};
|
|
|
|
|
2022-03-13 12:23:51 +00:00
|
|
|
RPNElement() = default;
|
|
|
|
RPNElement(Function function_) : function(function_) {} /// NOLINT
|
2017-04-01 07:20:54 +00:00
|
|
|
RPNElement(Function function_, size_t key_column_) : function(function_), key_column(key_column_) {}
|
|
|
|
RPNElement(Function function_, size_t key_column_, const Range & range_)
|
|
|
|
: function(function_), range(range_), key_column(key_column_) {}
|
|
|
|
|
|
|
|
String toString() const;
|
2022-07-14 16:11:35 +00:00
|
|
|
String toString(std::string_view column_name, bool print_constants) const;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
Function function = FUNCTION_UNKNOWN;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// For FUNCTION_IN_RANGE and FUNCTION_NOT_IN_RANGE.
|
2017-04-01 07:20:54 +00:00
|
|
|
Range range;
|
2018-06-04 17:01:00 +00:00
|
|
|
size_t key_column = 0;
|
2017-04-16 15:00:33 +00:00
|
|
|
/// For FUNCTION_IN_SET, FUNCTION_NOT_IN_SET
|
2020-07-21 11:22:45 +00:00
|
|
|
using MergeTreeSetIndexPtr = std::shared_ptr<const MergeTreeSetIndex>;
|
2018-02-02 13:19:40 +00:00
|
|
|
MergeTreeSetIndexPtr set_index;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2020-07-21 11:02:58 +00:00
|
|
|
MonotonicFunctionsChain monotonic_functions_chain;
|
2017-04-01 07:20:54 +00:00
|
|
|
};
|
|
|
|
|
2018-06-04 17:01:00 +00:00
|
|
|
using RPN = std::vector<RPNElement>;
|
|
|
|
using ColumnIndices = std::map<String, size_t>;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-01-22 12:33:56 +00:00
|
|
|
using AtomMap = std::unordered_map<std::string, bool(*)(RPNElement & out, const Field & value)>;
|
2018-02-02 13:19:40 +00:00
|
|
|
|
2018-06-04 17:01:00 +00:00
|
|
|
public:
|
|
|
|
static const AtomMap atom_map;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-06-30 21:35:01 +00:00
|
|
|
private:
|
2020-01-29 21:40:22 +00:00
|
|
|
BoolMask checkInRange(
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t used_key_size,
|
2020-04-02 17:27:07 +00:00
|
|
|
const FieldRef * left_key,
|
|
|
|
const FieldRef * right_key,
|
2017-04-01 07:20:54 +00:00
|
|
|
const DataTypes & data_types,
|
2020-01-29 21:40:22 +00:00
|
|
|
bool right_bounded,
|
|
|
|
BoolMask initial_mask) const;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2021-04-10 23:33:54 +00:00
|
|
|
void traverseAST(const ASTPtr & node, ContextPtr context, Block & block_with_constants);
|
|
|
|
bool tryParseAtomFromAST(const ASTPtr & node, ContextPtr context, Block & block_with_constants, RPNElement & out);
|
2020-03-18 00:57:00 +00:00
|
|
|
static bool tryParseLogicalOperatorFromAST(const ASTFunction * func, RPNElement & out);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-20 00:27:25 +00:00
|
|
|
/** Is node the key column
|
|
|
|
* or expression in which column of key is wrapped by chain of functions,
|
2019-01-22 19:56:53 +00:00
|
|
|
* that can be monotonic on certain ranges?
|
2018-04-20 00:27:25 +00:00
|
|
|
* If these conditions are true, then returns number of column in key, type of resulting expression
|
2017-04-01 07:20:54 +00:00
|
|
|
* and fills chain of possibly-monotonic functions.
|
|
|
|
*/
|
2018-04-20 00:27:25 +00:00
|
|
|
bool isKeyPossiblyWrappedByMonotonicFunctions(
|
2017-04-01 07:20:54 +00:00
|
|
|
const ASTPtr & node,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr context,
|
2018-04-20 00:27:25 +00:00
|
|
|
size_t & out_key_column_num,
|
|
|
|
DataTypePtr & out_key_res_column_type,
|
2018-06-04 17:01:00 +00:00
|
|
|
MonotonicFunctionsChain & out_functions_chain);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-20 00:27:25 +00:00
|
|
|
bool isKeyPossiblyWrappedByMonotonicFunctionsImpl(
|
2017-04-01 07:20:54 +00:00
|
|
|
const ASTPtr & node,
|
2018-04-20 00:27:25 +00:00
|
|
|
size_t & out_key_column_num,
|
|
|
|
DataTypePtr & out_key_column_type,
|
2017-04-01 07:20:54 +00:00
|
|
|
std::vector<const ASTFunction *> & out_functions_chain);
|
|
|
|
|
2021-08-28 15:33:18 +00:00
|
|
|
bool transformConstantWithValidFunctions(
|
|
|
|
const String & expr_name,
|
|
|
|
size_t & out_key_column_num,
|
|
|
|
DataTypePtr & out_key_column_type,
|
|
|
|
Field & out_value,
|
|
|
|
DataTypePtr & out_type,
|
|
|
|
std::function<bool(IFunctionBase &, const IDataType &)> always_monotonic) const;
|
|
|
|
|
PKCondition: infer index use with pk subexpression
By default only constraints explicitly matching
primary key expression (or expression wrapped in
a monotonic function) are eligible for part and
range selection. So for example, if index is:
(toStartOfHour(dt), UserID)
Then a query such as this resorts to full scan:
SELECT count() FROM t WHERE dt = now()
Intuitively, only parts with toStartOfHour(now())
could be selected, but it is less trivial to prove.
The primary key currently can be wrapped in a chain
of monotonic functions, so following would work:
toStartOfHour(dt) = toStartOfHour(now()) AND dt = now()
It must be however explicitly stated, if we wanted
to infer that we’d have to know the inverse function,
and prove that the inverse function is monotonic
on given interval. This is not practical as
there is no inverse function that for example undos
rounding, it isn’t strictly monotonic.
There are however functions that don’t transform
output range and preserve monotonicity on the
complete input range, such as rounding or casts
to a same or wider numeric type. This eliminates
the need to find inverse function, as no check for monotonicity over arbitrary interval is needed,
and thus makes this optimisation possible.
2017-07-06 05:39:05 +00:00
|
|
|
bool canConstantBeWrappedByMonotonicFunctions(
|
|
|
|
const ASTPtr & node,
|
2018-04-20 00:27:25 +00:00
|
|
|
size_t & out_key_column_num,
|
|
|
|
DataTypePtr & out_key_column_type,
|
PKCondition: infer index use with pk subexpression
By default only constraints explicitly matching
primary key expression (or expression wrapped in
a monotonic function) are eligible for part and
range selection. So for example, if index is:
(toStartOfHour(dt), UserID)
Then a query such as this resorts to full scan:
SELECT count() FROM t WHERE dt = now()
Intuitively, only parts with toStartOfHour(now())
could be selected, but it is less trivial to prove.
The primary key currently can be wrapped in a chain
of monotonic functions, so following would work:
toStartOfHour(dt) = toStartOfHour(now()) AND dt = now()
It must be however explicitly stated, if we wanted
to infer that we’d have to know the inverse function,
and prove that the inverse function is monotonic
on given interval. This is not practical as
there is no inverse function that for example undos
rounding, it isn’t strictly monotonic.
There are however functions that don’t transform
output range and preserve monotonicity on the
complete input range, such as rounding or casts
to a same or wider numeric type. This eliminates
the need to find inverse function, as no check for monotonicity over arbitrary interval is needed,
and thus makes this optimisation possible.
2017-07-06 05:39:05 +00:00
|
|
|
Field & out_value,
|
|
|
|
DataTypePtr & out_type);
|
|
|
|
|
2020-11-06 03:50:58 +00:00
|
|
|
bool canConstantBeWrappedByFunctions(
|
2020-11-09 13:20:56 +00:00
|
|
|
const ASTPtr & ast, size_t & out_key_column_num, DataTypePtr & out_key_column_type, Field & out_value, DataTypePtr & out_type);
|
2020-11-06 03:50:58 +00:00
|
|
|
|
2018-07-02 18:57:14 +00:00
|
|
|
/// If it's possible to make an RPNElement
|
|
|
|
/// that will filter values (possibly tuples) by the content of 'prepared_set',
|
|
|
|
/// do it and return true.
|
|
|
|
bool tryPrepareSetIndex(
|
2019-01-22 12:33:56 +00:00
|
|
|
const ASTs & args,
|
2021-04-10 23:33:54 +00:00
|
|
|
ContextPtr context,
|
2018-02-02 13:19:40 +00:00
|
|
|
RPNElement & out,
|
2018-04-20 00:27:25 +00:00
|
|
|
size_t & out_key_column_num);
|
2018-02-02 13:19:40 +00:00
|
|
|
|
Fix optimize_trivial_count_query with partition predicate
Consider the following example:
CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
- SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
since there is no post processing and return count() based on only the first predicate is wrong.
Before this patch FUNCTION_UNKNOWN was allowed for optimize_trivial_count_query, and the result was wrong.
And two examples above just to show the difference, the behaviour hadn't been changed with this patch:
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
so it will prune everything out and nothing will be read.
- SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
2020-11-06 18:14:36 +00:00
|
|
|
/// Checks that the index can not be used.
|
|
|
|
///
|
|
|
|
/// If unknown_any is false (used by alwaysUnknownOrTrue()), then FUNCTION_UNKNOWN can be AND'ed,
|
|
|
|
/// otherwise (anyUnknownOrAlwaysTrue()) first FUNCTION_UNKNOWN will return true (index cannot be used).
|
|
|
|
///
|
|
|
|
/// Consider the following example:
|
|
|
|
///
|
|
|
|
/// CREATE TABLE test(p DateTime, k int) ENGINE MergeTree PARTITION BY toDate(p) ORDER BY k;
|
|
|
|
/// INSERT INTO test VALUES ('2020-09-01 00:01:02', 1), ('2020-09-01 20:01:03', 2), ('2020-09-02 00:01:03', 3);
|
|
|
|
///
|
|
|
|
/// - SELECT count() FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
|
|
|
|
/// In this case rpn will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN (due to strict), FUNCTION_AND)
|
|
|
|
/// and for optimize_trivial_count_query we cannot use index if there is at least one FUNCTION_UNKNOWN.
|
|
|
|
/// since there is no post processing and return count() based on only the first predicate is wrong.
|
|
|
|
///
|
|
|
|
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND p <= '2020-09-01 00:00:00'
|
|
|
|
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_IN_RANGE (due to non-strict), FUNCTION_AND)
|
|
|
|
/// so it will prune everything out and nothing will be read.
|
|
|
|
///
|
|
|
|
/// - SELECT * FROM test WHERE toDate(p) >= '2020-09-01' AND toUnixTimestamp(p)%5==0
|
|
|
|
/// In this case will be (FUNCTION_IN_RANGE, FUNCTION_UNKNOWN, FUNCTION_AND)
|
|
|
|
/// and all, two, partitions will be scanned, but due to filtering later none of rows will be matched.
|
|
|
|
bool unknownOrAlwaysTrue(bool unknown_any) const;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
RPN rpn;
|
|
|
|
|
2018-04-20 00:20:36 +00:00
|
|
|
ColumnIndices key_columns;
|
2021-06-15 13:47:37 +00:00
|
|
|
/// Expression which is used for key condition.
|
|
|
|
const ExpressionActionsPtr key_expr;
|
|
|
|
/// All intermediate columns are used to calculate key_expr.
|
|
|
|
const NameSet key_subexpr_names;
|
2021-06-21 12:34:05 +00:00
|
|
|
|
|
|
|
NameSet array_joined_columns;
|
2017-07-15 03:48:36 +00:00
|
|
|
PreparedSets prepared_sets;
|
2020-09-21 10:13:01 +00:00
|
|
|
|
|
|
|
// If true, always allow key_expr to be wrapped by function
|
|
|
|
bool single_point;
|
|
|
|
// If true, do not use always_monotonic information to transform constants
|
|
|
|
bool strict;
|
2012-12-05 12:44:55 +00:00
|
|
|
};
|
|
|
|
|
2021-12-08 02:40:59 +00:00
|
|
|
String extractFixedPrefixFromLikePattern(const String & like_pattern);
|
|
|
|
|
2012-12-05 12:44:55 +00:00
|
|
|
}
|