Merge pull request #10026 from CurtizJ/speedup-index

Improve performance of index analysis with monotonic functions [2].
This commit is contained in:
alexey-milovidov 2020-04-12 06:37:26 +03:00 committed by GitHub
commit b888c867b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 152 additions and 144 deletions

View File

@ -596,6 +596,14 @@ BoolMask MergeTreeSetIndex::checkInRange(const std::vector<Range> & key_ranges,
};
}
bool MergeTreeSetIndex::hasMonotonicFunctionsChain() const
{
for (const auto & mapping : indexes_mapping)
if (!mapping.functions.empty())
return true;
return false;
}
void ValueWithInfinity::update(const Field & x)
{
/// Keep at most one element in column.
@ -607,8 +615,11 @@ void ValueWithInfinity::update(const Field & x)
const IColumn & ValueWithInfinity::getColumnIfFinite() const
{
#ifndef NDEBUG
if (type != NORMAL)
throw Exception("Trying to get column of infinite type", ErrorCodes::LOGICAL_ERROR);
#endif
return *column;
}

View File

@ -231,6 +231,8 @@ public:
size_t size() const { return ordered_set.at(0)->size(); }
bool hasMonotonicFunctionsChain() const;
BoolMask checkInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types);
private:

View File

@ -98,8 +98,8 @@ void IMergeTreeDataPart::MinMaxIndex::update(const Block & block, const Names &
for (size_t i = 0; i < column_names.size(); ++i)
{
Field min_value;
Field max_value;
FieldRef min_value;
FieldRef max_value;
const ColumnWithTypeAndName & column = block.getByName(column_names[i]);
column.column->getExtremes(min_value, max_value);

View File

@ -338,44 +338,6 @@ inline bool Range::equals(const Field & lhs, const Field & rhs) { return applyVi
inline bool Range::less(const Field & lhs, const Field & rhs) { return applyVisitor(FieldVisitorAccurateLess(), lhs, rhs); }
FieldWithInfinity::FieldWithInfinity(const Field & field_)
: field(field_),
type(Type::NORMAL)
{
}
FieldWithInfinity::FieldWithInfinity(Field && field_)
: field(std::move(field_)),
type(Type::NORMAL)
{
}
FieldWithInfinity::FieldWithInfinity(const Type type_)
: type(type_)
{
}
FieldWithInfinity FieldWithInfinity::getMinusInfinity()
{
return FieldWithInfinity(Type::MINUS_INFINITY);
}
FieldWithInfinity FieldWithInfinity::getPlusInfinity()
{
return FieldWithInfinity(Type::PLUS_INFINITY);
}
bool FieldWithInfinity::operator<(const FieldWithInfinity & other) const
{
return type < other.type || (type == other.type && type == Type::NORMAL && field < other.field);
}
bool FieldWithInfinity::operator==(const FieldWithInfinity & other) const
{
return type == other.type && (type != Type::NORMAL || field == other.field);
}
/** Calculate expressions, that depend only on constants.
* For index to work when something like "WHERE Date = toDate(now())" is written.
*/
@ -480,24 +442,41 @@ bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants
}
static void applyFunction(
static Field applyFunctionForField(
const FunctionBasePtr & func,
const DataTypePtr & arg_type, const Field & arg_value,
DataTypePtr & res_type, Field & res_value)
const DataTypePtr & arg_type,
const Field & arg_value)
{
res_type = func->getReturnType();
Block block
{
{ arg_type->createColumnConst(1, arg_value), arg_type, "x" },
{ nullptr, res_type, "y" }
{ nullptr, func->getReturnType(), "y" }
};
func->execute(block, {0}, 1, 1);
block.safeGetByPosition(1).column->get(0, res_value);
return (*block.safeGetByPosition(1).column)[0];
}
static FieldRef applyFunction(FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field)
{
/// Fallback for fields without block reference.
if (field.isExplicit())
return applyFunctionForField(func, current_type, field);
String result_name = "_" + func->getName() + "_" + toString(field.column_idx);
size_t result_idx;
const auto & block = field.block;
if (!block->has(result_name))
{
result_idx = block->columns();
field.block->insert({nullptr, func->getReturnType(), result_name});
func->execute(*block, {field.column_idx}, result_idx, block->rows());
}
else
result_idx = block->getPositionByName(result_name);
return {field.block, field.row_idx, result_idx};
}
void KeyCondition::traverseAST(const ASTPtr & node, const Context & context, Block & block_with_constants)
{
@ -569,12 +548,8 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions(
return false;
// Apply the next transformation step
DataTypePtr new_type;
applyFunction(a.function_base, out_type, out_value, new_type, out_value);
if (!new_type)
return false;
out_type.swap(new_type);
out_value = applyFunctionForField(a.function_base, out_type, out_value);
out_type = a.function_base->getReturnType();
expr_name = a.result_name;
// Transformation results in a key expression, accept
@ -957,8 +932,8 @@ String KeyCondition::toString() const
template <typename F>
static BoolMask forAnyHyperrectangle(
size_t key_size,
const Field * key_left,
const Field * key_right,
const FieldRef * key_left,
const FieldRef * key_right,
bool left_bounded,
bool right_bounded,
std::vector<Range> & hyperrectangle,
@ -1049,8 +1024,8 @@ static BoolMask forAnyHyperrectangle(
BoolMask KeyCondition::checkInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef * right_key,
const DataTypes & data_types,
bool right_bounded,
BoolMask initial_mask) const
@ -1102,19 +1077,12 @@ std::optional<Range> KeyCondition::applyMonotonicFunctionsChainToRange(
return {};
}
/// Apply the function.
DataTypePtr new_type;
if (!key_range.left.isNull())
applyFunction(func, current_type, key_range.left, new_type, key_range.left);
key_range.left = applyFunction(func, current_type, key_range.left);
if (!key_range.right.isNull())
applyFunction(func, current_type, key_range.right, new_type, key_range.right);
key_range.right = applyFunction(func, current_type, key_range.right);
if (!new_type)
{
return {};
}
current_type.swap(new_type);
current_type = func->getReturnType();
if (!monotonicity.is_positive)
key_range.swapLeftAndRight();
@ -1220,8 +1188,8 @@ BoolMask KeyCondition::checkInHyperrectangle(
BoolMask KeyCondition::checkInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef * right_key,
const DataTypes & data_types,
BoolMask initial_mask) const
{
@ -1231,8 +1199,8 @@ BoolMask KeyCondition::checkInRange(
bool KeyCondition::mayBeTrueInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef * right_key,
const DataTypes & data_types) const
{
return checkInRange(used_key_size, left_key, right_key, data_types, true, BoolMask::consider_only_can_be_true).can_be_true;
@ -1241,7 +1209,7 @@ bool KeyCondition::mayBeTrueInRange(
BoolMask KeyCondition::checkAfter(
size_t used_key_size,
const Field * left_key,
const FieldRef * left_key,
const DataTypes & data_types,
BoolMask initial_mask) const
{
@ -1251,7 +1219,7 @@ BoolMask KeyCondition::checkAfter(
bool KeyCondition::mayBeTrueAfter(
size_t used_key_size,
const Field * left_key,
const FieldRef * left_key,
const DataTypes & data_types) const
{
return checkInRange(used_key_size, left_key, nullptr, data_types, false, BoolMask::consider_only_can_be_true).can_be_true;
@ -1382,4 +1350,13 @@ size_t KeyCondition::getMaxKeyColumn() const
return res;
}
bool KeyCondition::hasMonotonicFunctionsChain() const
{
for (const auto & element : rpn)
if (!element.monotonic_functions_chain.empty()
|| (element.set_index && element.set_index->hasMonotonicFunctionsChain()))
return true;
return false;
}
}

View File

@ -15,10 +15,6 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_TYPE_OF_FIELD;
}
class IFunction;
using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
@ -26,6 +22,33 @@ using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
class ExpressionActions;
using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
/** A field, that can be stored in two reperesenations:
* - A standalone field.
* - A field with reference to its position in a block.
* It's needed for execution of functions on ranges during
* index analysis. If function was executed once for field,
* its result would be cached for whole block for which field's reference points to.
*/
struct FieldRef : public Field
{
FieldRef() = default;
/// Create as explicit field without block.
template <typename T>
FieldRef(T && value) : Field(std::forward<T>(value)) {}
/// Create as reference to field in block.
FieldRef(Block * block_, size_t row_idx_, size_t column_idx_)
: Field((*block_->getByPosition(column_idx_).column)[row_idx_]),
block(block_), row_idx(row_idx_), column_idx(column_idx_) {}
bool isExplicit() const { return block == nullptr; }
Block * block = nullptr;
size_t row_idx = 0;
size_t column_idx = 0;
};
/** Range with open or closed ends; possibly unbounded.
*/
struct Range
@ -35,8 +58,8 @@ private:
static bool less(const Field & lhs, const Field & rhs);
public:
Field left; /// the left border, if any
Field right; /// the right border, if any
FieldRef left; /// the left border, if any
FieldRef right; /// the right border, if any
bool left_bounded = false; /// bounded at the left
bool right_bounded = false; /// bounded at the right
bool left_included = false; /// includes the left border, if any
@ -46,11 +69,11 @@ public:
Range() {}
/// One point.
Range(const Field & point)
Range(const FieldRef & point)
: left(point), right(point), left_bounded(true), right_bounded(true), left_included(true), right_included(true) {}
/// A bounded two-sided range.
Range(const Field & left_, bool left_included_, const Field & right_, bool right_included_)
Range(const FieldRef & left_, bool left_included_, const FieldRef & right_, bool right_included_)
: left(left_), right(right_),
left_bounded(true), right_bounded(true),
left_included(left_included_), right_included(right_included_)
@ -58,7 +81,7 @@ public:
shrinkToIncludedIfPossible();
}
static Range createRightBounded(const Field & right_point, bool right_included)
static Range createRightBounded(const FieldRef & right_point, bool right_included)
{
Range r;
r.right = right_point;
@ -68,7 +91,7 @@ public:
return r;
}
static Range createLeftBounded(const Field & left_point, bool left_included)
static Range createLeftBounded(const FieldRef & left_point, bool left_included)
{
Range r;
r.left = left_point;
@ -84,7 +107,7 @@ public:
*/
void shrinkToIncludedIfPossible()
{
if (left_bounded && !left_included)
if (left.isExplicit() && left_bounded && !left_included)
{
if (left.getType() == Field::Types::UInt64 && left.get<UInt64>() != std::numeric_limits<UInt64>::max())
{
@ -97,7 +120,7 @@ public:
left_included = true;
}
}
if (right_bounded && !right_included)
if (right.isExplicit() && right_bounded && !right_included)
{
if (right.getType() == Field::Types::UInt64 && right.get<UInt64>() != std::numeric_limits<UInt64>::min())
{
@ -120,13 +143,13 @@ public:
}
/// x contained in the range
bool contains(const Field & x) const
bool contains(const FieldRef & x) const
{
return !leftThan(x) && !rightThan(x);
}
/// x is to the left
bool rightThan(const Field & x) const
bool rightThan(const FieldRef & x) const
{
return (left_bounded
? !(less(left, x) || (left_included && equals(x, left)))
@ -134,7 +157,7 @@ public:
}
/// x is to the right
bool leftThan(const Field & x) const
bool leftThan(const FieldRef & x) const
{
return (right_bounded
? !(less(x, right) || (right_included && equals(x, right)))
@ -195,42 +218,6 @@ public:
String toString() const;
};
/// Class that extends arbitrary objects with infinities, like +-inf for floats
class FieldWithInfinity
{
public:
enum Type
{
MINUS_INFINITY = -1,
NORMAL = 0,
PLUS_INFINITY = 1
};
explicit FieldWithInfinity(const Field & field_);
FieldWithInfinity(Field && field_);
static FieldWithInfinity getMinusInfinity();
static FieldWithInfinity getPlusInfinity();
bool operator<(const FieldWithInfinity & other) const;
bool operator==(const FieldWithInfinity & other) const;
Field getFieldIfFinite() const
{
if (type != NORMAL)
throw Exception("Trying to get field of infinite type", ErrorCodes::BAD_TYPE_OF_FIELD);
return field;
}
private:
Field field;
Type type;
FieldWithInfinity(const Type type_);
};
/** Condition on the index.
*
* Consists of the conditions for the key belonging to all possible ranges or sets,
@ -261,8 +248,8 @@ public:
/// one of the resulting mask components (see BoolMask::consider_only_can_be_XXX).
BoolMask checkInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef* right_key,
const DataTypes & data_types,
BoolMask initial_mask = BoolMask(false, false)) const;
@ -270,7 +257,7 @@ public:
/// left_key must contain all the fields in the sort_descr in the appropriate order.
BoolMask checkAfter(
size_t used_key_size,
const Field * left_key,
const FieldRef * left_key,
const DataTypes & data_types,
BoolMask initial_mask = BoolMask(false, false)) const;
@ -278,15 +265,15 @@ public:
/// This is more efficient than checkInRange(...).can_be_true.
bool mayBeTrueInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef * right_key,
const DataTypes & data_types) const;
/// Same as checkAfter, but calculate only may_be_true component of a result.
/// This is more efficient than checkAfter(...).can_be_true.
bool mayBeTrueAfter(
size_t used_key_size,
const Field * left_key,
const FieldRef * left_key,
const DataTypes & data_types) const;
/// Checks that the index can not be used.
@ -295,6 +282,8 @@ public:
/// Get the maximum number of the key element used in the condition.
size_t getMaxKeyColumn() const;
bool hasMonotonicFunctionsChain() const;
/// Impose an additional condition: the value in the column `column` must be in the range `range`.
/// Returns whether there is such a column in the key.
bool addCondition(const String & column, const Range & range);
@ -374,8 +363,8 @@ public:
private:
BoolMask checkInRange(
size_t used_key_size,
const Field * left_key,
const Field * right_key,
const FieldRef * left_key,
const FieldRef * right_key,
const DataTypes & data_types,
bool right_bounded,
BoolMask initial_mask) const;

View File

@ -1201,11 +1201,33 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
* If fits, split it into smaller ones and put them on the stack. If not, discard it.
* If the segment is already of one mark length, add it to response and discard it.
*/
std::vector<MarkRange> ranges_stack{ {0, marks_count} };
std::vector<MarkRange> ranges_stack = { {0, marks_count} };
std::function<void(size_t, size_t, FieldRef &)> create_field_ref;
/// If there are no monotonic functions, there is no need to save block reference.
/// Passing explicit field to FieldRef allows to optimize ranges and shows better performance.
if (key_condition.hasMonotonicFunctionsChain())
{
auto index_block = std::make_shared<Block>();
for (size_t i = 0; i < used_key_size; ++i)
index_block->insert({index[i], data.primary_key_data_types[i], data.primary_key_columns[i]});
create_field_ref = [index_block](size_t row, size_t column, FieldRef & field)
{
field = {index_block.get(), row, column};
};
}
else
{
create_field_ref = [&index](size_t row, size_t column, FieldRef & field)
{
index[column]->get(row, field);
};
}
/// NOTE Creating temporary Field objects to pass to KeyCondition.
Row index_left(used_key_size);
Row index_right(used_key_size);
std::vector<FieldRef> index_left(used_key_size);
std::vector<FieldRef> index_right(used_key_size);
while (!ranges_stack.empty())
{
@ -1216,7 +1238,7 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
if (range.end == marks_count && !has_final_mark)
{
for (size_t i = 0; i < used_key_size; ++i)
index[i]->get(range.begin, index_left[i]);
create_field_ref(range.begin, i, index_left[i]);
may_be_true = key_condition.mayBeTrueAfter(
used_key_size, index_left.data(), data.primary_key_data_types);
@ -1228,8 +1250,8 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
for (size_t i = 0; i < used_key_size; ++i)
{
index[i]->get(range.begin, index_left[i]);
index[i]->get(range.end, index_right[i]);
create_field_ref(range.begin, i, index_left[i]);
create_field_ref(range.end, i, index_right[i]);
}
may_be_true = key_condition.mayBeTrueInRange(
@ -1254,9 +1276,9 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
size_t end;
for (end = range.end; end > range.begin + step; end -= step)
ranges_stack.push_back(MarkRange(end - step, end));
ranges_stack.emplace_back(end - step, end);
ranges_stack.push_back(MarkRange(range.begin, end));
ranges_stack.emplace_back(range.begin, end);
}
}
}

View File

@ -100,8 +100,8 @@ void MergeTreeIndexAggregatorMinMax::update(const Block & block, size_t * pos, s
size_t rows_read = std::min(limit, block.rows() - *pos);
Field field_min;
Field field_max;
FieldRef field_min;
FieldRef field_max;
for (size_t i = 0; i < index.columns.size(); ++i)
{
const auto & column = block.getByName(index.columns[i]).column;

File diff suppressed because one or more lines are too long