mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-22 01:30:51 +00:00
Better prewhere (incomplete)
This commit is contained in:
parent
a216cba3c4
commit
f30c05ed94
@ -18,22 +18,14 @@
|
|||||||
#include <ext/map.h>
|
#include <ext/map.h>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <map>
|
#include <tuple>
|
||||||
#include <limits>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
static constexpr auto threshold = 10;
|
static constexpr auto threshold = 2;
|
||||||
/// We decided to remove the restriction due to the absence of a penalty for the transfer in PREWHERE
|
|
||||||
static constexpr auto max_columns_relative_size = 1.0f;
|
|
||||||
static constexpr auto and_function_name = "and";
|
|
||||||
static constexpr auto equals_function_name = "equals";
|
|
||||||
static constexpr auto array_join_function_name = "arrayJoin";
|
|
||||||
static constexpr auto global_in_function_name = "globalIn";
|
|
||||||
static constexpr auto global_not_in_function_name = "globalNotIn";
|
|
||||||
|
|
||||||
|
|
||||||
MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
|
MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
|
||||||
@ -63,7 +55,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
const auto function = typeid_cast<ASTFunction *>(select.where_expression.get());
|
const auto function = typeid_cast<ASTFunction *>(select.where_expression.get());
|
||||||
if (function && function->name == and_function_name)
|
if (function && function->name == "and")
|
||||||
optimizeConjunction(select, function);
|
optimizeConjunction(select, function);
|
||||||
else
|
else
|
||||||
optimizeArbitrary(select);
|
optimizeArbitrary(select);
|
||||||
@ -73,23 +65,36 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const
|
|||||||
void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names)
|
void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names)
|
||||||
{
|
{
|
||||||
for (const auto & column_name : column_names)
|
for (const auto & column_name : column_names)
|
||||||
{
|
column_sizes[column_name] = data.getColumnCompressedSize(column_name);
|
||||||
const auto column_size = data.getColumnCompressedSize(column_name);
|
}
|
||||||
|
|
||||||
column_sizes[column_name] = column_size;
|
|
||||||
total_column_size += column_size;
|
namespace
|
||||||
|
{
|
||||||
|
struct ConditionCandidate
|
||||||
|
{
|
||||||
|
size_t columns_size;
|
||||||
|
int64_t position;
|
||||||
|
IdentifierNameSet identifiers;
|
||||||
|
bool is_good;
|
||||||
|
|
||||||
|
auto tuple() const
|
||||||
|
{
|
||||||
|
/// We'll move conditions from back to keep "position".
|
||||||
|
return std::forward_as_tuple(!is_good, columns_size, -position);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool operator< (const ConditionCandidate & rhs) const
|
||||||
|
{
|
||||||
|
return tuple() < rhs.tuple();
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void MergeTreeWhereOptimizer::optimizeConjunction(ASTSelectQuery & select, ASTFunction * const fun) const
|
void MergeTreeWhereOptimizer::optimizeConjunction(ASTSelectQuery & select, ASTFunction * const fun) const
|
||||||
{
|
{
|
||||||
/// used as max possible size and indicator that appropriate condition has not been found
|
std::vector<ConditionCandidate> condition_candidates;
|
||||||
const auto no_such_condition = std::numeric_limits<size_t>::max();
|
|
||||||
|
|
||||||
/// { first: condition index, second: summary column size }
|
|
||||||
std::pair<size_t, size_t> lightest_good_condition{no_such_condition, no_such_condition};
|
|
||||||
std::pair<size_t, size_t> lightest_viable_condition{no_such_condition, no_such_condition};
|
|
||||||
|
|
||||||
auto & conditions = fun->arguments->children;
|
auto & conditions = fun->arguments->children;
|
||||||
|
|
||||||
@ -109,7 +114,7 @@ void MergeTreeWhereOptimizer::optimizeConjunction(ASTSelectQuery & select, ASTFu
|
|||||||
/// linearize sub-conjunctions
|
/// linearize sub-conjunctions
|
||||||
if (const auto function = typeid_cast<ASTFunction *>(condition))
|
if (const auto function = typeid_cast<ASTFunction *>(condition))
|
||||||
{
|
{
|
||||||
if (function->name == and_function_name)
|
if (function->name == "and")
|
||||||
{
|
{
|
||||||
for (auto & child : function->arguments->children)
|
for (auto & child : function->arguments->children)
|
||||||
conditions.emplace_back(std::move(child));
|
conditions.emplace_back(std::move(child));
|
||||||
@ -133,27 +138,47 @@ void MergeTreeWhereOptimizer::optimizeConjunction(ASTSelectQuery & select, ASTFu
|
|||||||
/// do not take into consideration the conditions consisting only of the first primary key column
|
/// do not take into consideration the conditions consisting only of the first primary key column
|
||||||
if (!hasPrimaryKeyAtoms(condition) && isSubsetOfTableColumns(identifiers))
|
if (!hasPrimaryKeyAtoms(condition) && isSubsetOfTableColumns(identifiers))
|
||||||
{
|
{
|
||||||
/// calculate size of columns involved in condition
|
ConditionCandidate candidate;
|
||||||
const auto cond_columns_size = getIdentifiersColumnSize(identifiers);
|
candidate.position = idx;
|
||||||
|
candidate.columns_size = getIdentifiersColumnSize(identifiers);
|
||||||
/// place condition either in good or viable conditions set
|
candidate.is_good = isConditionGood(condition);
|
||||||
auto & good_or_viable_condition = isConditionGood(condition) ? lightest_good_condition : lightest_viable_condition;
|
candidate.identifiers = identifiers;
|
||||||
if (good_or_viable_condition.second > cond_columns_size)
|
condition_candidates.emplace_back(std::move(candidate));
|
||||||
{
|
|
||||||
good_or_viable_condition.first = idx;
|
|
||||||
good_or_viable_condition.second = cond_columns_size;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (condition_candidates.empty())
|
||||||
|
return;
|
||||||
|
|
||||||
const auto move_condition_to_prewhere = [&] (const size_t idx)
|
const auto move_condition_to_prewhere = [&] (const size_t idx)
|
||||||
{
|
{
|
||||||
select.prewhere_expression = conditions[idx];
|
if (!select.prewhere_expression)
|
||||||
select.children.push_back(select.prewhere_expression);
|
{
|
||||||
LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition `" << select.prewhere_expression << "` moved to PREWHERE");
|
select.prewhere_expression = conditions[idx];
|
||||||
|
select.children.push_back(select.prewhere_expression);
|
||||||
|
}
|
||||||
|
else if (auto func_and = typeid_cast<ASTFunction *>(select.prewhere_expression.get()); func_and && func_and->name == "and")
|
||||||
|
{
|
||||||
|
/// Add argument to AND chain
|
||||||
|
|
||||||
|
func_and->arguments->children.emplace_back(conditions[idx]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/// Make old_cond AND new_cond
|
||||||
|
|
||||||
|
auto func = std::make_shared<ASTFunction>();
|
||||||
|
func->name = "and";
|
||||||
|
func->arguments->children = {select.prewhere_expression, conditions[idx]};
|
||||||
|
|
||||||
|
select.children.clear();
|
||||||
|
select.prewhere_expression = std::move(func);
|
||||||
|
select.children.push_back(select.prewhere_expression);
|
||||||
|
}
|
||||||
|
|
||||||
/** Replace conjunction with the only remaining argument if only two conditions were present,
|
/** Replace conjunction with the only remaining argument if only two conditions were present,
|
||||||
* remove selected condition from conjunction otherwise. */
|
* remove selected condition from conjunction otherwise.
|
||||||
|
*/
|
||||||
if (conditions.size() == 2)
|
if (conditions.size() == 2)
|
||||||
{
|
{
|
||||||
/// find old where_expression in children of select
|
/// find old where_expression in children of select
|
||||||
@ -167,27 +192,25 @@ void MergeTreeWhereOptimizer::optimizeConjunction(ASTSelectQuery & select, ASTFu
|
|||||||
remove_condition_at_index(idx);
|
remove_condition_at_index(idx);
|
||||||
};
|
};
|
||||||
|
|
||||||
/// if there is a "good" condition - move it to PREWHERE
|
/// Lightest conditions first. NOTE The algorithm is suboptimal, replace with priority_queue if you want.
|
||||||
if (lightest_good_condition.first != no_such_condition)
|
std::sort(condition_candidates.begin(), condition_candidates.end());
|
||||||
{
|
|
||||||
move_condition_to_prewhere(lightest_good_condition.first);
|
|
||||||
}
|
|
||||||
else if (lightest_viable_condition.first != no_such_condition)
|
|
||||||
{
|
|
||||||
/// check that the relative column size is less than max
|
|
||||||
if (total_column_size != 0)
|
|
||||||
{
|
|
||||||
/// calculate relative size of condition's columns
|
|
||||||
const auto cond_columns_size = lightest_viable_condition.second;
|
|
||||||
const auto columns_relative_size = static_cast<float>(cond_columns_size) / total_column_size;
|
|
||||||
|
|
||||||
/// do nothing if it exceeds max relative size
|
/// Pick the best condition and also all other conditions with the same set of columns.
|
||||||
if (columns_relative_size > max_columns_relative_size)
|
/// For example, if we take "EventTime >= '2014-03-20 00:00:00'", we will also take "EventTime < '2014-03-21 00:00:00'".
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
move_condition_to_prewhere(lightest_viable_condition.first);
|
IdentifierNameSet identifiers_of_moved_condition = condition_candidates[0].identifiers;
|
||||||
|
move_condition_to_prewhere(condition_candidates[0].position);
|
||||||
|
|
||||||
|
for (size_t i = 1, size = condition_candidates.size(); i < size; ++i)
|
||||||
|
{
|
||||||
|
if (identifiers_of_moved_condition == condition_candidates[i].identifiers)
|
||||||
|
move_condition_to_prewhere(condition_candidates[i].position);
|
||||||
|
else
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (select.prewhere_expression)
|
||||||
|
LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition \"" << select.prewhere_expression << "\" moved to PREWHERE");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -205,16 +228,6 @@ void MergeTreeWhereOptimizer::optimizeArbitrary(ASTSelectQuery & select) const
|
|||||||
if (hasPrimaryKeyAtoms(condition.get()) || !isSubsetOfTableColumns(identifiers))
|
if (hasPrimaryKeyAtoms(condition.get()) || !isSubsetOfTableColumns(identifiers))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/// if condition is not "good" - check that it can be moved
|
|
||||||
if (!isConditionGood(condition.get()) && total_column_size != 0)
|
|
||||||
{
|
|
||||||
const auto cond_columns_size = getIdentifiersColumnSize(identifiers);
|
|
||||||
const auto columns_relative_size = static_cast<float>(cond_columns_size) / total_column_size;
|
|
||||||
|
|
||||||
if (columns_relative_size > max_columns_relative_size)
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// add the condition to PREWHERE, remove it from WHERE
|
/// add the condition to PREWHERE, remove it from WHERE
|
||||||
std::swap(select.prewhere_expression, condition);
|
std::swap(select.prewhere_expression, condition);
|
||||||
LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition `" << select.prewhere_expression << "` moved to PREWHERE");
|
LOG_DEBUG(log, "MergeTreeWhereOptimizer: condition `" << select.prewhere_expression << "` moved to PREWHERE");
|
||||||
@ -246,7 +259,7 @@ bool MergeTreeWhereOptimizer::isConditionGood(const IAST * condition) const
|
|||||||
|
|
||||||
/** we are only considering conditions of form `equals(one, another)` or `one = another`,
|
/** we are only considering conditions of form `equals(one, another)` or `one = another`,
|
||||||
* especially if either `one` or `another` is ASTIdentifier */
|
* especially if either `one` or `another` is ASTIdentifier */
|
||||||
if (function->name != equals_function_name)
|
if (function->name != "equals")
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
auto left_arg = function->arguments->children.front().get();
|
auto left_arg = function->arguments->children.front().get();
|
||||||
@ -371,12 +384,12 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const
|
|||||||
if (const auto function_ptr = typeid_cast<const ASTFunction *>(ptr.get()))
|
if (const auto function_ptr = typeid_cast<const ASTFunction *>(ptr.get()))
|
||||||
{
|
{
|
||||||
/// disallow arrayJoin expressions to be moved to PREWHERE for now
|
/// disallow arrayJoin expressions to be moved to PREWHERE for now
|
||||||
if (array_join_function_name == function_ptr->name)
|
if ("arrayJoin" == function_ptr->name)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/// disallow GLOBAL IN, GLOBAL NOT IN
|
/// disallow GLOBAL IN, GLOBAL NOT IN
|
||||||
if (global_in_function_name == function_ptr->name
|
if ("globalIn" == function_ptr->name
|
||||||
|| global_not_in_function_name == function_ptr->name)
|
|| "globalNotIn" == function_ptr->name)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
/// indexHint is a special function that it does not make sense to transfer to PREWHERE
|
/// indexHint is a special function that it does not make sense to transfer to PREWHERE
|
||||||
|
@ -24,10 +24,8 @@ using IdentifierNameSet = std::set<std::string>;
|
|||||||
* sizes of columns used in particular expression and identifying "good" conditions of
|
* sizes of columns used in particular expression and identifying "good" conditions of
|
||||||
* form "column_name = constant", where "constant" is outside some `threshold` specified in advance.
|
* form "column_name = constant", where "constant" is outside some `threshold` specified in advance.
|
||||||
*
|
*
|
||||||
* If there are "good" conditions present in WHERE, the one with minimal summary column size is
|
* If there are "good" conditions present in WHERE, the one with minimal summary column size is transferred to PREWHERE.
|
||||||
* transferred to PREWHERE.
|
* Otherwise any condition with minimal summary column size can be transferred to PREWHERE.
|
||||||
* Otherwise any condition with minimal summary column size can be transferred to PREWHERE, if only
|
|
||||||
* its relative size (summary column size divided by query column size) is less than `max_columns_relative_size`.
|
|
||||||
*/
|
*/
|
||||||
class MergeTreeWhereOptimizer : private boost::noncopyable
|
class MergeTreeWhereOptimizer : private boost::noncopyable
|
||||||
{
|
{
|
||||||
@ -79,7 +77,6 @@ private:
|
|||||||
const Block block_with_constants;
|
const Block block_with_constants;
|
||||||
Poco::Logger * log;
|
Poco::Logger * log;
|
||||||
std::unordered_map<std::string, size_t> column_sizes{};
|
std::unordered_map<std::string, size_t> column_sizes{};
|
||||||
size_t total_column_size{};
|
|
||||||
NameSet array_joined_names;
|
NameSet array_joined_names;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user