ClickHouse/dbms/include/DB/Storages/MergeTree/MergeTreeWhereOptimizer.h

235 lines
6.9 KiB
C
Raw Normal View History

2014-09-17 13:05:18 +00:00
#pragma once
#include <DB/Storages/MergeTree/MergeTreeData.h>
#include <DB/Parsers/ASTSelectQuery.h>
#include <DB/Parsers/ASTFunction.h>
#include <DB/Parsers/ASTIdentifier.h>
#include <DB/Parsers/ASTLiteral.h>
#include <DB/Parsers/ASTExpressionList.h>
#include <DB/Common/escapeForFileName.h>
2014-09-18 11:44:20 +00:00
#include <statdaemons/stdext.h>
2014-09-17 13:05:18 +00:00
#include <unordered_map>
#include <set>
#include <cstddef>
#include <DB/Parsers/formatAST.h>
namespace DB
{
class MergeTreeWhereOptimizer
{
static constexpr auto threshold = 10;
2014-09-18 11:44:20 +00:00
static constexpr auto max_column_relative_size = 0.25f;
static constexpr auto and_function_name = "and";
static constexpr auto equals_function_name = "equals";
2014-09-17 13:05:18 +00:00
public:
MergeTreeWhereOptimizer(const MergeTreeData & data, const MergeTreeData::DataPartsVector & parts)
{
fillPrimaryKeyColumns(data);
2014-09-17 13:05:18 +00:00
calculateColumnSizes(parts);
}
void optimize(ASTSelectQuery & select)
{
2014-09-18 11:44:20 +00:00
if (!select.where_expression)
2014-09-17 13:05:18 +00:00
return;
2014-09-18 11:44:20 +00:00
const auto function = typeid_cast<ASTFunction *>(select.where_expression.get());
if (function && function->name == and_function_name)
optimizeAnd(select, function);
else
optimizeArbitrary(select);
std::cout << "(possibly) transformed query is: ";
formatAST(select, std::cout);
std::cout << std::endl;
}
private:
void fillPrimaryKeyColumns(const MergeTreeData & data)
{
for (const auto column : data.getPrimaryExpression()->getRequiredColumnsWithTypes())
primary_key_columns.insert(column.name);
}
void calculateColumnSizes(const MergeTreeData::DataPartsVector & parts)
{
for (const auto & part : parts)
{
for (const auto & file : part->checksums.files)
{
const auto file_name = unescapeForFileName(file.first);
const auto column_name = file_name.substr(0, file_name.find_last_of('.'));
const auto column_file_size = file.second.file_size;
column_sizes[column_name] += column_file_size;
total_size += column_file_size;
}
}
}
2014-09-17 13:05:18 +00:00
void optimizeAnd(ASTSelectQuery & select, ASTFunction * const fun)
{
2014-09-17 13:05:18 +00:00
/// column size => index of condition which uses said row
std::map<size_t, size_t> good_conditions{};
/// index of condition
std::set<size_t> viable_conditions{};
2014-09-18 11:44:20 +00:00
auto & conditions = fun->arguments->children;
2014-09-17 13:05:18 +00:00
/// remove condition by swapping it with the last one and calling ::pop_back()
2014-09-18 11:44:20 +00:00
const auto remove_condition_at_index = [&conditions] (const size_t idx) {
if (idx < conditions.size())
conditions[idx] = std::move(conditions.back());
conditions.pop_back();
2014-09-17 13:05:18 +00:00
};
/// linearize conjunction and divide conditions into "good" and not-"good" ones
2014-09-18 11:44:20 +00:00
for (size_t i = 0; i < conditions.size();)
2014-09-17 13:05:18 +00:00
{
2014-09-18 11:44:20 +00:00
const auto condition = conditions[i].get();
2014-09-17 13:05:18 +00:00
/// linearize sub-conjunctions
2014-09-18 11:44:20 +00:00
if (const auto function = typeid_cast<ASTFunction *>(condition))
2014-09-17 13:05:18 +00:00
{
2014-09-18 11:44:20 +00:00
if (function->name == and_function_name)
2014-09-17 13:05:18 +00:00
{
2014-09-18 11:44:20 +00:00
for (auto & child : function->arguments->children)
conditions.emplace_back(std::move(child));
2014-09-17 13:05:18 +00:00
/// remove the condition corresponding to conjunction
remove_condition_at_index(i);
/// continue iterating without increment to ensure the just added conditions are processed
continue;
}
}
/// identify condition as either "good" or not
std::string column_name{};
if (isConditionGood(condition, column_name))
2014-09-17 13:05:18 +00:00
good_conditions.emplace(column_sizes[column_name], i);
else
viable_conditions.emplace(i);
++i;
}
/// if there are "good" conditions - select the one with the least compressed size
2014-09-17 13:05:18 +00:00
if (!good_conditions.empty())
{
const auto idx = good_conditions.begin()->second;
2014-09-18 11:44:20 +00:00
addConditionTo(conditions[idx], select.prewhere_expression);
2014-09-17 13:05:18 +00:00
/** Replace conjunction with the only remaining argument if only two conditions were presentotherwise,
* remove selected condition from conjunction otherwise. */
2014-09-18 11:44:20 +00:00
if (conditions.size() == 2)
select.where_expression = std::move(conditions[idx == 0 ? 1 : 0]);
2014-09-17 13:05:18 +00:00
else
remove_condition_at_index(idx);
}
else if (!viable_conditions.empty())
{
/// @todo implement not-"good" condition transformation
}
}
void optimizeArbitrary(ASTSelectQuery & select)
2014-09-17 13:05:18 +00:00
{
}
2014-09-18 11:44:20 +00:00
void addConditionTo(ASTPtr condition, ASTPtr & ast)
{
/** if there already are some conditions - either combine them using conjunction
* or add new argument to existing conjunction; just set ast to condition otherwise. */
if (ast)
{
const auto function = typeid_cast<ASTFunction *>(ast.get());
if (function && function->name == and_function_name)
{
/// add new argument to the conjunction
function->arguments->children.emplace_back(std::move(condition));
}
else
{
/// create a conjunction which will host old condition and the one being added
auto conjunction = stdext::make_unique<ASTFunction>();
conjunction->name = and_function_name;
conjunction->arguments = stdext::make_unique<ASTExpressionList>().release();
conjunction->children.push_back(conjunction->arguments);
conjunction->arguments->children.emplace_back(std::move(ast));
conjunction->arguments->children.emplace_back(std::move(condition));
ast = conjunction.release();
}
}
else
ast = std::move(condition);
}
2014-09-17 13:05:18 +00:00
bool isConditionGood(const IAST * condition, std::string & column_name)
2014-09-17 13:05:18 +00:00
{
2014-09-18 11:44:20 +00:00
const auto function = typeid_cast<const ASTFunction *>(condition);
if (!function)
2014-09-17 13:05:18 +00:00
return false;
2014-09-18 11:44:20 +00:00
/** we are only considering conditions of form `equals(one, another)` or `one = another`,
2014-09-17 13:05:18 +00:00
* especially if either `one` or `another` is ASTIdentifier */
2014-09-18 11:44:20 +00:00
if (function->name != equals_function_name)
2014-09-17 13:05:18 +00:00
return false;
2014-09-18 11:44:20 +00:00
auto left_arg = function->arguments->children.front().get();
auto right_arg = function->arguments->children.back().get();
2014-09-17 13:05:18 +00:00
/// try to ensure left_arg points to ASTIdentifier
if (!typeid_cast<const ASTIdentifier *>(left_arg) && typeid_cast<const ASTIdentifier *>(right_arg))
std::swap(left_arg, right_arg);
if (const auto identifier = typeid_cast<const ASTIdentifier *>(left_arg))
{
/// if the identifier is part of the primary key, the condition is not "good"
if (primary_key_columns.count(identifier->name))
return false;
column_name = identifier->name;
/// condition may be "good" if only right_arg is a constant and its value is outside the threshold
if (const auto literal = typeid_cast<const ASTLiteral *>(right_arg))
{
const auto & field = literal->value;
const auto type = field.getType();
/// check the value with respect to threshold
if (type == Field::Types::UInt64)
{
const auto value = field.get<UInt64>();
return value > threshold;
}
else if (type == Field::Types::Int64)
{
const auto value = field.get<Int64>();
return value < -threshold || threshold < value;
}
else if (type == Field::Types::Float64)
{
const auto value = field.get<Float64>();
return value < threshold || threshold < value;
}
}
}
return false;
}
std::unordered_set<std::string> primary_key_columns{};
std::unordered_map<std::string, std::size_t> column_sizes{};
std::size_t total_size{};
};
}