2021-04-24 15:24:32 +00:00
|
|
|
#include <Storages/MergeTree/SubstituteColumnOptimizer.h>
|
|
|
|
#include <Interpreters/TreeCNFConverter.h>
|
|
|
|
#include <Interpreters/ComparisonGraph.h>
|
|
|
|
#include <Parsers/IAST_fwd.h>
|
|
|
|
#include <Parsers/ASTFunction.h>
|
2021-04-26 11:26:54 +00:00
|
|
|
#include <Parsers/ASTIdentifier.h>
|
2021-04-24 15:24:32 +00:00
|
|
|
#include <Storages/StorageInMemoryMetadata.h>
|
|
|
|
#include <Parsers/ASTSelectQuery.h>
|
|
|
|
#include <Poco/Logger.h>
|
|
|
|
#include <Interpreters/InDepthNodeVisitor.h>
|
|
|
|
#include <Storages/IStorage.h>
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int LOGICAL_ERROR;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-05-04 14:34:00 +00:00
|
|
|
const String COMPONENT = "__aorLwT30aH_comp";
|
|
|
|
const String COMPONENT_SEPARATOR = "_";
|
2021-04-29 18:54:54 +00:00
|
|
|
constexpr UInt64 COLUMN_PENALTY = 10 * 1024 * 1024;
|
2021-04-26 11:26:54 +00:00
|
|
|
|
|
|
|
class ComponentMatcher
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
using Visitor = InDepthNodeVisitor<ComponentMatcher, true>;
|
|
|
|
|
|
|
|
struct Data
|
|
|
|
{
|
|
|
|
const ComparisonGraph & graph;
|
2021-04-29 18:12:51 +00:00
|
|
|
std::set<UInt64> & components;
|
2021-05-04 14:34:00 +00:00
|
|
|
std::unordered_map<String, String> & old_name;
|
|
|
|
std::unordered_map<String, UInt64> & component;
|
|
|
|
UInt64 & current_id;
|
|
|
|
|
|
|
|
Data(const ComparisonGraph & graph_,
|
|
|
|
std::set<UInt64> & components_,
|
|
|
|
std::unordered_map<String, String> & old_name_,
|
|
|
|
std::unordered_map<String, UInt64> & component_,
|
|
|
|
UInt64 & current_id_)
|
|
|
|
: graph(graph_)
|
|
|
|
, components(components_)
|
|
|
|
, old_name(old_name_)
|
|
|
|
, component(component_)
|
|
|
|
, current_id(current_id_)
|
2021-04-26 11:26:54 +00:00
|
|
|
{
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
static void visit(ASTPtr & ast, Data & data)
|
|
|
|
{
|
2021-05-04 14:34:00 +00:00
|
|
|
const auto id = data.graph.getComponentId(ast);
|
2021-04-26 11:26:54 +00:00
|
|
|
if (id)
|
2021-04-29 18:12:51 +00:00
|
|
|
{
|
2021-05-04 14:34:00 +00:00
|
|
|
const String name = COMPONENT + std::to_string(id.value()) + COMPONENT_SEPARATOR + std::to_string(++data.current_id);
|
|
|
|
data.old_name[name] = ast->getAliasOrColumnName();
|
|
|
|
data.component[name] = id.value();
|
|
|
|
ast = std::make_shared<ASTIdentifier>(name);
|
2021-04-29 18:12:51 +00:00
|
|
|
data.components.insert(id.value());
|
|
|
|
}
|
2021-04-26 11:26:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool needChildVisit(const ASTPtr &, const ASTPtr &)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
using ComponentVisitor = ComponentMatcher::Visitor;
|
|
|
|
|
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
void collectIdentifiers(const ASTPtr & ast, std::unordered_set<String> & identifiers)
|
2021-04-26 11:26:54 +00:00
|
|
|
{
|
2021-04-29 18:12:51 +00:00
|
|
|
const auto * identifier = ast->as<ASTIdentifier>();
|
|
|
|
if (identifier)
|
|
|
|
identifiers.insert(identifier->name());
|
|
|
|
else
|
2021-04-26 11:26:54 +00:00
|
|
|
{
|
2021-04-29 18:12:51 +00:00
|
|
|
for (const auto & child : ast->children)
|
|
|
|
collectIdentifiers(child, identifiers);
|
|
|
|
}
|
|
|
|
}
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
struct ColumnPrice
|
|
|
|
{
|
|
|
|
size_t compressed_size;
|
|
|
|
size_t uncompressed_size;
|
|
|
|
|
|
|
|
ColumnPrice(const size_t compressed_size_, const size_t uncompressed_size_)
|
|
|
|
: compressed_size(compressed_size_)
|
|
|
|
, uncompressed_size(uncompressed_size_)
|
|
|
|
{}
|
|
|
|
|
|
|
|
ColumnPrice()
|
|
|
|
: ColumnPrice(0, 0)
|
|
|
|
{}
|
|
|
|
|
|
|
|
bool operator<(const ColumnPrice & that) const
|
2021-04-26 11:26:54 +00:00
|
|
|
{
|
2021-04-29 18:12:51 +00:00
|
|
|
return std::tie(compressed_size, uncompressed_size) < std::tie(that.compressed_size, that.uncompressed_size);
|
2021-04-26 11:26:54 +00:00
|
|
|
}
|
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
ColumnPrice operator+(ColumnPrice that) const
|
2021-04-26 11:26:54 +00:00
|
|
|
{
|
2021-04-29 18:12:51 +00:00
|
|
|
that += *this;
|
|
|
|
return that;
|
2021-04-26 11:26:54 +00:00
|
|
|
}
|
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
ColumnPrice & operator+=(const ColumnPrice & that)
|
|
|
|
{
|
|
|
|
compressed_size += that.compressed_size;
|
|
|
|
uncompressed_size += that.uncompressed_size;
|
|
|
|
return *this;
|
|
|
|
}
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
ColumnPrice & operator-=(const ColumnPrice & that)
|
|
|
|
{
|
|
|
|
compressed_size -= that.compressed_size;
|
|
|
|
uncompressed_size -= that.uncompressed_size;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
};
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-04-24 15:24:32 +00:00
|
|
|
class SubstituteColumnMatcher
|
|
|
|
{
|
|
|
|
public:
|
2021-04-26 11:26:54 +00:00
|
|
|
using Visitor = InDepthNodeVisitor<SubstituteColumnMatcher, false>;
|
2021-04-24 15:24:32 +00:00
|
|
|
|
|
|
|
struct Data
|
|
|
|
{
|
2021-05-04 14:34:00 +00:00
|
|
|
std::unordered_map<UInt64, ASTPtr> id_to_expression_map;
|
|
|
|
std::unordered_map<String, UInt64> name_to_component_id;
|
|
|
|
std::unordered_map<String, String> old_name;
|
|
|
|
bool is_select;
|
2021-04-24 15:24:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static void visit(ASTPtr & ast, Data & data)
|
|
|
|
{
|
2021-04-26 11:26:54 +00:00
|
|
|
const auto * identifier = ast->as<ASTIdentifier>();
|
2021-05-04 14:34:00 +00:00
|
|
|
if (identifier && data.name_to_component_id.contains(identifier->name()))
|
|
|
|
{
|
|
|
|
const auto name = identifier->name();
|
|
|
|
const auto component_id = data.name_to_component_id.at(name);
|
|
|
|
ast = data.id_to_expression_map.at(component_id)->clone();
|
|
|
|
if (data.is_select)
|
|
|
|
ast->setAlias(data.old_name.at(name));
|
|
|
|
}
|
2021-04-24 15:24:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool needChildVisit(const ASTPtr &, const ASTPtr &)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
using SubstituteColumnVisitor = SubstituteColumnMatcher::Visitor;
|
2021-04-29 18:12:51 +00:00
|
|
|
|
|
|
|
ColumnPrice calculatePrice(
|
|
|
|
const std::unordered_map<std::string, ColumnPrice> & column_prices,
|
|
|
|
std::unordered_set<String> identifiers)
|
|
|
|
{
|
|
|
|
ColumnPrice result(0, 0);
|
|
|
|
for (const auto & ident : identifiers)
|
|
|
|
result = result + column_prices.at(ident);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: branch-and-bound
|
|
|
|
void bruteforce(
|
|
|
|
const ComparisonGraph & graph,
|
|
|
|
const std::vector<UInt64> & components,
|
|
|
|
size_t current_component,
|
|
|
|
const std::unordered_map<std::string, ColumnPrice> & column_prices,
|
|
|
|
ColumnPrice current_price,
|
|
|
|
std::vector<ASTPtr> & expressions_stack,
|
|
|
|
ColumnPrice & min_price,
|
|
|
|
std::vector<ASTPtr> & min_expressions)
|
|
|
|
{
|
|
|
|
if (current_component == components.size())
|
|
|
|
{
|
|
|
|
if (current_price < min_price)
|
|
|
|
{
|
|
|
|
min_price = current_price;
|
|
|
|
min_expressions = expressions_stack;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (const auto & ast : graph.getComponent(components[current_component]))
|
|
|
|
{
|
|
|
|
std::unordered_set<String> identifiers;
|
|
|
|
collectIdentifiers(ast, identifiers);
|
|
|
|
ColumnPrice expression_price = calculatePrice(column_prices, identifiers);
|
|
|
|
|
|
|
|
expressions_stack.push_back(ast);
|
|
|
|
current_price += expression_price;
|
|
|
|
|
|
|
|
std::unordered_map<std::string, ColumnPrice> new_prices(column_prices);
|
|
|
|
for (const auto & identifier : identifiers)
|
|
|
|
new_prices[identifier] = ColumnPrice(0, 0);
|
|
|
|
|
|
|
|
bruteforce(graph,
|
|
|
|
components,
|
|
|
|
current_component + 1,
|
|
|
|
new_prices,
|
|
|
|
current_price,
|
|
|
|
expressions_stack,
|
|
|
|
min_price,
|
|
|
|
min_expressions);
|
|
|
|
|
|
|
|
current_price -= expression_price;
|
|
|
|
expressions_stack.pop_back();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-24 15:24:32 +00:00
|
|
|
}
|
|
|
|
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-04-24 15:24:32 +00:00
|
|
|
SubstituteColumnOptimizer::SubstituteColumnOptimizer(
|
|
|
|
ASTSelectQuery * select_query_,
|
|
|
|
Aliases & /*aliases_*/,
|
|
|
|
const NameSet & /*source_columns_set_*/,
|
|
|
|
const std::vector<TableWithColumnNamesAndTypes> & /*tables_with_columns_*/,
|
|
|
|
const StorageMetadataPtr & metadata_snapshot_,
|
|
|
|
const ConstStoragePtr & storage_)
|
|
|
|
: select_query(select_query_)
|
|
|
|
/* , aliases(aliases_)
|
|
|
|
, source_columns_set(source_columns_set_)
|
|
|
|
, tables_with_columns(tables_with_columns_)*/
|
|
|
|
, metadata_snapshot(metadata_snapshot_)
|
|
|
|
, storage(storage_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void SubstituteColumnOptimizer::perform()
|
|
|
|
{
|
|
|
|
if (!storage)
|
|
|
|
return;
|
2021-04-29 18:12:51 +00:00
|
|
|
const auto column_sizes = storage->getColumnSizes();
|
|
|
|
if (column_sizes.empty())
|
|
|
|
{
|
|
|
|
Poco::Logger::get("SubstituteColumnOptimizer").information("skip: column sizes not available");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-04-24 15:24:32 +00:00
|
|
|
const auto compare_graph = metadata_snapshot->getConstraints().getGraph();
|
2021-04-26 11:26:54 +00:00
|
|
|
|
2021-05-04 14:34:00 +00:00
|
|
|
// Fill aliases
|
|
|
|
if (select_query->select())
|
|
|
|
{
|
|
|
|
auto * list = select_query->refSelect()->as<ASTExpressionList>();
|
|
|
|
if (!list)
|
|
|
|
throw Exception("Bad select list.", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
for (ASTPtr & ast : list->children)
|
|
|
|
ast->setAlias(ast->getAliasOrColumnName());
|
|
|
|
}
|
|
|
|
|
2021-04-26 11:26:54 +00:00
|
|
|
auto run_for_all = [&](const auto func) {
|
|
|
|
if (select_query->where())
|
2021-05-04 14:34:00 +00:00
|
|
|
func(select_query->refWhere(), false);
|
2021-04-26 11:26:54 +00:00
|
|
|
if (select_query->prewhere())
|
2021-05-04 14:34:00 +00:00
|
|
|
func(select_query->refPrewhere(), false);
|
2021-04-26 11:26:54 +00:00
|
|
|
if (select_query->select())
|
2021-05-04 14:34:00 +00:00
|
|
|
func(select_query->refSelect(), true);
|
2021-04-26 11:26:54 +00:00
|
|
|
if (select_query->having())
|
2021-05-04 14:34:00 +00:00
|
|
|
func(select_query->refHaving(), false);
|
2021-04-26 11:26:54 +00:00
|
|
|
};
|
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
std::set<UInt64> components;
|
2021-05-04 14:34:00 +00:00
|
|
|
std::unordered_map<String, String> old_name;
|
|
|
|
std::unordered_map<String, UInt64> name_to_component;
|
|
|
|
UInt64 counter_id = 0;
|
|
|
|
ComponentVisitor::Data component_data(
|
|
|
|
compare_graph, components, old_name, name_to_component, counter_id);
|
2021-04-29 18:12:51 +00:00
|
|
|
std::unordered_set<String> identifiers;
|
2021-05-04 14:34:00 +00:00
|
|
|
auto preprocess = [&](ASTPtr & ast, bool) {
|
2021-04-26 11:26:54 +00:00
|
|
|
ComponentVisitor(component_data).visit(ast);
|
2021-04-29 18:12:51 +00:00
|
|
|
collectIdentifiers(ast, identifiers);
|
2021-04-26 11:26:54 +00:00
|
|
|
};
|
|
|
|
|
2021-04-29 18:12:51 +00:00
|
|
|
run_for_all(preprocess);
|
|
|
|
|
|
|
|
const auto primary_key = metadata_snapshot->getColumnsRequiredForPrimaryKey();
|
|
|
|
const std::unordered_set<std::string_view> primary_key_set(std::begin(primary_key), std::end(primary_key));
|
|
|
|
std::unordered_map<std::string, ColumnPrice> column_prices;
|
|
|
|
for (const auto & [column_name, column_size] : column_sizes)
|
|
|
|
column_prices[column_name] = ColumnPrice(
|
|
|
|
column_size.data_compressed + COLUMN_PENALTY, column_size.data_uncompressed);
|
|
|
|
for (const auto & column_name : primary_key)
|
|
|
|
column_prices[column_name] = ColumnPrice(0, 0);
|
|
|
|
for (const auto & column_name : identifiers)
|
|
|
|
column_prices[column_name] = ColumnPrice(0, 0);
|
|
|
|
|
2021-05-04 14:34:00 +00:00
|
|
|
std::unordered_map<UInt64, ASTPtr> id_to_expression_map;
|
2021-04-29 18:12:51 +00:00
|
|
|
std::vector<UInt64> components_list;
|
|
|
|
for (const UInt64 component : components)
|
|
|
|
if (compare_graph.getComponent(component).size() == 1)
|
2021-05-04 14:34:00 +00:00
|
|
|
id_to_expression_map[component] = compare_graph.getComponent(component).front();
|
2021-04-29 18:12:51 +00:00
|
|
|
else
|
|
|
|
components_list.push_back(component);
|
|
|
|
|
|
|
|
std::vector<ASTPtr> expressions_stack;
|
|
|
|
ColumnPrice min_price(std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max());
|
|
|
|
std::vector<ASTPtr> min_expressions;
|
|
|
|
bruteforce(compare_graph,
|
|
|
|
components_list,
|
|
|
|
0,
|
|
|
|
column_prices,
|
|
|
|
ColumnPrice(0, 0),
|
|
|
|
expressions_stack,
|
|
|
|
min_price,
|
|
|
|
min_expressions);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < min_expressions.size(); ++i)
|
2021-05-04 14:34:00 +00:00
|
|
|
id_to_expression_map[components_list[i]] = min_expressions[i];
|
2021-04-29 18:12:51 +00:00
|
|
|
|
2021-05-04 14:34:00 +00:00
|
|
|
auto process = [&](ASTPtr & ast, bool is_select) {
|
|
|
|
SubstituteColumnVisitor::Data substitute_data{id_to_expression_map, name_to_component, old_name, is_select};
|
2021-04-26 11:26:54 +00:00
|
|
|
SubstituteColumnVisitor(substitute_data).visit(ast);
|
|
|
|
};
|
|
|
|
|
|
|
|
run_for_all(process);
|
2021-04-24 15:24:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|