mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
rewrite ExpressionAnalyzer.collectUsedColumns CLICKHOUSE-3996
This commit is contained in:
parent
8f306e8b45
commit
6ea13516f0
113
dbms/src/Interpreters/ColumnNamesContext.cpp
Normal file
113
dbms/src/Interpreters/ColumnNamesContext.cpp
Normal file
@ -0,0 +1,113 @@
|
||||
#include <Interpreters/ColumnNamesContext.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
bool ColumnNamesContext::addTableAliasIfAny(const IAST & ast)
|
||||
{
|
||||
String alias = ast.tryGetAlias();
|
||||
if (alias.empty())
|
||||
return false;
|
||||
|
||||
table_aliases.insert(alias);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ColumnNamesContext::addColumnAliasIfAny(const IAST & ast, bool is_public)
|
||||
{
|
||||
String alias = ast.tryGetAlias();
|
||||
if (alias.empty())
|
||||
return false;
|
||||
|
||||
if (required_names.count(alias))
|
||||
masked_columns.insert(alias);
|
||||
|
||||
if (is_public)
|
||||
public_columns.insert(alias);
|
||||
column_aliases.insert(alias);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ColumnNamesContext::addColumnIdentifier(const ASTIdentifier & node, bool is_public)
|
||||
{
|
||||
if (!node.general())
|
||||
return;
|
||||
|
||||
required_names.insert(node.name);
|
||||
|
||||
if (!addColumnAliasIfAny(node, is_public) && is_public)
|
||||
public_columns.insert(node.name);
|
||||
}
|
||||
|
||||
bool ColumnNamesContext::addArrayJoinAliasIfAny(const IAST & ast)
|
||||
{
|
||||
String alias = ast.tryGetAlias();
|
||||
if (alias.empty())
|
||||
return false;
|
||||
|
||||
array_join_columns.insert(alias);
|
||||
return true;
|
||||
}
|
||||
|
||||
void ColumnNamesContext::addArrayJoinIdentifier(const ASTIdentifier & node)
|
||||
{
|
||||
array_join_columns.insert(node.name);
|
||||
}
|
||||
|
||||
NameSet ColumnNamesContext::requiredColumns() const
|
||||
{
|
||||
NameSet required;
|
||||
for (const auto & name : required_names)
|
||||
{
|
||||
String table_name = Nested::extractTableName(name);
|
||||
|
||||
/// Tech debt. There's its own logic for ARRAY JOIN columns.
|
||||
if (array_join_columns.count(name) || array_join_columns.count(table_name))
|
||||
continue;
|
||||
|
||||
if (!column_aliases.count(name) || masked_columns.count(name))
|
||||
required.insert(name);
|
||||
}
|
||||
return required;
|
||||
}
|
||||
|
||||
std::ostream & operator << (std::ostream & os, const ColumnNamesContext & cols)
|
||||
{
|
||||
os << "required_names: ";
|
||||
for (const auto & x : cols.required_names)
|
||||
os << "'" << x << "' ";
|
||||
os << "source_tables: ";
|
||||
for (const auto & x : cols.tables)
|
||||
{
|
||||
auto alias = x.alias();
|
||||
auto name = x.name();
|
||||
if (alias && name)
|
||||
os << "'" << *alias << "'/'" << *name << "' ";
|
||||
else if (alias)
|
||||
os << "'" << *alias << "' ";
|
||||
else if (name)
|
||||
os << "'" << *name << "' ";
|
||||
}
|
||||
os << "table_aliases: ";
|
||||
for (const auto & x : cols.table_aliases)
|
||||
os << "'" << x << "' ";
|
||||
os << "private_aliases: ";
|
||||
for (const auto & x : cols.private_aliases)
|
||||
os << "'" << x << "' ";
|
||||
os << "column_aliases: ";
|
||||
for (const auto & x : cols.column_aliases)
|
||||
os << "'" << x << "' ";
|
||||
os << "public_columns: ";
|
||||
for (const auto & x : cols.public_columns)
|
||||
os << "'" << x << "' ";
|
||||
os << "masked_columns: ";
|
||||
for (const auto & x : cols.masked_columns)
|
||||
os << "'" << x << "' ";
|
||||
os << "array_join_columns: ";
|
||||
for (const auto & x : cols.array_join_columns)
|
||||
os << "'" << x << "' ";
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
74
dbms/src/Interpreters/ColumnNamesContext.h
Normal file
74
dbms/src/Interpreters/ColumnNamesContext.h
Normal file
@ -0,0 +1,74 @@
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <optional>
|
||||
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Core/Names.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/// Information about table and column names extracted from ASTSelectQuery block. Do not include info from subselects.
|
||||
struct ColumnNamesContext
|
||||
{
|
||||
struct JoinedTable
|
||||
{
|
||||
const ASTTableExpression * expr;
|
||||
const ASTTableJoin * join;
|
||||
|
||||
std::optional<String> alias() const
|
||||
{
|
||||
String alias;
|
||||
if (expr->database_and_table_name)
|
||||
alias = expr->database_and_table_name->tryGetAlias();
|
||||
else if (expr->table_function)
|
||||
alias = expr->table_function->tryGetAlias();
|
||||
else if (expr->subquery)
|
||||
alias = expr->subquery->tryGetAlias();
|
||||
if (!alias.empty())
|
||||
return alias;
|
||||
return {};
|
||||
}
|
||||
|
||||
std::optional<String> name() const
|
||||
{
|
||||
if (auto * node = expr->database_and_table_name.get())
|
||||
if (auto * identifier = typeid_cast<const ASTIdentifier *>(node))
|
||||
return identifier->name;
|
||||
return {};
|
||||
}
|
||||
|
||||
std::optional<ASTTableJoin::Kind> joinKind() const
|
||||
{
|
||||
if (join)
|
||||
return join->kind;
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
NameSet required_names;
|
||||
NameSet table_aliases;
|
||||
NameSet private_aliases;
|
||||
NameSet column_aliases;
|
||||
NameSet masked_columns;
|
||||
NameSet public_columns;
|
||||
NameSet array_join_columns;
|
||||
std::vector<JoinedTable> tables; /// ordered list of visited tables in FROM section with joins
|
||||
bool has_table_join = false;
|
||||
bool has_array_join = false;
|
||||
|
||||
bool addTableAliasIfAny(const IAST & ast);
|
||||
bool addColumnAliasIfAny(const IAST & ast, bool is_public = false);
|
||||
void addColumnIdentifier(const ASTIdentifier & node, bool is_public = false);
|
||||
bool addArrayJoinAliasIfAny(const IAST & ast);
|
||||
void addArrayJoinIdentifier(const ASTIdentifier & node);
|
||||
|
||||
NameSet requiredColumns() const;
|
||||
};
|
||||
|
||||
std::ostream & operator << (std::ostream & os, const ColumnNamesContext & cols);
|
||||
|
||||
}
|
@ -106,10 +106,10 @@ ExpressionAnalyzer::ExpressionAnalyzer(
|
||||
removeDuplicateColumns(source_columns);
|
||||
}
|
||||
|
||||
/// Delete the unnecessary from `source_columns` list. Create `unknown_required_source_columns`. Form `columns_added_by_join`.
|
||||
/// Delete the unnecessary from `source_columns` list. Form `columns_added_by_join`.
|
||||
collectUsedColumns();
|
||||
|
||||
/// external_tables, subqueries_for_sets for global subqueries.
|
||||
/// external_tables, subqueries_for_sets for global subqueries.f
|
||||
/// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers.
|
||||
initGlobalSubqueriesAndExternalTables();
|
||||
|
||||
@ -1001,6 +1001,15 @@ void ExpressionAnalyzer::getAggregateInfo(Names & key_names, AggregateDescriptio
|
||||
aggregates = aggregate_descriptions;
|
||||
}
|
||||
|
||||
/// db.table.column -> table.column / table.column -> column
|
||||
static String cropDatabaseOrTableName(const String & name)
|
||||
{
|
||||
size_t pos = name.find('.', 0);
|
||||
if (pos != std::string::npos)
|
||||
return name.substr(pos + 1, name.size() - pos - 1);
|
||||
return name;
|
||||
}
|
||||
|
||||
void ExpressionAnalyzer::collectUsedColumns()
|
||||
{
|
||||
/** Calculate which columns are required to execute the expression.
|
||||
@ -1008,83 +1017,131 @@ void ExpressionAnalyzer::collectUsedColumns()
|
||||
* After execution, columns will only contain the list of columns needed to read from the table.
|
||||
*/
|
||||
|
||||
NameSet required;
|
||||
NameSet ignored;
|
||||
RequiredSourceColumnsVisitor::Data columns_context;
|
||||
RequiredSourceColumnsVisitor(columns_context).visit(query);
|
||||
|
||||
NameSet available_columns;
|
||||
for (const auto & column : source_columns)
|
||||
available_columns.insert(column.name);
|
||||
NameSet required = columns_context.requiredColumns();
|
||||
|
||||
if (select_query && select_query->array_join_expression_list())
|
||||
#if 0
|
||||
std::cerr << "Query: " << query << std::endl;
|
||||
std::cerr << "CTX: " << columns_context << std::endl;
|
||||
std::cerr << "source_columns: ";
|
||||
for (const auto & name : source_columns)
|
||||
std::cerr << "'" << name.name << "' ";
|
||||
std::cerr << "required: ";
|
||||
for (const auto & name : required)
|
||||
std::cerr << "'" << name << "' ";
|
||||
std::cerr << std::endl;
|
||||
#endif
|
||||
|
||||
if (columns_context.has_table_join)
|
||||
{
|
||||
ASTs & expressions = select_query->array_join_expression_list()->children;
|
||||
for (size_t i = 0; i < expressions.size(); ++i)
|
||||
const AnalyzedJoin & analyzed_join = analyzedJoin();
|
||||
#if 0
|
||||
std::cerr << "key_names_left: ";
|
||||
for (const auto & name : analyzed_join.key_names_left)
|
||||
std::cerr << "'" << name << "' ";
|
||||
std::cerr << "key_names_right: ";
|
||||
for (const auto & name : analyzed_join.key_names_right)
|
||||
std::cerr << "'" << name << "' ";
|
||||
std::cerr << "columns_from_joined_table: ";
|
||||
for (const auto & column : analyzed_join.columns_from_joined_table)
|
||||
std::cerr << "'" << column.name_and_type.name << '/' << column.original_name << "' ";
|
||||
std::cerr << "available_joined_columns: ";
|
||||
for (const auto & column : analyzed_join.available_joined_columns)
|
||||
std::cerr << "'" << column.name_and_type.name << '/' << column.original_name << "' ";
|
||||
std::cerr << std::endl;
|
||||
#endif
|
||||
NameSet avaliable_columns;
|
||||
for (const auto & name : source_columns)
|
||||
avaliable_columns.insert(name.name);
|
||||
|
||||
NameSet right_keys;
|
||||
for (const auto & right_key_name : analyzed_join.key_names_right)
|
||||
right_keys.insert(right_key_name);
|
||||
|
||||
/** You also need to ignore the identifiers of the columns that are obtained by JOIN.
|
||||
* (Do not assume that they are required for reading from the "left" table).
|
||||
*/
|
||||
columns_added_by_join.clear();
|
||||
for (const auto & joined_column : analyzed_join.available_joined_columns)
|
||||
{
|
||||
/// Ignore the top-level identifiers from the ARRAY JOIN section.
|
||||
/// Then add them separately.
|
||||
if (typeid_cast<ASTIdentifier *>(expressions[i].get()))
|
||||
auto & name = joined_column.name_and_type.name;
|
||||
if (required.count(name) && !avaliable_columns.count(name))
|
||||
{
|
||||
ignored.insert(expressions[i]->getColumnName());
|
||||
}
|
||||
else
|
||||
{
|
||||
/// Nothing needs to be ignored for expressions in ARRAY JOIN.
|
||||
NameSet empty;
|
||||
RequiredSourceColumnsVisitor::Data visitor_data{available_columns, required, empty, empty, empty};
|
||||
RequiredSourceColumnsVisitor(visitor_data).visit(expressions[i]);
|
||||
}
|
||||
columns_added_by_join.push_back(joined_column);
|
||||
required.erase(name);
|
||||
|
||||
ignored.insert(expressions[i]->getAliasOrColumnName());
|
||||
/// Some columns from right join key may be used in query. This columns will be appended to block during join.
|
||||
if (right_keys.count(name))
|
||||
columns_added_by_join_from_right_keys.insert(name);
|
||||
}
|
||||
}
|
||||
|
||||
/// @fix filter required columns according to misqualified names in JOIN ON
|
||||
if (columns_context.has_table_join &&
|
||||
columns_context.tables.size() >= 2 &&
|
||||
columns_context.tables[1].join &&
|
||||
columns_context.tables[1].join->on_expression)
|
||||
{
|
||||
NameSet fixed_required;
|
||||
|
||||
for (const auto & req_name : required)
|
||||
{
|
||||
bool collated = false;
|
||||
String cropped_name = req_name;
|
||||
static const constexpr size_t max_column_prefix = 2;
|
||||
|
||||
for (size_t i = 0; i < max_column_prefix && !collated; ++i)
|
||||
{
|
||||
cropped_name = cropDatabaseOrTableName(cropped_name);
|
||||
|
||||
if (avaliable_columns.count(cropped_name))
|
||||
{
|
||||
fixed_required.insert(cropped_name);
|
||||
collated = true;
|
||||
break;
|
||||
}
|
||||
|
||||
for (const auto & joined_column : analyzed_join.available_joined_columns)
|
||||
{
|
||||
auto & name = joined_column.name_and_type.name;
|
||||
|
||||
if (cropped_name == name)
|
||||
{
|
||||
columns_added_by_join.push_back(joined_column);
|
||||
if (right_keys.count(name))
|
||||
columns_added_by_join_from_right_keys.insert(name);
|
||||
collated = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!collated)
|
||||
fixed_required.insert(req_name);
|
||||
}
|
||||
|
||||
required.swap(fixed_required);
|
||||
}
|
||||
|
||||
/// @note required_columns_from_joined_table is output
|
||||
joined_block_actions = analyzed_join.createJoinedBlockActions(
|
||||
columns_added_by_join, select_query, context, required_columns_from_joined_table);
|
||||
}
|
||||
|
||||
/** You also need to ignore the identifiers of the columns that are obtained by JOIN.
|
||||
* (Do not assume that they are required for reading from the "left" table).
|
||||
*/
|
||||
NameSet available_joined_columns;
|
||||
for (const auto & joined_column : analyzedJoin().available_joined_columns)
|
||||
available_joined_columns.insert(joined_column.name_and_type.name);
|
||||
|
||||
NameSet required_joined_columns;
|
||||
|
||||
for (const auto & left_key_ast : syntax->analyzed_join.key_asts_left)
|
||||
if (columns_context.has_array_join)
|
||||
{
|
||||
NameSet empty;
|
||||
RequiredSourceColumnsVisitor::Data columns_data{available_columns, required, ignored, empty, required_joined_columns};
|
||||
ASTPtr tmp = left_key_ast;
|
||||
RequiredSourceColumnsVisitor(columns_data).visit(tmp);
|
||||
/// Insert the columns required for the ARRAY JOIN calculation into the required columns list.
|
||||
NameSet array_join_sources;
|
||||
for (const auto & result_source : syntax->array_join_result_to_source)
|
||||
array_join_sources.insert(result_source.second);
|
||||
|
||||
for (const auto & column_name_type : source_columns)
|
||||
if (array_join_sources.count(column_name_type.name))
|
||||
required.insert(column_name_type.name);
|
||||
}
|
||||
|
||||
RequiredSourceColumnsVisitor::Data columns_visitor_data{available_columns, required, ignored,
|
||||
available_joined_columns, required_joined_columns};
|
||||
RequiredSourceColumnsVisitor(columns_visitor_data).visit(query);
|
||||
|
||||
columns_added_by_join = analyzedJoin().available_joined_columns;
|
||||
for (auto it = columns_added_by_join.begin(); it != columns_added_by_join.end();)
|
||||
{
|
||||
if (required_joined_columns.count(it->name_and_type.name))
|
||||
++it;
|
||||
else
|
||||
columns_added_by_join.erase(it++);
|
||||
}
|
||||
|
||||
joined_block_actions = analyzedJoin().createJoinedBlockActions(
|
||||
columns_added_by_join, select_query, context, required_columns_from_joined_table);
|
||||
|
||||
/// Some columns from right join key may be used in query. This columns will be appended to block during join.
|
||||
for (const auto & right_key_name : analyzedJoin().key_names_right)
|
||||
if (required_joined_columns.count(right_key_name))
|
||||
columns_added_by_join_from_right_keys.insert(right_key_name);
|
||||
|
||||
/// Insert the columns required for the ARRAY JOIN calculation into the required columns list.
|
||||
NameSet array_join_sources;
|
||||
for (const auto & result_source : syntax->array_join_result_to_source)
|
||||
array_join_sources.insert(result_source.second);
|
||||
|
||||
for (const auto & column_name_type : source_columns)
|
||||
if (array_join_sources.count(column_name_type.name))
|
||||
required.insert(column_name_type.name);
|
||||
|
||||
/// You need to read at least one column to find the number of rows.
|
||||
if (select_query && required.empty())
|
||||
required.insert(ExpressionActions::getSmallestColumn(source_columns));
|
||||
@ -1118,9 +1175,17 @@ void ExpressionAnalyzer::collectUsedColumns()
|
||||
}
|
||||
|
||||
if (!unknown_required_source_columns.empty())
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << columns_context;
|
||||
ss << "source_columns: ";
|
||||
for (const auto & name : source_columns)
|
||||
ss << "'" << name.name << "' ";
|
||||
|
||||
throw Exception("Unknown identifier: " + *unknown_required_source_columns.begin()
|
||||
+ (select_query && !select_query->tables ? ". Note that there is no tables (FROM clause) in your query" : ""),
|
||||
ErrorCodes::UNKNOWN_IDENTIFIER);
|
||||
+ (select_query && !select_query->tables ? ". Note that there is no tables (FROM clause) in your query" : "")
|
||||
+ ", context: " + ss.str(), ErrorCodes::UNKNOWN_IDENTIFIER);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -232,8 +232,7 @@ private:
|
||||
const AnalyzedJoin & analyzedJoin() const { return syntax->analyzed_join; }
|
||||
|
||||
/** Remove all unnecessary columns from the list of all available columns of the table (`columns`).
|
||||
* At the same time, form a set of unknown columns (`unknown_required_source_columns`),
|
||||
* as well as the columns added by JOIN (`columns_added_by_join`).
|
||||
* At the same time, form a set of columns added by JOIN (`columns_added_by_join`).
|
||||
*/
|
||||
void collectUsedColumns();
|
||||
|
||||
|
232
dbms/src/Interpreters/RequiredSourceColumnsVisitor.cpp
Normal file
232
dbms/src/Interpreters/RequiredSourceColumnsVisitor.cpp
Normal file
@ -0,0 +1,232 @@
|
||||
#include <Interpreters/RequiredSourceColumnsVisitor.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include <Core/Names.h>
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTSubquery.h>
|
||||
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
static std::vector<String> extractNamesFromLambda(const ASTFunction & node)
|
||||
{
|
||||
if (node.arguments->children.size() != 2)
|
||||
throw Exception("lambda requires two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
ASTFunction * lambda_args_tuple = typeid_cast<ASTFunction *>(node.arguments->children[0].get());
|
||||
|
||||
if (!lambda_args_tuple || lambda_args_tuple->name != "tuple")
|
||||
throw Exception("First argument of lambda must be a tuple", ErrorCodes::TYPE_MISMATCH);
|
||||
|
||||
std::vector<String> names;
|
||||
for (auto & child : lambda_args_tuple->arguments->children)
|
||||
{
|
||||
ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(child.get());
|
||||
if (!identifier)
|
||||
throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);
|
||||
|
||||
names.push_back(identifier->name);
|
||||
}
|
||||
|
||||
return names;
|
||||
}
|
||||
|
||||
bool RequiredSourceColumnsMatcher::needChildVisit(ASTPtr & node, const ASTPtr & child)
|
||||
{
|
||||
if (typeid_cast<ASTSelectQuery *>(child.get()))
|
||||
return false;
|
||||
|
||||
/// Processed. Do not need children.
|
||||
if (typeid_cast<ASTIdentifier *>(node.get()) ||
|
||||
typeid_cast<ASTTableExpression *>(node.get()) ||
|
||||
typeid_cast<ASTArrayJoin *>(node.get()) ||
|
||||
typeid_cast<ASTSelectQuery *>(node.get()))
|
||||
return false;
|
||||
|
||||
if (auto * f = typeid_cast<ASTFunction *>(node.get()))
|
||||
{
|
||||
/// "indexHint" is a special function for index analysis. Everything that is inside it is not calculated. @sa KeyCondition
|
||||
/// "lambda" visit children itself.
|
||||
if (f->name == "indexHint" || f->name == "lambda")
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTPtr & ast, Data & data)
|
||||
{
|
||||
/// results are columns
|
||||
|
||||
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
|
||||
{
|
||||
data.addColumnAliasIfAny(*ast);
|
||||
visit(*t, ast, data);
|
||||
return {};
|
||||
}
|
||||
if (auto * t = typeid_cast<ASTFunction *>(ast.get()))
|
||||
{
|
||||
data.addColumnAliasIfAny(*ast);
|
||||
visit(*t, ast, data);
|
||||
return {};
|
||||
}
|
||||
|
||||
/// results are tables
|
||||
|
||||
if (auto * t = typeid_cast<ASTTablesInSelectQueryElement *>(ast.get()))
|
||||
{
|
||||
visit(*t, ast, data);
|
||||
return {};
|
||||
}
|
||||
|
||||
if (auto * t = typeid_cast<ASTTableExpression *>(ast.get()))
|
||||
{
|
||||
//data.addTableAliasIfAny(*ast); alias is attached to child
|
||||
visit(*t, ast, data);
|
||||
return {};
|
||||
}
|
||||
if (auto * t = typeid_cast<ASTSelectQuery *>(ast.get()))
|
||||
{
|
||||
data.addTableAliasIfAny(*ast);
|
||||
return visit(*t, ast, data);
|
||||
}
|
||||
if (auto * t = typeid_cast<ASTSubquery *>(ast.get()))
|
||||
{
|
||||
data.addTableAliasIfAny(*ast);
|
||||
return {};
|
||||
}
|
||||
|
||||
/// other
|
||||
|
||||
if (auto * t = typeid_cast<ASTArrayJoin *>(ast.get()))
|
||||
{
|
||||
data.has_array_join = true;
|
||||
return visit(*t, ast, data);
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTSelectQuery & select, const ASTPtr &, Data & data)
|
||||
{
|
||||
/// special case for top-level SELECT items: they are publics
|
||||
for (auto & node : select.select_expression_list->children)
|
||||
{
|
||||
if (auto * identifier = typeid_cast<ASTIdentifier *>(node.get()))
|
||||
data.addColumnIdentifier(*identifier, true);
|
||||
else
|
||||
data.addColumnAliasIfAny(*node, true);
|
||||
}
|
||||
|
||||
std::vector<ASTPtr *> out;
|
||||
for (auto & node : select.children)
|
||||
if (node != select.select_expression_list)
|
||||
out.push_back(&node);
|
||||
|
||||
/// revisit select_expression_list (with children) when all the aliases are set
|
||||
out.push_back(&select.select_expression_list);
|
||||
return out;
|
||||
}
|
||||
|
||||
void RequiredSourceColumnsMatcher::visit(const ASTIdentifier & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
if (node.name.empty())
|
||||
throw Exception("Expected not empty name", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
if (!data.private_aliases.count(node.name))
|
||||
data.addColumnIdentifier(node);
|
||||
}
|
||||
|
||||
void RequiredSourceColumnsMatcher::visit(const ASTFunction & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
/// Do not add formal parameters of the lambda expression
|
||||
if (node.name == "lambda")
|
||||
{
|
||||
Names local_aliases;
|
||||
for (const auto & name : extractNamesFromLambda(node))
|
||||
if (data.private_aliases.insert(name).second)
|
||||
local_aliases.push_back(name);
|
||||
|
||||
/// visit child with masked local aliases
|
||||
visit(node.arguments->children[1], data);
|
||||
|
||||
for (const auto & name : local_aliases)
|
||||
data.private_aliases.erase(name);
|
||||
}
|
||||
}
|
||||
|
||||
void RequiredSourceColumnsMatcher::visit(ASTTablesInSelectQueryElement & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
ASTTableExpression * expr = nullptr;
|
||||
ASTTableJoin * join = nullptr;
|
||||
|
||||
for (auto & child : node.children)
|
||||
{
|
||||
if (auto * e = typeid_cast<ASTTableExpression *>(child.get()))
|
||||
expr = e;
|
||||
if (auto * j = typeid_cast<ASTTableJoin *>(child.get()))
|
||||
join = j;
|
||||
}
|
||||
|
||||
if (join)
|
||||
data.has_table_join = true;
|
||||
data.tables.emplace_back(ColumnNamesContext::JoinedTable{expr, join});
|
||||
}
|
||||
|
||||
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(ASTTableExpression & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
/// ASTIdentifiers here are tables. Do not visit them as generic ones.
|
||||
if (node.database_and_table_name)
|
||||
data.addTableAliasIfAny(*node.database_and_table_name);
|
||||
|
||||
std::vector<ASTPtr *> out;
|
||||
if (node.table_function)
|
||||
{
|
||||
data.addTableAliasIfAny(*node.table_function);
|
||||
out.push_back(&node.table_function);
|
||||
}
|
||||
|
||||
if (node.subquery)
|
||||
{
|
||||
data.addTableAliasIfAny(*node.subquery);
|
||||
out.push_back(&node.subquery);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
std::vector<ASTPtr *> RequiredSourceColumnsMatcher::visit(const ASTArrayJoin & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
ASTPtr expression_list = node.expression_list;
|
||||
if (!expression_list || expression_list->children.empty())
|
||||
throw Exception("Expected not empty expression_list", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
std::vector<ASTPtr *> out;
|
||||
|
||||
/// Tech debt. Ignore ARRAY JOIN top-level identifiers and aliases. There's its own logic for them.
|
||||
for (auto & expr : expression_list->children)
|
||||
{
|
||||
data.addArrayJoinAliasIfAny(*expr);
|
||||
|
||||
if (auto * identifier = typeid_cast<ASTIdentifier *>(expr.get()))
|
||||
{
|
||||
data.addArrayJoinIdentifier(*identifier);
|
||||
continue;
|
||||
}
|
||||
|
||||
out.push_back(&expr);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
}
|
@ -1,140 +1,45 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Names.h>
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTSelectQuery.h>
|
||||
#include <Parsers/ASTTablesInSelectQuery.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Common/typeid_cast.h>
|
||||
#include "InDepthNodeVisitor.h"
|
||||
#include <Interpreters/ColumnNamesContext.h>
|
||||
#include <Interpreters/InDepthNodeVisitor.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TYPE_MISMATCH;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
/** Get a set of necessary columns to read from the table.
|
||||
* In this case, the columns specified in ignored_names are considered unnecessary. And the ignored_names parameter can be modified.
|
||||
* The set of columns available_joined_columns are the columns available from JOIN, they are not needed for reading from the main table.
|
||||
* Put in required_joined_columns the set of columns available from JOIN and needed.
|
||||
*/
|
||||
class ASTIdentifier;
|
||||
class ASTFunction;
|
||||
class ASTSelectQuery;
|
||||
struct ASTTablesInSelectQueryElement;
|
||||
struct ASTArrayJoin;
|
||||
struct ASTTableExpression;
|
||||
|
||||
class RequiredSourceColumnsMatcher
|
||||
{
|
||||
public:
|
||||
struct Data
|
||||
{
|
||||
const NameSet & available_columns;
|
||||
NameSet & required_source_columns;
|
||||
NameSet & ignored_names;
|
||||
const NameSet & available_joined_columns;
|
||||
NameSet & required_joined_columns;
|
||||
};
|
||||
using Data = ColumnNamesContext;
|
||||
|
||||
static constexpr const char * label = "RequiredSourceColumns";
|
||||
|
||||
static bool needChildVisit(ASTPtr & node, const ASTPtr & child)
|
||||
{
|
||||
/// We will not go to the ARRAY JOIN section, because we need to look at the names of non-ARRAY-JOIN columns.
|
||||
/// There, `collectUsedColumns` will send us separately.
|
||||
if (typeid_cast<ASTSelectQuery *>(child.get()) ||
|
||||
typeid_cast<ASTArrayJoin *>(child.get()) ||
|
||||
typeid_cast<ASTTableExpression *>(child.get()) ||
|
||||
typeid_cast<ASTTableJoin *>(child.get()))
|
||||
return false;
|
||||
|
||||
/// Processed. Do not need children.
|
||||
if (typeid_cast<ASTIdentifier *>(node.get()))
|
||||
return false;
|
||||
|
||||
if (auto * f = typeid_cast<ASTFunction *>(node.get()))
|
||||
{
|
||||
/// "indexHint" is a special function for index analysis. Everything that is inside it is not calculated. @sa KeyCondition
|
||||
/// "lambda" visit children itself.
|
||||
if (f->name == "indexHint" || f->name == "lambda")
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Find all the identifiers in the query.
|
||||
* We will use depth first search in AST.
|
||||
* In this case
|
||||
* - for lambda functions we will not take formal parameters;
|
||||
* - do not go into subqueries (they have their own identifiers);
|
||||
* - there is some exception for the ARRAY JOIN clause (it has a slightly different identifiers);
|
||||
* - we put identifiers available from JOIN in required_joined_columns.
|
||||
*/
|
||||
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data)
|
||||
{
|
||||
if (auto * t = typeid_cast<ASTIdentifier *>(ast.get()))
|
||||
visit(*t, ast, data);
|
||||
if (auto * t = typeid_cast<ASTFunction *>(ast.get()))
|
||||
visit(*t, ast, data);
|
||||
return {};
|
||||
}
|
||||
static bool needChildVisit(ASTPtr & node, const ASTPtr & child);
|
||||
static std::vector<ASTPtr *> visit(ASTPtr & ast, Data & data);
|
||||
|
||||
private:
|
||||
static void visit(const ASTIdentifier & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
if (node.general()
|
||||
&& !data.ignored_names.count(node.name)
|
||||
&& !data.ignored_names.count(Nested::extractTableName(node.name)))
|
||||
{
|
||||
/// Read column from left table if has.
|
||||
if (!data.available_joined_columns.count(node.name) || data.available_columns.count(node.name))
|
||||
data.required_source_columns.insert(node.name);
|
||||
else
|
||||
data.required_joined_columns.insert(node.name);
|
||||
}
|
||||
}
|
||||
|
||||
static void visit(const ASTFunction & node, const ASTPtr &, Data & data)
|
||||
{
|
||||
NameSet & ignored_names = data.ignored_names;
|
||||
|
||||
if (node.name == "lambda")
|
||||
{
|
||||
if (node.arguments->children.size() != 2)
|
||||
throw Exception("lambda requires two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
ASTFunction * lambda_args_tuple = typeid_cast<ASTFunction *>(node.arguments->children.at(0).get());
|
||||
|
||||
if (!lambda_args_tuple || lambda_args_tuple->name != "tuple")
|
||||
throw Exception("First argument of lambda must be a tuple", ErrorCodes::TYPE_MISMATCH);
|
||||
|
||||
/// You do not need to add formal parameters of the lambda expression in required_source_columns.
|
||||
Names added_ignored;
|
||||
for (auto & child : lambda_args_tuple->arguments->children)
|
||||
{
|
||||
ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(child.get());
|
||||
if (!identifier)
|
||||
throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);
|
||||
|
||||
String & name = identifier->name;
|
||||
if (!ignored_names.count(name))
|
||||
{
|
||||
ignored_names.insert(name);
|
||||
added_ignored.push_back(name);
|
||||
}
|
||||
}
|
||||
|
||||
/// @note It's a special case where we visit children inside the matcher, not in visitor.
|
||||
visit(node.arguments->children[1], data);
|
||||
|
||||
for (size_t i = 0; i < added_ignored.size(); ++i)
|
||||
ignored_names.erase(added_ignored[i]);
|
||||
}
|
||||
}
|
||||
static void visit(const ASTIdentifier & node, const ASTPtr &, Data & data);
|
||||
static void visit(const ASTFunction & node, const ASTPtr &, Data & data);
|
||||
static void visit(ASTTablesInSelectQueryElement & node, const ASTPtr &, Data & data);
|
||||
static std::vector<ASTPtr *> visit(ASTTableExpression & node, const ASTPtr &, Data & data);
|
||||
static std::vector<ASTPtr *> visit(const ASTArrayJoin & node, const ASTPtr &, Data & data);
|
||||
static std::vector<ASTPtr *> visit(ASTSelectQuery & select, const ASTPtr &, Data & data);
|
||||
};
|
||||
|
||||
/// Get a set of necessary columns to read from the table.
|
||||
using RequiredSourceColumnsVisitor = InDepthNodeVisitor<RequiredSourceColumnsMatcher, true>;
|
||||
/// Extracts all the information about columns and tables from ASTSelectQuery block into ColumnNamesContext object.
|
||||
/// It doesn't use anithing but AST. It visits nodes from bottom to top except ASTFunction content to get aliases in right manner.
|
||||
/// @note There's some ambiguousness with nested columns names that can't be solved without schema.
|
||||
using RequiredSourceColumnsVisitor = InDepthNodeVisitor<RequiredSourceColumnsMatcher, false>;
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user