From c45002a2eebff403632ead4ed5a34957dd443fdc Mon Sep 17 00:00:00 2001 From: Artem Zuikov Date: Fri, 20 Mar 2020 03:58:20 +0300 Subject: [PATCH] multiple join rewriter v2 (in progress) --- .../JoinToSubqueryTransformVisitor.cpp | 422 ++++++++++++------ 1 file changed, 292 insertions(+), 130 deletions(-) diff --git a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index 00cb6059a6a..6cf280f4fa7 100644 --- a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -34,12 +34,19 @@ namespace ASTPtr makeSubqueryTemplate() { ParserTablesInSelectQueryElement parser(true); - ASTPtr subquery_template = parseQuery(parser, "(select * from _t)", 0); + ASTPtr subquery_template = parseQuery(parser, "(select * from _t) as `--s`", 0); if (!subquery_template) throw Exception("Cannot parse subquery template", ErrorCodes::LOGICAL_ERROR); return subquery_template; } +ASTPtr makeSubqueryQualifiedAsterisk() +{ + auto asterisk = std::make_shared(); + asterisk->children.emplace_back(std::make_shared("--s")); + return asterisk; +} + /// Replace asterisks in select_expression_list with column identifiers class ExtractAsterisksMatcher { @@ -50,6 +57,7 @@ public: std::vector tables_order; std::shared_ptr new_select_expression_list; + /// V1 Data(const Context & context, const std::vector & table_expressions) { tables_order.reserve(table_expressions.size()); @@ -69,6 +77,19 @@ public: } } + /// V2 + Data(const std::vector & tables) + { + tables_order.reserve(tables.size()); + for (const auto & table : tables) + { + String table_name = table.table.getQualifiedNamePrefix(false); + NamesAndTypesList columns = table.columns; + tables_order.push_back(table_name); + table_columns.emplace(std::move(table_name), std::move(columns)); + } + } + void addTableColumns(const String & table_name) { auto it = table_columns.find(table_name); @@ -380,9 +401,10 @@ struct CollectColumnIdentifiersMatcher static bool needChildVisit(const ASTPtr & node, const ASTPtr &) { - /// Do not go into subqueries. Do not collect table identifiers. + /// Do not go into subqueries. Do not collect table identifiers. Do not get identifier from 't.*'. return !node->as() && - !node->as(); + !node->as() && + !node->as(); } static void visit(const ASTPtr & ast, Data & data) @@ -396,45 +418,45 @@ struct CollectColumnIdentifiersMatcher data.push_back(const_cast(&ident)); } }; +using CollectColumnIdentifiersVisitor = ConstInDepthNodeVisitor; -struct TableNeededColumns +struct CheckAliasDependencyVisitorData { - const DatabaseAndTableWithAlias & table; - NameSet no_clashes = {}; - NameSet column_clashes = {}; /// It's column for sure - NameSet alias_clashes = {}; /// It's column or alias + using TypeToVisit = ASTIdentifier; - void fillExpressionList(ASTExpressionList & expression_list) const + const Aliases & aliases; + const ASTIdentifier * dependency = nullptr; + + void visit(ASTIdentifier & ident, ASTPtr &) { - size_t columns_count = no_clashes.size() + column_clashes.size() + alias_clashes.size(); - expression_list.children.reserve(expression_list.children.size() + columns_count); - - String table_name = table.getQualifiedNamePrefix(false); - - for (auto & column : no_clashes) - addShortName(column, expression_list); - - for (auto & column : column_clashes) - addAliasedName(table_name, column, expression_list); - - for (auto & column : alias_clashes) - addShortName(column, expression_list); - } - - static void addShortName(const String & column, ASTExpressionList & expression_list) - { - auto ident = std::make_shared(column); - expression_list.children.emplace_back(std::move(ident)); - } - - /// t.x as `t.x` - static void addAliasedName(const String & table, const String & column, ASTExpressionList & expression_list) - { - auto ident = std::make_shared(std::vector{table, column}); - ident->setAlias(table + '.' + column); - expression_list.children.emplace_back(std::move(ident)); + if (!dependency && aliases.count(ident.name)) + dependency = &ident; } }; +using CheckAliasDependencyMatcher = OneTypeMatcher; +using CheckAliasDependencyVisitor = InDepthNodeVisitor; + +struct RewriteWithAliasMatcher +{ + using Data = std::unordered_map; + + static bool needChildVisit(const ASTPtr & node, const ASTPtr &) + { + return !node->as(); + } + + static void visit(ASTPtr & ast, Data & data) + { + String alias = ast->tryGetAlias(); + if (!alias.empty()) + { + auto it = data.find(alias); + if (it != data.end() && it->second.get() == ast.get()) + ast = std::make_shared(alias); + } + } +}; +using RewriteWithAliasVisitor = InDepthNodeVisitor; class SubqueryExpressionsRewriteMatcher { @@ -442,9 +464,7 @@ public: struct Data { ASTPtr expression_list; - const String & alias; - bool rewritten = false; - bool aliased = false; + bool done = false; }; static bool needChildVisit(ASTPtr & node, ASTPtr &) @@ -456,28 +476,179 @@ public: { if (auto * t = ast->as()) visit(*t, ast, data); - if (auto * t = ast->as()) - visit(*t, ast, data); } private: static void visit(ASTSelectQuery & select, ASTPtr &, Data & data) { - if (!data.rewritten) + if (!data.done) select.setExpression(ASTSelectQuery::Expression::SELECT, std::move(data.expression_list)); - data.rewritten = true; + data.done = true; + } +}; +using SubqueryExpressionsRewriteVisitor = InDepthNodeVisitor; + +struct TableNeededColumns +{ + const DatabaseAndTableWithAlias & table; + NameSet no_clashes = {}; + NameSet alias_clashes = {}; + std::unordered_map column_clashes = {}; + + void fillExpressionList(ASTExpressionList & expression_list) const + { + size_t columns_count = no_clashes.size() + column_clashes.size() + alias_clashes.size(); + expression_list.children.reserve(expression_list.children.size() + columns_count); + + String table_name = table.getQualifiedNamePrefix(false); + + for (auto & column : no_clashes) + addShortName(column, expression_list); + + for (auto & column : alias_clashes) + addShortName(column, expression_list); + + for (auto & [column, alias] : column_clashes) + addAliasedName(table_name, column, alias, expression_list); } - static void visit(ASTSubquery &, ASTPtr & ast, Data & data) + static void addShortName(const String & column, ASTExpressionList & expression_list) { - if (!data.aliased) - ast->setAlias(data.alias); - data.aliased = true; + auto ident = std::make_shared(column); + expression_list.children.emplace_back(std::move(ident)); + } + + /// t.x as `some` + static void addAliasedName(const String & table, const String & column, const String & alias, ASTExpressionList & expression_list) + { + auto ident = std::make_shared(std::vector{table, column}); + ident->setAlias(alias); + expression_list.children.emplace_back(std::move(ident)); } }; -using CollectColumnIdentifiersVisitor = ConstInDepthNodeVisitor; -using SubqueryExpressionsRewriteVisitor = InDepthNodeVisitor; +class UniqueShortNames +{ +public: + String get(const String & long_name) + { + auto it = names.find(long_name); + if (it != names.end()) + return it->second; + + String unique_name = generateUniqueName(); + names.emplace(long_name, unique_name); + return unique_name; + } + +private: + std::unordered_map names; + size_t counter = 0; + + String generateUniqueName() + { + static constexpr const char * pattern = "--x"; + return String(pattern) + std::to_string(counter++); + } +}; + +size_t countSuchColumns(const std::vector & tables, const String & short_name) +{ + size_t count = 0; + for (auto & table : tables) + if (table.hasColumn(short_name)) + ++count; + return count; +} + +/// Find clashes and normalize names +/// 1. If column name has no clashes make all its occurrences short: 'table.column' -> 'column', 'table_alias.column' -> 'column'. +/// 2. If column name can't be short cause of alias with same name generate and use unique name for it. +/// 3. If column clashes with another column generate and use unique names for them. +/// 4. If column clashes with another column and it's short - it's 'ambiguous column' error. +/// 5. If column clashes with alias add short column name to select list. It would be removed later if not needed. +std::vector normalizeColumnNamesExtractNeeded( + const std::vector & tables, + const Aliases & aliases, + std::vector & identifiers) +{ + UniqueShortNames unique_names; + size_t last_table_pos = tables.size() - 1; + + std::vector needed_columns; + needed_columns.reserve(tables.size()); + for (auto & table : tables) + needed_columns.push_back(TableNeededColumns{table.table}); + + for (ASTIdentifier * ident : identifiers) + { + bool got_alias = aliases.count(ident->name); + + if (auto table_pos = IdentifierSemantic::chooseTable(*ident, tables)) + { + if (!ident->isShort()) + { + if (got_alias) + throw Exception("Alias clashes with qualified column '" + ident->name + "'", ErrorCodes::AMBIGUOUS_COLUMN_NAME); + + String short_name = ident->shortName(); + size_t count = countSuchColumns(tables, short_name); + + if (count > 1 || aliases.count(short_name)) + { + auto & table = tables[*table_pos]; + IdentifierSemantic::setColumnLongName(*ident, table.table); /// table.column -> table_alias.column + + /// For tables moved into subselects we need unique short names for clashed names + if (*table_pos != last_table_pos) + { + auto & unique_long_name = ident->name; + String unique_short_name = unique_names.get(unique_long_name); + ident->setShortName(unique_short_name); + needed_columns[*table_pos].column_clashes.emplace(short_name, unique_short_name); + } + } + else + { + ident->setShortName(short_name); /// table.column -> column + needed_columns[*table_pos].no_clashes.emplace(std::move(short_name)); + } + } + else if (got_alias) + needed_columns[*table_pos].alias_clashes.emplace(ident->shortName()); + else + needed_columns[*table_pos].no_clashes.emplace(ident->shortName()); + } + else if (!got_alias) + throw Exception("Unknown column name '" + ident->name + "'", ErrorCodes::UNKNOWN_IDENTIFIER); + } + + return needed_columns; +} + +/// Make expression list for current subselect +std::shared_ptr subqueryExpressionList( + size_t table_pos, + const std::vector & needed_columns, + const std::vector> & alias_pushdown) +{ + auto expression_list = std::make_shared(); + + /// First time extract needed left table columns manually. + /// Next times extract left table columns via QualifiedAsterisk: `--s`.* + if (table_pos == 1) + needed_columns[0].fillExpressionList(*expression_list); + else + expression_list->children.emplace_back(makeSubqueryQualifiedAsterisk()); + + /// Add needed right table columns + needed_columns[table_pos].fillExpressionList(*expression_list); + + for (auto & expr : alias_pushdown[table_pos]) + expression_list->children.emplace_back(std::move(expr)); + + return expression_list; +} } /// namelesspace @@ -493,121 +664,112 @@ void JoinToSubqueryTransformMatcher::visit(ASTPtr & ast, Data & data) } } -/// The reason for V2: not to alias columns without clashes, use better `t.x` style aliases for others. +/// The reason for V2: not to alias columns without clashes. +/// It allows USING and 'select *' for queries with subselects. It doesn't need AsterisksSemantic and related stuff. +/// 1. Expand asterisks in select expression list. +/// 2. Normalize column names and find name clashes +/// 3. Rewrite multiple JOINs with subqueries: +/// SELECT ... FROM (SELECT `--join`.*, ... FROM (...) AS `--join` JOIN tableY ON ...) AS `--join` JOIN tableZ ON ...' +/// 4. Push down expressions of aliases used in ON section into expression list of first reletad subquery void JoinToSubqueryTransformMatcher::visitV2(ASTSelectQuery & select, ASTPtr & ast, Data & data) { std::vector table_expressions; if (!needRewrite<2>(select, table_expressions)) return; - /// TODO: check table_expressions vs data.tables consistency + auto & src_tables = select.tables()->children; + size_t tables_count = src_tables.size(); + + if (table_expressions.size() != data.tables.size() || + tables_count != data.tables.size()) + throw Exception("Inconsistent tables count in JOIN rewriter", ErrorCodes::LOGICAL_ERROR); + + /// Replace * and t.* with columns in select expression list. + { + ExtractAsterisksVisitor::Data asterisks_data(data.tables); + ExtractAsterisksVisitor(asterisks_data).visit(select.select()); + if (asterisks_data.new_select_expression_list) + select.setExpression(ASTSelectQuery::Expression::SELECT, std::move(asterisks_data.new_select_expression_list)); + } /// Collect column identifiers std::vector identifiers; CollectColumnIdentifiersVisitor(identifiers).visit(ast); - /// JOIN sections - for (auto & child : select.tables()->children) + std::vector using_identifiers; + std::vector> alias_pushdown(tables_count); + std::unordered_map on_aliases; + + /// Collect columns from JOIN sections. Detect if we have aliases there (they need pushdown). + for (size_t table_pos = 0; table_pos < tables_count; ++table_pos) { - auto * table = child->as(); + auto * table = src_tables[table_pos]->as(); if (table->table_join) { auto & join = table->table_join->as(); if (join.on_expression) - CollectColumnIdentifiersVisitor(identifiers).visit(join.on_expression); - /// Nothing special for join.using_expression_list cause it contains short names - } - } - - /// Find clashes and normalize names: - /// 1. If column name has no clashes make all its occurrences short: 'table.column' -> 'column', 'table_alias.column' -> 'column'. - /// 2. If column name can't be short cause of same alias we keep it long converting 'table.column' -> 'table_alias.column' if any. - /// 3. If column clashes with another column keep their names long but convert 'table.column' -> 'table_alias.column' if any. - /// 4. If column clashes with another column and it's short - it's 'ambiguous column' error. - /// 5. If column clashes with alias add short column name to select list. It would be removed later if not needed. - /// @note Source query aliases should not clash with qualified names. - - std::vector needed_columns; - needed_columns.reserve(data.tables.size()); - for (auto & table : data.tables) - needed_columns.push_back(TableNeededColumns{table.table}); - NameSet alias_uses; - - for (ASTIdentifier * ident : identifiers) - { - bool got_alias = data.aliases.count(ident->name); - - if (auto table_pos = IdentifierSemantic::chooseTable(*ident, data.tables)) - { - const String & short_name = ident->shortName(); - if (!ident->isShort()) { - if (got_alias) - throw Exception("Alias clashes with qualified column '" + ident->name + "'", ErrorCodes::AMBIGUOUS_COLUMN_NAME); + std::vector on_identifiers; + CollectColumnIdentifiersVisitor(on_identifiers).visit(join.on_expression); + identifiers.insert(identifiers.end(), on_identifiers.begin(), on_identifiers.end()); - size_t count = 0; - for (auto & table : data.tables) - if (table.hasColumn(short_name)) - ++count; - - if (count > 1 || data.aliases.count(short_name)) + /// Extract aliases used in ON section for pushdown. Exclude the last table. + if (table_pos < tables_count - 1) { - auto & table = data.tables[*table_pos]; - IdentifierSemantic::setColumnLongName(*ident, table.table); /// table.column -> table_alias.column - needed_columns[*table_pos].column_clashes.emplace(short_name); - } - else - { - ident->setShortName(short_name); /// table.column -> column - needed_columns[*table_pos].no_clashes.emplace(short_name); + for (auto * ident : on_identifiers) + { + auto it = data.aliases.find(ident->name); + if (!on_aliases.count(ident->name) && it != data.aliases.end()) + { + auto alias_expression = it->second; + alias_pushdown[table_pos].push_back(alias_expression); + on_aliases[ident->name] = alias_expression; + } + } } } - else if (got_alias) - needed_columns[*table_pos].alias_clashes.emplace(short_name); - else - needed_columns[*table_pos].no_clashes.emplace(short_name); + else if (join.using_expression_list) + CollectColumnIdentifiersVisitor(using_identifiers).visit(join.on_expression); } - else if (got_alias) - alias_uses.insert(ident->name); - else - throw Exception("Unknown column name '" + ident->name + "'", ErrorCodes::UNKNOWN_IDENTIFIER); } - /// Rewrite tables + /// Check if alias expression is too complex to push it down. + for (auto & expr : on_aliases) + { + CheckAliasDependencyVisitor::Data check{data.aliases}; + CheckAliasDependencyVisitor(check).visit(expr.second); + if (check.dependency) + throw Exception("Cannot rewrite JOINs. Alias '" + expr.first + + "' used in ON section depends on another alias '" + check.dependency->name + "'", + ErrorCodes::NOT_IMPLEMENTED); + } + + /// Check same name in aliases, USING and ON sections. Cannot push down alias to ON through USING cause of name masquerading. + for (auto * ident : using_identifiers) + if (on_aliases.count(ident->name)) + throw Exception("Cannot rewrite JOINs. Alias '" + ident->name + "' appears both in ON and USING", ErrorCodes::NOT_IMPLEMENTED); + using_identifiers.clear(); + + /// Replace pushdowned expressions with aliases names in original expression lists. + RewriteWithAliasVisitor(on_aliases).visit(ast); + on_aliases.clear(); + + std::vector needed_columns = + normalizeColumnNamesExtractNeeded(data.tables, data.aliases, identifiers); + + /// Rewrite JOINs with subselects - auto & src_tables = select.tables()->children; ASTPtr left_table = src_tables[0]; static ASTPtr subquery_template = makeSubqueryTemplate(); - static constexpr const char * join_subquery_alias = "--join"; for (size_t i = 1; i < src_tables.size() - 1; ++i) { - String prev_join_alias = String(join_subquery_alias) + std::to_string(i-1); - String current_join_alias = String(join_subquery_alias) + std::to_string(i); - - auto expression_list = std::make_shared(); - { - if (i == 1) - { - /// First time extract needed left table columns manually - needed_columns[0].fillExpressionList(*expression_list); - } - else - { - /// Next times extract left tables via QualifiedAsterisk - auto asterisk = std::make_shared(); - asterisk->children.emplace_back(std::make_shared(prev_join_alias)); - expression_list->children.emplace_back(std::move(asterisk)); - } - - /// Add needed right table columns - needed_columns[i].fillExpressionList(*expression_list); - } + auto expression_list = subqueryExpressionList(i, needed_columns, alias_pushdown); ASTPtr subquery = subquery_template->clone(); - SubqueryExpressionsRewriteVisitor::Data expr_rewrite_data{std::move(expression_list), current_join_alias}; + SubqueryExpressionsRewriteVisitor::Data expr_rewrite_data{std::move(expression_list)}; SubqueryExpressionsRewriteVisitor(expr_rewrite_data).visit(subquery); left_table = replaceJoin(left_table, src_tables[i], subquery);