#include #include #include #include #include #include #include #include #include #include #include #include /// getSmallestColumn() #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include namespace DB { namespace ErrorCodes { extern const int EMPTY_LIST_OF_COLUMNS_QUERIED; extern const int EMPTY_NESTED_TABLE; extern const int EXPECTED_ALL_OR_ANY; extern const int INVALID_JOIN_ON_EXPRESSION; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNKNOWN_IDENTIFIER; extern const int UNEXPECTED_EXPRESSION; } namespace { using LogAST = DebugASTLog; /// set to true to enable logs void optimizeGroupingSets(ASTPtr & query) { GroupingSetsRewriterVisitor::Data data; GroupingSetsRewriterVisitor(data).visit(query); } /// Select implementation of a function based on settings. /// Important that it is done as query rewrite. It means rewritten query /// will be sent to remote servers during distributed query execution, /// and on all remote servers, function implementation will be same. template struct CustomizeFunctionsData { using TypeToVisit = ASTFunction; const String & customized_func_name; void visit(ASTFunction & func, ASTPtr &) const { if (Poco::toLower(func.name) == func_name) { func.name = customized_func_name; } } }; char countdistinct[] = "countdistinct"; using CustomizeCountDistinctVisitor = InDepthNodeVisitor>, true>; char countifdistinct[] = "countifdistinct"; using CustomizeCountIfDistinctVisitor = InDepthNodeVisitor>, true>; char countdistinctif[] = "countdistinctif"; using CustomizeCountDistinctIfVisitor = InDepthNodeVisitor>, true>; char in[] = "in"; using CustomizeInVisitor = InDepthNodeVisitor>, true>; char notIn[] = "notin"; using CustomizeNotInVisitor = InDepthNodeVisitor>, true>; char globalIn[] = "globalin"; using CustomizeGlobalInVisitor = InDepthNodeVisitor>, true>; char globalNotIn[] = "globalnotin"; using CustomizeGlobalNotInVisitor = InDepthNodeVisitor>, true>; template struct CustomizeFunctionsSuffixData { using TypeToVisit = ASTFunction; const String & customized_func_suffix; void visit(ASTFunction & func, ASTPtr &) const { if (endsWith(Poco::toLower(func.name), func_suffix)) { size_t prefix_len = func.name.length() - strlen(func_suffix); func.name = func.name.substr(0, prefix_len) + customized_func_suffix; } } }; /// Swap 'if' and 'distinct' suffixes to make execution more optimal. char ifDistinct[] = "ifdistinct"; using CustomizeIfDistinctVisitor = InDepthNodeVisitor>, true>; /// Used to rewrite all aggregate functions to add -OrNull suffix to them if setting `aggregate_functions_null_for_empty` is set. struct CustomizeAggregateFunctionsSuffixData { using TypeToVisit = ASTFunction; const String & customized_func_suffix; void visit(ASTFunction & func, ASTPtr &) const { const auto & instance = AggregateFunctionFactory::instance(); if (instance.isAggregateFunctionName(func.name) && !endsWith(func.name, customized_func_suffix) && !endsWith(func.name, customized_func_suffix + "If")) { auto properties = instance.tryGetProperties(func.name, func.nulls_action); if (properties && !properties->returns_default_when_only_null) { func.name += customized_func_suffix; } } } }; // Used to rewrite aggregate functions with -OrNull suffix in some cases, such as sumIfOrNull, we should rewrite to sumOrNullIf struct CustomizeAggregateFunctionsMoveSuffixData { using TypeToVisit = ASTFunction; const String & customized_func_suffix; String moveSuffixAhead(const String & name) const { auto prefix = name.substr(0, name.size() - customized_func_suffix.size()); auto prefix_size = prefix.size(); if (endsWith(prefix, "MergeState")) return prefix.substr(0, prefix_size - 10) + customized_func_suffix + "MergeState"; if (endsWith(prefix, "Merge")) return prefix.substr(0, prefix_size - 5) + customized_func_suffix + "Merge"; if (endsWith(prefix, "State")) return prefix.substr(0, prefix_size - 5) + customized_func_suffix + "State"; if (endsWith(prefix, "If")) return prefix.substr(0, prefix_size - 2) + customized_func_suffix + "If"; return name; } void visit(ASTFunction & func, ASTPtr &) const { const auto & instance = AggregateFunctionFactory::instance(); if (instance.isAggregateFunctionName(func.name)) { if (endsWith(func.name, customized_func_suffix)) { auto properties = instance.tryGetProperties(func.name, func.nulls_action); if (properties && !properties->returns_default_when_only_null) { func.name = moveSuffixAhead(func.name); } } } } }; using CustomizeAggregateFunctionsOrNullVisitor = InDepthNodeVisitor, true>; using CustomizeAggregateFunctionsMoveOrNullVisitor = InDepthNodeVisitor, true>; struct ExistsExpressionData { using TypeToVisit = ASTFunction; static void visit(ASTFunction & func, ASTPtr) { bool exists_expression = func.name == "exists" && func.arguments && func.arguments->children.size() == 1 && typeid_cast(func.arguments->children[0].get()); if (!exists_expression) return; /// EXISTS(subquery) --> 1 IN (SELECT 1 FROM subquery LIMIT 1) auto subquery_node = func.arguments->children[0]; auto table_expression = std::make_shared(); table_expression->subquery = std::move(subquery_node); table_expression->children.push_back(table_expression->subquery); auto tables_in_select_element = std::make_shared(); tables_in_select_element->table_expression = std::move(table_expression); tables_in_select_element->children.push_back(tables_in_select_element->table_expression); auto tables_in_select = std::make_shared(); tables_in_select->children.push_back(std::move(tables_in_select_element)); auto select_expr_list = std::make_shared(); select_expr_list->children.push_back(std::make_shared(1u)); auto select_query = std::make_shared(); select_query->children.push_back(select_expr_list); select_query->setExpression(ASTSelectQuery::Expression::SELECT, select_expr_list); select_query->setExpression(ASTSelectQuery::Expression::TABLES, tables_in_select); ASTPtr limit_length_ast = std::make_shared(Field(static_cast(1))); select_query->setExpression(ASTSelectQuery::Expression::LIMIT_LENGTH, std::move(limit_length_ast)); auto select_with_union_query = std::make_shared(); select_with_union_query->list_of_selects = std::make_shared(); select_with_union_query->list_of_selects->children.push_back(std::move(select_query)); select_with_union_query->children.push_back(select_with_union_query->list_of_selects); auto new_subquery = std::make_shared(); new_subquery->children.push_back(select_with_union_query); auto function = makeASTFunction("in", std::make_shared(1u), new_subquery); func = *function; } }; using ExistsExpressionVisitor = InDepthNodeVisitor, false>; struct ReplacePositionalArgumentsData { using TypeToVisit = ASTSelectQuery; static void visit(ASTSelectQuery & select_query, ASTPtr &) { if (select_query.groupBy()) { for (auto & expr : select_query.groupBy()->children) replaceForPositionalArguments(expr, &select_query, ASTSelectQuery::Expression::GROUP_BY); } if (select_query.orderBy()) { for (auto & expr : select_query.orderBy()->children) { auto & elem = assert_cast(*expr).children.at(0); replaceForPositionalArguments(elem, &select_query, ASTSelectQuery::Expression::ORDER_BY); } } if (select_query.limitBy()) { for (auto & expr : select_query.limitBy()->children) replaceForPositionalArguments(expr, &select_query, ASTSelectQuery::Expression::LIMIT_BY); } } }; using ReplacePositionalArgumentsVisitor = InDepthNodeVisitor, false>; /// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form. /// Expand asterisks and qualified asterisks with column names. /// There would be columns in normal form & column aliases after translation. Column & column alias would be normalized in QueryNormalizer. void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query, const NameSet & source_columns_set, const TablesWithColumns & tables_with_columns) { LogAST log; TranslateQualifiedNamesVisitor::Data visitor_data(source_columns_set, tables_with_columns, true/* has_columns */); TranslateQualifiedNamesVisitor visitor(visitor_data, log.stream()); visitor.visit(query); /// This may happen after expansion of COLUMNS('regexp'). if (select_query.select()->children.empty()) throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED, "Empty list of columns in SELECT query"); } bool hasArrayJoin(const ASTPtr & ast) { if (const ASTFunction * function = ast->as()) if (function->name == "arrayJoin") return true; for (const auto & child : ast->children) if (!child->as() && hasArrayJoin(child)) return true; return false; } /// Keep number of columns for 'GLOBAL IN (SELECT 1 AS a, a)' void renameDuplicatedColumns(const ASTSelectQuery * select_query) { ASTs & elements = select_query->select()->children; std::set all_column_names; std::set assigned_column_names; for (auto & expr : elements) all_column_names.insert(expr->getAliasOrColumnName()); for (auto & expr : elements) { auto name = expr->getAliasOrColumnName(); if (!assigned_column_names.insert(name).second) { size_t i = 1; while (all_column_names.end() != all_column_names.find(name + "_" + toString(i))) ++i; name = name + "_" + toString(i); expr = expr->clone(); /// Cancels fuse of the same expressions in the tree. expr->setAlias(name); all_column_names.insert(name); assigned_column_names.insert(name); } } } /// Sometimes we have to calculate more columns in SELECT clause than will be returned from query. /// This is the case when we have DISTINCT or arrayJoin: we require more columns in SELECT even if we need less columns in result. /// Also we have to remove duplicates in case of GLOBAL subqueries. Their results are placed into tables so duplicates are impossible. /// Also remove all INTERPOLATE columns which are not in SELECT anymore. void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups) { ASTs & elements = select_query->select()->children; std::map required_columns_with_duplicate_count; if (!required_result_columns.empty()) { /// Some columns may be queried multiple times, like SELECT x, y, y FROM table. for (const auto & name : required_result_columns) { if (remove_dups) required_columns_with_duplicate_count[name] = 1; else ++required_columns_with_duplicate_count[name]; } } else if (remove_dups) { /// Even if we have no requirements there could be duplicates cause of asterisks. SELECT *, t.* for (const auto & elem : elements) required_columns_with_duplicate_count.emplace(elem->getAliasOrColumnName(), 1); } else return; NameSet required_by_interpolate; if (select_query->interpolate()) { auto & children = select_query->interpolate()->children; if (!children.empty()) { NameToNameSetMap expressions; auto interpolate_visitor = [](const ASTPtr ast, NameSet & columns) -> void { auto interpolate_visitor_impl = [](const ASTPtr node, NameSet & cols, auto self) -> void { if (const auto * ident = node->as()) cols.insert(ident->name()); else if (const auto * func = node->as()) for (const auto & elem : func->arguments->children) self(elem, cols, self); }; interpolate_visitor_impl(ast, columns, interpolate_visitor_impl); }; for (const auto & elem : children) { if (auto * interpolate = elem->as()) { NameSet needed_columns; interpolate_visitor(interpolate->expr, needed_columns); expressions.emplace(interpolate->column, std::move(needed_columns)); } } for (const auto & name : required_result_columns) if (const auto it = expressions.find(name); it != expressions.end()) required_by_interpolate.insert(it->second.begin(), it->second.end()); } } ASTs new_elements; new_elements.reserve(elements.size()); NameSet remove_columns; for (const auto & elem : elements) { String name = elem->getAliasOrColumnName(); auto it = required_columns_with_duplicate_count.find(name); if (required_columns_with_duplicate_count.end() != it && it->second) { new_elements.push_back(elem); --it->second; } else if (required_by_interpolate.contains(name)) { /// Columns required by interpolate expression are not always in the required_result_columns new_elements.push_back(elem); } else if (select_query->distinct || hasArrayJoin(elem)) { /// ARRAY JOIN cannot be optimized out since it may change number of rows, /// so as DISTINCT. new_elements.push_back(elem); } else { remove_columns.insert(name); ASTFunction * func = elem->as(); /// Never remove untuple. It's result column may be in required columns. /// It is not easy to analyze untuple here, because types were not calculated yet. if (func && func->name == "untuple") new_elements.push_back(elem); /// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong if (func && !select_query->groupBy()) { GetAggregatesVisitor::Data data = {}; GetAggregatesVisitor(data).visit(elem); if (!data.aggregates.empty()) new_elements.push_back(elem); } } } if (select_query->interpolate()) { auto & children = select_query->interpolate()->children; if (!children.empty()) { for (auto * it = children.begin(); it != children.end();) { if (remove_columns.contains((*it)->as()->column)) it = select_query->interpolate()->children.erase(it); else ++it; } if (children.empty()) select_query->setExpression(ASTSelectQuery::Expression::INTERPOLATE, nullptr); } } elements = std::move(new_elements); } /// Replacing scalar subqueries with constant values. void executeScalarSubqueries( ASTPtr & query, ContextPtr context, size_t subquery_depth, Scalars & scalars, Scalars & local_scalars, bool only_analyze, bool is_create_parameterized_view) { LogAST log; ExecuteScalarSubqueriesVisitor::Data visitor_data{ WithContext{context}, subquery_depth, scalars, local_scalars, only_analyze, is_create_parameterized_view, /*replace_only_to_literals=*/ false, /*max_literal_size=*/ std::nullopt}; ExecuteScalarSubqueriesVisitor(visitor_data, log.stream()).visit(query); } void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const ASTSelectQuery * select_query, const NamesAndTypesList & source_columns, const NameSet & source_columns_set) { if (!select_query->arrayJoinExpressionList().first) return; ArrayJoinedColumnsVisitor::Data visitor_data{ result.aliases, result.array_join_name_to_alias, result.array_join_alias_to_name, result.array_join_result_to_source}; ArrayJoinedColumnsVisitor(visitor_data).visit(query); /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column, /// to get the correct number of rows. if (result.array_join_result_to_source.empty()) { if (select_query->arrayJoinExpressionList().first->children.empty()) throw DB::Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "ARRAY JOIN requires an argument"); ASTPtr expr = select_query->arrayJoinExpressionList().first->children.at(0); String source_name = expr->getColumnName(); String result_name = expr->getAliasOrColumnName(); /// This is an array. if (!expr->as() || source_columns_set.contains(source_name)) { result.array_join_result_to_source[result_name] = source_name; } else /// This is a nested table. { bool found = false; for (const auto & column : source_columns) { auto split = Nested::splitName(column.name, /*reverse=*/ true); if (split.first == source_name && !split.second.empty()) { result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name; found = true; break; } } if (!found) throw Exception(ErrorCodes::EMPTY_NESTED_TABLE, "No columns in nested table {}", source_name); } } } void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_default_strictness, bool old_any, std::shared_ptr & analyzed_join) { const ASTTablesInSelectQueryElement * node = select_query.join(); if (!node) return; auto & table_join = const_cast(node)->table_join->as(); if (table_join.strictness == JoinStrictness::Unspecified && table_join.kind != JoinKind::Cross) { if (join_default_strictness == JoinStrictness::Any) table_join.strictness = JoinStrictness::Any; else if (join_default_strictness == JoinStrictness::All) table_join.strictness = JoinStrictness::All; else throw Exception(DB::ErrorCodes::EXPECTED_ALL_OR_ANY, "Expected ANY or ALL in JOIN section, because setting (join_default_strictness) is empty"); } if (old_any) { if (table_join.strictness == JoinStrictness::Any && table_join.kind == JoinKind::Inner) { table_join.strictness = JoinStrictness::Semi; table_join.kind = JoinKind::Left; } if (table_join.strictness == JoinStrictness::Any) table_join.strictness = JoinStrictness::RightAny; } else { if (table_join.strictness == JoinStrictness::Any && table_join.kind == JoinKind::Full) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ANY FULL JOINs are not implemented"); } analyzed_join->getTableJoin() = table_join; } /// Evaluate expression and return boolean value if it can be interpreted as bool. /// Only UInt8 or NULL are allowed. /// Returns `false` for 0 or NULL values, `true` for any non-negative value. std::optional tryEvaluateConstCondition(ASTPtr expr, ContextPtr context) { if (!expr) return {}; Field eval_res; DataTypePtr eval_res_type; { auto constant_expression_result = tryEvaluateConstantExpression(expr, context); if (!constant_expression_result) return {}; std::tie(eval_res, eval_res_type) = std::move(constant_expression_result.value()); } /// UInt8, maybe Nullable, maybe LowCardinality, and NULL are allowed eval_res_type = removeNullable(removeLowCardinality(eval_res_type)); if (auto which = WhichDataType(eval_res_type); !which.isUInt8() && !which.isNothing()) return {}; if (eval_res.isNull()) return false; UInt8 res = eval_res.template safeGet(); return res > 0; } bool tryJoinOnConst(TableJoin & analyzed_join, const ASTPtr & on_expression, ContextPtr context) { if (!analyzed_join.isEnabledAlgorithm(JoinAlgorithm::HASH)) return false; if (analyzed_join.strictness() == JoinStrictness::Asof) return false; if (analyzed_join.isSpecialStorage()) return false; if (auto eval_const_res = tryEvaluateConstCondition(on_expression, context)) { if (eval_const_res.value()) { /// JOIN ON 1 == 1 LOG_DEBUG(getLogger("TreeRewriter"), "Join on constant executed as cross join"); analyzed_join.resetToCross(); } else { /// JOIN ON 1 != 1 LOG_DEBUG(getLogger("TreeRewriter"), "Join on constant executed as empty join"); analyzed_join.resetKeys(); } return true; } return false; } /// Find the columns that are obtained by JOIN. void collectJoinedColumns(TableJoin & analyzed_join, ASTTableJoin & table_join, const TablesWithColumns & tables, const Aliases & aliases, ContextPtr context) { assert(tables.size() >= 2); if (table_join.using_expression_list) { const auto & keys = table_join.using_expression_list->as(); analyzed_join.addDisjunct(); for (const auto & key : keys.children) analyzed_join.addUsingKey(key); } else if (table_join.on_expression) { bool join_on_const_ok = tryJoinOnConst(analyzed_join, table_join.on_expression, context); if (join_on_const_ok) return; bool is_asof = (table_join.strictness == JoinStrictness::Asof); CollectJoinOnKeysVisitor::Data data{analyzed_join, tables[0], tables[1], aliases, is_asof}; if (auto * or_func = table_join.on_expression->as(); or_func && or_func->name == "or") { for (auto & disjunct : or_func->arguments->children) { analyzed_join.addDisjunct(); CollectJoinOnKeysVisitor(data).visit(disjunct); } assert(analyzed_join.getClauses().size() == or_func->arguments->children.size()); } else { analyzed_join.addDisjunct(); CollectJoinOnKeysVisitor(data).visit(table_join.on_expression); assert(analyzed_join.oneDisjunct()); } auto check_keys_empty = [] (auto e) { return e.key_names_left.empty(); }; bool any_keys_empty = std::any_of(analyzed_join.getClauses().begin(), analyzed_join.getClauses().end(), check_keys_empty); if (any_keys_empty) throw DB::Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Cannot get JOIN keys from JOIN ON section: '{}', found keys: {}", queryToString(table_join.on_expression), TableJoin::formatClauses(analyzed_join.getClauses())); if (is_asof) { if (!analyzed_join.oneDisjunct()) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "ASOF join doesn't support multiple ORs for keys in JOIN ON section"); data.asofToJoinKeys(); } if (!analyzed_join.oneDisjunct() && !analyzed_join.isEnabledAlgorithm(JoinAlgorithm::HASH)) throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "Only `hash` join supports multiple ORs for keys in JOIN ON section"); } } std::pair recursivelyCollectMaxOrdinaryExpressions(const ASTPtr & expr, ASTExpressionList & into) { checkStackSize(); if (expr->as()) { into.children.push_back(expr); return {false, 1}; } auto * function = expr->as(); if (!function) return {false, 0}; if (AggregateUtils::isAggregateFunction(*function)) return {true, 0}; UInt64 pushed_children = 0; bool has_aggregate = false; for (const auto & child : function->arguments->children) { auto [child_has_aggregate, child_pushed_children] = recursivelyCollectMaxOrdinaryExpressions(child, into); has_aggregate |= child_has_aggregate; pushed_children += child_pushed_children; } /// The current function is not aggregate function and there is no aggregate function in its arguments, /// so use the current function to replace its arguments if (!has_aggregate) { for (UInt64 i = 0; i < pushed_children; i++) into.children.pop_back(); into.children.push_back(expr); pushed_children = 1; } return {has_aggregate, pushed_children}; } /** Expand GROUP BY ALL by extracting all the SELECT-ed expressions that are not aggregate functions. * * For a special case that if there is a function having both aggregate functions and other fields as its arguments, * the `GROUP BY` keys will contain the maximum non-aggregate fields we can extract from it. * * Example: * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY ALL * will expand as * SELECT substring(a, 4, 2), substring(substring(a, 1, 2), 1, count(b)) FROM t GROUP BY substring(a, 4, 2), substring(a, 1, 2) */ void expandGroupByAll(ASTSelectQuery * select_query) { auto group_expression_list = std::make_shared(); for (const auto & expr : select_query->select()->children) recursivelyCollectMaxOrdinaryExpressions(expr, *group_expression_list); select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, group_expression_list); } void expandOrderByAll(ASTSelectQuery * select_query) { auto * all_elem = select_query->orderBy()->children[0]->as(); if (!all_elem) throw Exception(ErrorCodes::LOGICAL_ERROR, "Select analyze for not order by asts."); auto order_expression_list = std::make_shared(); for (const auto & expr : select_query->select()->children) { if (auto * identifier = expr->as(); identifier != nullptr) if (Poco::toUpper(identifier->name()) == "ALL" || Poco::toUpper(identifier->alias) == "ALL") throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again"); if (auto * function = expr->as(); function != nullptr) if (Poco::toUpper(function->alias) == "ALL") throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION, "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again"); auto elem = std::make_shared(); elem->direction = all_elem->direction; elem->nulls_direction = all_elem->nulls_direction; elem->nulls_direction_was_explicitly_specified = all_elem->nulls_direction_was_explicitly_specified; elem->children.push_back(expr); order_expression_list->children.push_back(elem); } select_query->setExpression(ASTSelectQuery::Expression::ORDER_BY, order_expression_list); } ASTs getAggregates(ASTPtr & query, const ASTSelectQuery & select_query) { /// There can not be aggregate functions inside the WHERE and PREWHERE. if (select_query.where()) assertNoAggregates(select_query.where(), "in WHERE"); if (select_query.prewhere()) assertNoAggregates(select_query.prewhere(), "in PREWHERE"); GetAggregatesVisitor::Data data; GetAggregatesVisitor(data).visit(query); /// There can not be other aggregate functions within the aggregate functions. for (const ASTPtr & ast : data.aggregates) { const ASTFunction & node = typeid_cast(*ast); if (node.arguments) { for (auto & arg : node.arguments->children) { assertNoAggregates(arg, "inside another aggregate function"); // We also can't have window functions inside aggregate functions, // because the window functions are calculated later. assertNoWindows(arg, "inside an aggregate function"); } } } return data.aggregates; } ASTs getWindowFunctions(ASTPtr & query, const ASTSelectQuery & select_query) { /// There can not be window functions inside the WHERE, PREWHERE and HAVING if (select_query.having()) assertNoWindows(select_query.having(), "in HAVING"); if (select_query.where()) assertNoWindows(select_query.where(), "in WHERE"); if (select_query.prewhere()) assertNoWindows(select_query.prewhere(), "in PREWHERE"); if (select_query.window()) assertNoWindows(select_query.window(), "in WINDOW"); GetAggregatesVisitor::Data data; GetAggregatesVisitor(data).visit(query); /// Window functions cannot be inside aggregates or other window functions. /// Aggregate functions can be inside window functions because they are /// calculated earlier. for (const ASTPtr & ast : data.window_functions) { const ASTFunction & node = typeid_cast(*ast); if (node.arguments) for (auto & arg : node.arguments->children) assertNoWindows(arg, "inside another window function"); if (node.window_definition) assertNoWindows(node.window_definition, "inside window definition"); } return data.window_functions; } class MarkTupleLiteralsAsLegacyData { public: struct Data { }; static void visitLiteral(ASTLiteral & literal, ASTPtr &) { if (literal.value.getType() == Field::Types::Tuple) literal.use_legacy_column_name_of_tuple = true; } static void visitFunction(ASTFunction & func, ASTPtr &ast) { if (func.name == "tuple" && func.arguments && !func.arguments->children.empty()) { // re-write tuple() function as literal if (auto literal = func.toLiteral()) { ast = literal; visitLiteral(*typeid_cast(ast.get()), ast); } } } static void visit(ASTPtr & ast, Data &) { if (auto * identifier = typeid_cast(ast.get())) visitFunction(*identifier, ast); if (auto * identifier = typeid_cast(ast.get())) visitLiteral(*identifier, ast); } static bool needChildVisit(const ASTPtr & /*parent*/, const ASTPtr & /*child*/) { return true; } }; using MarkTupleLiteralsAsLegacyVisitor = InDepthNodeVisitor; void markTupleLiteralsAsLegacy(ASTPtr & query) { MarkTupleLiteralsAsLegacyVisitor::Data data; MarkTupleLiteralsAsLegacyVisitor(data).visit(query); } /// Rewrite _shard_num -> shardNum() AS _shard_num struct RewriteShardNum { struct Data { }; static bool needChildVisit(const ASTPtr & parent, const ASTPtr & /*child*/) { /// ON section should not be rewritten. return typeid_cast(parent.get()) == nullptr; } static void visit(ASTPtr & ast, Data &) { if (auto * identifier = typeid_cast(ast.get())) visit(*identifier, ast); } static void visit(ASTIdentifier & identifier, ASTPtr & ast) { if (identifier.shortName() != "_shard_num") return; String alias = identifier.tryGetAlias(); if (alias.empty()) alias = "_shard_num"; ast = makeASTFunction("shardNum"); ast->setAlias(alias); } }; using RewriteShardNumVisitor = InDepthNodeVisitor; } TreeRewriterResult::TreeRewriterResult( const NamesAndTypesList & source_columns_, ConstStoragePtr storage_, const StorageSnapshotPtr & storage_snapshot_, bool add_special) : storage(storage_) , storage_snapshot(storage_snapshot_) , source_columns(source_columns_) { collectSourceColumns(add_special); is_remote_storage = storage && storage->isRemote(); } /// Add columns from storage to source_columns list. Deduplicate resulted list. /// Special columns are non physical columns, for example ALIAS void TreeRewriterResult::collectSourceColumns(bool add_special) { if (storage) { auto options = GetColumnsOptions(add_special ? GetColumnsOptions::All : GetColumnsOptions::AllPhysical); options.withExtendedObjects(); if (storage->supportsSubcolumns()) options.withSubcolumns(); auto columns_from_storage = storage_snapshot->getColumns(options); if (source_columns.empty()) source_columns.swap(columns_from_storage); else source_columns.insert(source_columns.end(), columns_from_storage.begin(), columns_from_storage.end()); auto metadata_snapshot = storage->getInMemoryMetadataPtr(); auto metadata_column_descriptions = metadata_snapshot->getColumns(); source_columns_ordinary = metadata_column_descriptions.getOrdinary(); } source_columns_set = removeDuplicateColumns(source_columns); } /// Calculate which columns are required to execute the expression. /// Then, delete all other columns from the list of available columns. /// After execution, columns will only contain the list of columns needed to read from the table. bool TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select, bool no_throw) { /// We calculate required_source_columns with source_columns modifications and swap them on exit required_source_columns = source_columns; RequiredSourceColumnsVisitor::Data columns_context; RequiredSourceColumnsVisitor(columns_context).visit(query); NameSet source_column_names; for (const auto & column : source_columns) source_column_names.insert(column.name); NameSet required = columns_context.requiredColumns(); if (columns_context.has_table_join) { NameSet available_columns; for (const auto & name : source_columns) available_columns.insert(name.name); /// Add columns obtained by JOIN (if needed). for (const auto & joined_column : analyzed_join->columnsFromJoinedTable()) { const auto & name = joined_column.name; if (available_columns.contains(name)) continue; if (required.contains(name)) { /// Optimization: do not add columns needed only in JOIN ON section. if (columns_context.nameInclusion(name) > analyzed_join->rightKeyInclusion(name)) analyzed_join->addJoinedColumn(joined_column); required.erase(name); } } } NameSet array_join_sources; if (columns_context.has_array_join) { /// Insert the columns required for the ARRAY JOIN calculation into the required columns list. for (const auto & result_source : array_join_result_to_source) array_join_sources.insert(result_source.second); for (const auto & column_name_type : source_columns) if (array_join_sources.contains(column_name_type.name)) required.insert(column_name_type.name); } /// Figure out if we're able to use the trivial count optimization. has_explicit_columns = !required.empty(); if (is_select && !has_explicit_columns) { optimize_trivial_count = !columns_context.has_array_join; /// You need to read at least one column to find the number of rows. /// We will find a column with minimum . /// Because it is the column that is cheapest to read. struct ColumnSizeTuple { size_t compressed_size; size_t type_size; size_t uncompressed_size; String name; bool operator<(const ColumnSizeTuple & that) const { return std::tie(compressed_size, type_size, uncompressed_size) < std::tie(that.compressed_size, that.type_size, that.uncompressed_size); } }; std::vector columns; if (storage) { auto column_sizes = storage->getColumnSizes(); for (auto & source_column : source_columns) { auto c = column_sizes.find(source_column.name); if (c == column_sizes.end()) continue; size_t type_size = source_column.type->haveMaximumSizeOfValue() ? source_column.type->getMaximumSizeOfValueInMemory() : 100; columns.emplace_back(ColumnSizeTuple{c->second.data_compressed, type_size, c->second.data_uncompressed, source_column.name}); } } if (!columns.empty()) required.insert(std::min_element(columns.begin(), columns.end())->name); else if (!source_columns.empty()) /// If we have no information about columns sizes, choose a column of minimum size of its data type. required.insert(ExpressionActions::getSmallestColumn(source_columns).name); } else if (is_select && storage_snapshot && !columns_context.has_array_join) { const auto & partition_desc = storage_snapshot->metadata->getPartitionKey(); if (partition_desc.expression) { auto partition_source_columns = partition_desc.expression->getRequiredColumns(); partition_source_columns.push_back("_part"); partition_source_columns.push_back("_partition_id"); partition_source_columns.push_back("_part_uuid"); partition_source_columns.push_back("_partition_value"); optimize_trivial_count = true; for (const auto & required_column : required) { if (std::find(partition_source_columns.begin(), partition_source_columns.end(), required_column) == partition_source_columns.end()) { optimize_trivial_count = false; break; } } } } NameSet unknown_required_source_columns = required; for (NamesAndTypesList::iterator it = source_columns.begin(); it != source_columns.end();) { const String & column_name = it->name; unknown_required_source_columns.erase(column_name); if (!required.contains(column_name)) it = source_columns.erase(it); else ++it; } has_virtual_shard_num = false; /// If there are virtual columns among the unknown columns. Remove them from the list of unknown and add /// in columns list, so that when further processing they are also considered. if (storage) { const auto storage_virtuals = storage->getVirtuals(); for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) { auto column = storage_virtuals.tryGetByName(*it); if (column) { source_columns.push_back(*column); it = unknown_required_source_columns.erase(it); } else ++it; } if (is_remote_storage) { for (const auto & name_type : storage_virtuals) { if (name_type.name == "_shard_num" && storage->isVirtualColumn("_shard_num", storage_snapshot->getMetadataForQuery())) { has_virtual_shard_num = true; break; } } } } /// Collect missed object subcolumns if (!unknown_required_source_columns.empty()) { for (const NameAndTypePair & pair : source_columns_ordinary) { for (auto it = unknown_required_source_columns.begin(); it != unknown_required_source_columns.end();) { size_t object_pos = it->find('.'); if (object_pos != std::string::npos) { String object_name = it->substr(0, object_pos); if (pair.name == object_name && pair.type->getTypeId() == TypeIndex::Object) { const auto * object_type = typeid_cast(pair.type.get()); if (object_type->getSchemaFormat() == "json" && object_type->hasNullableSubcolumns()) { missed_subcolumns.insert(*it); it = unknown_required_source_columns.erase(it); continue; } } } ++it; } } } if (!unknown_required_source_columns.empty()) { constexpr auto format_string = "Missing columns: {} while processing query: '{}', required columns:{}{}"; WriteBufferFromOwnString ss; ss << "Missing columns:"; for (const auto & name : unknown_required_source_columns) ss << " '" << name << "'"; ss << " while processing query: '" << queryToString(query) << "'"; ss << ", required columns:"; for (const auto & name : columns_context.requiredColumns()) ss << " '" << name << "'"; if (storage) { std::vector hint_name{}; std::set helper_hint_name{}; for (const auto & name : columns_context.requiredColumns()) { auto hints = storage->getHints(name); for (const auto & hint : hints) { // We want to preserve the ordering of the hints // (as they are ordered by Levenshtein distance) auto [_, inserted] = helper_hint_name.insert(hint); if (inserted) hint_name.push_back(hint); } } if (!hint_name.empty()) { ss << ", maybe you meant: "; ss << toStringWithFinalSeparator(hint_name, " or "); } } else { if (!source_column_names.empty()) for (const auto & name : columns_context.requiredColumns()) ss << " '" << name << "'"; else ss << ", no source columns"; } if (columns_context.has_table_join) { ss << ", joined columns:"; for (const auto & column : analyzed_join->columnsFromJoinedTable()) ss << " '" << column.name << "'"; } if (!array_join_sources.empty()) { ss << ", arrayJoin columns:"; for (const auto & name : array_join_sources) ss << " '" << name << "'"; } if (no_throw) return false; throw Exception(PreformattedMessage{ss.str(), format_string}, ErrorCodes::UNKNOWN_IDENTIFIER); } required_source_columns.swap(source_columns); for (const auto & column : required_source_columns) { source_column_names.insert(column.name); } return true; } NameSet TreeRewriterResult::getArrayJoinSourceNameSet() const { NameSet forbidden_columns; for (const auto & elem : array_join_result_to_source) forbidden_columns.insert(elem.first); return forbidden_columns; } TreeRewriterResultPtr TreeRewriter::analyzeSelect( ASTPtr & query, TreeRewriterResult && result, const SelectQueryOptions & select_options, const TablesWithColumns & tables_with_columns, const Names & required_result_columns, std::shared_ptr table_join) const { auto * select_query = query->as(); if (!select_query) throw Exception(ErrorCodes::LOGICAL_ERROR, "Select analyze for not select asts."); size_t subquery_depth = select_options.subquery_depth; bool remove_duplicates = select_options.remove_duplicates; const auto & settings = getContext()->getSettingsRef(); const NameSet & source_columns_set = result.source_columns_set; if (table_join) { result.analyzed_join = table_join; result.analyzed_join->resetCollected(); } else /// TODO: remove. For now ExpressionAnalyzer expects some not empty object here result.analyzed_join = std::make_shared(); if (remove_duplicates) renameDuplicatedColumns(select_query); /// Perform it before analyzing JOINs, because it may change number of columns with names unique and break some logic inside JOINs if (settings.optimize_normalize_count_variants) TreeOptimizer::optimizeCountConstantAndSumOne(query, getContext()); if (tables_with_columns.size() > 1) { const auto & right_table = tables_with_columns[1]; auto columns_from_joined_table = right_table.columns; /// query can use materialized or aliased columns from right joined table, /// we want to request it for right table columns_from_joined_table.insert(columns_from_joined_table.end(), right_table.hidden_columns.begin(), right_table.hidden_columns.end()); result.analyzed_join->setColumnsFromJoinedTable(std::move(columns_from_joined_table), source_columns_set, right_table.table.getQualifiedNamePrefix()); } translateQualifiedNames(query, *select_query, source_columns_set, tables_with_columns); /// Optimizes logical expressions. LogicalExpressionsOptimizer(select_query, tables_with_columns, settings.optimize_min_equality_disjunction_chain_length.value).perform(); NameSet all_source_columns_set = source_columns_set; if (table_join) { for (const auto & [name, _] : table_join->columnsFromJoinedTable()) all_source_columns_set.insert(name); } normalize(query, result.aliases, all_source_columns_set, select_options.ignore_alias, settings, /* allow_self_aliases = */ true, getContext(), select_options.is_create_parameterized_view); // expand GROUP BY ALL if (select_query->group_by_all) expandGroupByAll(select_query); // expand ORDER BY ALL if (settings.enable_order_by_all && select_query->order_by_all) expandOrderByAll(select_query); /// Remove unneeded columns according to 'required_result_columns'. /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside. /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost) /// and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations. removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates); /// Executing scalar subqueries - replacing them with constant values. Scalars scalars; Scalars local_scalars; executeScalarSubqueries( query, getContext(), subquery_depth, scalars, local_scalars, select_options.only_analyze, select_options.is_create_parameterized_view); /// Save scalar sub queries's results in the query context /// Note that we are only saving scalars and not local_scalars since the latter can't be safely shared across contexts if (!select_options.only_analyze && getContext()->hasQueryContext()) { for (const auto & it : scalars) getContext()->getQueryContext()->addScalar(it.first, it.second); } if (settings.legacy_column_name_of_tuple_literal) markTupleLiteralsAsLegacy(query); /// Push the predicate expression down to subqueries. The optimization should be applied to both initial and secondary queries. result.rewrite_subqueries = PredicateExpressionsOptimizer(getContext(), tables_with_columns, settings).optimize(*select_query); /// Only apply AST optimization for initial queries. const bool ast_optimizations_allowed = getContext()->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !select_options.ignore_ast_optimizations; bool optimize_multiif_to_if = ast_optimizations_allowed && settings.optimize_multiif_to_if; TreeOptimizer::optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif, optimize_multiif_to_if); if (ast_optimizations_allowed) TreeOptimizer::apply(query, result, tables_with_columns, getContext()); /// array_join_alias_to_name, array_join_result_to_source. getArrayJoinedColumns(query, result, select_query, result.source_columns, source_columns_set); setJoinStrictness( *select_query, settings.join_default_strictness, settings.any_join_distinct_right_table_keys, result.analyzed_join); auto * table_join_ast = select_query->join() ? select_query->join()->table_join->as() : nullptr; if (table_join_ast && tables_with_columns.size() >= 2) collectJoinedColumns(*result.analyzed_join, *table_join_ast, tables_with_columns, result.aliases, getContext()); result.aggregates = getAggregates(query, *select_query); result.window_function_asts = getWindowFunctions(query, *select_query); result.expressions_with_window_function = getExpressionsWithWindowFunctions(query); result.collectUsedColumns(query, true); if (!result.missed_subcolumns.empty()) { for (const String & column_name : result.missed_subcolumns) replaceMissedSubcolumnsInQuery(query, column_name); result.missed_subcolumns.clear(); } result.required_source_columns_before_expanding_alias_columns = result.required_source_columns.getNames(); /// rewrite filters for select query, must go after getArrayJoinedColumns bool is_initiator = getContext()->getClientInfo().distributed_depth == 0; if (settings.optimize_respect_aliases && result.storage_snapshot && is_initiator) { std::unordered_set excluded_nodes; { /// Do not replace ALIASed columns in JOIN ON/USING sections if (table_join_ast && table_join_ast->on_expression) excluded_nodes.insert(table_join_ast->on_expression.get()); if (table_join_ast && table_join_ast->using_expression_list) excluded_nodes.insert(table_join_ast->using_expression_list.get()); } bool is_changed = replaceAliasColumnsInQuery(query, result.storage_snapshot->metadata->getColumns(), result.array_join_result_to_source, getContext(), excluded_nodes); /// If query is changed, we need to redo some work to correct name resolution. if (is_changed) { /// We should re-apply the optimization, because an expression substituted from alias column might be a function of a group key. if (ast_optimizations_allowed && settings.optimize_group_by_function_keys) TreeOptimizer::optimizeGroupByFunctionKeys(select_query); result.aggregates = getAggregates(query, *select_query); result.window_function_asts = getWindowFunctions(query, *select_query); result.expressions_with_window_function = getExpressionsWithWindowFunctions(query); result.collectUsedColumns(query, true); } } /// Rewrite _shard_num to shardNum() if (result.has_virtual_shard_num) { RewriteShardNumVisitor::Data data_rewrite_shard_num; RewriteShardNumVisitor(data_rewrite_shard_num).visit(query); } result.ast_join = select_query->join(); if (result.optimize_trivial_count) result.optimize_trivial_count = settings.optimize_trivial_count_query && !select_query->groupBy() && !select_query->having() && !select_query->sampleSize() && !select_query->sampleOffset() && !select_query->final() && (tables_with_columns.size() < 2 || isLeft(result.analyzed_join->kind())); // remove outer braces in order by RewriteOrderByVisitor::Data data; RewriteOrderByVisitor(data).visit(query); return std::make_shared(result); } TreeRewriterResultPtr TreeRewriter::analyze( ASTPtr & query, const NamesAndTypesList & source_columns, ConstStoragePtr storage, const StorageSnapshotPtr & storage_snapshot, bool allow_aggregations, bool allow_self_aliases, bool execute_scalar_subqueries, bool is_create_parameterized_view) const { if (query->as()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Not select analyze for select asts."); const auto & settings = getContext()->getSettingsRef(); TreeRewriterResult result(source_columns, storage, storage_snapshot, false); normalize(query, result.aliases, result.source_columns_set, false, settings, allow_self_aliases, getContext(), is_create_parameterized_view); /// Executing scalar subqueries. Column defaults could be a scalar subquery. Scalars scalars; Scalars local_scalars; executeScalarSubqueries(query, getContext(), 0, scalars, local_scalars, !execute_scalar_subqueries, is_create_parameterized_view); /// Save scalar sub queries's results in the query context /// Note that we are only saving scalars and not local_scalars since the latter can't be safely shared across contexts if (execute_scalar_subqueries && getContext()->hasQueryContext()) { for (const auto & it : scalars) getContext()->getQueryContext()->addScalar(it.first, it.second); } if (settings.legacy_column_name_of_tuple_literal) markTupleLiteralsAsLegacy(query); TreeOptimizer::optimizeIf(query, result.aliases, settings.optimize_if_chain_to_multiif, false); if (allow_aggregations) { GetAggregatesVisitor::Data data; GetAggregatesVisitor(data).visit(query); /// There can not be other aggregate functions within the aggregate functions. for (const ASTPtr & node : data.aggregates) for (auto & arg : typeid_cast(*node).arguments->children) assertNoAggregates(arg, "inside another aggregate function"); result.aggregates = data.aggregates; } else assertNoAggregates(query, "in wrong place"); bool is_ok = result.collectUsedColumns(query, false, no_throw); if (!is_ok) return {}; if (!result.missed_subcolumns.empty()) { for (const String & column_name : result.missed_subcolumns) replaceMissedSubcolumnsInQuery(query, column_name); result.missed_subcolumns.clear(); } return std::make_shared(result); } void TreeRewriter::normalize( ASTPtr & query, Aliases & aliases, const NameSet & source_columns_set, bool ignore_alias, const Settings & settings, bool allow_self_aliases, ContextPtr context_, bool is_create_parameterized_view) { if (!UserDefinedSQLFunctionFactory::instance().empty()) UserDefinedSQLFunctionVisitor::visit(query); CustomizeCountDistinctVisitor::Data data_count_distinct{settings.count_distinct_implementation}; CustomizeCountDistinctVisitor(data_count_distinct).visit(query); CustomizeCountIfDistinctVisitor::Data data_count_if_distinct{settings.count_distinct_implementation.toString() + "If"}; CustomizeCountIfDistinctVisitor(data_count_if_distinct).visit(query); CustomizeIfDistinctVisitor::Data data_distinct_if{"DistinctIf"}; CustomizeIfDistinctVisitor(data_distinct_if).visit(query); if (settings.rewrite_count_distinct_if_with_count_distinct_implementation) { CustomizeCountDistinctIfVisitor::Data data_count_distinct_if{settings.count_distinct_implementation.toString() + "If"}; CustomizeCountDistinctIfVisitor(data_count_distinct_if).visit(query); } ExistsExpressionVisitor::Data exists; ExistsExpressionVisitor(exists).visit(query); if (context_->getSettingsRef().enable_positional_arguments) { ReplacePositionalArgumentsVisitor::Data data_replace_positional_arguments; ReplacePositionalArgumentsVisitor(data_replace_positional_arguments).visit(query); } if (settings.transform_null_in) { CustomizeInVisitor::Data data_null_in{"nullIn"}; CustomizeInVisitor(data_null_in).visit(query); CustomizeNotInVisitor::Data data_not_null_in{"notNullIn"}; CustomizeNotInVisitor(data_not_null_in).visit(query); CustomizeGlobalInVisitor::Data data_global_null_in{"globalNullIn"}; CustomizeGlobalInVisitor(data_global_null_in).visit(query); CustomizeGlobalNotInVisitor::Data data_global_not_null_in{"globalNotNullIn"}; CustomizeGlobalNotInVisitor(data_global_not_null_in).visit(query); } /// Rewrite all aggregate functions to add -OrNull suffix to them if (settings.aggregate_functions_null_for_empty) { CustomizeAggregateFunctionsOrNullVisitor::Data data_or_null{"OrNull"}; CustomizeAggregateFunctionsOrNullVisitor(data_or_null).visit(query); } /// Move -OrNull suffix ahead, this should execute after add -OrNull suffix CustomizeAggregateFunctionsMoveOrNullVisitor::Data data_or_null{"OrNull"}; CustomizeAggregateFunctionsMoveOrNullVisitor(data_or_null).visit(query); /// Creates a dictionary `aliases`: alias -> ASTPtr QueryAliasesVisitor(aliases).visit(query); /// Mark table ASTIdentifiers with not a column marker MarkTableIdentifiersVisitor::Data identifiers_data{aliases}; MarkTableIdentifiersVisitor(identifiers_data).visit(query); /// Rewrite function names to their canonical ones. /// Notice: function name normalization is disabled when it's a secondary query, because queries are either /// already normalized on initiator node, or not normalized and should remain unnormalized for /// compatibility. if (context_->getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && settings.normalize_function_names) FunctionNameNormalizer().visit(query.get()); if (settings.optimize_move_to_prewhere) { /// Required for PREWHERE ComparisonTupleEliminationVisitor::Data data_comparison_tuple_elimination; ComparisonTupleEliminationVisitor(data_comparison_tuple_elimination).visit(query); } /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, source_columns_set, ignore_alias, settings, allow_self_aliases, is_create_parameterized_view); QueryNormalizer(normalizer_data).visit(query); optimizeGroupingSets(query); } }