Merge pull request #37887 from azat/union-fix

Fix incorrect columns order in subqueries of UNION
2024-09-20 08:40:50 +00:00 · 2022-06-29 20:40:23 +02:00 · 2022-06-29 20:40:23 +02:00 · 4f7967ab9e
commit 4f7967ab9e
parent cd2cb10f60 4a52d3f6b9
6 changed files with 74 additions and 53 deletions
--- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@ -52,10 +52,6 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(
    if (!num_children)
        throw Exception("Logical error: no children in ASTSelectWithUnionQuery", ErrorCodes::LOGICAL_ERROR);

-    /// This is required for UNION to match headers correctly.
-    if (num_children > 1)
-        options.reorderColumns();
-
    /// Note that we pass 'required_result_column_names' to first SELECT.
    /// And for the rest, we pass names at the corresponding positions of 'required_result_column_names' in the result of first SELECT,
    ///  because names could be different.
@ -163,7 +159,22 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(
    {
        Blocks headers(num_children);
        for (size_t query_num = 0; query_num < num_children; ++query_num)
+        {
            headers[query_num] = nested_interpreters[query_num]->getSampleBlock();
+            const auto & current_required_result_column_names = required_result_column_names_for_other_selects[query_num];
+            if (!current_required_result_column_names.empty())
+            {
+                const auto & header_columns = headers[query_num].getNames();
+                if (current_required_result_column_names != header_columns)
+                {
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Different order of columns in UNION subquery: {} and {}",
+                        fmt::join(current_required_result_column_names, ", "),
+                        fmt::join(header_columns, ", "));
+                }
+
+            }
+        }

        result_header = getCommonHeaderForUnion(headers);
    }
--- a/src/Interpreters/SelectQueryOptions.h
+++ b/src/Interpreters/SelectQueryOptions.h
@ -31,8 +31,6 @@ struct SelectQueryOptions
    bool only_analyze = false;
    bool modify_inplace = false;
    bool remove_duplicates = false;
-    /// This is required for UNION to match headers correctly.
-    bool reorder_columns_as_required_header = false;
    bool ignore_quota = false;
    bool ignore_limits = false;
    /// This flag is needed to analyze query ignoring table projections.
@ -99,12 +97,6 @@ struct SelectQueryOptions
        return *this;
    }

-    SelectQueryOptions & reorderColumns(bool value = true)
-    {
-        reorder_columns_as_required_header = value;
-        return *this;
-    }
-
    SelectQueryOptions & noSubquery()
    {
        subquery_depth = 0;
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -429,21 +429,28 @@ void renameDuplicatedColumns(const ASTSelectQuery * select_query)
 /// This is the case when we have DISTINCT or arrayJoin: we require more columns in SELECT even if we need less columns in result.
 /// Also we have to remove duplicates in case of GLOBAL subqueries. Their results are placed into tables so duplicates are impossible.
 /// Also remove all INTERPOLATE columns which are not in SELECT anymore.
-void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups, bool reorder_columns_as_required_header)
+void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups)
 {
    ASTs & elements = select_query->select()->children;

-    std::map<String, size_t> required_columns_with_duplicate_count;
+    std::unordered_map<String, size_t> required_columns_with_duplicate_count;
+    /// Order of output columns should match order in required_result_columns,
+    /// otherwise UNION queries may have incorrect header when subselect has duplicated columns.
+    ///
+    /// NOTE: multimap is required since there can be duplicated column names.
+    std::unordered_multimap<String, size_t> output_columns_positions;

    if (!required_result_columns.empty())
    {
        /// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
-        for (const auto & name : required_result_columns)
+        for (size_t i = 0; i < required_result_columns.size(); ++i)
        {
+            const auto & name = required_result_columns[i];
            if (remove_dups)
                required_columns_with_duplicate_count[name] = 1;
            else
                ++required_columns_with_duplicate_count[name];
+            output_columns_positions.emplace(name, i);
        }
    }
    else if (remove_dups)
@ -455,49 +462,44 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
    else
        return;

-    ASTs new_elements;
-    new_elements.reserve(elements.size());
+    ASTs new_elements(elements.size() + output_columns_positions.size());
+    size_t new_elements_size = 0;

    NameSet remove_columns;

-    /// Resort columns according to required_result_columns.
-    if (reorder_columns_as_required_header && !required_result_columns.empty())
-    {
-        std::unordered_map<String, size_t> name_pos;
-        {
-            size_t pos = 0;
-            for (const auto & name : required_result_columns)
-                name_pos[name] = pos++;
-        }
-        ::sort(elements.begin(), elements.end(), [&](const auto & lhs, const auto & rhs)
-        {
-            String lhs_name = lhs->getAliasOrColumnName();
-            String rhs_name = rhs->getAliasOrColumnName();
-            size_t lhs_pos = name_pos.size();
-            size_t rhs_pos = name_pos.size();
-            if (auto it = name_pos.find(lhs_name); it != name_pos.end())
-                lhs_pos = it->second;
-            if (auto it = name_pos.find(rhs_name); it != name_pos.end())
-                rhs_pos = it->second;
-            return lhs_pos < rhs_pos;
-        });
-    }
-
    for (const auto & elem : elements)
    {
        String name = elem->getAliasOrColumnName();

+        /// Columns that are presented in output_columns_positions should
+        /// appears in the same order in the new_elements, hence default
+        /// result_index goes after all elements of output_columns_positions
+        /// (it is for columns that are not located in
+        /// output_columns_positions, i.e. untuple())
+        size_t result_index = output_columns_positions.size() + new_elements_size;
+
+        /// Note, order of duplicated columns is not important here (since they
+        /// are the same), only order for unique columns is important, so it is
+        /// fine to use multimap here.
+        if (auto it = output_columns_positions.find(name); it != output_columns_positions.end())
+        {
+            result_index = it->second;
+            output_columns_positions.erase(it);
+        }
+
        auto it = required_columns_with_duplicate_count.find(name);
        if (required_columns_with_duplicate_count.end() != it && it->second)
        {
-            new_elements.push_back(elem);
+            new_elements[result_index] = elem;
            --it->second;
+            ++new_elements_size;
        }
        else if (select_query->distinct || hasArrayJoin(elem))
        {
            /// ARRAY JOIN cannot be optimized out since it may change number of rows,
            /// so as DISTINCT.
-            new_elements.push_back(elem);
+            new_elements[result_index] = elem;
+            ++new_elements_size;
        }
        else
        {
@ -508,13 +510,20 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
            /// Never remove untuple. It's result column may be in required columns.
            /// It is not easy to analyze untuple here, because types were not calculated yet.
            if (func && func->name == "untuple")
-                new_elements.push_back(elem);
-
+            {
+                new_elements[result_index] = elem;
+                ++new_elements_size;
+            }
            /// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong
            if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy())
-                new_elements.push_back(elem);
+            {
+                new_elements[result_index] = elem;
+                ++new_elements_size;
+            }
        }
    }
+    /// Remove empty nodes.
+    std::erase(new_elements, ASTPtr{});

    if (select_query->interpolate())
    {
@ -1193,7 +1202,6 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(

    size_t subquery_depth = select_options.subquery_depth;
    bool remove_duplicates = select_options.remove_duplicates;
-    bool reorder_columns_as_required_header = select_options.reorder_columns_as_required_header;

    const auto & settings = getContext()->getSettingsRef();

@ -1245,7 +1253,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
    /// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
    /// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost)
    ///  and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations.
-    removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates, reorder_columns_as_required_header);
+    removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates);

    /// Executing scalar subqueries - replacing them with constant values.
    executeScalarSubqueries(query, getContext(), subquery_depth, result.scalars, result.local_scalars, select_options.only_analyze);
--- a/tests/queries/0_stateless/00597_push_down_predicate_long.reference
+++ b/tests/queries/0_stateless/00597_push_down_predicate_long.reference
@ -402,8 +402,8 @@ FROM
 ANY LEFT JOIN
 (
    SELECT
-        date,
        id,
+        date,
        name,
        value
    FROM test_00597
@ -472,8 +472,8 @@ FROM
 ANY LEFT JOIN
 (
    SELECT
-        date,
        id,
+        date,
        name,
        value
    FROM test_00597
@ -537,10 +537,10 @@ FROM
    ANY LEFT JOIN
    (
        SELECT
-            date,
-            id,
            name,
-            value
+            value,
+            date,
+            id
        FROM test_00597
    ) AS b ON id = b.id
    WHERE id = 1
@ -567,8 +567,8 @@ FROM
 SEMI LEFT JOIN
 (
    SELECT
-        date,
        id,
+        date,
        name,
        value
    FROM
--- a/tests/queries/0_stateless/02340_union_header.reference
+++ b/tests/queries/0_stateless/02340_union_header.reference
@ -0,0 +1,7 @@
+-- { echo }
+SELECT a, b, c FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ORDER BY a;
+-2	\N	-2
+3	2147483647	1048575
+SELECT a, b, c, d FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ALL LEFT JOIN (SELECT 100 AS a, -9223372036854775808 AS b, NULL AS d UNION ALL SELECT 256, 256, NULL) AS js2 USING (a, b) ORDER BY a DESC NULLS FIRST, '-0.02' ASC, b ASC NULLS FIRST, c DESC NULLS FIRST, 1048575 ASC NULLS LAST, d DESC SETTINGS enable_positional_arguments=0;
+3	2147483647	1048575	\N
+-2	\N	-2	\N
--- a/tests/queries/0_stateless/02340_union_header.sql
+++ b/tests/queries/0_stateless/02340_union_header.sql
@ -0,0 +1,3 @@
+-- { echo }
+SELECT a, b, c FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ORDER BY a;
+SELECT a, b, c, d FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ALL LEFT JOIN (SELECT 100 AS a, -9223372036854775808 AS b, NULL AS d UNION ALL SELECT 256, 256, NULL) AS js2 USING (a, b) ORDER BY a DESC NULLS FIRST, '-0.02' ASC, b ASC NULLS FIRST, c DESC NULLS FIRST, 1048575 ASC NULLS LAST, d DESC SETTINGS enable_positional_arguments=0;