Fix incorrect columns order in subqueries of UNION

Consider the following query: SELECT avgWeighted(x, y) FROM (SELECT NULL, 255 AS x, 1 AS y UNION ALL SELECT y, NULL AS x, 1 AS y) Here is UNION from two SELECT queries - `SELECT NULL, 255 AS x, 1 AS y` - `SELECT y, NULL AS x, 1 AS y` UNION queries matches columns by positions, not names, so the following columns should be used by `avgWeighted()`: - `255 AS x, 1 AS y` - `NULL AS x, 1 AS y` Result types of arguments should be: - `x Nullable(UInt8)` - `y UInt8` And in case of UNION query is a subselect itself, it will return only required columns, for the example above it needs only `x` and `y`. For this it will get positions of these arguments from the first query, and then use those positions to get required column names from the second query (since there is no ability to get columns by positions instead of names internally), and due to duplicated columns the second query will return (`y`, `x`) not (`x`, `y`), and this will make the result incorrect: EXPLAIN header = 1, optimize = 0, actions=1 SELECT avgWeighted(x, y) FROM (SELECT NULL, 255 AS x, 1 AS y UNION ALL SELECT y, NULL AS x, 1 AS y) Aggregates: avgWeighted(x, y) Function: avgWeighted(Nullable(UInt8), UInt8) → Nullable(Float64) Arguments: x, y Argument positions: 0, 1 Expression (Before GROUP BY) Header: x UInt8 y Nullable(UInt8) ... Union Header: x UInt8 y Nullable(UInt8) Expression (Conversion before UNION) Header: x UInt8 y Nullable(UInt8) Expression (Conversion before UNION) Header: x UInt8 y Nullable(UInt8) And the query itself fails with an error: Logical error: 'Bad cast from type DB::ColumnVector<char8_t> to DB::ColumnNullable'. _NOTE: `avgWeighted()` here is required to trigger `LOGICAL_ERROR`_ CI: https://s3.amazonaws.com/clickhouse-test-reports/37796/e637489f81768df582fe7389e57f7ed12893087c/fuzzer_astfuzzerdebug,actions//report.html Fixes: 02227_union_match_by_name v2: fix untuple() (reserve space for output_columns_positions too) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2024-09-20 08:40:50 +00:00 · 2022-06-06 16:11:13 +03:00 · 2022-06-06 16:11:13 +03:00 · d98336ad83
commit d98336ad83
parent 563fac5333
1 changed files with 41 additions and 9 deletions
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -433,17 +433,24 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
 {
    ASTs & elements = select_query->select()->children;

-    std::map<String, size_t> required_columns_with_duplicate_count;
+    std::unordered_map<String, size_t> required_columns_with_duplicate_count;
+    /// Order of output columns should match order in required_result_columns,
+    /// otherwise UNION queries may have incorrect header when subselect has duplicated columns.
+    ///
+    /// NOTE: multimap is required since there can be duplicated column names.
+    std::unordered_multimap<String, size_t> output_columns_positions;

    if (!required_result_columns.empty())
    {
        /// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
-        for (const auto & name : required_result_columns)
+        for (size_t i = 0; i < required_result_columns.size(); ++i)
        {
+            const auto & name = required_result_columns[i];
            if (remove_dups)
                required_columns_with_duplicate_count[name] = 1;
            else
                ++required_columns_with_duplicate_count[name];
+            output_columns_positions.emplace(name, i);
        }
    }
    else if (remove_dups)
@ -455,8 +462,8 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
    else
        return;

-    ASTs new_elements;
-    new_elements.reserve(elements.size());
+    ASTs new_elements(elements.size() + output_columns_positions.size());
+    size_t new_elements_size = 0;

    NameSet remove_columns;

@ -464,17 +471,35 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
    {
        String name = elem->getAliasOrColumnName();

+        /// Columns that are presented in output_columns_positions should
+        /// appears in the same order in the new_elements, hence default
+        /// result_index goes after all elements of output_columns_positions
+        /// (it is for columns that are not located in
+        /// output_columns_positions, i.e. untuple())
+        size_t result_index = output_columns_positions.size() + new_elements_size;
+
+        /// Note, order of duplicated columns is not important here (since they
+        /// are the same), only order for unique columns is important, so it is
+        /// fine to use multimap here.
+        if (auto it = output_columns_positions.find(name); it != output_columns_positions.end())
+        {
+            result_index = it->second;
+            output_columns_positions.erase(it);
+        }
+
        auto it = required_columns_with_duplicate_count.find(name);
        if (required_columns_with_duplicate_count.end() != it && it->second)
        {
-            new_elements.push_back(elem);
+            new_elements[result_index] = elem;
            --it->second;
+            ++new_elements_size;
        }
        else if (select_query->distinct || hasArrayJoin(elem))
        {
            /// ARRAY JOIN cannot be optimized out since it may change number of rows,
            /// so as DISTINCT.
-            new_elements.push_back(elem);
+            new_elements[result_index] = elem;
+            ++new_elements_size;
        }
        else
        {
@ -485,13 +510,20 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
            /// Never remove untuple. It's result column may be in required columns.
            /// It is not easy to analyze untuple here, because types were not calculated yet.
            if (func && func->name == "untuple")
-                new_elements.push_back(elem);
-
+            {
+                new_elements[result_index] = elem;
+                ++new_elements_size;
+            }
            /// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong
            if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy())
-                new_elements.push_back(elem);
+            {
+                new_elements[result_index] = elem;
+                ++new_elements_size;
+            }
        }
    }
+    /// Remove empty nodes.
+    std::erase(new_elements, ASTPtr{});

    if (select_query->interpolate())
    {