Fix incorrect columns order in subqueries of UNION

Consider the following query:

    SELECT avgWeighted(x, y) FROM (SELECT NULL, 255 AS x, 1 AS y UNION ALL SELECT y, NULL AS x, 1 AS y)

Here is UNION from two SELECT queries
- `SELECT NULL, 255 AS x, 1 AS y`
- `SELECT y, NULL AS x, 1 AS y`

UNION queries matches columns by positions, not names, so the following
columns should be used by `avgWeighted()`:
- `255 AS x,  1 AS y`
- `NULL AS x, 1 AS y`

Result types of arguments should be:
- `x Nullable(UInt8)`
- `y UInt8`

And in case of UNION query is a subselect itself, it will return only
required columns, for the example above it needs only `x` and `y`.
For this it will get positions of these arguments from the first query,
and then use those positions to get required column names from the
second query (since there is no ability to get columns by positions
instead of names internally), and due to duplicated columns the second
query will return (`y`, `x`) not (`x`, `y`), and this will make the
result incorrect:

    EXPLAIN header = 1, optimize = 0, actions=1 SELECT avgWeighted(x, y) FROM (SELECT NULL, 255 AS x, 1 AS y UNION ALL SELECT y, NULL AS x, 1 AS y)

    Aggregates:
        avgWeighted(x, y)
          Function: avgWeighted(Nullable(UInt8), UInt8) → Nullable(Float64)
          Arguments: x, y
          Argument positions: 0, 1
      Expression (Before GROUP BY)
      Header: x UInt8
              y Nullable(UInt8)
      ...
      Union
      Header: x UInt8
              y Nullable(UInt8)
        Expression (Conversion before UNION)
        Header: x UInt8
                y Nullable(UInt8)
        Expression (Conversion before UNION)
        Header: x UInt8
                y Nullable(UInt8)

And the query itself fails with an error:

    Logical error: 'Bad cast from type DB::ColumnVector<char8_t> to DB::ColumnNullable'.

_NOTE: `avgWeighted()` here is required to trigger `LOGICAL_ERROR`_

CI: https://s3.amazonaws.com/clickhouse-test-reports/37796/e637489f81768df582fe7389e57f7ed12893087c/fuzzer_astfuzzerdebug,actions//report.html
Fixes: 02227_union_match_by_name
v2: fix untuple() (reserve space for output_columns_positions too)
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
This commit is contained in:
Azat Khuzhin 2022-06-06 16:11:13 +03:00
parent 563fac5333
commit d98336ad83

View File

@ -433,17 +433,24 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
{
ASTs & elements = select_query->select()->children;
std::map<String, size_t> required_columns_with_duplicate_count;
std::unordered_map<String, size_t> required_columns_with_duplicate_count;
/// Order of output columns should match order in required_result_columns,
/// otherwise UNION queries may have incorrect header when subselect has duplicated columns.
///
/// NOTE: multimap is required since there can be duplicated column names.
std::unordered_multimap<String, size_t> output_columns_positions;
if (!required_result_columns.empty())
{
/// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
for (const auto & name : required_result_columns)
for (size_t i = 0; i < required_result_columns.size(); ++i)
{
const auto & name = required_result_columns[i];
if (remove_dups)
required_columns_with_duplicate_count[name] = 1;
else
++required_columns_with_duplicate_count[name];
output_columns_positions.emplace(name, i);
}
}
else if (remove_dups)
@ -455,8 +462,8 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
else
return;
ASTs new_elements;
new_elements.reserve(elements.size());
ASTs new_elements(elements.size() + output_columns_positions.size());
size_t new_elements_size = 0;
NameSet remove_columns;
@ -464,17 +471,35 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
{
String name = elem->getAliasOrColumnName();
/// Columns that are presented in output_columns_positions should
/// appears in the same order in the new_elements, hence default
/// result_index goes after all elements of output_columns_positions
/// (it is for columns that are not located in
/// output_columns_positions, i.e. untuple())
size_t result_index = output_columns_positions.size() + new_elements_size;
/// Note, order of duplicated columns is not important here (since they
/// are the same), only order for unique columns is important, so it is
/// fine to use multimap here.
if (auto it = output_columns_positions.find(name); it != output_columns_positions.end())
{
result_index = it->second;
output_columns_positions.erase(it);
}
auto it = required_columns_with_duplicate_count.find(name);
if (required_columns_with_duplicate_count.end() != it && it->second)
{
new_elements.push_back(elem);
new_elements[result_index] = elem;
--it->second;
++new_elements_size;
}
else if (select_query->distinct || hasArrayJoin(elem))
{
/// ARRAY JOIN cannot be optimized out since it may change number of rows,
/// so as DISTINCT.
new_elements.push_back(elem);
new_elements[result_index] = elem;
++new_elements_size;
}
else
{
@ -485,13 +510,20 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
/// Never remove untuple. It's result column may be in required columns.
/// It is not easy to analyze untuple here, because types were not calculated yet.
if (func && func->name == "untuple")
new_elements.push_back(elem);
{
new_elements[result_index] = elem;
++new_elements_size;
}
/// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong
if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy())
new_elements.push_back(elem);
{
new_elements[result_index] = elem;
++new_elements_size;
}
}
}
/// Remove empty nodes.
std::erase(new_elements, ASTPtr{});
if (select_query->interpolate())
{