Merge pull request #37887 from azat/union-fix

Fix incorrect columns order in subqueries of UNION
This commit is contained in:
Dmitry Novik 2022-06-29 20:40:23 +02:00 committed by GitHub
commit 4f7967ab9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 74 additions and 53 deletions

View File

@ -52,10 +52,6 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(
if (!num_children)
throw Exception("Logical error: no children in ASTSelectWithUnionQuery", ErrorCodes::LOGICAL_ERROR);
/// This is required for UNION to match headers correctly.
if (num_children > 1)
options.reorderColumns();
/// Note that we pass 'required_result_column_names' to first SELECT.
/// And for the rest, we pass names at the corresponding positions of 'required_result_column_names' in the result of first SELECT,
/// because names could be different.
@ -163,7 +159,22 @@ InterpreterSelectWithUnionQuery::InterpreterSelectWithUnionQuery(
{
Blocks headers(num_children);
for (size_t query_num = 0; query_num < num_children; ++query_num)
{
headers[query_num] = nested_interpreters[query_num]->getSampleBlock();
const auto & current_required_result_column_names = required_result_column_names_for_other_selects[query_num];
if (!current_required_result_column_names.empty())
{
const auto & header_columns = headers[query_num].getNames();
if (current_required_result_column_names != header_columns)
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Different order of columns in UNION subquery: {} and {}",
fmt::join(current_required_result_column_names, ", "),
fmt::join(header_columns, ", "));
}
}
}
result_header = getCommonHeaderForUnion(headers);
}

View File

@ -31,8 +31,6 @@ struct SelectQueryOptions
bool only_analyze = false;
bool modify_inplace = false;
bool remove_duplicates = false;
/// This is required for UNION to match headers correctly.
bool reorder_columns_as_required_header = false;
bool ignore_quota = false;
bool ignore_limits = false;
/// This flag is needed to analyze query ignoring table projections.
@ -99,12 +97,6 @@ struct SelectQueryOptions
return *this;
}
SelectQueryOptions & reorderColumns(bool value = true)
{
reorder_columns_as_required_header = value;
return *this;
}
SelectQueryOptions & noSubquery()
{
subquery_depth = 0;

View File

@ -429,21 +429,28 @@ void renameDuplicatedColumns(const ASTSelectQuery * select_query)
/// This is the case when we have DISTINCT or arrayJoin: we require more columns in SELECT even if we need less columns in result.
/// Also we have to remove duplicates in case of GLOBAL subqueries. Their results are placed into tables so duplicates are impossible.
/// Also remove all INTERPOLATE columns which are not in SELECT anymore.
void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups, bool reorder_columns_as_required_header)
void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const Names & required_result_columns, bool remove_dups)
{
ASTs & elements = select_query->select()->children;
std::map<String, size_t> required_columns_with_duplicate_count;
std::unordered_map<String, size_t> required_columns_with_duplicate_count;
/// Order of output columns should match order in required_result_columns,
/// otherwise UNION queries may have incorrect header when subselect has duplicated columns.
///
/// NOTE: multimap is required since there can be duplicated column names.
std::unordered_multimap<String, size_t> output_columns_positions;
if (!required_result_columns.empty())
{
/// Some columns may be queried multiple times, like SELECT x, y, y FROM table.
for (const auto & name : required_result_columns)
for (size_t i = 0; i < required_result_columns.size(); ++i)
{
const auto & name = required_result_columns[i];
if (remove_dups)
required_columns_with_duplicate_count[name] = 1;
else
++required_columns_with_duplicate_count[name];
output_columns_positions.emplace(name, i);
}
}
else if (remove_dups)
@ -455,49 +462,44 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
else
return;
ASTs new_elements;
new_elements.reserve(elements.size());
ASTs new_elements(elements.size() + output_columns_positions.size());
size_t new_elements_size = 0;
NameSet remove_columns;
/// Resort columns according to required_result_columns.
if (reorder_columns_as_required_header && !required_result_columns.empty())
{
std::unordered_map<String, size_t> name_pos;
{
size_t pos = 0;
for (const auto & name : required_result_columns)
name_pos[name] = pos++;
}
::sort(elements.begin(), elements.end(), [&](const auto & lhs, const auto & rhs)
{
String lhs_name = lhs->getAliasOrColumnName();
String rhs_name = rhs->getAliasOrColumnName();
size_t lhs_pos = name_pos.size();
size_t rhs_pos = name_pos.size();
if (auto it = name_pos.find(lhs_name); it != name_pos.end())
lhs_pos = it->second;
if (auto it = name_pos.find(rhs_name); it != name_pos.end())
rhs_pos = it->second;
return lhs_pos < rhs_pos;
});
}
for (const auto & elem : elements)
{
String name = elem->getAliasOrColumnName();
/// Columns that are presented in output_columns_positions should
/// appears in the same order in the new_elements, hence default
/// result_index goes after all elements of output_columns_positions
/// (it is for columns that are not located in
/// output_columns_positions, i.e. untuple())
size_t result_index = output_columns_positions.size() + new_elements_size;
/// Note, order of duplicated columns is not important here (since they
/// are the same), only order for unique columns is important, so it is
/// fine to use multimap here.
if (auto it = output_columns_positions.find(name); it != output_columns_positions.end())
{
result_index = it->second;
output_columns_positions.erase(it);
}
auto it = required_columns_with_duplicate_count.find(name);
if (required_columns_with_duplicate_count.end() != it && it->second)
{
new_elements.push_back(elem);
new_elements[result_index] = elem;
--it->second;
++new_elements_size;
}
else if (select_query->distinct || hasArrayJoin(elem))
{
/// ARRAY JOIN cannot be optimized out since it may change number of rows,
/// so as DISTINCT.
new_elements.push_back(elem);
new_elements[result_index] = elem;
++new_elements_size;
}
else
{
@ -508,13 +510,20 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
/// Never remove untuple. It's result column may be in required columns.
/// It is not easy to analyze untuple here, because types were not calculated yet.
if (func && func->name == "untuple")
new_elements.push_back(elem);
{
new_elements[result_index] = elem;
++new_elements_size;
}
/// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong
if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy())
new_elements.push_back(elem);
{
new_elements[result_index] = elem;
++new_elements_size;
}
}
}
/// Remove empty nodes.
std::erase(new_elements, ASTPtr{});
if (select_query->interpolate())
{
@ -1193,7 +1202,6 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
size_t subquery_depth = select_options.subquery_depth;
bool remove_duplicates = select_options.remove_duplicates;
bool reorder_columns_as_required_header = select_options.reorder_columns_as_required_header;
const auto & settings = getContext()->getSettingsRef();
@ -1245,7 +1253,7 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
/// Leave all selected columns in case of DISTINCT; columns that contain arrayJoin function inside.
/// Must be after 'normalizeTree' (after expanding aliases, for aliases not get lost)
/// and before 'executeScalarSubqueries', 'analyzeAggregation', etc. to avoid excessive calculations.
removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates, reorder_columns_as_required_header);
removeUnneededColumnsFromSelectClause(select_query, required_result_columns, remove_duplicates);
/// Executing scalar subqueries - replacing them with constant values.
executeScalarSubqueries(query, getContext(), subquery_depth, result.scalars, result.local_scalars, select_options.only_analyze);

View File

@ -402,8 +402,8 @@ FROM
ANY LEFT JOIN
(
SELECT
date,
id,
date,
name,
value
FROM test_00597
@ -472,8 +472,8 @@ FROM
ANY LEFT JOIN
(
SELECT
date,
id,
date,
name,
value
FROM test_00597
@ -537,10 +537,10 @@ FROM
ANY LEFT JOIN
(
SELECT
date,
id,
name,
value
value,
date,
id
FROM test_00597
) AS b ON id = b.id
WHERE id = 1
@ -567,8 +567,8 @@ FROM
SEMI LEFT JOIN
(
SELECT
date,
id,
date,
name,
value
FROM

View File

@ -0,0 +1,7 @@
-- { echo }
SELECT a, b, c FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ORDER BY a;
-2 \N -2
3 2147483647 1048575
SELECT a, b, c, d FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ALL LEFT JOIN (SELECT 100 AS a, -9223372036854775808 AS b, NULL AS d UNION ALL SELECT 256, 256, NULL) AS js2 USING (a, b) ORDER BY a DESC NULLS FIRST, '-0.02' ASC, b ASC NULLS FIRST, c DESC NULLS FIRST, 1048575 ASC NULLS LAST, d DESC SETTINGS enable_positional_arguments=0;
3 2147483647 1048575 \N
-2 \N -2 \N

View File

@ -0,0 +1,3 @@
-- { echo }
SELECT a, b, c FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ORDER BY a;
SELECT a, b, c, d FROM (SELECT 3 AS a, 2147483647 AS b, 1048575 AS c UNION ALL SELECT -2, NULL, -2) AS js1 ALL LEFT JOIN (SELECT 100 AS a, -9223372036854775808 AS b, NULL AS d UNION ALL SELECT 256, 256, NULL) AS js2 USING (a, b) ORDER BY a DESC NULLS FIRST, '-0.02' ASC, b ASC NULLS FIRST, c DESC NULLS FIRST, 1048575 ASC NULLS LAST, d DESC SETTINGS enable_positional_arguments=0;