rewrite duplicate distinct optimization

This commit is contained in:
Artem Zuikov 2020-08-20 20:04:42 +03:00
parent 7445ab94bf
commit 911e6efe3e
4 changed files with 271 additions and 78 deletions

View File

@ -1,72 +0,0 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/IAST.h>
#include <Common/typeid_cast.h>
namespace DB
{
/// Removes duplicate DISTINCT from queries.
class DuplicateDistinctMatcher
{
public:
struct Data
{
bool is_distinct;
std::vector<String> last_ids;
};
static void visit(const ASTPtr & ast, Data & data)
{
auto * select_query = ast->as<ASTSelectQuery>();
if (select_query)
visit(*select_query, data);
}
static void visit(ASTSelectQuery & select_query, Data & data)
{
if (!select_query.distinct || !select_query.select())
return;
/// Optimize shouldn't work for distributed tables
for (const auto & elem : select_query.children)
{
if (elem->as<ASTSetQuery>() && !elem->as<ASTSetQuery>()->is_standalone)
return;
}
auto expression_list = select_query.select();
std::vector<String> current_ids;
if (expression_list->children.empty())
return;
current_ids.reserve(expression_list->children.size());
for (const auto & id : expression_list->children)
current_ids.push_back(id->getColumnName());
if (data.is_distinct && current_ids == data.last_ids)
select_query.distinct = false;
data.is_distinct = true;
data.last_ids = std::move(current_ids);
}
static bool needChildVisit(const ASTPtr &, const ASTPtr &)
{
return true;
}
};
using DuplicateDistinctVisitor = InDepthNodeVisitor<DuplicateDistinctMatcher, false>;
}

View File

@ -4,7 +4,6 @@
#include <Interpreters/OptimizeIfChains.h>
#include <Interpreters/OptimizeIfWithConstantConditionVisitor.h>
#include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h>
#include <Interpreters/DuplicateDistinctVisitor.h>
#include <Interpreters/DuplicateOrderByVisitor.h>
#include <Interpreters/GroupByFunctionKeysVisitor.h>
#include <Interpreters/AggregateFunctionOfGroupByKeysVisitor.h>
@ -22,6 +21,8 @@
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTOrderByElement.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSubquery.h>
#include <Parsers/ASTSelectWithUnionQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Functions/FunctionFactory.h>
@ -311,13 +312,118 @@ void optimizeDuplicatesInOrderBy(const ASTSelectQuery * select_query)
elems = std::move(unique_elems);
}
/// Optimize duplicate ORDER BY and DISTINCT
void optimizeDuplicateOrderByAndDistinct(ASTPtr & query, const Context & context)
/// Optimize duplicate ORDER BY
void optimizeDuplicateOrderBy(ASTPtr & query, const Context & context)
{
DuplicateOrderByVisitor::Data order_by_data{context};
DuplicateOrderByVisitor(order_by_data).visit(query);
DuplicateDistinctVisitor::Data distinct_data{};
DuplicateDistinctVisitor(distinct_data).visit(query);
}
/// Return simple subselect (without UNIONs or JOINs) if any
const ASTSelectQuery * getSimpleSubselect(const ASTSelectQuery & select)
{
const auto & tables = select.tables()->children;
if (tables.empty() || tables.size() != 1)
return nullptr;
const auto & ast_table_expression = tables[0]->as<ASTTablesInSelectQueryElement>()->table_expression;
if (!ast_table_expression)
return nullptr;
const auto & table_expression = ast_table_expression->as<ASTTableExpression>();
if (!table_expression->subquery)
return nullptr;
const auto & subquery = table_expression->subquery->as<ASTSubquery>();
if (!subquery || subquery->children.size() != 1)
return nullptr;
const auto & subselect_union = subquery->children[0]->as<ASTSelectWithUnionQuery>();
if (!subselect_union || !subselect_union->list_of_selects ||
subselect_union->list_of_selects->children.size() != 1)
return nullptr;
return subselect_union->list_of_selects->children[0]->as<ASTSelectQuery>();
}
std::unordered_set<String> getDistinctNames(const ASTSelectQuery & select)
{
if (!select.select() || select.select()->children.empty())
return {};
std::unordered_set<String> names;
std::unordered_set<String> implicit_distinct;
if (!select.distinct)
{
/// SELECT a, b FROM (SELECT DISTINCT a FROM ...)
if (const ASTSelectQuery * subselect = getSimpleSubselect(select))
implicit_distinct = getDistinctNames(*subselect);
if (implicit_distinct.empty())
return {};
}
/// Extract result column names (prefer aliases, ignore table name)
for (const auto & id : select.select()->children)
{
String alias = id->tryGetAlias();
if (const auto * identifier = id->as<ASTIdentifier>())
{
String name = identifier->shortName();
if (select.distinct || implicit_distinct.count(name))
{
if (alias.empty())
names.insert(name);
else
names.insert(alias);
}
}
else if (select.distinct && !alias.empty())
{
/// It's not possible to use getAliasOrColumnName() cause name is context specific (function arguments)
names.insert(alias);
}
}
return names;
}
/// Remove DISTINCT from query if columns are known as DISTINCT from subquery
void optimizeDuplicateDistinct(ASTSelectQuery & select)
{
if (!select.select() || select.select()->children.empty())
return;
const ASTSelectQuery * subselect = getSimpleSubselect(select);
if (!subselect)
return;
std::unordered_set<String> distinct_names = getDistinctNames(*subselect);
std::unordered_set<String> selected_names;
/// Check source column names from select list (ignore aliases and table names)
for (const auto & id : select.select()->children)
{
const auto * identifier = id->as<ASTIdentifier>();
if (!identifier)
return;
String name = identifier->shortName();
if (!distinct_names.count(name))
return; /// Not a distinct column, keep DISTINCT for it.
selected_names.insert(name);
}
/// select columns list != distinct columns list
/// SELECT DISTINCT a FROM (SELECT DISTINCT a, b FROM ...)) -- cannot remove DISTINCT
if (selected_names.size() != distinct_names.size())
return;
select.distinct = false;
}
/// Replace monotonous functions in ORDER BY if they don't participate in GROUP BY expression,
@ -537,7 +643,10 @@ void TreeOptimizer::apply(ASTPtr & query, Aliases & aliases, const NameSet & sou
/// Remove duplicate ORDER BY and DISTINCT from subqueries.
if (settings.optimize_duplicate_order_by_and_distinct)
optimizeDuplicateOrderByAndDistinct(query, context);
{
optimizeDuplicateOrderBy(query, context);
optimizeDuplicateDistinct(*select_query);
}
/// Remove functions from ORDER BY if its argument is also in ORDER BY
if (settings.optimize_redundant_functions_in_order_by)

View File

@ -0,0 +1,124 @@
SELECT DISTINCT number
FROM numbers(1)
SELECT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
)
SELECT DISTINCT number * 2
FROM
(
SELECT DISTINCT
number * 2,
number
FROM numbers(1)
)
SELECT number
FROM
(
SELECT DISTINCT number * 2 AS number
FROM numbers(1)
)
SELECT
b,
a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
SELECT DISTINCT a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
SELECT a
FROM
(
SELECT DISTINCT a
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT DISTINCT a
FROM
(
SELECT
a,
b
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT
a,
b
FROM
(
SELECT
b,
a
FROM
(
SELECT DISTINCT
number AS a,
number AS b
FROM numbers(1)
)
)
SELECT
a,
b
FROM
(
SELECT
b,
a,
a + b
FROM
(
SELECT DISTINCT
number % 2 AS a,
number % 3 AS b
FROM numbers(100)
)
)
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
) AS t1
CROSS JOIN numbers(2) AS t2
SELECT number
FROM
(
SELECT DISTINCT number
FROM numbers(1) AS t1
CROSS JOIN numbers(2) AS t2
)
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM numbers(1)
UNION ALL
SELECT DISTINCT number
FROM numbers(2)
)
0
1

View File

@ -0,0 +1,32 @@
SET enable_debug_queries = 1;
SET optimize_duplicate_order_by_and_distinct = 1;
ANALYZE SELECT DISTINCT number FROM numbers(1);
ANALYZE SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1));
ANALYZE SELECT DISTINCT number * 2 FROM (SELECT DISTINCT number * 2, number FROM numbers(1));
ANALYZE SELECT DISTINCT number FROM (SELECT DISTINCT number * 2 AS number FROM numbers(1));
ANALYZE SELECT DISTINCT b, a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100));
ANALYZE SELECT DISTINCT a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100));
ANALYZE SELECT DISTINCT a FROM (SELECT DISTINCT a FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
ANALYZE SELECT DISTINCT a FROM (SELECT DISTINCT a, b FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
ANALYZE SELECT DISTINCT a, b FROM (SELECT DISTINCT b, a FROM (SELECT DISTINCT number a, number b FROM numbers(1)));
ANALYZE SELECT DISTINCT a, b FROM (SELECT b, a, a + b FROM (SELECT DISTINCT number % 2 AS a, number % 3 AS b FROM numbers(100)));
ANALYZE SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1)) t1 CROSS JOIN numbers(2) t2;
ANALYZE SELECT DISTINCT number FROM (SELECT DISTINCT number FROM numbers(1) t1 CROSS JOIN numbers(2) t2);
ANALYZE SELECT DISTINCT number FROM
(
(SELECT DISTINCT number FROM numbers(1))
UNION ALL
(SELECT DISTINCT number FROM numbers(2))
);
--
SELECT DISTINCT number FROM
(
(SELECT DISTINCT number FROM numbers(1))
UNION ALL
(SELECT DISTINCT number FROM numbers(2))
)
ORDER BY number;