Remove duplicate ORDER BY and DISTINCT from subqueries (#10067)

This commit is contained in:
Mikhail Malafeev 2020-06-15 17:03:01 +05:00 committed by GitHub
parent 20b8e1e9ec
commit 10566e2b43
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 388 additions and 0 deletions

View File

@ -360,6 +360,7 @@ struct Settings : public SettingsCollection<Settings>
M(SettingBool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(SettingBool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \
M(SettingUInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ M(SettingUInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \
M(SettingBool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ M(SettingBool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \
M(SettingBool, optimize_duplicate_order_by_and_distinct, true, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \
M(SettingBool, optimize_if_chain_to_miltiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ M(SettingBool, optimize_if_chain_to_miltiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \
M(SettingBool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \ M(SettingBool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \
M(SettingBool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \ M(SettingBool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \

View File

@ -0,0 +1,72 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/IAST.h>
#include <Common/typeid_cast.h>
namespace DB
{
/// Removes duplicate DISTINCT from queries.
class DuplicateDistinctMatcher
{
public:
struct Data
{
bool is_distinct;
std::vector<String> last_ids;
};
static void visit(const ASTPtr & ast, Data & data)
{
auto * select_query = ast->as<ASTSelectQuery>();
if (select_query)
visit(*select_query, data);
}
static void visit(ASTSelectQuery & select_query, Data & data)
{
if (!select_query.distinct || !select_query.select())
return;
/// Optimize shouldn't work for distributed tables
for (const auto & elem : select_query.children)
{
if (elem->as<ASTSetQuery>() && !elem->as<ASTSetQuery>()->is_standalone)
return;
}
auto expression_list = select_query.select();
std::vector<String> current_ids;
if (expression_list->children.empty())
return;
current_ids.reserve(expression_list->children.size());
for (const auto & id : expression_list->children)
current_ids.push_back(id->getColumnName());
if (data.is_distinct && current_ids == data.last_ids)
select_query.distinct = false;
data.is_distinct = true;
data.last_ids = std::move(current_ids);
}
static bool needChildVisit(const ASTPtr &, const ASTPtr &)
{
return true;
}
};
using DuplicateDistinctVisitor = InDepthNodeVisitor<DuplicateDistinctMatcher, false>;
}

View File

@ -0,0 +1,127 @@
#pragma once
#include <Functions/FunctionFactory.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/IAST.h>
#include <Common/typeid_cast.h>
namespace DB
{
/// Checks if SELECT has stateful functions
class ASTFunctionStatefulData
{
public:
using TypeToVisit = ASTFunction;
const Context & context;
bool & is_stateful;
void visit(ASTFunction & ast_function, ASTPtr &)
{
if (ast_function.name == "any" || ast_function.name == "groupArray")
{
is_stateful = true;
return;
}
const auto & function = FunctionFactory::instance().tryGet(ast_function.name, context);
if (function && function->isStateful())
{
is_stateful = true;
return;
}
}
};
using ASTFunctionStatefulMatcher = OneTypeMatcher<ASTFunctionStatefulData>;
using ASTFunctionStatefulVisitor = InDepthNodeVisitor<ASTFunctionStatefulMatcher, true>;
/// Erases unnecessary ORDER BY from subquery
class DuplicateOrderByFromSubqueriesData
{
public:
using TypeToVisit = ASTSelectQuery;
bool done = false;
void visit(ASTSelectQuery & select_query, ASTPtr &)
{
if (done)
return;
if (select_query.orderBy() && !select_query.limitBy() && !select_query.limitByOffset() &&
!select_query.limitByLength() && !select_query.limitLength() && !select_query.limitOffset())
{
select_query.setExpression(ASTSelectQuery::Expression::ORDER_BY, nullptr);
}
done = true;
}
};
using DuplicateOrderByFromSubqueriesMatcher = OneTypeMatcher<DuplicateOrderByFromSubqueriesData>;
using DuplicateOrderByFromSubqueriesVisitor = InDepthNodeVisitor<DuplicateOrderByFromSubqueriesMatcher, true>;
/// Finds SELECT that can be optimized
class DuplicateOrderByData
{
public:
using TypeToVisit = ASTSelectQuery;
const Context & context;
bool done = false;
void visit(ASTSelectQuery & select_query, ASTPtr &)
{
if (done)
return;
/// Disable optimization for distributed tables
for (const auto & elem : select_query.children)
{
if (elem->as<ASTSetQuery>() && !elem->as<ASTSetQuery>()->is_standalone)
return;
}
if (select_query.orderBy() || select_query.groupBy())
{
for (auto & elem : select_query.children)
{
if (elem->as<ASTExpressionList>())
{
bool is_stateful = false;
ASTFunctionStatefulVisitor::Data data{context, is_stateful};
ASTFunctionStatefulVisitor(data).visit(elem);
if (is_stateful)
return;
}
}
if (auto select_table_ptr = select_query.tables())
{
if (auto * select_table = select_table_ptr->as<ASTTablesInSelectQuery>())
{
if (!select_table->children.empty())
{
DuplicateOrderByFromSubqueriesVisitor::Data data{false};
DuplicateOrderByFromSubqueriesVisitor(data).visit(select_table->children[0]);
}
}
}
}
}
};
using DuplicateOrderByMatcher = OneTypeMatcher<DuplicateOrderByData>;
using DuplicateOrderByVisitor = InDepthNodeVisitor<DuplicateOrderByMatcher, true>;
}

View File

@ -23,12 +23,15 @@
#include <Interpreters/getTableExpressions.h> #include <Interpreters/getTableExpressions.h>
#include <Interpreters/OptimizeIfChains.h> #include <Interpreters/OptimizeIfChains.h>
#include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h> #include <Interpreters/ArithmeticOperationsInAgrFuncOptimize.h>
#include <Interpreters/DuplicateDistinctVisitor.h>
#include <Interpreters/DuplicateOrderByVisitor.h>
#include <Parsers/ASTExpressionList.h> #include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h> #include <Parsers/ASTFunction.h>
#include <Parsers/ASTLiteral.h> #include <Parsers/ASTLiteral.h>
#include <Parsers/ASTOrderByElement.h> #include <Parsers/ASTOrderByElement.h>
#include <Parsers/ASTSelectQuery.h> #include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h> #include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/queryToString.h> #include <Parsers/queryToString.h>
@ -370,6 +373,18 @@ void optimizeOrderBy(const ASTSelectQuery * select_query)
elems = std::move(unique_elems); elems = std::move(unique_elems);
} }
/// Optimize duplicate ORDER BY and DISTINCT
void optimizeDuplicateOrderByAndDistinct(ASTPtr & query, bool optimize_duplicate_order_by_and_distinct, const Context & context)
{
if (optimize_duplicate_order_by_and_distinct)
{
DuplicateOrderByVisitor::Data order_by_data{context, false};
DuplicateOrderByVisitor(order_by_data).visit(query);
DuplicateDistinctVisitor::Data distinct_data{};
DuplicateDistinctVisitor(distinct_data).visit(query);
}
}
/// Remove duplicate items from LIMIT BY. /// Remove duplicate items from LIMIT BY.
void optimizeLimitBy(const ASTSelectQuery * select_query) void optimizeLimitBy(const ASTSelectQuery * select_query)
{ {
@ -831,6 +846,9 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect(
/// Remove duplicate items from ORDER BY. /// Remove duplicate items from ORDER BY.
optimizeOrderBy(select_query); optimizeOrderBy(select_query);
/// Remove duplicate ORDER BY and DISTINCT from subqueries.
optimizeDuplicateOrderByAndDistinct(query, settings.optimize_duplicate_order_by_and_distinct, context);
/// Remove duplicated elements from LIMIT BY clause. /// Remove duplicated elements from LIMIT BY clause.
optimizeLimitBy(select_query); optimizeLimitBy(select_query);

View File

@ -0,0 +1,10 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
</preconditions>
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY EventDate, CounterID FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single) FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY toStartOfWeek(EventDate) FORMAT Null</query>
</test>

View File

@ -0,0 +1,14 @@
SELECT number\nFROM \n(\n SELECT number\n FROM \n (\n SELECT DISTINCT number\n FROM numbers(3)\n )\n)\nORDER BY number ASC
0
1
2
SELECT DISTINCT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number\n FROM numbers(3)\n ORDER BY number ASC\n )\n ORDER BY number ASC\n)\nORDER BY number ASC
0
1
2
SELECT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number % 2 AS number\n FROM numbers(3)\n )\n)\nORDER BY number ASC
0
1
SELECT DISTINCT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number % 2 AS number\n FROM numbers(3)\n ORDER BY number ASC\n )\n ORDER BY number ASC\n)\nORDER BY number ASC
0
1

View File

@ -0,0 +1,124 @@
set enable_debug_queries = 1;
set optimize_duplicate_order_by_and_distinct = 1;
analyze SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 0;
analyze SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 1;
analyze SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
set optimize_duplicate_order_by_and_distinct = 0;
analyze SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;
SELECT DISTINCT *
FROM
(
SELECT DISTINCT *
FROM
(
SELECT DISTINCT number % 2
AS number
FROM numbers(3)
ORDER BY number
)
ORDER BY number
)
ORDER BY number;

View File

@ -0,0 +1,20 @@
set optimize_duplicate_order_by_and_distinct = 1;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
SETTINGS distributed_group_by_no_merge = 1
);
set optimize_duplicate_order_by_and_distinct = 0;
SELECT DISTINCT number
FROM
(
SELECT DISTINCT number
FROM remote('127.0.0.{1,2}', system.numbers)
LIMIT 1
SETTINGS distributed_group_by_no_merge = 1
);