From 10566e2b43705364fc1d54224a5393e681f16a5b Mon Sep 17 00:00:00 2001 From: Mikhail Malafeev <50805089+demo-99@users.noreply.github.com> Date: Mon, 15 Jun 2020 17:03:01 +0500 Subject: [PATCH] Remove duplicate ORDER BY and DISTINCT from subqueries (#10067) --- src/Core/Settings.h | 1 + src/Interpreters/DuplicateDistinctVisitor.h | 72 ++++++++++ src/Interpreters/DuplicateOrderByVisitor.h | 127 ++++++++++++++++++ src/Interpreters/SyntaxAnalyzer.cpp | 18 +++ .../duplicate_order_by_and_distinct.xml | 10 ++ ..._duplicate_order_by_and_distinct.reference | 14 ++ .../01305_duplicate_order_by_and_distinct.sql | 124 +++++++++++++++++ ...t_optimize_for_distributed_table.reference | 2 + ...istinct_optimize_for_distributed_table.sql | 20 +++ 9 files changed, 388 insertions(+) create mode 100644 src/Interpreters/DuplicateDistinctVisitor.h create mode 100644 src/Interpreters/DuplicateOrderByVisitor.h create mode 100644 tests/performance/duplicate_order_by_and_distinct.xml create mode 100644 tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.reference create mode 100644 tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.sql create mode 100644 tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.reference create mode 100644 tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1e7728709ba..adc804c3a28 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -360,6 +360,7 @@ struct Settings : public SettingsCollection M(SettingBool, optimize_trivial_count_query, true, "Process trivial 'SELECT count() FROM table' query from metadata.", 0) \ M(SettingUInt64, mutations_sync, 0, "Wait for synchronous execution of ALTER TABLE UPDATE/DELETE queries (mutations). 0 - execute asynchronously. 1 - wait current server. 2 - wait all replicas if they exist.", 0) \ M(SettingBool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ + M(SettingBool, optimize_duplicate_order_by_and_distinct, true, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \ M(SettingBool, optimize_if_chain_to_miltiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ M(SettingBool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \ M(SettingBool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \ diff --git a/src/Interpreters/DuplicateDistinctVisitor.h b/src/Interpreters/DuplicateDistinctVisitor.h new file mode 100644 index 00000000000..9ce2624f5bd --- /dev/null +++ b/src/Interpreters/DuplicateDistinctVisitor.h @@ -0,0 +1,72 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Removes duplicate DISTINCT from queries. +class DuplicateDistinctMatcher +{ +public: + struct Data + { + bool is_distinct; + std::vector last_ids; + }; + + static void visit(const ASTPtr & ast, Data & data) + { + auto * select_query = ast->as(); + if (select_query) + visit(*select_query, data); + } + + static void visit(ASTSelectQuery & select_query, Data & data) + { + if (!select_query.distinct || !select_query.select()) + return; + + /// Optimize shouldn't work for distributed tables + for (const auto & elem : select_query.children) + { + if (elem->as() && !elem->as()->is_standalone) + return; + } + + auto expression_list = select_query.select(); + std::vector current_ids; + + if (expression_list->children.empty()) + return; + + current_ids.reserve(expression_list->children.size()); + for (const auto & id : expression_list->children) + current_ids.push_back(id->getColumnName()); + + if (data.is_distinct && current_ids == data.last_ids) + select_query.distinct = false; + + data.is_distinct = true; + data.last_ids = std::move(current_ids); + } + + static bool needChildVisit(const ASTPtr &, const ASTPtr &) + { + return true; + } + +}; + +using DuplicateDistinctVisitor = InDepthNodeVisitor; + +} diff --git a/src/Interpreters/DuplicateOrderByVisitor.h b/src/Interpreters/DuplicateOrderByVisitor.h new file mode 100644 index 00000000000..85f34377e54 --- /dev/null +++ b/src/Interpreters/DuplicateOrderByVisitor.h @@ -0,0 +1,127 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Checks if SELECT has stateful functions +class ASTFunctionStatefulData +{ +public: + using TypeToVisit = ASTFunction; + + const Context & context; + bool & is_stateful; + void visit(ASTFunction & ast_function, ASTPtr &) + { + if (ast_function.name == "any" || ast_function.name == "groupArray") + { + is_stateful = true; + return; + } + + const auto & function = FunctionFactory::instance().tryGet(ast_function.name, context); + + if (function && function->isStateful()) + { + is_stateful = true; + return; + } + } +}; + +using ASTFunctionStatefulMatcher = OneTypeMatcher; +using ASTFunctionStatefulVisitor = InDepthNodeVisitor; + + +/// Erases unnecessary ORDER BY from subquery +class DuplicateOrderByFromSubqueriesData +{ +public: + using TypeToVisit = ASTSelectQuery; + + bool done = false; + + void visit(ASTSelectQuery & select_query, ASTPtr &) + { + if (done) + return; + + if (select_query.orderBy() && !select_query.limitBy() && !select_query.limitByOffset() && + !select_query.limitByLength() && !select_query.limitLength() && !select_query.limitOffset()) + { + select_query.setExpression(ASTSelectQuery::Expression::ORDER_BY, nullptr); + } + + done = true; + } +}; + +using DuplicateOrderByFromSubqueriesMatcher = OneTypeMatcher; +using DuplicateOrderByFromSubqueriesVisitor = InDepthNodeVisitor; + + +/// Finds SELECT that can be optimized +class DuplicateOrderByData +{ +public: + using TypeToVisit = ASTSelectQuery; + + const Context & context; + bool done = false; + + void visit(ASTSelectQuery & select_query, ASTPtr &) + { + if (done) + return; + + /// Disable optimization for distributed tables + for (const auto & elem : select_query.children) + { + if (elem->as() && !elem->as()->is_standalone) + return; + } + + if (select_query.orderBy() || select_query.groupBy()) + { + for (auto & elem : select_query.children) + { + if (elem->as()) + { + bool is_stateful = false; + ASTFunctionStatefulVisitor::Data data{context, is_stateful}; + ASTFunctionStatefulVisitor(data).visit(elem); + if (is_stateful) + return; + } + } + + if (auto select_table_ptr = select_query.tables()) + { + if (auto * select_table = select_table_ptr->as()) + { + if (!select_table->children.empty()) + { + DuplicateOrderByFromSubqueriesVisitor::Data data{false}; + DuplicateOrderByFromSubqueriesVisitor(data).visit(select_table->children[0]); + } + } + } + } + } +}; + +using DuplicateOrderByMatcher = OneTypeMatcher; +using DuplicateOrderByVisitor = InDepthNodeVisitor; + +} diff --git a/src/Interpreters/SyntaxAnalyzer.cpp b/src/Interpreters/SyntaxAnalyzer.cpp index 8f6d368e6ad..4bfae18f9a5 100644 --- a/src/Interpreters/SyntaxAnalyzer.cpp +++ b/src/Interpreters/SyntaxAnalyzer.cpp @@ -23,12 +23,15 @@ #include #include #include +#include +#include #include #include #include #include #include +#include #include #include @@ -370,6 +373,18 @@ void optimizeOrderBy(const ASTSelectQuery * select_query) elems = std::move(unique_elems); } +/// Optimize duplicate ORDER BY and DISTINCT +void optimizeDuplicateOrderByAndDistinct(ASTPtr & query, bool optimize_duplicate_order_by_and_distinct, const Context & context) +{ + if (optimize_duplicate_order_by_and_distinct) + { + DuplicateOrderByVisitor::Data order_by_data{context, false}; + DuplicateOrderByVisitor(order_by_data).visit(query); + DuplicateDistinctVisitor::Data distinct_data{}; + DuplicateDistinctVisitor(distinct_data).visit(query); + } +} + /// Remove duplicate items from LIMIT BY. void optimizeLimitBy(const ASTSelectQuery * select_query) { @@ -831,6 +846,9 @@ SyntaxAnalyzerResultPtr SyntaxAnalyzer::analyzeSelect( /// Remove duplicate items from ORDER BY. optimizeOrderBy(select_query); + /// Remove duplicate ORDER BY and DISTINCT from subqueries. + optimizeDuplicateOrderByAndDistinct(query, settings.optimize_duplicate_order_by_and_distinct, context); + /// Remove duplicated elements from LIMIT BY clause. optimizeLimitBy(select_query); diff --git a/tests/performance/duplicate_order_by_and_distinct.xml b/tests/performance/duplicate_order_by_and_distinct.xml new file mode 100644 index 00000000000..0c05af3fc56 --- /dev/null +++ b/tests/performance/duplicate_order_by_and_distinct.xml @@ -0,0 +1,10 @@ + + + hits_10m_single + + + SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY EventDate, CounterID FORMAT Null + SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single) FORMAT Null + SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY toStartOfWeek(EventDate) FORMAT Null + + diff --git a/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.reference b/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.reference new file mode 100644 index 00000000000..208f3d1abe5 --- /dev/null +++ b/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.reference @@ -0,0 +1,14 @@ +SELECT number\nFROM \n(\n SELECT number\n FROM \n (\n SELECT DISTINCT number\n FROM numbers(3)\n )\n)\nORDER BY number ASC +0 +1 +2 +SELECT DISTINCT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number\n FROM numbers(3)\n ORDER BY number ASC\n )\n ORDER BY number ASC\n)\nORDER BY number ASC +0 +1 +2 +SELECT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number % 2 AS number\n FROM numbers(3)\n )\n)\nORDER BY number ASC +0 +1 +SELECT DISTINCT number\nFROM \n(\n SELECT DISTINCT number\n FROM \n (\n SELECT DISTINCT number % 2 AS number\n FROM numbers(3)\n ORDER BY number ASC\n )\n ORDER BY number ASC\n)\nORDER BY number ASC +0 +1 diff --git a/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.sql b/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.sql new file mode 100644 index 00000000000..a660e5f0b77 --- /dev/null +++ b/tests/queries/0_stateless/01305_duplicate_order_by_and_distinct.sql @@ -0,0 +1,124 @@ +set enable_debug_queries = 1; +set optimize_duplicate_order_by_and_distinct = 1; + +analyze SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT * + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT * + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +set optimize_duplicate_order_by_and_distinct = 0; + +analyze SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT * + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT * + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +set optimize_duplicate_order_by_and_distinct = 1; + +analyze SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT number % 2 + AS number + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT number % 2 + AS number + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +set optimize_duplicate_order_by_and_distinct = 0; + +analyze SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT number % 2 + AS number + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; + +SELECT DISTINCT * +FROM +( + SELECT DISTINCT * + FROM + ( + SELECT DISTINCT number % 2 + AS number + FROM numbers(3) + ORDER BY number + ) + ORDER BY number +) +ORDER BY number; diff --git a/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.reference b/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.sql b/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.sql new file mode 100644 index 00000000000..e1467bacf2f --- /dev/null +++ b/tests/queries/0_stateless/01306_disable_duplicate_order_by_and_distinct_optimize_for_distributed_table.sql @@ -0,0 +1,20 @@ +set optimize_duplicate_order_by_and_distinct = 1; +SELECT DISTINCT number +FROM +( + SELECT DISTINCT number + FROM remote('127.0.0.{1,2}', system.numbers) + LIMIT 1 + SETTINGS distributed_group_by_no_merge = 1 +); + +set optimize_duplicate_order_by_and_distinct = 0; +SELECT DISTINCT number +FROM +( + SELECT DISTINCT number + FROM remote('127.0.0.{1,2}', system.numbers) + LIMIT 1 + SETTINGS distributed_group_by_no_merge = 1 +); +