From 7b91e62ae01d7461831d758785a069ae3d947df8 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 27 Feb 2022 16:43:56 +0000 Subject: [PATCH 1/5] Transform OR LIKE chain to multiMatchAny Closes #5895 --- src/Core/Settings.h | 1 + .../ConvertFunctionOrLikeVisitor.cpp | 67 +++++++++++++++++++ .../ConvertFunctionOrLikeVisitor.h | 22 ++++++ src/Interpreters/TreeOptimizer.cpp | 15 +++++ .../02226_or_like_combine.reference | 34 ++++++++++ .../0_stateless/02226_or_like_combine.sql | 15 +++++ 6 files changed, 154 insertions(+) create mode 100644 src/Interpreters/ConvertFunctionOrLikeVisitor.cpp create mode 100644 src/Interpreters/ConvertFunctionOrLikeVisitor.h create mode 100644 tests/queries/0_stateless/02226_or_like_combine.reference create mode 100644 tests/queries/0_stateless/02226_or_like_combine.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c4b4ab77867..6a8ae829b55 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -436,6 +436,7 @@ class IColumn; M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ M(Bool, convert_query_to_cnf, false, "Convert SELECT query to CNF", 0) \ + M(Bool, optimize_or_like_chain, false, "Optimize multiple OR LIKE into multiMatchAny", 0) \ M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ M(Bool, optimize_duplicate_order_by_and_distinct, true, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \ M(Bool, optimize_redundant_functions_in_order_by, true, "Remove functions from ORDER BY if its argument is also in ORDER BY", 0) \ diff --git a/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp b/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp new file mode 100644 index 00000000000..23d11e25437 --- /dev/null +++ b/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +void ConvertFunctionOrLikeData::visit(ASTFunction & function, ASTPtr &) +{ + if (function.name != "or") + return; + std::unordered_map> identifier_to_literals; + for (auto & child : function.children) + { + if (auto expr_list_fn = child->as()) + { + ASTs unique_elems; + for (auto child_expr_fn : expr_list_fn->children) + { + unique_elems.push_back(child_expr_fn); + if (auto child_fn = child_expr_fn->as()) + { + const bool is_like = child_fn->name == "like"; + const bool is_ilike = child_fn->name == "ilike"; + // Not {i}like -> bail out. + if (!is_like && !is_ilike) + continue; + const auto & arguments = child_fn->arguments->children; + // They should have 2 arguments. + if (arguments.size() != 2) + continue; + // Second one is string literal. + auto identifier = arguments[0]; + auto literal = arguments[1]->as(); + if (!identifier || !literal || literal->value.getType() != Field::Types::String) + continue; + String regexp = likePatternToRegexp(literal->value.get()); + // Case insensitive. Works with UTF-8 as well. + if (is_ilike) + regexp = "(?i)" + regexp; + unique_elems.pop_back(); + auto it = identifier_to_literals.find(identifier); + if (it == identifier_to_literals.end()) + { + it = identifier_to_literals.insert({identifier, std::make_shared(Field{Array{}})}).first; + auto match = makeASTFunction("multiMatchAny"); + match->arguments->children.push_back(std::move(arguments[0])); + match->arguments->children.push_back(it->second); + unique_elems.push_back(std::move(match)); + } + it->second->value.get().push_back(regexp); + } + } + // OR must have at least two arguments. + if (unique_elems.size() == 1) + unique_elems.push_back(std::make_shared(Field(false))); + expr_list_fn->children = std::move(unique_elems); + } + } +} + +} diff --git a/src/Interpreters/ConvertFunctionOrLikeVisitor.h b/src/Interpreters/ConvertFunctionOrLikeVisitor.h new file mode 100644 index 00000000000..ba4a0073448 --- /dev/null +++ b/src/Interpreters/ConvertFunctionOrLikeVisitor.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class ASTFunction; + +/// Replaces all the "or"'s with {i}like to multiMatchAny +class ConvertFunctionOrLikeData +{ +public: + using TypeToVisit = ASTFunction; + + void visit(ASTFunction & function, ASTPtr & ast); +}; + +using ConvertFunctionOrLikeVisitor = InDepthNodeVisitor, true>; + +} diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 64b25ca9777..c28d07bee5b 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -720,6 +721,12 @@ void optimizeFuseQuantileFunctions(ASTPtr & query) } } +void optimizeOrLikeChain(ASTPtr & query) +{ + ConvertFunctionOrLikeVisitor::Data data = {}; + ConvertFunctionOrLikeVisitor(data).visit(query); +} + } void TreeOptimizer::optimizeIf(ASTPtr & query, Aliases & aliases, bool if_chain_to_multiif) @@ -828,6 +835,14 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result, if (settings.optimize_syntax_fuse_functions) optimizeFuseQuantileFunctions(query); + + if (settings.optimize_or_like_chain + && settings.allow_hyperscan + && settings.max_hyperscan_regexp_length == 0 + && settings.max_hyperscan_regexp_total_length == 0) + { + optimizeOrLikeChain(query); + } } } diff --git a/tests/queries/0_stateless/02226_or_like_combine.reference b/tests/queries/0_stateless/02226_or_like_combine.reference new file mode 100644 index 00000000000..50555468435 --- /dev/null +++ b/tests/queries/0_stateless/02226_or_like_combine.reference @@ -0,0 +1,34 @@ +SELECT materialize(\'Привет, World\') AS s +WHERE (s LIKE \'hell%\') OR (s ILIKE \'%привет%\') OR (s ILIKE \'world%\') +SELECT materialize(\'Привет, World\') AS s +WHERE multiMatchAny(s, [\'^hell\', \'(?i)привет\', \'(?i)^world\']) OR false +SETTINGS optimize_or_like_chain = 1 +SELECT + materialize(\'Привет, World\') AS s1, + materialize(\'Привет, World\') AS s2 +WHERE multiMatchAny(s1, [\'^hell\', \'(?i)^world\']) OR multiMatchAny(s2, [\'(?i)привет\']) +SETTINGS optimize_or_like_chain = 1 +SELECT + materialize(\'Привет, World\') AS s1, + materialize(\'Привет, World\') AS s2 +WHERE (s1 LIKE \'hell%\') OR (s2 ILIKE \'%привет%\') OR (s1 ILIKE \'world%\') +SETTINGS optimize_or_like_chain = 1 +SELECT + materialize(\'Привет, World\') AS s1, + materialize(\'Привет, World\') AS s2 +WHERE (s1 LIKE \'hell%\') OR (s2 ILIKE \'%привет%\') OR (s1 ILIKE \'world%\') +SETTINGS optimize_or_like_chain = 1 +SELECT + materialize(\'Привет, World\') AS s1, + materialize(\'Привет, World\') AS s2 +WHERE (s1 LIKE \'hell%\') OR (s2 ILIKE \'%привет%\') OR (s1 ILIKE \'world%\') +SETTINGS optimize_or_like_chain = 1 +SELECT + materialize(\'Привет, World\') AS s1, + materialize(\'Привет, World\') AS s2 +WHERE multiMatchAny(s1, [\'^hell\', \'(?i)^world\']) OR multiMatchAny(s2, [\'(?i)привет\']) OR (s1 = \'Привет\') +SETTINGS optimize_or_like_chain = 1 +Привет, optimized World +Привет, World +Привет, optimized World +Привет, World diff --git a/tests/queries/0_stateless/02226_or_like_combine.sql b/tests/queries/0_stateless/02226_or_like_combine.sql new file mode 100644 index 00000000000..79a70b31398 --- /dev/null +++ b/tests/queries/0_stateless/02226_or_like_combine.sql @@ -0,0 +1,15 @@ +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%'); +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%') SETTINGS optimize_or_like_chain=1; + + +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS allow_hyperscan=0; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS max_hyperscan_regexp_length=10; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS max_hyperscan_regexp_total_length=10; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') OR s1 == 'Привет' SETTINGS optimize_or_like_chain=1; + + +SELECT materialize('Привет, optimized World') AS s WHERE (s LIKE 'hell%') OR (s LIKE '%привет%') OR (s ILIKE '%world') SETTINGS optimize_or_like_chain = 1; +SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s LIKE '%привет%') OR (s ILIKE '%world'); +SELECT materialize('Привет, optimized World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s LIKE 'world%') SETTINGS optimize_or_like_chain = 1; +SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s LIKE 'world%'); From d6a7841b4eef47c093a46cba8a30f48e18153ddf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 7 May 2022 14:09:32 +0300 Subject: [PATCH 2/5] Update Settings.h --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 56ebca87212..aa35cdf7a94 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -444,7 +444,7 @@ class IColumn; M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ M(Bool, convert_query_to_cnf, false, "Convert SELECT query to CNF", 0) \ - M(Bool, optimize_or_like_chain, false, "Optimize multiple OR LIKE into multiMatchAny", 0) \ + M(Bool, optimize_or_like_chain, true, "Optimize multiple OR LIKE into multiMatchAny", 0) \ M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ M(Bool, optimize_duplicate_order_by_and_distinct, true, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \ M(Bool, optimize_redundant_functions_in_order_by, true, "Remove functions from ORDER BY if its argument is also in ORDER BY", 0) \ From 2b88c0898afa31f82dc83226fc65f5afe5227a7c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 7 May 2022 14:11:06 +0300 Subject: [PATCH 3/5] Update ConvertFunctionOrLikeVisitor.cpp --- .../ConvertFunctionOrLikeVisitor.cpp | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp b/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp index 23d11e25437..523c7c109dd 100644 --- a/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp +++ b/src/Interpreters/ConvertFunctionOrLikeVisitor.cpp @@ -14,6 +14,7 @@ void ConvertFunctionOrLikeData::visit(ASTFunction & function, ASTPtr &) { if (function.name != "or") return; + std::unordered_map> identifier_to_literals; for (auto & child : function.children) { @@ -23,26 +24,32 @@ void ConvertFunctionOrLikeData::visit(ASTFunction & function, ASTPtr &) for (auto child_expr_fn : expr_list_fn->children) { unique_elems.push_back(child_expr_fn); - if (auto child_fn = child_expr_fn->as()) + if (const auto * child_fn = child_expr_fn->as()) { const bool is_like = child_fn->name == "like"; const bool is_ilike = child_fn->name == "ilike"; - // Not {i}like -> bail out. + + /// Not {i}like -> bail out. if (!is_like && !is_ilike) continue; + const auto & arguments = child_fn->arguments->children; - // They should have 2 arguments. + + /// They should have 2 arguments. if (arguments.size() != 2) continue; - // Second one is string literal. + + /// Second one is string literal. auto identifier = arguments[0]; auto literal = arguments[1]->as(); if (!identifier || !literal || literal->value.getType() != Field::Types::String) continue; + String regexp = likePatternToRegexp(literal->value.get()); - // Case insensitive. Works with UTF-8 as well. + /// Case insensitive. Works with UTF-8 as well. if (is_ilike) regexp = "(?i)" + regexp; + unique_elems.pop_back(); auto it = identifier_to_literals.find(identifier); if (it == identifier_to_literals.end()) @@ -56,9 +63,11 @@ void ConvertFunctionOrLikeData::visit(ASTFunction & function, ASTPtr &) it->second->value.get().push_back(regexp); } } - // OR must have at least two arguments. + + /// OR must have at least two arguments. if (unique_elems.size() == 1) unique_elems.push_back(std::make_shared(Field(false))); + expr_list_fn->children = std::move(unique_elems); } } From bd658cbaec8c5e150d0168c420510e48a5a60687 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 7 May 2022 14:12:48 +0300 Subject: [PATCH 4/5] Update 02226_or_like_combine.sql --- .../0_stateless/02226_or_like_combine.sql | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/02226_or_like_combine.sql b/tests/queries/0_stateless/02226_or_like_combine.sql index 79a70b31398..9a6fded98e0 100644 --- a/tests/queries/0_stateless/02226_or_like_combine.sql +++ b/tests/queries/0_stateless/02226_or_like_combine.sql @@ -1,15 +1,15 @@ -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%'); -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%') SETTINGS optimize_or_like_chain=1; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%') SETTINGS optimize_or_like_chain = 0; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s ILIKE 'world%') SETTINGS optimize_or_like_chain = 1; -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1; -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS allow_hyperscan=0; -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS max_hyperscan_regexp_length=10; -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain=1 SETTINGS max_hyperscan_regexp_total_length=10; -EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') OR s1 == 'Привет' SETTINGS optimize_or_like_chain=1; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain = 1; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain = 1 SETTINGS allow_hyperscan = 0; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain = 1 SETTINGS max_hyperscan_regexp_length = 10; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') SETTINGS optimize_or_like_chain = 1 SETTINGS max_hyperscan_regexp_total_length = 10; +EXPLAIN SYNTAX SELECT materialize('Привет, World') AS s1, materialize('Привет, World') AS s2 WHERE (s1 LIKE 'hell%') OR (s2 ILIKE '%привет%') OR (s1 ILIKE 'world%') OR s1 == 'Привет' SETTINGS optimize_or_like_chain = 1; SELECT materialize('Привет, optimized World') AS s WHERE (s LIKE 'hell%') OR (s LIKE '%привет%') OR (s ILIKE '%world') SETTINGS optimize_or_like_chain = 1; -SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s LIKE '%привет%') OR (s ILIKE '%world'); +SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s LIKE '%привет%') OR (s ILIKE '%world') SETTINGS optimize_or_like_chain = 0; SELECT materialize('Привет, optimized World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s LIKE 'world%') SETTINGS optimize_or_like_chain = 1; -SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s LIKE 'world%'); +SELECT materialize('Привет, World') AS s WHERE (s LIKE 'hell%') OR (s ILIKE '%привет%') OR (s LIKE 'world%') SETTINGS optimize_or_like_chain = 0; From 00680a7b65619e17bd6fd42a4db70023476aaeba Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 8 May 2022 16:31:12 +0300 Subject: [PATCH 5/5] Update Settings.h --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index aa35cdf7a94..808350c5e86 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -444,7 +444,7 @@ class IColumn; M(Bool, optimize_normalize_count_variants, true, "Rewrite aggregate functions that semantically equals to count() as count().", 0) \ M(Bool, optimize_injective_functions_inside_uniq, true, "Delete injective functions of one argument inside uniq*() functions.", 0) \ M(Bool, convert_query_to_cnf, false, "Convert SELECT query to CNF", 0) \ - M(Bool, optimize_or_like_chain, true, "Optimize multiple OR LIKE into multiMatchAny", 0) \ + M(Bool, optimize_or_like_chain, true, "Optimize multiple OR LIKE into multiMatchAny. This optimization should not be enabled by default, because it defies index analysis in some cases.", 0) \ M(Bool, optimize_arithmetic_operations_in_aggregate_functions, true, "Move arithmetic operations out of aggregation functions", 0) \ M(Bool, optimize_duplicate_order_by_and_distinct, true, "Remove duplicate ORDER BY and DISTINCT if it's possible", 0) \ M(Bool, optimize_redundant_functions_in_order_by, true, "Remove functions from ORDER BY if its argument is also in ORDER BY", 0) \