diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 2744811d336..24ec314d98e 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -449,44 +449,27 @@ struct NameMultiSearchFirstPositionCaseInsensitiveUTF8 using FunctionPosition = FunctionsStringSearch, NamePosition>; using FunctionPositionUTF8 = FunctionsStringSearch, NamePositionUTF8>; using FunctionPositionCaseInsensitive = FunctionsStringSearch, NamePositionCaseInsensitive>; -using FunctionPositionCaseInsensitiveUTF8 - = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; +using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; -using FunctionMultiSearchAllPositions - = FunctionsMultiStringPosition, NameMultiSearchAllPositions>; -using FunctionMultiSearchAllPositionsUTF8 - = FunctionsMultiStringPosition, NameMultiSearchAllPositionsUTF8>; -using FunctionMultiSearchAllPositionsCaseInsensitive - = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitive>; -using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition< - MultiSearchAllPositionsImpl, - NameMultiSearchAllPositionsCaseInsensitiveUTF8>; +using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition, NameMultiSearchAllPositions>; +using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsUTF8>; +using FunctionMultiSearchAllPositionsCaseInsensitive = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitive>; +using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitiveUTF8>; using FunctionMultiSearch = FunctionsMultiStringSearch, NameMultiSearchAny>; using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyUTF8>; -using FunctionMultiSearchCaseInsensitive - = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitive>; -using FunctionMultiSearchCaseInsensitiveUTF8 - = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitiveUTF8>; +using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitive>; +using FunctionMultiSearchCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitiveUTF8>; -using FunctionMultiSearchFirstIndex - = FunctionsMultiStringSearch, NameMultiSearchFirstIndex>; -using FunctionMultiSearchFirstIndexUTF8 - = FunctionsMultiStringSearch, NameMultiSearchFirstIndexUTF8>; -using FunctionMultiSearchFirstIndexCaseInsensitive - = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitive>; -using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 - = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitiveUTF8>; +using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch, NameMultiSearchFirstIndex>; +using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexUTF8>; +using FunctionMultiSearchFirstIndexCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitive>; +using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitiveUTF8>; -using FunctionMultiSearchFirstPosition - = FunctionsMultiStringSearch, NameMultiSearchFirstPosition>; -using FunctionMultiSearchFirstPositionUTF8 - = FunctionsMultiStringSearch, NameMultiSearchFirstPositionUTF8>; -using FunctionMultiSearchFirstPositionCaseInsensitive - = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitive>; -using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch< - MultiSearchFirstPositionImpl, - NameMultiSearchFirstPositionCaseInsensitiveUTF8>; +using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch, NameMultiSearchFirstPosition>; +using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionUTF8>; +using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitive>; +using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitiveUTF8>; void registerFunctionsStringSearch(FunctionFactory & factory) diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 73d6d4b5e51..255301c00ed 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -339,11 +339,9 @@ struct NameNgramDistanceUTF8CaseInsensitive }; using FunctionNgramDistance = FunctionsStringSimilarity, NameNgramDistance>; -using FunctionNgramDistanceCaseInsensitive - = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; +using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8>; -using FunctionNgramDistanceCaseInsensitiveUTF8 - = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; +using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; void registerFunctionsStringSimilarity(FunctionFactory & factory) { diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h index d815d475aa5..48d285e394a 100644 --- a/dbms/src/Functions/Regexps.h +++ b/dbms/src/Functions/Regexps.h @@ -135,9 +135,10 @@ namespace MultiRegexps for (const StringRef ref : str_patterns) { ptrns.push_back(ref.data); - flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH); + flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8); if constexpr (CompileForEditDistance) { + flags.back() &= ~HS_FLAG_UTF8; ext_exprs.emplace_back(); ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; ext_exprs.back().edit_distance = edit_distance.value(); diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.reference b/dbms/tests/queries/0_stateless/00926_multimatch.reference index 3f410b40875..8e3a8ec4820 100644 --- a/dbms/tests/queries/0_stateless/00926_multimatch.reference +++ b/dbms/tests/queries/0_stateless/00926_multimatch.reference @@ -599,3 +599,4 @@ 1 1 1 +1 diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.sql b/dbms/tests/queries/0_stateless/00926_multimatch.sql index a9f09ee3014..797c59f52a5 100644 --- a/dbms/tests/queries/0_stateless/00926_multimatch.sql +++ b/dbms/tests/queries/0_stateless/00926_multimatch.sql @@ -79,3 +79,4 @@ select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;; SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']); +SELECT 1 = multiMatchAny('фабрикант', ['f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']); diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 28b30f153a1..3674869bd81 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -15,27 +15,27 @@ The same as `position`, but the position is returned in Unicode code points. Wor For a case-insensitive search, use the function `positionCaseInsensitiveUTF8`. -## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) -The same as `position`, but returns `Array` of the `position`s for all `needle_i`. +The same as `position`, but returns `Array` of the `position`s for all needlei. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`. -## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchFirstPosition(haystack, [needle1, needle2, ..., needlen]) The same as `position` but returns the leftmost offset of the string `haystack` that is matched to some of the needles. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchFirstIndex(haystack, [needle1, needle2, ..., needlen]) -Returns the index `i` (starting from 1) of the leftmost found `needle_i` in the string `haystack` and 0 otherwise. +Returns the index `i` (starting from 1) of the leftmost found needlei in the string `haystack` and 0 otherwise. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) -Returns 1, if at least one string `needle_i` matches the string `haystack` and 0 otherwise. +Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. @@ -52,24 +52,26 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes. For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster. -## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) +## multiMatchAny(haystack, [pattern1, pattern2, ..., patternn]) The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. **Note: the length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.** -## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) +## multiMatchAnyIndex(haystack, [pattern1, pattern2, ..., patternn]) The same as `multiMatchAny`, but returns any index that matches the haystack. -## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) +## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). -## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) +## multiFuzzyMatchAnyIndex(haystack, distance, [pattern1, pattern2, ..., patternn]) The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance. +**Note: `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction.** + **Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.** ## extract(haystack, pattern) diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 8c099a9c369..f7981f32365 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -13,28 +13,28 @@ Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`. -## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen]) Так же, как и `position`, только возвращает `Array` первых вхождений. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`. -## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n]) +## multiSearchFirstPosition(haystack, [needle1, needle2, ..., needlen]) Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`. -## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n]) -Возвращает индекс `i` (нумерация с единицы) первой найденной строки `needle_i` в строке `haystack` и 0 иначе. +## multiSearchFirstIndex(haystack, [needle1, needle2, ..., needlen]) +Возвращает индекс `i` (нумерация с единицы) первой найденной строки needlei в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`. -## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n]) -Возвращает 1, если хотя бы одна подстрока `needle_i` нашлась в строке `haystack` и 0 иначе. +## multiSearchAny(haystack, [needle1, needle2, ..., needlen]) +Возвращает 1, если хотя бы одна подстрока needlei нашлась в строке `haystack` и 0 иначе. Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. -**Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 28 из-за внутренностей реализации.** +**Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 28 из-за особенностей реализации.** ## match(haystack, pattern) Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)). @@ -45,24 +45,26 @@ Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. -## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) +## multiMatchAny(haystack, [pattern1, pattern2, ..., patternn]) То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее. **Примечание: длина любой строки из `haystack` должна быть меньше 232 байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API.** -## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) +## multiMatchAnyIndex(haystack, [pattern1, pattern2, ..., patternn]) То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. -## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) +## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). -## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) +## multiFuzzyMatchAnyIndex(haystack, distance, [pattern1, pattern2, ..., patternn]) То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. +**Примечание: `multiFuzzyMatch*` функции не поддерживают UTF-8 закодированные регулярные выражения, и такие выражения рассматриваются как байтовые из-за ограничения hyperscan.** + **Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.** ## extract(haystack, pattern)