mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
fix hyperscan to treat regular expressions as utf-8 expressions
This commit is contained in:
parent
eb0a8d1709
commit
ca4f098362
@ -449,44 +449,27 @@ struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
|
||||
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
|
||||
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
|
||||
using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveASCII>, NamePositionCaseInsensitive>;
|
||||
using FunctionPositionCaseInsensitiveUTF8
|
||||
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
|
||||
using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchAllPositions
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
|
||||
using FunctionMultiSearchAllPositionsUTF8
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitive
|
||||
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<
|
||||
MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>,
|
||||
NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
|
||||
using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
|
||||
using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitive = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
|
||||
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
|
||||
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
|
||||
using FunctionMultiSearchCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
|
||||
using FunctionMultiSearchCaseInsensitiveUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
|
||||
using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAnyCaseInsensitive>;
|
||||
using FunctionMultiSearchCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchFirstIndex
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
|
||||
using FunctionMultiSearchFirstIndexUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
|
||||
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
|
||||
using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
|
||||
|
||||
using FunctionMultiSearchFirstPosition
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
|
||||
using FunctionMultiSearchFirstPositionUTF8
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitive
|
||||
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<
|
||||
MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>,
|
||||
NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||
using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
|
||||
using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
|
||||
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
|
||||
|
||||
|
||||
void registerFunctionsStringSearch(FunctionFactory & factory)
|
||||
|
@ -339,11 +339,9 @@ struct NameNgramDistanceUTF8CaseInsensitive
|
||||
};
|
||||
|
||||
using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false>, NameNgramDistance>;
|
||||
using FunctionNgramDistanceCaseInsensitive
|
||||
= FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true>, NameNgramDistanceCaseInsensitive>;
|
||||
using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true>, NameNgramDistanceCaseInsensitive>;
|
||||
using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false>, NameNgramDistanceUTF8>;
|
||||
using FunctionNgramDistanceCaseInsensitiveUTF8
|
||||
= FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
|
||||
using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
|
||||
|
||||
void registerFunctionsStringSimilarity(FunctionFactory & factory)
|
||||
{
|
||||
|
@ -135,9 +135,10 @@ namespace MultiRegexps
|
||||
for (const StringRef ref : str_patterns)
|
||||
{
|
||||
ptrns.push_back(ref.data);
|
||||
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH);
|
||||
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8);
|
||||
if constexpr (CompileForEditDistance)
|
||||
{
|
||||
flags.back() &= ~HS_FLAG_UTF8;
|
||||
ext_exprs.emplace_back();
|
||||
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
|
||||
ext_exprs.back().edit_distance = edit_distance.value();
|
||||
|
@ -599,3 +599,4 @@
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
|
@ -79,3 +79,4 @@ select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*',
|
||||
select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;;
|
||||
|
||||
SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']);
|
||||
SELECT 1 = multiMatchAny('фабрикант', ['f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']);
|
||||
|
@ -15,27 +15,27 @@ The same as `position`, but the position is returned in Unicode code points. Wor
|
||||
|
||||
For a case-insensitive search, use the function `positionCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchAllPositions(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
|
||||
The same as `position`, but returns `Array` of the `position`s for all `needle_i`.
|
||||
The same as `position`, but returns `Array` of the `position`s for all needle<sub>i</sub>.
|
||||
|
||||
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchFirstPosition(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
|
||||
The same as `position` but returns the leftmost offset of the string `haystack` that is matched to some of the needles.
|
||||
|
||||
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchFirstIndex(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
|
||||
Returns the index `i` (starting from 1) of the leftmost found `needle_i` in the string `haystack` and 0 otherwise.
|
||||
Returns the index `i` (starting from 1) of the leftmost found needle<sub>i</sub> in the string `haystack` and 0 otherwise.
|
||||
|
||||
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchAny(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
|
||||
Returns 1, if at least one string `needle_i` matches the string `haystack` and 0 otherwise.
|
||||
Returns 1, if at least one string needle<sub>i</sub> matches the string `haystack` and 0 otherwise.
|
||||
|
||||
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
|
||||
|
||||
@ -52,24 +52,26 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express
|
||||
The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes.
|
||||
For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster.
|
||||
|
||||
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiMatchAny(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
|
||||
|
||||
**Note: the length of any of the `haystack` string must be less than 2<sup>32</sup> bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.**
|
||||
|
||||
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiMatchAnyIndex(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
The same as `multiMatchAny`, but returns any index that matches the haystack.
|
||||
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance.
|
||||
|
||||
**Note: `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction.**
|
||||
|
||||
**Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.**
|
||||
|
||||
## extract(haystack, pattern)
|
||||
|
@ -13,28 +13,28 @@
|
||||
|
||||
Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchAllPositions(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchAllPositions(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
Так же, как и `position`, только возвращает `Array` первых вхождений.
|
||||
|
||||
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAllPositionsCaseInsensitive, multiSearchAllPositionsUTF8, multiSearchAllPositionsCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchFirstPosition(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
## multiSearchFirstPosition(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
|
||||
Так же, как и `position`, только возвращает оффсет первого вхождения любого из needles.
|
||||
|
||||
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstPositionCaseInsensitive, multiSearchFirstPositionUTF8, multiSearchFirstPositionCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchFirstIndex(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
Возвращает индекс `i` (нумерация с единицы) первой найденной строки `needle_i` в строке `haystack` и 0 иначе.
|
||||
## multiSearchFirstIndex(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
Возвращает индекс `i` (нумерация с единицы) первой найденной строки needle<sub>i</sub> в строке `haystack` и 0 иначе.
|
||||
|
||||
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchFirstIndexCaseInsensitive, multiSearchFirstIndexUTF8, multiSearchFirstIndexCaseInsensitiveUTF8`.
|
||||
|
||||
## multiSearchAny(haystack, [needle_1, needle_2, ..., needle_n])
|
||||
Возвращает 1, если хотя бы одна подстрока `needle_i` нашлась в строке `haystack` и 0 иначе.
|
||||
## multiSearchAny(haystack, [needle<sub>1</sub>, needle<sub>2</sub>, ..., needle<sub>n</sub>])
|
||||
Возвращает 1, если хотя бы одна подстрока needle<sub>i</sub> нашлась в строке `haystack` и 0 иначе.
|
||||
|
||||
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
|
||||
|
||||
**Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 2<sup>8</sup> из-за внутренностей реализации.**
|
||||
**Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 2<sup>8</sup> из-за особенностей реализации.**
|
||||
|
||||
## match(haystack, pattern)
|
||||
Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)).
|
||||
@ -45,24 +45,26 @@
|
||||
Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты.
|
||||
Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее.
|
||||
|
||||
## multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiMatchAny(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее.
|
||||
|
||||
**Примечание: длина любой строки из `haystack` должна быть меньше 2<sup>32</sup> байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API.**
|
||||
|
||||
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiMatchAnyIndex(haystack, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
|
||||
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern<sub>1</sub>, pattern<sub>2</sub>, ..., pattern<sub>n</sub>])
|
||||
|
||||
То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
|
||||
|
||||
**Примечание: `multiFuzzyMatch*` функции не поддерживают UTF-8 закодированные регулярные выражения, и такие выражения рассматриваются как байтовые из-за ограничения hyperscan.**
|
||||
|
||||
**Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.**
|
||||
|
||||
## extract(haystack, pattern)
|
||||
|
Loading…
Reference in New Issue
Block a user