Better renamings of fuzzy string search

This commit is contained in:
Danila Kutenin 2019-03-29 04:27:17 +03:00
parent 965b478a8e
commit b822923f5a
6 changed files with 43 additions and 44 deletions

View File

@ -863,14 +863,13 @@ struct NameMultiMatchAnyIndex
{ {
static constexpr auto name = "multiMatchAnyIndex"; static constexpr auto name = "multiMatchAnyIndex";
}; };
/// FunctionsStringSimilarity or Regex? struct NameMultiFuzzyMatchAny
struct NameMultiMatchAnyEditDistance
{ {
static constexpr auto name = "multiMatchAnyEditDistance"; static constexpr auto name = "multiFuzzyMatchAny";
}; };
struct NameMultiMatchAnyIndexEditDistance struct NameMultiFuzzyMatchAnyIndex
{ {
static constexpr auto name = "multiMatchAnyIndexEditDistance"; static constexpr auto name = "multiFuzzyMatchAnyIndex";
}; };
struct NameExtract struct NameExtract
{ {
@ -906,14 +905,14 @@ using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<
NameMultiMatchAnyIndex, NameMultiMatchAnyIndex,
std::numeric_limits<UInt32>::max()>; std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyEditDistance = FunctionsMultiStringSearchEditDistance< using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<
MultiMatchAnyImpl<UInt8, true, false, true>, MultiMatchAnyImpl<UInt8, true, false, true>,
NameMultiMatchAnyEditDistance, NameMultiFuzzyMatchAny,
std::numeric_limits<UInt32>::max()>; std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndexEditDistance = FunctionsMultiStringSearchEditDistance< using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch<
MultiMatchAnyImpl<UInt64, false, true, true>, MultiMatchAnyImpl<UInt64, false, true, true>,
NameMultiMatchAnyIndexEditDistance, NameMultiFuzzyMatchAnyIndex,
std::numeric_limits<UInt32>::max()>; std::numeric_limits<UInt32>::max()>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>; using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
@ -938,8 +937,8 @@ void registerFunctionsStringRegex(FunctionFactory & factory)
factory.registerFunction<FunctionMultiMatchAny>(); factory.registerFunction<FunctionMultiMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndex>(); factory.registerFunction<FunctionMultiMatchAnyIndex>();
factory.registerFunction<FunctionMultiMatchAnyEditDistance>(); factory.registerFunction<FunctionMultiFuzzyMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndexEditDistance>(); factory.registerFunction<FunctionMultiFuzzyMatchAnyIndex>();
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive); factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
} }
} }

View File

@ -29,7 +29,7 @@ namespace ErrorCodes
template <typename Impl, typename Name, size_t LimitArgs> template <typename Impl, typename Name, size_t LimitArgs>
class FunctionsMultiStringSearchEditDistance : public IFunction class FunctionsMultiStringFuzzySearch : public IFunction
{ {
static_assert(LimitArgs > 0); static_assert(LimitArgs > 0);
@ -41,7 +41,7 @@ public:
throw Exception( throw Exception(
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
return std::make_shared<FunctionsMultiStringSearchEditDistance>(); return std::make_shared<FunctionsMultiStringFuzzySearch>();
} }
String getName() const override { return name; } String getName() const override { return name; }

View File

@ -53,10 +53,10 @@
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk\\.74\\.ru', 'doctor\\.74\\.ru', 'transport\\.74\\.ru', 'm\\.74\\.ru', '//74\\.ru/', 'chel\\.74\\.ru', 'afisha\\.74\\.ru', 'diplom\\.74\\.ru', 'chelfin\\.ru', '//chel\\.ru', 'chelyabinsk\\.ru', 'cheldoctor\\.ru', '//mychel\\.ru', 'cheldiplom\\.ru', '74\\.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query> <query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk\\.74\\.ru', 'doctor\\.74\\.ru', 'transport\\.74\\.ru', 'm\\.74\\.ru', '//74\\.ru/', 'chel\\.74\\.ru', 'afisha\\.74\\.ru', 'diplom\\.74\\.ru', 'chelfin\\.ru', '//chel\\.ru', 'chelyabinsk\\.ru', 'cheldoctor\\.ru', '//mychel\\.ru', 'cheldiplom\\.ru', '74\\.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiSearchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query> <query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiSearchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
<query><![CDATA[SELECT DISTINCT Title, multiMatchAnyEditDistance(Title, 2, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query> <query><![CDATA[SELECT DISTINCT Title, multiFuzzyMatchAny(Title, 2, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
<query><![CDATA[SELECT DISTINCT Title, multiMatchAnyEditDistance(Title, 5, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query> <query><![CDATA[SELECT DISTINCT Title, multiFuzzyMatchAny(Title, 5, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
<query><![CDATA[SELECT sum(multiMatchAnyEditDistance(Title, 3, ['hello$', 'world$', '^hello'])) FROM hits_100m_single]]></query> <query><![CDATA[SELECT sum(multiFuzzyMatchAny(Title, 3, ['hello$', 'world$', '^hello'])) FROM hits_100m_single]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAnyEditDistance(URL, 2, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query> <query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiFuzzyMatchAny(URL, 2, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>

View File

@ -1,26 +1,26 @@
SET send_logs_level = 'none'; SET send_logs_level = 'none';
select 0 = multiMatchAnyEditDistance('abc', 0, ['a1c']) from system.numbers limit 5; select 0 = multiFuzzyMatchAny('abc', 0, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 1, ['a1c']) from system.numbers limit 5; select 1 = multiFuzzyMatchAny('abc', 1, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 2, ['a1c']) from system.numbers limit 5; select 1 = multiFuzzyMatchAny('abc', 2, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 3, ['a1c']) from system.numbers limit 5; -- { serverError 49 } select 1 = multiFuzzyMatchAny('abc', 3, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('abc', 4, ['a1c']) from system.numbers limit 5; -- { serverError 49 } select 1 = multiFuzzyMatchAny('abc', 4, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('leftabcright', 1, ['a1c']) from system.numbers limit 5; select 1 = multiFuzzyMatchAny('leftabcright', 1, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('hello some world', 0, ['^hello.*world$']); select 1 = multiFuzzyMatchAny('hello some world', 0, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('hallo some world', 1, ['^hello.*world$']); select 1 = multiFuzzyMatchAny('hallo some world', 1, ['^hello.*world$']);
select 0 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$']); select 0 = multiFuzzyMatchAny('halo some wrld', 2, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']); select 1 = multiFuzzyMatchAny('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']); select 1 = multiFuzzyMatchAny('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 3, ['^hello.*world$']); select 1 = multiFuzzyMatchAny('halo some wrld', 3, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('hello some world', 10, ['^hello.*world$']); -- { serverError 49 } select 1 = multiFuzzyMatchAny('hello some world', 10, ['^hello.*world$']); -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('hello some world', -1, ['^hello.*world$']); -- { serverError 43 } select 1 = multiFuzzyMatchAny('hello some world', -1, ['^hello.*world$']); -- { serverError 43 }
select 1 = multiMatchAnyEditDistance('hello some world', 10000000000, ['^hello.*world$']); -- { serverError 44 } select 1 = multiFuzzyMatchAny('hello some world', 10000000000, ['^hello.*world$']); -- { serverError 44 }
select 1 = multiMatchAnyEditDistance('http://hyperscan_is_nice.ru/st', 2, ['http://hyperscan_is_nice.ru/(st\\d\\d$|st\\d\\d\\.|st1[0-4]\\d|st150|st\\d$|gl|rz|ch)']); select 1 = multiFuzzyMatchAny('http://hyperscan_is_nice.ru/st', 2, ['http://hyperscan_is_nice.ru/(st\\d\\d$|st\\d\\d\\.|st1[0-4]\\d|st150|st\\d$|gl|rz|ch)']);
select 0 = multiMatchAnyEditDistance('string', 0, ['zorro$', '^tring', 'in$', 'how.*', 'it{2}', 'works']); select 0 = multiFuzzyMatchAny('string', 0, ['zorro$', '^tring', 'in$', 'how.*', 'it{2}', 'works']);
select 1 = multiMatchAnyEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']); select 1 = multiFuzzyMatchAny('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
select 2 = multiMatchAnyIndexEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']); select 2 = multiFuzzyMatchAnyIndex('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
select 2 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']); select 2 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
select 1 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']); select 1 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);

View File

@ -62,13 +62,13 @@ The same as `match`, but returns 0 if none of the regular expressions are matche
The same as `multiMatchAny`, but returns any index that matches the haystack. The same as `multiMatchAny`, but returns any index that matches the haystack.
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) ## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) ## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAnyEditDistance`, but returns any index that matches the haystack within constant edit distance. The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within constant edit distance.
**Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.** **Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.**

View File

@ -55,13 +55,13 @@
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) ## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n]) ## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `multiMatchAnyEditDistance`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
**Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.** **Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.**