mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Better renamings of fuzzy string search
This commit is contained in:
parent
965b478a8e
commit
b822923f5a
@ -863,14 +863,13 @@ struct NameMultiMatchAnyIndex
|
||||
{
|
||||
static constexpr auto name = "multiMatchAnyIndex";
|
||||
};
|
||||
/// FunctionsStringSimilarity or Regex?
|
||||
struct NameMultiMatchAnyEditDistance
|
||||
struct NameMultiFuzzyMatchAny
|
||||
{
|
||||
static constexpr auto name = "multiMatchAnyEditDistance";
|
||||
static constexpr auto name = "multiFuzzyMatchAny";
|
||||
};
|
||||
struct NameMultiMatchAnyIndexEditDistance
|
||||
struct NameMultiFuzzyMatchAnyIndex
|
||||
{
|
||||
static constexpr auto name = "multiMatchAnyIndexEditDistance";
|
||||
static constexpr auto name = "multiFuzzyMatchAnyIndex";
|
||||
};
|
||||
struct NameExtract
|
||||
{
|
||||
@ -906,14 +905,14 @@ using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<
|
||||
NameMultiMatchAnyIndex,
|
||||
std::numeric_limits<UInt32>::max()>;
|
||||
|
||||
using FunctionMultiMatchAnyEditDistance = FunctionsMultiStringSearchEditDistance<
|
||||
using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch<
|
||||
MultiMatchAnyImpl<UInt8, true, false, true>,
|
||||
NameMultiMatchAnyEditDistance,
|
||||
NameMultiFuzzyMatchAny,
|
||||
std::numeric_limits<UInt32>::max()>;
|
||||
|
||||
using FunctionMultiMatchAnyIndexEditDistance = FunctionsMultiStringSearchEditDistance<
|
||||
using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch<
|
||||
MultiMatchAnyImpl<UInt64, false, true, true>,
|
||||
NameMultiMatchAnyIndexEditDistance,
|
||||
NameMultiFuzzyMatchAnyIndex,
|
||||
std::numeric_limits<UInt32>::max()>;
|
||||
|
||||
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
|
||||
@ -938,8 +937,8 @@ void registerFunctionsStringRegex(FunctionFactory & factory)
|
||||
|
||||
factory.registerFunction<FunctionMultiMatchAny>();
|
||||
factory.registerFunction<FunctionMultiMatchAnyIndex>();
|
||||
factory.registerFunction<FunctionMultiMatchAnyEditDistance>();
|
||||
factory.registerFunction<FunctionMultiMatchAnyIndexEditDistance>();
|
||||
factory.registerFunction<FunctionMultiFuzzyMatchAny>();
|
||||
factory.registerFunction<FunctionMultiFuzzyMatchAnyIndex>();
|
||||
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
|
||||
}
|
||||
}
|
||||
|
@ -29,7 +29,7 @@ namespace ErrorCodes
|
||||
|
||||
|
||||
template <typename Impl, typename Name, size_t LimitArgs>
|
||||
class FunctionsMultiStringSearchEditDistance : public IFunction
|
||||
class FunctionsMultiStringFuzzySearch : public IFunction
|
||||
{
|
||||
static_assert(LimitArgs > 0);
|
||||
|
||||
@ -41,7 +41,7 @@ public:
|
||||
throw Exception(
|
||||
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
|
||||
|
||||
return std::make_shared<FunctionsMultiStringSearchEditDistance>();
|
||||
return std::make_shared<FunctionsMultiStringFuzzySearch>();
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
@ -53,10 +53,10 @@
|
||||
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAny(URL, ['chelyabinsk\\.74\\.ru', 'doctor\\.74\\.ru', 'transport\\.74\\.ru', 'm\\.74\\.ru', '//74\\.ru/', 'chel\\.74\\.ru', 'afisha\\.74\\.ru', 'diplom\\.74\\.ru', 'chelfin\\.ru', '//chel\\.ru', 'chelyabinsk\\.ru', 'cheldoctor\\.ru', '//mychel\\.ru', 'cheldiplom\\.ru', '74\\.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
|
||||
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiSearchAny(URL, ['chelyabinsk.74.ru', 'doctor.74.ru', 'transport.74.ru', 'm.74.ru', '//74.ru/', 'chel.74.ru', 'afisha.74.ru', 'diplom.74.ru', 'chelfin.ru', '//chel.ru', 'chelyabinsk.ru', 'cheldoctor.ru', '//mychel.ru', 'cheldiplom.ru', '74.ru/video', 'market', 'poll', 'mail', 'conference', 'consult', 'contest', 'tags', 'feedback', 'pages', 'text'])]]></query>
|
||||
|
||||
<query><![CDATA[SELECT DISTINCT Title, multiMatchAnyEditDistance(Title, 2, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
|
||||
<query><![CDATA[SELECT DISTINCT Title, multiMatchAnyEditDistance(Title, 5, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
|
||||
<query><![CDATA[SELECT sum(multiMatchAnyEditDistance(Title, 3, ['hello$', 'world$', '^hello'])) FROM hits_100m_single]]></query>
|
||||
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatchAnyEditDistance(URL, 2, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>
|
||||
<query><![CDATA[SELECT DISTINCT Title, multiFuzzyMatchAny(Title, 2, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
|
||||
<query><![CDATA[SELECT DISTINCT Title, multiFuzzyMatchAny(Title, 5, ['^metrika\\.ry$']) AS distance FROM hits_100m_single WHERE distance = 1]]></query>
|
||||
<query><![CDATA[SELECT sum(multiFuzzyMatchAny(Title, 3, ['hello$', 'world$', '^hello'])) FROM hits_100m_single]]></query>
|
||||
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiFuzzyMatchAny(URL, 2, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>
|
||||
|
||||
|
||||
|
||||
|
@ -1,26 +1,26 @@
|
||||
SET send_logs_level = 'none';
|
||||
|
||||
select 0 = multiMatchAnyEditDistance('abc', 0, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiMatchAnyEditDistance('abc', 1, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiMatchAnyEditDistance('abc', 2, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiMatchAnyEditDistance('abc', 3, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
|
||||
select 1 = multiMatchAnyEditDistance('abc', 4, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
|
||||
select 0 = multiFuzzyMatchAny('abc', 0, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiFuzzyMatchAny('abc', 1, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiFuzzyMatchAny('abc', 2, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiFuzzyMatchAny('abc', 3, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
|
||||
select 1 = multiFuzzyMatchAny('abc', 4, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
|
||||
|
||||
select 1 = multiMatchAnyEditDistance('leftabcright', 1, ['a1c']) from system.numbers limit 5;
|
||||
select 1 = multiFuzzyMatchAny('leftabcright', 1, ['a1c']) from system.numbers limit 5;
|
||||
|
||||
select 1 = multiMatchAnyEditDistance('hello some world', 0, ['^hello.*world$']);
|
||||
select 1 = multiMatchAnyEditDistance('hallo some world', 1, ['^hello.*world$']);
|
||||
select 0 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$']);
|
||||
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
||||
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
||||
select 1 = multiMatchAnyEditDistance('halo some wrld', 3, ['^hello.*world$']);
|
||||
select 1 = multiMatchAnyEditDistance('hello some world', 10, ['^hello.*world$']); -- { serverError 49 }
|
||||
select 1 = multiMatchAnyEditDistance('hello some world', -1, ['^hello.*world$']); -- { serverError 43 }
|
||||
select 1 = multiMatchAnyEditDistance('hello some world', 10000000000, ['^hello.*world$']); -- { serverError 44 }
|
||||
select 1 = multiMatchAnyEditDistance('http://hyperscan_is_nice.ru/st', 2, ['http://hyperscan_is_nice.ru/(st\\d\\d$|st\\d\\d\\.|st1[0-4]\\d|st150|st\\d$|gl|rz|ch)']);
|
||||
select 0 = multiMatchAnyEditDistance('string', 0, ['zorro$', '^tring', 'in$', 'how.*', 'it{2}', 'works']);
|
||||
select 1 = multiFuzzyMatchAny('hello some world', 0, ['^hello.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('hallo some world', 1, ['^hello.*world$']);
|
||||
select 0 = multiFuzzyMatchAny('halo some wrld', 2, ['^hello.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('halo some wrld', 3, ['^hello.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('hello some world', 10, ['^hello.*world$']); -- { serverError 49 }
|
||||
select 1 = multiFuzzyMatchAny('hello some world', -1, ['^hello.*world$']); -- { serverError 43 }
|
||||
select 1 = multiFuzzyMatchAny('hello some world', 10000000000, ['^hello.*world$']); -- { serverError 44 }
|
||||
select 1 = multiFuzzyMatchAny('http://hyperscan_is_nice.ru/st', 2, ['http://hyperscan_is_nice.ru/(st\\d\\d$|st\\d\\d\\.|st1[0-4]\\d|st150|st\\d$|gl|rz|ch)']);
|
||||
select 0 = multiFuzzyMatchAny('string', 0, ['zorro$', '^tring', 'in$', 'how.*', 'it{2}', 'works']);
|
||||
|
||||
select 1 = multiMatchAnyEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
||||
select 2 = multiMatchAnyIndexEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
||||
select 2 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
||||
select 1 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
||||
select 1 = multiFuzzyMatchAny('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
||||
select 2 = multiFuzzyMatchAnyIndex('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
|
||||
select 2 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
|
||||
select 1 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
|
||||
|
@ -62,13 +62,13 @@ The same as `match`, but returns 0 if none of the regular expressions are matche
|
||||
|
||||
The same as `multiMatchAny`, but returns any index that matches the haystack.
|
||||
|
||||
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
|
||||
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||
|
||||
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
|
||||
The same as `multiMatchAnyEditDistance`, but returns any index that matches the haystack within constant edit distance.
|
||||
The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within constant edit distance.
|
||||
|
||||
**Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.**
|
||||
|
||||
|
@ -55,13 +55,13 @@
|
||||
|
||||
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
|
||||
|
||||
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAny(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
|
||||
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
|
||||
|
||||
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
## multiFuzzyMatchAnyIndex(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
|
||||
|
||||
То же, что и `multiMatchAnyEditDistance`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
|
||||
То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
|
||||
|
||||
**Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.**
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user