Hyperscan multi regular expressions search

This commit is contained in:
Danila Kutenin 2019-03-23 22:40:16 +03:00
parent 7814ccbb29
commit f8001b41ba
17 changed files with 853 additions and 16 deletions

3
.gitmodules vendored
View File

@ -76,3 +76,6 @@
[submodule "contrib/brotli"]
path = contrib/brotli
url = https://github.com/google/brotli.git
[submodule "contrib/hyperscan"]
path = contrib/hyperscan
url = https://github.com/ClickHouse-Extras/hyperscan.git

View File

@ -318,6 +318,7 @@ include (cmake/find_pdqsort.cmake)
include (cmake/find_hdfs3.cmake) # uses protobuf
include (cmake/find_consistent-hashing.cmake)
include (cmake/find_base64.cmake)
include (cmake/find_hyperscan.cmake)
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(metrohash)

View File

@ -0,0 +1,7 @@
if (HAVE_SSSE3)
set (HYPERSCAN_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/hyperscan/src)
set (HYPERSCAN_LIBRARY hs)
set (USE_HYPERSCAN 1)
set (USE_INTERNAL_HYPERSCAN_LIBRARY 1)
message (STATUS "Using hyperscan: ${HYPERSCAN_INCLUDE_DIR} " : ${HYPERSCAN_LIBRARY})
endif()

View File

@ -304,3 +304,7 @@ endif ()
if (USE_BASE64)
add_subdirectory (base64-cmake)
endif()
if (USE_HYPERSCAN)
add_subdirectory (hyperscan)
endif()

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 6a96e8b59f76148eb8ad54a9d15259f8ce84c606
Subproject commit 32abf16beb7bb8b243a4d100ccdd6acb271738c4

1
contrib/hyperscan vendored Submodule

@ -0,0 +1 @@
Subproject commit 1c8d85e309afe6932fe37acfc712766c1f80b793

View File

@ -64,3 +64,8 @@ if (USE_XXHASH)
target_link_libraries(clickhouse_functions PRIVATE ${XXHASH_LIBRARY})
target_include_directories(clickhouse_functions SYSTEM PRIVATE ${XXHASH_INCLUDE_DIR})
endif()
if (USE_HYPERSCAN)
target_link_libraries (clickhouse_functions PRIVATE ${HYPERSCAN_LIBRARY})
target_include_directories (clickhouse_functions SYSTEM PRIVATE ${HYPERSCAN_INCLUDE_DIR})
endif ()

View File

@ -15,6 +15,10 @@
#include <algorithm>
#include <memory>
#ifdef __SSSE3__
# include <hs.h>
#endif
#if USE_RE2_ST
# include <re2_st/re2.h> // Y_IGNORE
#else
@ -334,13 +338,13 @@ struct MultiPositionImpl
template <typename Impl>
struct MultiSearchImpl
{
using ResultType = UInt64;
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
PaddedPODArray<UInt8> & res)
{
Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
}
@ -524,8 +528,8 @@ struct MatchImpl
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(&data[i != 0 ? offsets[i - 1] : 0]);
size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;
const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
@ -581,6 +585,65 @@ struct MatchImpl
};
struct MultiMatchImpl
{
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<UInt8> & res)
{
#ifdef __SSSE3__
using ScratchPtr = std::unique_ptr<hs_scratch_t, DB::MultiRegexps::HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
const auto & hyperscan_regex = MultiRegexps::get(needles);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not allocate scratch space for hyperscan.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
ScratchPtr smart_scratch(scratch);
auto on_match = [](unsigned int /* id */,
unsigned long long /* from */,
unsigned long long /* to */,
unsigned int /* flags */,
void * context) -> int
{
*reinterpret_cast<UInt8 *>(context) = 1;
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
size_t offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
res[i] = 0;
hs_scan(
hyperscan_regex->get(),
reinterpret_cast<const char *>(haystack_data.data()) + offset,
haystack_offsets[i] - offset - 1,
0,
smart_scratch.get(),
on_match,
&res[i]);
offset = haystack_offsets[i];
}
#else
PaddedPODArray<UInt8> accum(res.size());
memset(res.data(), 0, res.size());
memset(accum.data(), 0, accum.size());
for (const StringRef ref : needles)
{
MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, ref.toString(), accum);
for (size_t i = 0; i < res.size(); ++i)
res[i] |= accum[i];
}
#endif // __SSSE3__
}
};
struct ExtractImpl
{
static void vector(
@ -1150,6 +1213,10 @@ struct NameNotLike
{
static constexpr auto name = "notLike";
};
struct NameMultiMatch
{
static constexpr auto name = "multiMatch";
};
struct NameExtract
{
static constexpr auto name = "extract";
@ -1201,6 +1268,7 @@ using FunctionFirstMatchCaseInsensitiveUTF8
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionMultiMatch = FunctionsMultiStringSearch<MultiMatchImpl, NameMultiMatch, std::numeric_limits<UInt64>::max()>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
using FunctionReplaceOne = FunctionStringReplace<ReplaceStringImpl<true>, NameReplaceOne>;
using FunctionReplaceAll = FunctionStringReplace<ReplaceStringImpl<false>, NameReplaceAll>;
@ -1238,6 +1306,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionMultiMatch>();
factory.registerFunction<FunctionExtract>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);

View File

@ -26,6 +26,7 @@ namespace DB
* notLike(haystack, pattern)
*
* match(haystack, pattern) - search by regular expression re2; Returns 0 or 1.
* multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches.
*
* Applies regexp re2 and pulls:
* - the first subpattern, if the regexp has a subpattern;
@ -269,9 +270,13 @@ public:
}
};
template <typename Impl, typename Name>
/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number.
/// But some other searchers use this function, for example, multiMatch -- hyperscan does not have such restrictions
template <typename Impl, typename Name, size_t LimitArgs = std::numeric_limits<UInt8>::max()>
class FunctionsMultiStringSearch : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
@ -282,10 +287,10 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
if (arguments.size() + 1 >= LimitArgs)
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
+ ", should be at most " + std::to_string(LimitArgs) + ".",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
@ -333,6 +338,7 @@ public:
vec_res.resize(column_haystack_size);
/// TODO support constant_constant version
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else

View File

@ -1,19 +1,32 @@
#pragma once
#include <Common/OptimizedRegularExpression.h>
#include <Common/ObjectPool.h>
#include <Common/ProfileEvents.h>
#include <Functions/likePatternToRegexp.h>
#include <Common/ObjectPool.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/ProfileEvents.h>
#include <common/StringRef.h>
#include <memory>
#include <string>
#include <vector>
#ifdef __SSSE3__
# include <hs.h>
#endif
namespace ProfileEvents
{
extern const Event RegexpCreated;
extern const Event RegexpCreated;
}
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_ALLOCATE_MEMORY;
extern const int LOGICAL_ERROR;
}
namespace Regexps
{
@ -21,16 +34,22 @@ namespace Regexps
using Pool = ObjectPoolMap<Regexp, String>;
template <bool like>
inline Regexp createRegexp(const std::string & pattern, int flags) { return {pattern, flags}; }
inline Regexp createRegexp(const std::string & pattern, int flags)
{
return {pattern, flags};
}
template <>
inline Regexp createRegexp<true>(const std::string & pattern, int flags) { return {likePatternToRegexp(pattern), flags}; }
inline Regexp createRegexp<true>(const std::string & pattern, int flags)
{
return {likePatternToRegexp(pattern), flags};
}
template <bool like, bool no_capture>
inline Pool::Pointer get(const std::string & pattern)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
static Pool known_regexps; /// Different variables for different pattern parameters.
return known_regexps.get(pattern, [&pattern]
{
@ -44,4 +63,74 @@ namespace Regexps
}
}
#ifdef __SSSE3__
namespace MultiRegexps
{
template <typename Deleter, Deleter deleter>
struct HyperscanDeleter
{
template <typename T>
void operator()(T * ptr) const
{
deleter(ptr);
}
};
using Regexps = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
using Pool = ObjectPoolMap<Regexps, std::vector<String>>;
inline Pool::Pointer get(const std::vector<StringRef> & patterns)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
std::vector<String> str_patterns;
str_patterns.reserve(patterns.size());
for (const StringRef ref : patterns)
str_patterns.push_back(ref.toString());
return known_regexps.get(str_patterns, [&str_patterns]
{
std::vector<const char *> ptrns;
std::vector<unsigned int> flags;
ptrns.reserve(str_patterns.size());
flags.reserve(str_patterns.size());
for (const StringRef ref : str_patterns)
{
ptrns.push_back(ref.data);
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY);
#ifdef __AVX2__
flags.back() |= HS_CPU_FEATURES_AVX2;
#endif // __AVX2__
}
hs_database_t * db = nullptr;
hs_compile_error_t * compile_error;
hs_error_t err
= hs_compile_multi(ptrns.data(), flags.data(), nullptr, ptrns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error);
if (err != HS_SUCCESS)
{
std::unique_ptr<
hs_compile_error_t,
HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>> error(compile_error);
if (error->expression < 0)
throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR);
else
throw Exception(
"Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message),
ErrorCodes::LOGICAL_ERROR);
}
ProfileEvents::increment(ProfileEvents::RegexpCreated);
return new Regexps{db};
});
}
}
#endif // __SSSE3__
}

View File

@ -25,21 +25,29 @@
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiMatch(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(multiMatch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE multiMatch(URL, ['about/address', 'for_woman', '^https?://lm-company.ruy/$', 'ultimateguitar.com'])]]></query>
<query><![CDATA[SELECT count() FROM hits_100m_single WHERE match(URL, 'about/address|for_woman|^https?://lm-company.ruy/$|ultimateguitar.com')]]></query>
<main_metric>
<min_time/>
</main_metric>

View File

@ -0,0 +1,560 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,74 @@
select 0 = multiMatch(materialize('mpnsguhwsitzvuleiwebwjfitmsg'), ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp']) from system.numbers limit 10;
select 0 = multiMatch(materialize('qjjzqexjpgkglgxpzrbqbnskq'), ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap']) from system.numbers limit 10;
select 0 = multiMatch(materialize('fdkmtqmxnegwvnjhghjq'), ['vynkybvdmhgeezybbdqfrukibisj', 'knazzamgjjpavwhvdkwigykh', 'peumnifrmdhhmrqqnemw', 'lmsnyvqoisinlaqobxojlwfbi', 'oqwfzs', 'dymudxxeodwjpgbibnkvr', 'vomtfsnizkplgzktqyoiw', 'yoyfuhlpgrzds', 'cefao', 'gi', 'srpgxfjwl', 'etsjusdeiwbfe', 'ikvtzdopxo', 'ljfkavrau', 'soqdhxtenfrkmeic', 'ktprjwfcelzbup', 'pcvuoddqwsaurcqdtjfnczekwni', 'agkqkqxkfbkfgyqliahsljim']) from system.numbers limit 10;
select 1 = multiMatch(materialize('khljxzxlpcrxpkrfybbfk'), ['', 'lpc', 'rxpkrfybb', 'crxp', '', 'pkr', 'jxzxlpcrxpkrf', '', 'xzxlpcr', 'xpk', 'fyb', 'xzxlpcrxpkrfybbfk', 'k', 'lpcrxp', 'ljxzxlpcr', 'r', 'pkr', 'fk']) from system.numbers limit 10;
select 0 = multiMatch(materialize('rbrizgjbigvzfnpgmpkqxoqxvdj'), ['ee', 'cohqnb', 'msol', 'yhlujcvhklnhuomy', 'ietn', 'vgmnlkcsybtokrepzrm', 'wspiryefojxysgrzsxyrluykxfnnbzdstcel', 'mxisnsivndbefqxwznimwgazuulupbaihavg', 'vpzdjvqqeizascxmzdhuq', 'pgvncohlxcqjhfkm', 'mbaypcnfapltsegquurahlsruqvipfhrhq', 'ioxjbcyyqujfveujfhnfdfokfcrlsincjbdt', 'cnvlujyowompdrqjwjx', 'wobwed', 'kdfhaoxiuifotmptcmdbk', 'leoamsnorcvtlmokdomkzuo', 'jjw', 'ogugysetxuqmvggneosbsfbonszepsatq']) from system.numbers limit 10;
select 0 = multiMatch(materialize('uymwxzyjbfegbhgswiqhinf'), ['lizxzbzlwljkr', 'ukxygktlpzuyijcqeqktxenlaqi', 'onperabgbdiafsxwbvpjtyt', 'xfqgoqvhqph', 'aflmcwabtwgmajmmqelxwkaolyyhmdlc', 'yfz', 'meffuiaicvwed', 'hhzvgmifzamgftkifaeowayjrnnzw', 'nwewybtajv', 'ectiye', 'epjeiljegmqqjncubj', 'zsjgftqjrn', 'pssng', 'raqoarfhdoeujulvqmdo']) from system.numbers limit 10;
select 0 = multiMatch(materialize('omgghgnzjmecpzqmtcvw'), ['fjhlzbszodmzavzg', 'gfofrnwrxprkfiokv', 'jmjiiqpgznlmyrxwewzqzbe', 'pkyrsqkltlmxr', 'crqgkgqkkyujcyoc', 'endagbcxwqhueczuasykmajfsvtcmh', 'xytmxtrnkdysuwltqomehddp', 'etmdxyyfotfyifwvbykghijvwv', 'mwqtgrncyhkfhjdg', 'iuvymofrqpp', 'pgllsdanlhzqhkstwsmzzftp', 'disjylcceufxtjdvhy']) from system.numbers limit 10;
select 1 = multiMatch(materialize('mznihnmshftvnmmhnrulizzpslq'), ['nrul', 'mshftvnmmhnr', 'z', 'mhnrulizzps', 'hftvnmmhnrul', 'ihnmshftvnmmhnrulizzp', 'izz', '', 'uli', 'nihnmshftvnmmhnru', 'hnrulizzp', 'nrulizz']) from system.numbers limit 10;
select 1 = multiMatch(materialize('ruqmqrsxrbftvruvahonradau'), ['uqmqrsxrbft', 'ftv', 'tvruvahonrad', 'mqrsxrbftvruvahon', 'rbftvruvah', 'qrsxrbftvru', 'o', 'ahonradau', 'a', 'ft', '', 'u', 'rsxrbftvruvahonradau', 'ruvahon', 'bftvruvahonradau', 'qrsxrbftvru', 't', 'vahonrada', 'vruvahonradau', 'onra']) from system.numbers limit 10;
select 1 = multiMatch(materialize('gpsevxtcoeexrltyzduyidmtzxf'), ['exrltyzduyid', 'vxtcoeexrltyz', 'xr', 'ltyzduyidmt', 'yzduy', 'exr', 'coeexrltyzduy', 'coeexrltyzduy', 'rlty', 'rltyzduyidm', 'exrltyz', 'xtcoeexrlty', 'vxtcoeexrltyzduyidm', '', 'coeexrl', 'sevxtcoeexrltyzdu', 'dmt', '']) from system.numbers limit 10;
select 0 = multiMatch(materialize('dyhycfhzyewaikgursyxfkuv'), ['sktnofpugrmyxmbizzrivmhn', 'fhlgadpoqcvktbfzncxbllvwutdawmw', 'eewzjpcgzrqmltbgmhafwlwqb', 'tpogbkyj', 'rtllntxjgkzs', 'mirbvsqexscnzglogigbujgdwjvcv', 'iktwpgjsakemewmahgqza', 'xgfvzkvqgiuoihjjnxwwpznxhz', 'nxaumpaknreklbwynvxdsmatjekdlxvklh', 'zadzwqhgfxqllihuudozxeixyokhny', 'tdqpgfpzexlkslodps', 'slztannufxaabqfcjyfquafgfhfb', 'xvjldhfuwurvkb', 'aecv', 'uycfsughpikqsbcmwvqygdyexkcykhbnau', 'jr']) from system.numbers limit 10;
select 1 = multiMatch(materialize('vbcsettndwuntnruiyclvvwoo'), ['dwuntnru', '', 'ttndwuntnruiyclvv', 'ntnr', 'nruiyclvvw', 'wo', '', 'bcsettndwuntnruiycl', 'yc', 'untnruiyclvvw', 'csettndwuntnr', 'ntnruiyclvvwo']) from system.numbers limit 10;
select 0 = multiMatch(materialize('pqqnugshlczcuxhpjxjbcnro'), ['dpeedqy', 'rtsc', 'jdgla', 'qkgudqjiyzvlvsj', 'xmfxawhijgxxtydbd', 'ebgzazqthb', 'wyrjhvhwzhmpybnylirrn', 'iviqbyuclayqketooztwegtkgwnsezfl', 'bhvidy', 'hijctxxweboq', 't', 'osnzfbziidteiaifgaanm']) from system.numbers limit 10;
select 1 = multiMatch(materialize('loqchlxspwuvvccucskuytr'), ['', 'k', 'qchlxspwu', 'u', 'hlxspwuvv', 'wuvvccucsku', 'vcc', 'uyt', 'uvv', 'spwu', 'ytr', 'wuvvccucs', 'xspwuv', 'lxspwuvvccuc', 'spwuvvccu', 'oqchlxspwuvvccucskuy']) from system.numbers limit 10;
select 1 = multiMatch(materialize('pjjyzupzwllshlnatiujmwvaofr'), ['lnatiujmwvao', '', 'zupzwllsh', 'nati', 'wllshl', 'hlnatiujmwv', 'mwvao', 'shlnat', 'ati', 'wllshlnatiujmwvao', 'wllshlnatiujmwvaofr', 'nat']) from system.numbers limit 10;
select 1 = multiMatch(materialize('iketunkleyaqaxdlocci'), ['nkleyaqaxd', 'etunkleyaq', 'yaqaxdlocci', 'tunkleyaq', 'eyaqaxdlocc', 'leyaq', 'nkleyaqaxdl', 'tunkleya', 'kleyaqa', 'etunkleya', 'leyaqa', 'dlo', 'yaqa', 'leyaqaxd', 'etunkleyaq', '']) from system.numbers limit 10;
select 0 = multiMatch(materialize('drqianqtangmgbdwruvblkqd'), ['wusajejyucamkyl', 'wsgibljugzrpkniliy', 'lhwqqiuafwffyersqjgjvvvfurx', 'jfokpzzxfdonelorqu', 'ccwkpcgac', 'jmyulqpndkmzbfztobwtm', 'rwrgfkccgxht', 'ggldjecrgbngkonphtcxrkcviujihidjx', 'spwweavbiokizv', 'lv', 'krb', 'vstnhvkbwlqbconaxgbfobqky', 'pvxwdc', 'thrl', 'ahsblffdveamceonqwrbeyxzccmux', 'yozji', 'oejtaxwmeovtqtz', 'zsnzznvqpxdvdxhznxrjn', 'hse', 'kcmkrccxmljzizracxwmpoaggywhdfpxkq']) from system.numbers limit 10;
select 0 = multiMatch(materialize('yasnpckniistxcejowfijjsvkdajz'), ['slkpxhtsmrtvtm', 'crsbq', 'rdeshtxbfrlfwpsqojassxmvlfbzefldavmgme', 'ipetilcbpsfroefkjirquciwtxhrimbmwnlyv', 'knjpwkmdwbvdbapuyqbtsw', 'horueidziztxovqhsicnklmharuxhtgrsr', 'ofohrgpz', 'oneqnwyevbaqsonrcpmxcynflojmsnix', 'shg', 'nglqzczevgevwawdfperpeytuodjlf']) from system.numbers limit 10;
select 0 = multiMatch(materialize('ueptpscfgxhplwsueckkxs'), ['ohhygchclbpcdwmftperprn', 'dvpjdqmqckekndvcerqrpkxen', 'lohhvarnmyi', 'zppd', 'qmqxgfewitsunbuhffozcpjtc', 'hsjbioisycsrawktqssjovkmltxodjgv', 'dbzuunwbkrtosyvctdujqtvaawfnvuq', 'gupbvpqthqxae', 'abjdmijaaiasnccgxttmqdsz', 'uccyumqoyqe', 'kxxliepyzlc', 'wbqcqtbyyjbqcgdbpkmzugksmcxhvr', 'piedxm', 'uncpphzoif', 'exkdankwck', 'qeitzozdrqopsergzr', 'hesgrhaftgesnzflrrtjdobxhbepjoas', 'wfpexx']) from system.numbers limit 10;
select 1 = multiMatch(materialize('ldrzgttlqaphekkkdukgngl'), ['gttlqaphekkkdukgn', 'ekkkd', 'gttlqaphe', 'qaphek', 'h', 'kdu', 'he', 'phek', '', 'drzgttlqaphekkkd']) from system.numbers limit 10;
select 1 = multiMatch(materialize('ololo'), ['ololo', 'ololo', 'ololo']);
SELECT 1 = multiMatch(materialize('khljxzxlpcrxpkrfybbfk'), ['k']);
select 1 = multiMatch(materialize(''), ['']);
select 0 = multiMatch(materialize(''), ['some string']);
select 1 = multiMatch(materialize('abc'), ['']);
select 1 = multiMatch(materialize('abc'), ['']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['defgh']);
select 0 = multiMatch(materialize('abc'), ['defg']);
select 0 = multiMatch(materialize('abc'), ['def']);
select 0 = multiMatch(materialize('abc'), ['de']);
select 0 = multiMatch(materialize('abc'), ['d']);
select 1 = multiMatch(materialize('abc'), ['b']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['bc']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['bcde']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['bcdef']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['bcdefg']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['bcdefgh']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['abcdefg']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['abcdef']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['abcde']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['abcd']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['abc']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['ab']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['a']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abcd'), ['c']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abcd'), ['cd']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abcd'), ['cde']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abcd'), ['cdef']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abcd'), ['cdefg']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abcd'), ['cdefgh']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['defgh']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['defg']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['def']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['de']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abc'), ['d']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['...']) from system.numbers limit 10;
select 1 = multiMatch(materialize('a\nbc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatch(materialize('a\nbc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatch(materialize('a\0bc'), ['a?bc']) from system.numbers limit 10;
select 1 = multiMatch(materialize('a\0bc'), ['a.bc']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abcdef'), ['a.....']) from system.numbers limit 10;
select 0 = multiMatch(materialize('abcdef'), ['a......']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abcdef'), ['a......', 'a.....']) from system.numbers limit 10;
select 0 = multiMatch(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10;
select 1 = multiMatch(materialize('abc'), ['a\0d']) from system.numbers limit 10;

View File

@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |

View File

@ -44,6 +44,10 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express
The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes.
For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster.
## multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) algorithm. For patterns to search substrings in a string, it is better to use `multiSearch` since it works much faster.
## extract(haystack, pattern)
Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern.

View File

@ -1,4 +1,4 @@
# Используемые сторонние библиотеки
# Используемые сторонние библиотеки
| Библиотека | Лицензия |
| ------- | ------- |
@ -10,6 +10,7 @@
| double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) |
| FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) |
| googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) |
| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) |
| libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) |
| libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) |
| libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) |

View File

@ -37,6 +37,10 @@
Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты.
Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее.
## multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется алгоритм [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearch`, так как она работает существенно быстрее.
## extract(haystack, pattern)
Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern.