diff --git a/.gitmodules b/.gitmodules index 6ad948c9a0a..6b6b734989d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -76,3 +76,6 @@ [submodule "contrib/brotli"] path = contrib/brotli url = https://github.com/google/brotli.git +[submodule "contrib/hyperscan"] + path = contrib/hyperscan + url = https://github.com/ClickHouse-Extras/hyperscan.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d9a5def427..750a3686c1d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -318,6 +318,7 @@ include (cmake/find_pdqsort.cmake) include (cmake/find_hdfs3.cmake) # uses protobuf include (cmake/find_consistent-hashing.cmake) include (cmake/find_base64.cmake) +include (cmake/find_hyperscan.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) find_contrib_lib(metrohash) diff --git a/cmake/find_hyperscan.cmake b/cmake/find_hyperscan.cmake new file mode 100644 index 00000000000..826ee555d53 --- /dev/null +++ b/cmake/find_hyperscan.cmake @@ -0,0 +1,7 @@ +if (HAVE_SSSE3) + set (HYPERSCAN_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/hyperscan/src) + set (HYPERSCAN_LIBRARY hs) + set (USE_HYPERSCAN 1) + set (USE_INTERNAL_HYPERSCAN_LIBRARY 1) + message (STATUS "Using hyperscan: ${HYPERSCAN_INCLUDE_DIR} " : ${HYPERSCAN_LIBRARY}) +endif() diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 636aaeb6bcc..dee9b3765de 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -304,3 +304,7 @@ endif () if (USE_BASE64) add_subdirectory (base64-cmake) endif() + +if (USE_HYPERSCAN) + add_subdirectory (hyperscan) +endif() diff --git a/contrib/boost b/contrib/boost index 6a96e8b59f7..32abf16beb7 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 6a96e8b59f76148eb8ad54a9d15259f8ce84c606 +Subproject commit 32abf16beb7bb8b243a4d100ccdd6acb271738c4 diff --git a/contrib/hyperscan b/contrib/hyperscan new file mode 160000 index 00000000000..1c8d85e309a --- /dev/null +++ b/contrib/hyperscan @@ -0,0 +1 @@ +Subproject commit 1c8d85e309afe6932fe37acfc712766c1f80b793 diff --git a/dbms/src/Functions/CMakeLists.txt b/dbms/src/Functions/CMakeLists.txt index b313b0c5a15..47d86190562 100644 --- a/dbms/src/Functions/CMakeLists.txt +++ b/dbms/src/Functions/CMakeLists.txt @@ -64,3 +64,8 @@ if (USE_XXHASH) target_link_libraries(clickhouse_functions PRIVATE ${XXHASH_LIBRARY}) target_include_directories(clickhouse_functions SYSTEM PRIVATE ${XXHASH_INCLUDE_DIR}) endif() + +if (USE_HYPERSCAN) + target_link_libraries (clickhouse_functions PRIVATE ${HYPERSCAN_LIBRARY}) + target_include_directories (clickhouse_functions SYSTEM PRIVATE ${HYPERSCAN_INCLUDE_DIR}) +endif () diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index f977c3d3b9f..a6e6c9c6f56 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -15,6 +15,10 @@ #include #include +#ifdef __SSSE3__ +# include +#endif + #if USE_RE2_ST # include // Y_IGNORE #else @@ -334,13 +338,13 @@ struct MultiPositionImpl template struct MultiSearchImpl { - using ResultType = UInt64; + using ResultType = UInt8; static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res) + PaddedPODArray & res) { Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res); } @@ -524,8 +528,8 @@ struct MatchImpl res[i] = !revert; else { - const char * str_data = reinterpret_cast(&data[i != 0 ? offsets[i - 1] : 0]); - size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1; + const char * str_data = reinterpret_cast(&data[offsets[i - 1]]); + size_t str_size = offsets[i] - offsets[i - 1] - 1; /** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp, * so that it can match when `required_substring` occurs into the string several times, @@ -581,6 +585,65 @@ struct MatchImpl }; +struct MultiMatchImpl +{ + using ResultType = UInt8; + + static void vector_constant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const std::vector & needles, + PaddedPODArray & res) + { +#ifdef __SSSE3__ + using ScratchPtr = std::unique_ptr>; + + const auto & hyperscan_regex = MultiRegexps::get(needles); + hs_scratch_t * scratch = nullptr; + hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch); + if (err != HS_SUCCESS) + throw Exception("Could not allocate scratch space for hyperscan.", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + ScratchPtr smart_scratch(scratch); + + auto on_match = [](unsigned int /* id */, + unsigned long long /* from */, + unsigned long long /* to */, + unsigned int /* flags */, + void * context) -> int + { + *reinterpret_cast(context) = 1; + return 0; + }; + const size_t haystack_offsets_size = haystack_offsets.size(); + size_t offset = 0; + for (size_t i = 0; i < haystack_offsets_size; ++i) + { + res[i] = 0; + hs_scan( + hyperscan_regex->get(), + reinterpret_cast(haystack_data.data()) + offset, + haystack_offsets[i] - offset - 1, + 0, + smart_scratch.get(), + on_match, + &res[i]); + offset = haystack_offsets[i]; + } +#else + PaddedPODArray accum(res.size()); + memset(res.data(), 0, res.size()); + memset(accum.data(), 0, accum.size()); + for (const StringRef ref : needles) + { + MatchImpl::vector_constant(haystack_data, haystack_offsets, ref.toString(), accum); + for (size_t i = 0; i < res.size(); ++i) + res[i] |= accum[i]; + } +#endif // __SSSE3__ + } +}; + + struct ExtractImpl { static void vector( @@ -1150,6 +1213,10 @@ struct NameNotLike { static constexpr auto name = "notLike"; }; +struct NameMultiMatch +{ + static constexpr auto name = "multiMatch"; +}; struct NameExtract { static constexpr auto name = "extract"; @@ -1201,6 +1268,7 @@ using FunctionFirstMatchCaseInsensitiveUTF8 using FunctionMatch = FunctionsStringSearch, NameMatch>; using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; +using FunctionMultiMatch = FunctionsMultiStringSearch::max()>; using FunctionExtract = FunctionsStringSearchToString; using FunctionReplaceOne = FunctionStringReplace, NameReplaceOne>; using FunctionReplaceAll = FunctionStringReplace, NameReplaceAll>; @@ -1238,6 +1306,7 @@ void registerFunctionsStringSearch(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive); diff --git a/dbms/src/Functions/FunctionsStringSearch.h b/dbms/src/Functions/FunctionsStringSearch.h index 19edeeae61e..bb4cb1b4c3b 100644 --- a/dbms/src/Functions/FunctionsStringSearch.h +++ b/dbms/src/Functions/FunctionsStringSearch.h @@ -26,6 +26,7 @@ namespace DB * notLike(haystack, pattern) * * match(haystack, pattern) - search by regular expression re2; Returns 0 or 1. + * multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches. * * Applies regexp re2 and pulls: * - the first subpattern, if the regexp has a subpattern; @@ -269,9 +270,13 @@ public: } }; -template +/// The argument limiting raises from Volnitsky searcher -- it is performance crucial to save only one byte for pattern number. +/// But some other searchers use this function, for example, multiMatch -- hyperscan does not have such restrictions +template ::max()> class FunctionsMultiStringSearch : public IFunction { + static_assert(LimitArgs > 0); + public: static constexpr auto name = Name::name; static FunctionPtr create(const Context &) { return std::make_shared(); } @@ -282,10 +287,10 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() + 1 >= std::numeric_limits::max()) + if (arguments.size() + 1 >= LimitArgs) throw Exception( "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size()) - + ", should be at most 255.", + + ", should be at most " + std::to_string(LimitArgs) + ".", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); if (!isString(arguments[0])) @@ -333,6 +338,7 @@ public: vec_res.resize(column_haystack_size); + /// TODO support constant_constant version if (col_haystack_vector) Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res); else diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h index 9ca9ed881e2..8c190ac26d4 100644 --- a/dbms/src/Functions/Regexps.h +++ b/dbms/src/Functions/Regexps.h @@ -1,19 +1,32 @@ #pragma once -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __SSSE3__ +# include +#endif namespace ProfileEvents { - extern const Event RegexpCreated; +extern const Event RegexpCreated; } namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int LOGICAL_ERROR; +} namespace Regexps { @@ -21,16 +34,22 @@ namespace Regexps using Pool = ObjectPoolMap; template - inline Regexp createRegexp(const std::string & pattern, int flags) { return {pattern, flags}; } + inline Regexp createRegexp(const std::string & pattern, int flags) + { + return {pattern, flags}; + } template <> - inline Regexp createRegexp(const std::string & pattern, int flags) { return {likePatternToRegexp(pattern), flags}; } + inline Regexp createRegexp(const std::string & pattern, int flags) + { + return {likePatternToRegexp(pattern), flags}; + } template inline Pool::Pointer get(const std::string & pattern) { /// C++11 has thread-safe function-local statics on most modern compilers. - static Pool known_regexps; /// Different variables for different pattern parameters. + static Pool known_regexps; /// Different variables for different pattern parameters. return known_regexps.get(pattern, [&pattern] { @@ -44,4 +63,74 @@ namespace Regexps } } +#ifdef __SSSE3__ + +namespace MultiRegexps +{ + template + struct HyperscanDeleter + { + template + void operator()(T * ptr) const + { + deleter(ptr); + } + }; + + using Regexps = std::unique_ptr>; + + using Pool = ObjectPoolMap>; + + inline Pool::Pointer get(const std::vector & patterns) + { + /// C++11 has thread-safe function-local statics on most modern compilers. + static Pool known_regexps; /// Different variables for different pattern parameters. + + std::vector str_patterns; + str_patterns.reserve(patterns.size()); + for (const StringRef ref : patterns) + str_patterns.push_back(ref.toString()); + + return known_regexps.get(str_patterns, [&str_patterns] + { + std::vector ptrns; + std::vector flags; + ptrns.reserve(str_patterns.size()); + flags.reserve(str_patterns.size()); + for (const StringRef ref : str_patterns) + { + ptrns.push_back(ref.data); + flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY); +#ifdef __AVX2__ + flags.back() |= HS_CPU_FEATURES_AVX2; +#endif // __AVX2__ + } + hs_database_t * db = nullptr; + hs_compile_error_t * compile_error; + + hs_error_t err + = hs_compile_multi(ptrns.data(), flags.data(), nullptr, ptrns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error); + if (err != HS_SUCCESS) + { + std::unique_ptr< + hs_compile_error_t, + HyperscanDeleter> error(compile_error); + + if (error->expression < 0) + throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR); + else + throw Exception( + "Pattern '" + str_patterns[error->expression] + "' failed with error '" + String(error->message), + ErrorCodes::LOGICAL_ERROR); + } + + ProfileEvents::increment(ProfileEvents::RegexpCreated); + + return new Regexps{db}; + }); + } +} + +#endif // __SSSE3__ + } diff --git a/dbms/tests/performance/string_search/constant_column_comparison.xml b/dbms/tests/performance/string_search/constant_column_comparison.xml index 2a7d430aadf..b8314ee2c2a 100644 --- a/dbms/tests/performance/string_search/constant_column_comparison.xml +++ b/dbms/tests/performance/string_search/constant_column_comparison.xml @@ -25,21 +25,29 @@ + + + + + + + + diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.reference b/dbms/tests/queries/0_stateless/00926_multimatch.reference new file mode 100644 index 00000000000..cc6a18aa3e9 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00926_multimatch.reference @@ -0,0 +1,560 @@ +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.sql b/dbms/tests/queries/0_stateless/00926_multimatch.sql new file mode 100644 index 00000000000..24eec0c833f --- /dev/null +++ b/dbms/tests/queries/0_stateless/00926_multimatch.sql @@ -0,0 +1,74 @@ +select 0 = multiMatch(materialize('mpnsguhwsitzvuleiwebwjfitmsg'), ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp']) from system.numbers limit 10; +select 0 = multiMatch(materialize('qjjzqexjpgkglgxpzrbqbnskq'), ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap']) from system.numbers limit 10; +select 0 = multiMatch(materialize('fdkmtqmxnegwvnjhghjq'), ['vynkybvdmhgeezybbdqfrukibisj', 'knazzamgjjpavwhvdkwigykh', 'peumnifrmdhhmrqqnemw', 'lmsnyvqoisinlaqobxojlwfbi', 'oqwfzs', 'dymudxxeodwjpgbibnkvr', 'vomtfsnizkplgzktqyoiw', 'yoyfuhlpgrzds', 'cefao', 'gi', 'srpgxfjwl', 'etsjusdeiwbfe', 'ikvtzdopxo', 'ljfkavrau', 'soqdhxtenfrkmeic', 'ktprjwfcelzbup', 'pcvuoddqwsaurcqdtjfnczekwni', 'agkqkqxkfbkfgyqliahsljim']) from system.numbers limit 10; +select 1 = multiMatch(materialize('khljxzxlpcrxpkrfybbfk'), ['', 'lpc', 'rxpkrfybb', 'crxp', '', 'pkr', 'jxzxlpcrxpkrf', '', 'xzxlpcr', 'xpk', 'fyb', 'xzxlpcrxpkrfybbfk', 'k', 'lpcrxp', 'ljxzxlpcr', 'r', 'pkr', 'fk']) from system.numbers limit 10; +select 0 = multiMatch(materialize('rbrizgjbigvzfnpgmpkqxoqxvdj'), ['ee', 'cohqnb', 'msol', 'yhlujcvhklnhuomy', 'ietn', 'vgmnlkcsybtokrepzrm', 'wspiryefojxysgrzsxyrluykxfnnbzdstcel', 'mxisnsivndbefqxwznimwgazuulupbaihavg', 'vpzdjvqqeizascxmzdhuq', 'pgvncohlxcqjhfkm', 'mbaypcnfapltsegquurahlsruqvipfhrhq', 'ioxjbcyyqujfveujfhnfdfokfcrlsincjbdt', 'cnvlujyowompdrqjwjx', 'wobwed', 'kdfhaoxiuifotmptcmdbk', 'leoamsnorcvtlmokdomkzuo', 'jjw', 'ogugysetxuqmvggneosbsfbonszepsatq']) from system.numbers limit 10; +select 0 = multiMatch(materialize('uymwxzyjbfegbhgswiqhinf'), ['lizxzbzlwljkr', 'ukxygktlpzuyijcqeqktxenlaqi', 'onperabgbdiafsxwbvpjtyt', 'xfqgoqvhqph', 'aflmcwabtwgmajmmqelxwkaolyyhmdlc', 'yfz', 'meffuiaicvwed', 'hhzvgmifzamgftkifaeowayjrnnzw', 'nwewybtajv', 'ectiye', 'epjeiljegmqqjncubj', 'zsjgftqjrn', 'pssng', 'raqoarfhdoeujulvqmdo']) from system.numbers limit 10; +select 0 = multiMatch(materialize('omgghgnzjmecpzqmtcvw'), ['fjhlzbszodmzavzg', 'gfofrnwrxprkfiokv', 'jmjiiqpgznlmyrxwewzqzbe', 'pkyrsqkltlmxr', 'crqgkgqkkyujcyoc', 'endagbcxwqhueczuasykmajfsvtcmh', 'xytmxtrnkdysuwltqomehddp', 'etmdxyyfotfyifwvbykghijvwv', 'mwqtgrncyhkfhjdg', 'iuvymofrqpp', 'pgllsdanlhzqhkstwsmzzftp', 'disjylcceufxtjdvhy']) from system.numbers limit 10; +select 1 = multiMatch(materialize('mznihnmshftvnmmhnrulizzpslq'), ['nrul', 'mshftvnmmhnr', 'z', 'mhnrulizzps', 'hftvnmmhnrul', 'ihnmshftvnmmhnrulizzp', 'izz', '', 'uli', 'nihnmshftvnmmhnru', 'hnrulizzp', 'nrulizz']) from system.numbers limit 10; +select 1 = multiMatch(materialize('ruqmqrsxrbftvruvahonradau'), ['uqmqrsxrbft', 'ftv', 'tvruvahonrad', 'mqrsxrbftvruvahon', 'rbftvruvah', 'qrsxrbftvru', 'o', 'ahonradau', 'a', 'ft', '', 'u', 'rsxrbftvruvahonradau', 'ruvahon', 'bftvruvahonradau', 'qrsxrbftvru', 't', 'vahonrada', 'vruvahonradau', 'onra']) from system.numbers limit 10; +select 1 = multiMatch(materialize('gpsevxtcoeexrltyzduyidmtzxf'), ['exrltyzduyid', 'vxtcoeexrltyz', 'xr', 'ltyzduyidmt', 'yzduy', 'exr', 'coeexrltyzduy', 'coeexrltyzduy', 'rlty', 'rltyzduyidm', 'exrltyz', 'xtcoeexrlty', 'vxtcoeexrltyzduyidm', '', 'coeexrl', 'sevxtcoeexrltyzdu', 'dmt', '']) from system.numbers limit 10; +select 0 = multiMatch(materialize('dyhycfhzyewaikgursyxfkuv'), ['sktnofpugrmyxmbizzrivmhn', 'fhlgadpoqcvktbfzncxbllvwutdawmw', 'eewzjpcgzrqmltbgmhafwlwqb', 'tpogbkyj', 'rtllntxjgkzs', 'mirbvsqexscnzglogigbujgdwjvcv', 'iktwpgjsakemewmahgqza', 'xgfvzkvqgiuoihjjnxwwpznxhz', 'nxaumpaknreklbwynvxdsmatjekdlxvklh', 'zadzwqhgfxqllihuudozxeixyokhny', 'tdqpgfpzexlkslodps', 'slztannufxaabqfcjyfquafgfhfb', 'xvjldhfuwurvkb', 'aecv', 'uycfsughpikqsbcmwvqygdyexkcykhbnau', 'jr']) from system.numbers limit 10; +select 1 = multiMatch(materialize('vbcsettndwuntnruiyclvvwoo'), ['dwuntnru', '', 'ttndwuntnruiyclvv', 'ntnr', 'nruiyclvvw', 'wo', '', 'bcsettndwuntnruiycl', 'yc', 'untnruiyclvvw', 'csettndwuntnr', 'ntnruiyclvvwo']) from system.numbers limit 10; +select 0 = multiMatch(materialize('pqqnugshlczcuxhpjxjbcnro'), ['dpeedqy', 'rtsc', 'jdgla', 'qkgudqjiyzvlvsj', 'xmfxawhijgxxtydbd', 'ebgzazqthb', 'wyrjhvhwzhmpybnylirrn', 'iviqbyuclayqketooztwegtkgwnsezfl', 'bhvidy', 'hijctxxweboq', 't', 'osnzfbziidteiaifgaanm']) from system.numbers limit 10; +select 1 = multiMatch(materialize('loqchlxspwuvvccucskuytr'), ['', 'k', 'qchlxspwu', 'u', 'hlxspwuvv', 'wuvvccucsku', 'vcc', 'uyt', 'uvv', 'spwu', 'ytr', 'wuvvccucs', 'xspwuv', 'lxspwuvvccuc', 'spwuvvccu', 'oqchlxspwuvvccucskuy']) from system.numbers limit 10; +select 1 = multiMatch(materialize('pjjyzupzwllshlnatiujmwvaofr'), ['lnatiujmwvao', '', 'zupzwllsh', 'nati', 'wllshl', 'hlnatiujmwv', 'mwvao', 'shlnat', 'ati', 'wllshlnatiujmwvao', 'wllshlnatiujmwvaofr', 'nat']) from system.numbers limit 10; +select 1 = multiMatch(materialize('iketunkleyaqaxdlocci'), ['nkleyaqaxd', 'etunkleyaq', 'yaqaxdlocci', 'tunkleyaq', 'eyaqaxdlocc', 'leyaq', 'nkleyaqaxdl', 'tunkleya', 'kleyaqa', 'etunkleya', 'leyaqa', 'dlo', 'yaqa', 'leyaqaxd', 'etunkleyaq', '']) from system.numbers limit 10; +select 0 = multiMatch(materialize('drqianqtangmgbdwruvblkqd'), ['wusajejyucamkyl', 'wsgibljugzrpkniliy', 'lhwqqiuafwffyersqjgjvvvfurx', 'jfokpzzxfdonelorqu', 'ccwkpcgac', 'jmyulqpndkmzbfztobwtm', 'rwrgfkccgxht', 'ggldjecrgbngkonphtcxrkcviujihidjx', 'spwweavbiokizv', 'lv', 'krb', 'vstnhvkbwlqbconaxgbfobqky', 'pvxwdc', 'thrl', 'ahsblffdveamceonqwrbeyxzccmux', 'yozji', 'oejtaxwmeovtqtz', 'zsnzznvqpxdvdxhznxrjn', 'hse', 'kcmkrccxmljzizracxwmpoaggywhdfpxkq']) from system.numbers limit 10; +select 0 = multiMatch(materialize('yasnpckniistxcejowfijjsvkdajz'), ['slkpxhtsmrtvtm', 'crsbq', 'rdeshtxbfrlfwpsqojassxmvlfbzefldavmgme', 'ipetilcbpsfroefkjirquciwtxhrimbmwnlyv', 'knjpwkmdwbvdbapuyqbtsw', 'horueidziztxovqhsicnklmharuxhtgrsr', 'ofohrgpz', 'oneqnwyevbaqsonrcpmxcynflojmsnix', 'shg', 'nglqzczevgevwawdfperpeytuodjlf']) from system.numbers limit 10; +select 0 = multiMatch(materialize('ueptpscfgxhplwsueckkxs'), ['ohhygchclbpcdwmftperprn', 'dvpjdqmqckekndvcerqrpkxen', 'lohhvarnmyi', 'zppd', 'qmqxgfewitsunbuhffozcpjtc', 'hsjbioisycsrawktqssjovkmltxodjgv', 'dbzuunwbkrtosyvctdujqtvaawfnvuq', 'gupbvpqthqxae', 'abjdmijaaiasnccgxttmqdsz', 'uccyumqoyqe', 'kxxliepyzlc', 'wbqcqtbyyjbqcgdbpkmzugksmcxhvr', 'piedxm', 'uncpphzoif', 'exkdankwck', 'qeitzozdrqopsergzr', 'hesgrhaftgesnzflrrtjdobxhbepjoas', 'wfpexx']) from system.numbers limit 10; +select 1 = multiMatch(materialize('ldrzgttlqaphekkkdukgngl'), ['gttlqaphekkkdukgn', 'ekkkd', 'gttlqaphe', 'qaphek', 'h', 'kdu', 'he', 'phek', '', 'drzgttlqaphekkkd']) from system.numbers limit 10; +select 1 = multiMatch(materialize('ololo'), ['ololo', 'ololo', 'ololo']); +SELECT 1 = multiMatch(materialize('khljxzxlpcrxpkrfybbfk'), ['k']); + +select 1 = multiMatch(materialize(''), ['']); +select 0 = multiMatch(materialize(''), ['some string']); +select 1 = multiMatch(materialize('abc'), ['']); +select 1 = multiMatch(materialize('abc'), ['']) from system.numbers limit 10; + +select 0 = multiMatch(materialize('abc'), ['defgh']); +select 0 = multiMatch(materialize('abc'), ['defg']); +select 0 = multiMatch(materialize('abc'), ['def']); +select 0 = multiMatch(materialize('abc'), ['de']); +select 0 = multiMatch(materialize('abc'), ['d']); + + +select 1 = multiMatch(materialize('abc'), ['b']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abc'), ['bc']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['bcde']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['bcdef']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['bcdefg']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['bcdefgh']) from system.numbers limit 10; + + +select 0 = multiMatch(materialize('abc'), ['abcdefg']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['abcdef']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['abcde']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['abcd']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abc'), ['abc']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abc'), ['ab']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abc'), ['a']) from system.numbers limit 10; + +select 1 = multiMatch(materialize('abcd'), ['c']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abcd'), ['cd']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abcd'), ['cde']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abcd'), ['cdef']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abcd'), ['cdefg']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abcd'), ['cdefgh']) from system.numbers limit 10; + +select 0 = multiMatch(materialize('abc'), ['defgh']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['defg']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['def']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['de']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abc'), ['d']) from system.numbers limit 10; + +select 1 = multiMatch(materialize('abc'), ['...']) from system.numbers limit 10; +select 1 = multiMatch(materialize('a\nbc'), ['a?bc']) from system.numbers limit 10; +select 1 = multiMatch(materialize('a\nbc'), ['a.bc']) from system.numbers limit 10; +select 1 = multiMatch(materialize('a\0bc'), ['a?bc']) from system.numbers limit 10; +select 1 = multiMatch(materialize('a\0bc'), ['a.bc']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abcdef'), ['a.....']) from system.numbers limit 10; +select 0 = multiMatch(materialize('abcdef'), ['a......']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abcdef'), ['a......', 'a.....']) from system.numbers limit 10; +select 0 = multiMatch(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10; +select 1 = multiMatch(materialize('abc'), ['a\0d']) from system.numbers limit 10; diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index cfbdd28dbf3..744917449f4 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -10,6 +10,7 @@ | double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | | FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | | googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | +| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | | libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index dce9917776c..a523aa96d3d 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -44,6 +44,10 @@ Note that the backslash symbol (`\`) is used for escaping in the regular express The regular expression works with the string as if it is a set of bytes. The regular expression can't contain null bytes. For patterns to search for substrings in a string, it is better to use LIKE or 'position', since they work much faster. +## multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) + +The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) algorithm. For patterns to search substrings in a string, it is better to use `multiSearch` since it works much faster. + ## extract(haystack, pattern) Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern. diff --git a/docs/ru/development/contrib.md b/docs/ru/development/contrib.md index f7afd8bb0a8..3640f1f3a58 100644 --- a/docs/ru/development/contrib.md +++ b/docs/ru/development/contrib.md @@ -1,4 +1,4 @@ -# Используемые сторонние библиотеки +# Используемые сторонние библиотеки | Библиотека | Лицензия | | ------- | ------- | @@ -10,6 +10,7 @@ | double-conversion | [BSD 3-Clause License](https://github.com/google/double-conversion/blob/cf2f0f3d547dc73b4612028a155b80536902ba02/LICENSE) | | FastMemcpy | [MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libmemcpy/impl/LICENSE) | | googletest | [BSD 3-Clause License](https://github.com/google/googletest/blob/master/LICENSE) | +| hyperscan | [BSD 3-Clause License](https://github.com/intel/hyperscan/blob/master/LICENSE) | | libbtrie | [BSD 2-Clause License](https://github.com/yandex/ClickHouse/blob/master/contrib/libbtrie/LICENSE) | | libcxxabi | [BSD + MIT](https://github.com/yandex/ClickHouse/blob/master/libs/libglibc-compatibility/libcxxabi/LICENSE.TXT) | | libdivide | [Zlib License](https://github.com/yandex/ClickHouse/blob/master/contrib/libdivide/LICENSE.txt) | diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 4b335cce34c..1f5cf0ce54b 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -37,6 +37,10 @@ Регулярное выражение работает со строкой как с набором байт. Регулярное выражение не может содержать нулевые байты. Для шаблонов на поиск подстроки в строке, лучше используйте LIKE или position, так как они работают существенно быстрее. +## multiMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) + +То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется алгоритм [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearch`, так как она работает существенно быстрее. + ## extract(haystack, pattern) Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern.