From 57f20ba17e6295816d118a9eb68380809ebf44eb Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sun, 13 Oct 2019 15:22:09 +0200 Subject: [PATCH] All multi{Fuzzy}MatchAllIndices functions --- dbms/src/Functions/FunctionsStringRegex.cpp | 132 ++++++++++++++++-- dbms/src/Functions/FunctionsStringRegex.h | 19 +-- dbms/src/Functions/FunctionsStringSearch.cpp | 78 ++++++++--- dbms/src/Functions/FunctionsStringSearch.h | 19 +-- dbms/src/Functions/Regexps.h | 68 ++++++--- .../0_stateless/00926_multimatch.reference | 23 +++ .../queries/0_stateless/00926_multimatch.sql | 18 ++- .../0_stateless/00927_disable_hyperscan.sql | 1 + .../00929_multi_match_edit_distance.reference | 2 + .../00929_multi_match_edit_distance.sql | 3 + .../functions/string_search_functions.md | 8 ++ .../functions/string_search_functions.md | 8 ++ 12 files changed, 305 insertions(+), 74 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringRegex.cpp b/dbms/src/Functions/FunctionsStringRegex.cpp index 464260e6a33..6eb0c8f8cdb 100644 --- a/dbms/src/Functions/FunctionsStringRegex.cpp +++ b/dbms/src/Functions/FunctionsStringRegex.cpp @@ -268,14 +268,12 @@ struct MultiMatchAnyImpl static_assert(static_cast(FindAny) + static_cast(FindAnyIndex) == 1); using ResultType = Type; static constexpr bool is_using_hyperscan = true; - - static void vector_constant( - const ColumnString::Chars & haystack_data, - const ColumnString::Offsets & haystack_offsets, - const std::vector & needles, - PaddedPODArray & res) + /// Variable for understanding, if we used offsets for the output, most + /// likely to determine whether the function returns ColumnVector of ColumnArray. + static constexpr bool is_column_array = false; + static auto ReturnType() { - vector_constant(haystack_data, haystack_offsets, needles, res, std::nullopt); + return std::make_shared>(); } static void vector_constant( @@ -283,10 +281,22 @@ struct MultiMatchAnyImpl const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res, + PaddedPODArray & offsets) + { + vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); + } + + static void vector_constant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const std::vector & needles, + PaddedPODArray & res, + [[maybe_unused]] PaddedPODArray & offsets, [[maybe_unused]] std::optional edit_distance) { (void)FindAny; (void)FindAnyIndex; + res.resize(haystack_offsets.size()); #if USE_HYPERSCAN const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); hs_scratch_t * scratch = nullptr; @@ -307,15 +317,18 @@ struct MultiMatchAnyImpl *reinterpret_cast(context) = id; else if constexpr (FindAny) *reinterpret_cast(context) = 1; - return 0; + /// Once we hit the callback, there is no need to search for others. + return 1; }; const size_t haystack_offsets_size = haystack_offsets.size(); UInt64 offset = 0; for (size_t i = 0; i < haystack_offsets_size; ++i) { UInt64 length = haystack_offsets[i] - offset - 1; + /// Hyperscan restriction. if (length > std::numeric_limits::max()) throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES); + /// Zero the result, scan, check, update the offset. res[i] = 0; err = hs_scan( hyperscan_regex->getDB(), @@ -325,7 +338,7 @@ struct MultiMatchAnyImpl smart_scratch.get(), on_match, &res[i]); - if (err != HS_SUCCESS) + if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) throw Exception("Failed to scan with hyperscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT); offset = haystack_offsets[i]; } @@ -353,6 +366,87 @@ struct MultiMatchAnyImpl } }; +template +struct MultiMatchAllIndicesImpl +{ + using ResultType = Type; + static constexpr bool is_using_hyperscan = true; + /// Variable for understanding, if we used offsets for the output, most + /// likely to determine whether the function returns ColumnVector of ColumnArray. + static constexpr bool is_column_array = true; + static auto ReturnType() + { + return std::make_shared(std::make_shared()); + } + + static void vector_constant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const std::vector & needles, + PaddedPODArray & res, + PaddedPODArray & offsets) + { + vector_constant(haystack_data, haystack_offsets, needles, res, offsets, std::nullopt); + } + + static void vector_constant( + const ColumnString::Chars & haystack_data, + const ColumnString::Offsets & haystack_offsets, + const std::vector & needles, + PaddedPODArray & res, + PaddedPODArray & offsets, + [[maybe_unused]] std::optional edit_distance) + { + offsets.resize(haystack_offsets.size()); +#if USE_HYPERSCAN + const auto & hyperscan_regex = MultiRegexps::get(needles, edit_distance); + hs_scratch_t * scratch = nullptr; + hs_error_t err = hs_clone_scratch(hyperscan_regex->getScratch(), &scratch); + + if (err != HS_SUCCESS) + throw Exception("Could not clone scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + + MultiRegexps::ScratchPtr smart_scratch(scratch); + + auto on_match = [](unsigned int id, + unsigned long long /* from */, + unsigned long long /* to */, + unsigned int /* flags */, + void * context) -> int + { + static_cast*>(context)->push_back(id); + return 0; + }; + const size_t haystack_offsets_size = haystack_offsets.size(); + UInt64 offset = 0; + for (size_t i = 0; i < haystack_offsets_size; ++i) + { + UInt64 length = haystack_offsets[i] - offset - 1; + /// Hyperscan restriction. + if (length > std::numeric_limits::max()) + throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES); + /// Scan, check, update the offsets array and the offset of haystack. + err = hs_scan( + hyperscan_regex->getDB(), + reinterpret_cast(haystack_data.data()) + offset, + length, + 0, + smart_scratch.get(), + on_match, + &res); + if (err != HS_SUCCESS) + throw Exception("Failed to scan with hyperscan", ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT); + offsets[i] = res.size(); + offset = haystack_offsets[i]; + } +#else + throw Exception( + "multi-search all indices is not implemented when hyperscan is off (is it Intel processor?)", + ErrorCodes::NOT_IMPLEMENTED); +#endif // USE_HYPERSCAN + } +}; + struct ExtractImpl { @@ -866,6 +960,10 @@ struct NameMultiMatchAnyIndex { static constexpr auto name = "multiMatchAnyIndex"; }; +struct NameMultiMatchAllIndices +{ + static constexpr auto name = "multiMatchAllIndices"; +}; struct NameMultiFuzzyMatchAny { static constexpr auto name = "multiFuzzyMatchAny"; @@ -874,6 +972,10 @@ struct NameMultiFuzzyMatchAnyIndex { static constexpr auto name = "multiFuzzyMatchAnyIndex"; }; +struct NameMultiFuzzyMatchAllIndices +{ + static constexpr auto name = "multiFuzzyMatchAllIndices"; +}; struct NameExtract { static constexpr auto name = "extract"; @@ -908,6 +1010,11 @@ using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch< NameMultiMatchAnyIndex, std::numeric_limits::max()>; +using FunctionMultiMatchAllIndices = FunctionsMultiStringSearch< + MultiMatchAllIndicesImpl, + NameMultiMatchAllIndices, + std::numeric_limits::max()>; + using FunctionMultiFuzzyMatchAny = FunctionsMultiStringFuzzySearch< MultiMatchAnyImpl, NameMultiFuzzyMatchAny, @@ -918,6 +1025,11 @@ using FunctionMultiFuzzyMatchAnyIndex = FunctionsMultiStringFuzzySearch< NameMultiFuzzyMatchAnyIndex, std::numeric_limits::max()>; +using FunctionMultiFuzzyMatchAllIndices = FunctionsMultiStringFuzzySearch< + MultiMatchAllIndicesImpl, + NameMultiFuzzyMatchAllIndices, + std::numeric_limits::max()>; + using FunctionLike = FunctionsStringSearch, NameLike>; using FunctionNotLike = FunctionsStringSearch, NameNotLike>; using FunctionExtract = FunctionsStringSearchToString; @@ -940,8 +1052,10 @@ void registerFunctionsStringRegex(FunctionFactory & factory) factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); + factory.registerFunction(); factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive); } } diff --git a/dbms/src/Functions/FunctionsStringRegex.h b/dbms/src/Functions/FunctionsStringRegex.h index a3f508b74d9..072f813c02b 100644 --- a/dbms/src/Functions/FunctionsStringRegex.h +++ b/dbms/src/Functions/FunctionsStringRegex.h @@ -63,9 +63,7 @@ public: if (!array_type || !checkAndGetDataType(array_type->getNestedType().get())) throw Exception( "Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - - return std::make_shared>(); + return Impl::ReturnType(); } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override @@ -115,20 +113,23 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); - const size_t column_haystack_size = column_haystack->size(); - auto col_res = ColumnVector::create(); + auto col_offsets = ColumnArray::ColumnOffsets::create(); auto & vec_res = col_res->getData(); + auto & offsets_res = col_offsets->getData(); - vec_res.resize(column_haystack_size); - + /// The blame for resizing output is for the callee. if (col_haystack_vector) - Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, edit_distance); + Impl::vector_constant( + col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res, edit_distance); else throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN); - block.getByPosition(result).column = std::move(col_res); + if constexpr (Impl::is_column_array) + block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets)); + else + block.getByPosition(result).column = std::move(col_res); } }; diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index e44138bb482..c39d536927c 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -47,7 +47,7 @@ struct PositionCaseSensitiveASCII /// Convert string to lowercase. Only for case-insensitive search. /// Implementation is permitted to be inefficient because it is called for single string. - static void toLowerIfNeed(std::string &) {} + static void toLowerIfNeed(std::string &) { } }; struct PositionCaseInsensitiveASCII @@ -107,7 +107,7 @@ struct PositionCaseSensitiveUTF8 return res; } - static void toLowerIfNeed(std::string &) {} + static void toLowerIfNeed(std::string &) { } }; struct PositionCaseInsensitiveUTF8 @@ -335,15 +335,21 @@ struct MultiSearchImpl { using ResultType = UInt8; static constexpr bool is_using_hyperscan = false; + /// Variable for understanding, if we used offsets for the output, most + /// likely to determine whether the function returns ColumnVector of ColumnArray. + static constexpr bool is_column_array = false; + static auto ReturnType() { return std::make_shared>(); } static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res) + PaddedPODArray & res, + [[maybe_unused]] PaddedPODArray & offsets) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); + res.resize(haystack_string_size); size_t iteration = 0; while (searcher.hasMoreToSearch()) { @@ -366,12 +372,17 @@ struct MultiSearchFirstPositionImpl { using ResultType = UInt64; static constexpr bool is_using_hyperscan = false; + /// Variable for understanding, if we used offsets for the output, most + /// likely to determine whether the function returns ColumnVector of ColumnArray. + static constexpr bool is_column_array = false; + static auto ReturnType() { return std::make_shared>(); } static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res) + PaddedPODArray & res, + [[maybe_unused]] PaddedPODArray & offsets) { auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { @@ -379,6 +390,7 @@ struct MultiSearchFirstPositionImpl }; auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); + res.resize(haystack_string_size); size_t iteration = 0; while (searcher.hasMoreToSearch()) { @@ -407,15 +419,21 @@ struct MultiSearchFirstIndexImpl { using ResultType = UInt64; static constexpr bool is_using_hyperscan = false; + /// Variable for understanding, if we used offsets for the output, most + /// likely to determine whether the function returns ColumnVector of ColumnArray. + static constexpr bool is_column_array = false; + static auto ReturnType() { return std::make_shared>(); } static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, - PaddedPODArray & res) + PaddedPODArray & res, + [[maybe_unused]] PaddedPODArray & offsets) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); + res.resize(haystack_string_size); size_t iteration = 0; while (searcher.hasMoreToSearch()) { @@ -598,30 +616,48 @@ struct NameHasTokenCaseInsensitive using FunctionPosition = FunctionsStringSearch, NamePosition>; using FunctionPositionUTF8 = FunctionsStringSearch, NamePositionUTF8>; using FunctionPositionCaseInsensitive = FunctionsStringSearch, NamePositionCaseInsensitive>; -using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; +using FunctionPositionCaseInsensitiveUTF8 + = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; -using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition, NameMultiSearchAllPositions>; -using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsUTF8>; -using FunctionMultiSearchAllPositionsCaseInsensitive = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitive>; -using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitiveUTF8>; +using FunctionMultiSearchAllPositions + = FunctionsMultiStringPosition, NameMultiSearchAllPositions>; +using FunctionMultiSearchAllPositionsUTF8 + = FunctionsMultiStringPosition, NameMultiSearchAllPositionsUTF8>; +using FunctionMultiSearchAllPositionsCaseInsensitive + = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitive>; +using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition< + MultiSearchAllPositionsImpl, + NameMultiSearchAllPositionsCaseInsensitiveUTF8>; using FunctionMultiSearch = FunctionsMultiStringSearch, NameMultiSearchAny>; using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyUTF8>; -using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitive>; -using FunctionMultiSearchCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitiveUTF8>; +using FunctionMultiSearchCaseInsensitive + = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitive>; +using FunctionMultiSearchCaseInsensitiveUTF8 + = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitiveUTF8>; -using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch, NameMultiSearchFirstIndex>; -using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexUTF8>; -using FunctionMultiSearchFirstIndexCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitive>; -using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitiveUTF8>; +using FunctionMultiSearchFirstIndex + = FunctionsMultiStringSearch, NameMultiSearchFirstIndex>; +using FunctionMultiSearchFirstIndexUTF8 + = FunctionsMultiStringSearch, NameMultiSearchFirstIndexUTF8>; +using FunctionMultiSearchFirstIndexCaseInsensitive + = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitive>; +using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 + = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitiveUTF8>; -using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch, NameMultiSearchFirstPosition>; -using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionUTF8>; -using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitive>; -using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitiveUTF8>; +using FunctionMultiSearchFirstPosition + = FunctionsMultiStringSearch, NameMultiSearchFirstPosition>; +using FunctionMultiSearchFirstPositionUTF8 + = FunctionsMultiStringSearch, NameMultiSearchFirstPositionUTF8>; +using FunctionMultiSearchFirstPositionCaseInsensitive + = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitive>; +using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch< + MultiSearchFirstPositionImpl, + NameMultiSearchFirstPositionCaseInsensitiveUTF8>; using FunctionHasToken = FunctionsStringSearch, NameHasToken>; -using FunctionHasTokenCaseInsensitive = FunctionsStringSearch, NameHasTokenCaseInsensitive>; +using FunctionHasTokenCaseInsensitive + = FunctionsStringSearch, NameHasTokenCaseInsensitive>; void registerFunctionsStringSearch(FunctionFactory & factory) { diff --git a/dbms/src/Functions/FunctionsStringSearch.h b/dbms/src/Functions/FunctionsStringSearch.h index 053240570d1..1f7963fca5f 100644 --- a/dbms/src/Functions/FunctionsStringSearch.h +++ b/dbms/src/Functions/FunctionsStringSearch.h @@ -28,6 +28,7 @@ namespace DB * match(haystack, pattern) - search by regular expression re2; Returns 0 or 1. * multiMatchAny(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns 0 or 1 if any pattern_i matches. * multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns index of any match or zero if none; + * multiMatchAllIndices(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- search by re2 regular expressions pattern_i; Returns an array of matched indices in any order; * * Applies regexp re2 and pulls: * - the first subpattern, if the regexp has a subpattern; @@ -312,9 +313,7 @@ public: if (!array_type || !checkAndGetDataType(array_type->getNestedType().get())) throw Exception( "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - - - return std::make_shared>(); + return Impl::ReturnType(); } void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override @@ -347,20 +346,22 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); - const size_t column_haystack_size = column_haystack->size(); - auto col_res = ColumnVector::create(); + auto col_offsets = ColumnArray::ColumnOffsets::create(); auto & vec_res = col_res->getData(); + auto & offsets_res = col_offsets->getData(); - vec_res.resize(column_haystack_size); - + /// The blame for resizing output is for the callee. if (col_haystack_vector) - Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res); + Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, offsets_res); else throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN); - block.getByPosition(result).column = std::move(col_res); + if constexpr (Impl::is_column_array) + block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets)); + else + block.getByPosition(result).column = std::move(col_res); } }; diff --git a/dbms/src/Functions/Regexps.h b/dbms/src/Functions/Regexps.h index ce81e62ac69..e7fec8027fb 100644 --- a/dbms/src/Functions/Regexps.h +++ b/dbms/src/Functions/Regexps.h @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include @@ -87,18 +87,20 @@ namespace MultiRegexps } }; + /// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception. using CompilerError = std::unique_ptr>; using ScratchPtr = std::unique_ptr>; using DataBasePtr = std::unique_ptr>; - /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher + /// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher. class Regexps { public: - Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} {} + Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { } hs_database_t * getDB() const { return db.get(); } hs_scratch_t * getScratch() const { return scratch.get(); } + private: DataBasePtr db; ScratchPtr scratch; @@ -106,25 +108,25 @@ namespace MultiRegexps struct Pool { - /// Mutex for finding in map + /// Mutex for finding in map. std::mutex mutex; - /// Patterns + possible edit_distance to database and scratch + /// Patterns + possible edit_distance to database and scratch. std::map, std::optional>, Regexps> storage; }; - template + template inline Regexps constructRegexps(const std::vector & str_patterns, std::optional edit_distance) { (void)edit_distance; /// Common pointers - std::vector ptrns; + std::vector patterns; std::vector flags; /// Pointer for external edit distance compilation std::vector ext_exprs; std::vector ext_exprs_ptrs; - ptrns.reserve(str_patterns.size()); + patterns.reserve(str_patterns.size()); flags.reserve(str_patterns.size()); if constexpr (CompileForEditDistance) @@ -135,12 +137,22 @@ namespace MultiRegexps for (const StringRef ref : str_patterns) { - ptrns.push_back(ref.data); - flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH | HS_FLAG_UTF8); + patterns.push_back(ref.data); + /* Flags below are the pattern matching flags. + * HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good + * performance practice accrording to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode + * HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match. + * HS_FLAG_UTF8 is a flag where UTF8 literals are matched. + * HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice + * as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag + */ + flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8); if constexpr (CompileForEditDistance) { + /// Hyperscan currently does not support UTF8 matching with edit distance. flags.back() &= ~HS_FLAG_UTF8; ext_exprs.emplace_back(); + /// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance. ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE; ext_exprs.back().edit_distance = edit_distance.value(); ext_exprs_ptrs.push_back(&ext_exprs.back()); @@ -152,31 +164,32 @@ namespace MultiRegexps std::unique_ptr ids; - if constexpr (FindAnyIndex) + /// We mark the patterns to provide the callback results. + if constexpr (SaveIndices) { - ids.reset(new unsigned int[ptrns.size()]); - for (size_t i = 0; i < ptrns.size(); ++i) + ids.reset(new unsigned int[patterns.size()]); + for (size_t i = 0; i < patterns.size(); ++i) ids[i] = i + 1; } hs_error_t err; if constexpr (!CompileForEditDistance) err = hs_compile_multi( - ptrns.data(), + patterns.data(), flags.data(), ids.get(), - ptrns.size(), + patterns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error); else err = hs_compile_ext_multi( - ptrns.data(), + patterns.data(), flags.data(), ids.get(), ext_exprs_ptrs.data(), - ptrns.size(), + patterns.size(), HS_MODE_BLOCK, nullptr, &db, @@ -184,6 +197,7 @@ namespace MultiRegexps if (err != HS_SUCCESS) { + /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. CompilerError error(compile_error); if (error->expression < 0) @@ -196,9 +210,12 @@ namespace MultiRegexps ProfileEvents::increment(ProfileEvents::RegexpCreated); + /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch + /// function which is faster than allocating scratch space each time in each thread. hs_scratch_t * scratch = nullptr; err = hs_alloc_scratch(db, &scratch); + /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. if (err != HS_SUCCESS) throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY); @@ -206,7 +223,10 @@ namespace MultiRegexps } /// If CompileForEditDistance is False, edit_distance must be nullopt - template + /// Also, we use templates here because each instantiation of function + /// template has its own copy of local static variables which must not be the same + /// for different hyperscan compilations. + template inline Regexps * get(const std::vector & patterns, std::optional edit_distance) { /// C++11 has thread-safe function-local statics on most modern compilers. @@ -217,15 +237,19 @@ namespace MultiRegexps for (const StringRef & ref : patterns) str_patterns.push_back(ref.toString()); + /// Get the lock for finding database. std::unique_lock lock(known_regexps.mutex); auto it = known_regexps.storage.find({str_patterns, edit_distance}); + /// If not found, compile and let other threads wait. if (known_regexps.storage.end() == it) - it = known_regexps.storage.emplace( - std::pair{str_patterns, edit_distance}, - constructRegexps(str_patterns, edit_distance)).first; - + it = known_regexps.storage + .emplace( + std::pair{str_patterns, edit_distance}, + constructRegexps(str_patterns, edit_distance)) + .first; + /// If found, unlock and return the database. lock.unlock(); return &it->second; diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.reference b/dbms/tests/queries/0_stateless/00926_multimatch.reference index 8e3a8ec4820..4a2320de57b 100644 --- a/dbms/tests/queries/0_stateless/00926_multimatch.reference +++ b/dbms/tests/queries/0_stateless/00926_multimatch.reference @@ -600,3 +600,26 @@ 1 1 1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +All tests above must return 1, all tests below return something. +[] +[1,3] +[] +[1,2,3] diff --git a/dbms/tests/queries/0_stateless/00926_multimatch.sql b/dbms/tests/queries/0_stateless/00926_multimatch.sql index 797c59f52a5..d54e4fd2280 100644 --- a/dbms/tests/queries/0_stateless/00926_multimatch.sql +++ b/dbms/tests/queries/0_stateless/00926_multimatch.sql @@ -73,10 +73,20 @@ select 1 = multiMatchAny(materialize('abcdef'), ['a......', 'a.....']) from syst select 0 = multiMatchAny(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']) from system.numbers limit 10; select 1 = multiMatchAny(materialize('abc'), ['a\0d']) from system.numbers limit 10; -select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10;; -select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10;; -select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10;; -select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10;; +select 1 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google', 'yandex1']) from system.numbers limit 10; +select 2 = multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['google1', 'yandex']) from system.numbers limit 10; +select 0 != multiMatchAnyIndex(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*']) from system.numbers limit 10; +select 5 = multiMatchAnyIndex(materialize('vladizlvav dabe don\'t heart me no more'), ['what', 'is', 'love', 'baby', 'no mo??', 'dont', 'h.rt me']) from system.numbers limit 10; SELECT multiMatchAny(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']); SELECT 1 = multiMatchAny('фабрикант', ['f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]']); + +-- All indices tests +SELECT [1, 2] = arraySort(multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['.*goo.*', '.*yan.*'])) from system.numbers limit 5; +SELECT [1, 3] = arraySort(multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['.*goo.*', 'neverexisted', '.*yan.*'])) from system.numbers limit 5; +SELECT [] = multiMatchAllIndices(materialize('gogleuedeyandexgoogle'), ['neverexisted', 'anotherone', 'andanotherone']) from system.numbers limit 5; +SELECT [1, 2, 3, 11] = arraySort(multiMatchAllIndices('фабрикант', ['', 'рикан', 'а', 'f[ae]b[ei]rl', 'ф[иаэе]б[еэи][рпл]', 'афиукд', 'a[ft],th', '^ф[аиеэ]?б?[еэи]?$', 'берлик', 'fab', 'фа[беьв]+е?[рлко]'])); +SELECT [1] = multiMatchAllIndices(materialize('/odezhda-dlya-bega/'), ['/odezhda-dlya-bega/', 'kurtki-i-vetrovki-dlya-bega', 'futbolki-i-mayki-dlya-bega']); +SELECT [] = multiMatchAllIndices(materialize('aaaa'), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}']); +SELECT 'All tests above must return 1, all tests below return something.'; +SELECT arraySort(multiMatchAllIndices(arrayJoin(['aaaa', 'aaaaaa', 'bbbb', 'aaaaaaaaaaaaaa']), ['.*aa.*aaa.*', 'aaaaaa{2}', '\(aa\){3}'])); diff --git a/dbms/tests/queries/0_stateless/00927_disable_hyperscan.sql b/dbms/tests/queries/0_stateless/00927_disable_hyperscan.sql index 1af9c129284..009ed2629a8 100644 --- a/dbms/tests/queries/0_stateless/00927_disable_hyperscan.sql +++ b/dbms/tests/queries/0_stateless/00927_disable_hyperscan.sql @@ -2,5 +2,6 @@ SET allow_hyperscan = 1; SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); SET allow_hyperscan = 0; SELECT multiMatchAny(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); -- { serverError 446 } +SELECT multiMatchAllIndices(arrayJoin(['hello', 'world', 'hellllllllo', 'wororld', 'abc']), ['hel+o', 'w(or)*ld']); -- { serverError 446 } SELECT multiSearchAny(arrayJoin(['hello', 'world', 'hello, world', 'abc']), ['hello', 'world']); diff --git a/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.reference b/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.reference index 16ee5335538..4600557506b 100644 --- a/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.reference +++ b/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.reference @@ -30,3 +30,5 @@ 1 1 1 +1 +1 diff --git a/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.sql b/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.sql index 5cb75a8bc3f..48b31070204 100644 --- a/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.sql +++ b/dbms/tests/queries/0_stateless/00929_multi_match_edit_distance.sql @@ -24,3 +24,6 @@ select 1 = multiFuzzyMatchAny('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', select 2 = multiFuzzyMatchAnyIndex('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']); select 2 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']); select 1 = multiFuzzyMatchAnyIndex('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']); + +select [2, 3, 4] = arraySort(multiFuzzyMatchAllIndices('halo some wrld', 2, ['some random string', '^halo.*world$', '^halo.*world$', '^halo.*world$', '^hallllo.*world$'])); +select [] = multiFuzzyMatchAllIndices('halo some wrld', 2, ['^halllllo.*world$', 'some random string']); diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 723b8edc154..9886a8ee89b 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -64,6 +64,10 @@ The same as `match`, but returns 0 if none of the regular expressions are matche The same as `multiMatchAny`, but returns any index that matches the haystack. +## multiMatchAllIndices(haystack, [pattern1, pattern2, ..., patternn]) + +The same as `multiMatchAny`, but returns the array of all indicies that match the haystack in any order. + ## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within a constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). @@ -72,6 +76,10 @@ The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack w The same as `multiFuzzyMatchAny`, but returns any index that matches the haystack within a constant edit distance. +## multiFuzzyMatchAllIndices(haystack, distance, [pattern1, pattern2, ..., patternn]) + +The same as `multiFuzzyMatchAny`, but returns the array of all indices that match the haystack within a constant edit distance. + !!! note "Note" `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 0f86554b552..a81b8e9276e 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -57,6 +57,10 @@ То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения. +## multiMatchAllIndices(haystack, [pattern1, pattern2, ..., patternn]) + +То же, что и `multiMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке. + ## multiFuzzyMatchAny(haystack, distance, [pattern1, pattern2, ..., patternn]) То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching). @@ -65,6 +69,10 @@ То же, что и `multiFuzzyMatchAny`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния. +## multiFuzzyMatchAllIndices(haystack, distance, [pattern1, pattern2, ..., patternn]) + +То же, что и `multiFuzzyMatchAny`, только возвращает массив всех индексов всех подходящих регулярных выражений в любом порядке в пределах константного редакционного расстояния. + !!! note "Примечание" `multiFuzzyMatch*` функции не поддерживают UTF-8 закодированные регулярные выражения, и такие выражения рассматриваются как байтовые из-за ограничения hyperscan.