diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index 7d96feba1f3..68f5b86877e 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -14,13 +15,40 @@ namespace DB } } +namespace +{ -template -void OptimizedRegularExpressionImpl::analyze( +struct Literal +{ + std::string literal; + bool prefix; /// this literal string is the prefix of the whole string. + bool suffix; /// this literal string is the suffix of the whole string. + void clear() + { + literal.clear(); + prefix = false; + suffix = false; + } +}; + +using Literals = std::vector; + +size_t shortest_literal_length(const Literals & literals) +{ + if (literals.empty()) return 0; + size_t shortest = std::numeric_limits::max(); + for (const auto & lit : literals) + if (shortest > lit.literal.size()) + shortest = lit.literal.size(); + return shortest; +} + +const char * analyzeImpl( std::string_view regexp, - std::string & required_substring, + const char * pos, + Literal & required_substring, bool & is_trivial, - bool & required_substring_is_prefix) + Literals & global_alternatives) { /** The expression is trivial if all the metacharacters in it are escaped. * The non-alternative string is @@ -30,12 +58,11 @@ void OptimizedRegularExpressionImpl::analyze( * and also avoid substrings of the form `http://` or `www` and some other * (this is the hack for typical use case in web analytics applications). */ - const char * begin = regexp.data(); - const char * pos = begin; + const char * begin = pos; const char * end = regexp.data() + regexp.size(); + bool is_first_call = begin == regexp.data(); int depth = 0; is_trivial = true; - required_substring_is_prefix = false; required_substring.clear(); bool has_alternative_on_depth_0 = false; bool has_case_insensitive_flag = false; @@ -47,6 +74,80 @@ void OptimizedRegularExpressionImpl::analyze( Substrings trivial_substrings(1); Substring * last_substring = &trivial_substrings.back(); + Literals cur_alternatives; + + auto finish_cur_alternatives = [&]() + { + if (cur_alternatives.empty()) + return; + + if (global_alternatives.empty()) + { + global_alternatives = cur_alternatives; + cur_alternatives.clear(); + return; + } + /// that means current alternatives have better quality. + if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives)) + { + global_alternatives.clear(); + global_alternatives = cur_alternatives; + } + cur_alternatives.clear(); + }; + + auto finish_non_trivial_char = [&](bool create_new_substr = true) + { + if (depth != 0) + return; + + for (auto & alter : cur_alternatives) + { + if (alter.suffix) + { + alter.literal += last_substring->first; + } + } + + finish_cur_alternatives(); + + if (!last_substring->first.empty() && create_new_substr) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + }; + + /// Resolve the string or alters in a group (xxxxx) + auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives) + { + for (auto & alter : group_alternatives) + { + if (alter.prefix) + { + alter.literal = last_substring->first + alter.literal; + } + } + + if (group_required_string.prefix) + last_substring->first += group_required_string.literal; + else + { + finish_non_trivial_char(); + last_substring->first = group_required_string.literal; + } + /// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg + if (!last_substring->first.empty() && !group_required_string.suffix) + { + trivial_substrings.resize(trivial_substrings.size() + 1); + last_substring = &trivial_substrings.back(); + } + + /// assign group alters to current alters. + finish_cur_alternatives(); + cur_alternatives = std::move(group_alternatives); + }; + bool in_curly_braces = false; bool in_square_braces = false; @@ -73,25 +174,19 @@ void OptimizedRegularExpressionImpl::analyze( case '$': case '.': case '[': + case ']': case '?': case '*': case '+': + case '-': case '{': - if (depth == 0 && !in_curly_braces && !in_square_braces) - { - if (last_substring->first.empty()) - last_substring->second = pos - begin; - last_substring->first.push_back(*pos); - } - break; + case '}': + case '/': + goto ordinary; default: /// all other escape sequences are not supported is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } + finish_non_trivial_char(); break; } @@ -100,28 +195,19 @@ void OptimizedRegularExpressionImpl::analyze( } case '|': - if (depth == 0) - has_alternative_on_depth_0 = true; is_trivial = false; - if (!in_square_braces && !last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } ++pos; + if (depth == 0) + { + has_alternative_on_depth_0 = true; + goto finish; + } break; case '(': + is_trivial = false; if (!in_square_braces) { - ++depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } - /// Check for case-insensitive flag. if (pos + 1 < end && pos[1] == '?') { @@ -143,6 +229,28 @@ void OptimizedRegularExpressionImpl::analyze( break; } } + if (pos + 2 < end && pos[1] == '?' && pos[2] == ':') + { + pos += 2; + } + Literal group_required_substr; + bool group_is_trival = true; + Literals group_alters; + pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters); + /// pos should be ')', if not, then it is not a valid regular expression + if (pos == end) + return pos; + + /// For ()? or ()* or (){0,1}, we can just ignore the whole group. + if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) || + (pos + 2 < end && pos[1] == '{' && pos[2] == '0')) + { + finish_non_trivial_char(); + } + else + { + finish_group(group_required_substr, group_alters); + } } ++pos; break; @@ -151,11 +259,7 @@ void OptimizedRegularExpressionImpl::analyze( in_square_braces = true; ++depth; is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } + finish_non_trivial_char(); ++pos; break; @@ -163,38 +267,25 @@ void OptimizedRegularExpressionImpl::analyze( if (!in_square_braces) goto ordinary; - in_square_braces = false; --depth; + if (depth == 0) + in_square_braces = false; is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } + finish_non_trivial_char(); ++pos; break; case ')': if (!in_square_braces) { - --depth; - is_trivial = false; - if (!last_substring->first.empty()) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } + goto finish; } ++pos; break; case '^': case '$': case '.': case '+': is_trivial = false; - if (!last_substring->first.empty() && !in_square_braces) - { - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); - } + finish_non_trivial_char(); ++pos; break; @@ -206,12 +297,11 @@ void OptimizedRegularExpressionImpl::analyze( [[fallthrough]]; case '*': is_trivial = false; - if (!last_substring->first.empty() && !in_square_braces) + if (depth == 0 && !last_substring->first.empty() && !in_square_braces) { last_substring->first.resize(last_substring->first.size() - 1); - trivial_substrings.resize(trivial_substrings.size() + 1); - last_substring = &trivial_substrings.back(); } + finish_non_trivial_char(); ++pos; break; @@ -236,13 +326,15 @@ void OptimizedRegularExpressionImpl::analyze( break; } } +finish: - if (last_substring && last_substring->first.empty()) - trivial_substrings.pop_back(); + finish_non_trivial_char(false); if (!is_trivial) { - if (!has_alternative_on_depth_0 && !has_case_insensitive_flag) + /// we calculate required substring even though has_alternative_on_depth_0. + /// we will clear the required substring after putting it to alternatives. + if (!has_case_insensitive_flag) { /// We choose the non-alternative substring of the maximum length for first search. @@ -262,19 +354,45 @@ void OptimizedRegularExpressionImpl::analyze( } } - if (max_length >= MIN_LENGTH_FOR_STRSTR) + if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0)) { - required_substring = candidate_it->first; - required_substring_is_prefix = candidate_it->second == 0; + required_substring.literal = candidate_it->first; + required_substring.prefix = candidate_it->second == 0; + required_substring.suffix = candidate_it + 1 == trivial_substrings.end(); } } } else if (!trivial_substrings.empty()) { - required_substring = trivial_substrings.front().first; - required_substring_is_prefix = trivial_substrings.front().second == 0; + required_substring.literal = trivial_substrings.front().first; + required_substring.prefix = trivial_substrings.front().second == 0; + required_substring.suffix = true; } + /// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result. + if (has_alternative_on_depth_0) + { + /// compare the quality of required substring and alternatives and choose the better one. + if (shortest_literal_length(global_alternatives) < required_substring.literal.size()) + global_alternatives = {required_substring}; + Literals next_alternatives; + /// this two vals are useless, xxx|xxx cannot be trivial nor prefix. + bool next_is_trivial = true; + pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives); + /// For xxx|xxx|xxx, we only conbine the alternatives and return a empty required_substring. + if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size()) + { + global_alternatives.push_back(required_substring); + } + else + { + global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end()); + } + required_substring.clear(); + } + + return pos; + /* std::cerr << "regexp: " << regexp << ", is_trivial: " << is_trivial @@ -282,12 +400,31 @@ void OptimizedRegularExpressionImpl::analyze( << ", required_substring_is_prefix: " << required_substring_is_prefix << std::endl;*/ } +} +template +void OptimizedRegularExpressionImpl::analyze( + std::string_view regexp_, + std::string & required_substring, + bool & is_trivial, + bool & required_substring_is_prefix, + std::vector & alternatives) +{ + Literals alternative_literals; + Literal required_literal; + analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals); + required_substring = std::move(required_literal.literal); + required_substring_is_prefix = required_literal.prefix; + for (auto & lit : alternative_literals) + alternatives.push_back(std::move(lit.literal)); +} template OptimizedRegularExpressionImpl::OptimizedRegularExpressionImpl(const std::string & regexp_, int options) { - analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix); + std::vector alternativesDummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used. + analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternativesDummy); + /// Just three following options are supported if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL))) diff --git a/src/Common/OptimizedRegularExpression.h b/src/Common/OptimizedRegularExpression.h index d8ed1e205c8..f6b59f0a465 100644 --- a/src/Common/OptimizedRegularExpression.h +++ b/src/Common/OptimizedRegularExpression.h @@ -95,6 +95,15 @@ public: out_required_substring_is_prefix = required_substring_is_prefix; } + /// analyze function will extract the longest string literal or multiple alternative string literals from regexp for pre-checking if + /// a string contains the string literal(s). If not, we can tell this string can never match the regexp. + static void analyze( + std::string_view regexp_, + std::string & required_substring, + bool & is_trivial, + bool & required_substring_is_prefix, + std::vector & alternatives); + private: bool is_trivial; bool required_substring_is_prefix; @@ -104,8 +113,6 @@ private: std::optional case_insensitive_substring_searcher; std::unique_ptr re2; unsigned number_of_subpatterns; - - static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); }; using OptimizedRegularExpression = OptimizedRegularExpressionImpl; diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp new file mode 100644 index 00000000000..556700f1fcc --- /dev/null +++ b/src/Common/tests/gtest_optimize_re.cpp @@ -0,0 +1,46 @@ +#include + +#include + +TEST(OptimizeRE, analyze) +{ + auto test_f = [](const std::string & regexp, const std::string & answer, std::vector expect_alternatives = {}, bool trival_expected = false) + { + std::string required; + bool is_trivial; + bool is_prefix; + std::vector alternatives; + OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives); + std::cerr << regexp << std::endl; + EXPECT_EQ(required, answer); + EXPECT_EQ(alternatives, expect_alternatives); + EXPECT_EQ(is_trivial, trival_expected); + }; + test_f("abc", "abc", {}, true); + test_f("c([^k]*)de", ""); + test_f("abc(de)fg", "abcdefg"); + test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}); + test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}); + test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"}); + test_f("(abc)", "abc"); + test_f("(abc|fgk)", "", {"abc","fgk"}); + test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"}); + test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"}); + test_f("abc[k]xyzz", "xyzz"); + test_f("(abc[k]xyzz)", "xyzz"); + test_f("abc((de)fg(hi))jk", "abcdefghijk"); + test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk"); + test_f("abc((de)fghi+zzz)jk", "abcdefghi"); + test_f("abc((de)fg(hi))?jk", "abc"); + test_f("abc((de)fghi?zzz)jk", "abcdefgh"); + test_f("abc(*cd)jk", "cdjk"); + test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"}); + test_f("abc(abc|fg)?xyzz", "xyzz"); + test_f("abc(abc|fg){0,1}xyzz", "xyzz"); + test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"}); + test_f("abc(abc|fg)xyzz|bc(dd?x|kk?y|(f))k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bck", "bcfk", "bc"}); + test_f("((?:abc|efg|xyz)/[a-zA-Z0-9]{1-50})(/?[^ ]*|)", "", {"abc/", "efg/", "xyz/"}); + test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"}); + test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"}); + test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"}); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ca89106dc08..9fa2ba0d32f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -934,7 +934,7 @@ class IColumn; M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \ \ M(Bool, regexp_dict_allow_other_sources, false, "Allow regexp_tree dictionary to use sources other than yaml source.", 0) \ - M(Bool, regexp_dict_allow_hyperscan, false, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ + M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \ // End of FORMAT_FACTORY_SETTINGS // Please add settings non-related to formats into the COMMON_SETTINGS above. diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index caba2a52a51..c072ba78d46 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -9,9 +9,10 @@ #include #include -#include "Common/Exception.h" #include +#include #include +#include #include #include #include @@ -34,6 +35,7 @@ #if USE_VECTORSCAN # include +# include #endif namespace DB @@ -46,6 +48,7 @@ namespace ErrorCodes extern const int HYPERSCAN_CANNOT_SCAN_TEXT; extern const int UNSUPPORTED_METHOD; extern const int INCORRECT_DICTIONARY_DEFINITION; + extern const int LOGICAL_ERROR; } const std::string kRegExp = "regexp"; @@ -172,10 +175,6 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) auto keys_column = block.getByName(kKeys).column; auto values_column = block.getByName(kValues).column; -#ifdef USE_VECTORSCAN - SlowWithHyperscanChecker checker; -#endif - size_t size = block.rows(); for (size_t i = 0; i < size; i++) { @@ -219,12 +218,36 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) } } regex_nodes.emplace(id, node); + #if USE_VECTORSCAN - if (use_vectorscan && !checker.isSlow(regex)) + String required_substring; + bool is_trivial, required_substring_is_prefix; + std::vector alternatives; + + if (use_vectorscan) + OptimizedRegularExpression::analyze(regex, required_substring, is_trivial, required_substring_is_prefix, alternatives); + + for (auto & alter : alternatives) { - simple_regexps.push_back(regex); + if (alter.size() < 3) + { + alternatives.clear(); + break; + } + } + if (!required_substring.empty()) + { + simple_regexps.push_back(required_substring); regexp_ids.push_back(id); } + else if (!alternatives.empty()) + { + for (auto & alternative : alternatives) + { + simple_regexps.push_back(alternative); + regexp_ids.push_back(id); + } + } else #endif complex_regexp_nodes.push_back(node); @@ -284,20 +307,50 @@ void RegExpTreeDictionary::loadData() use_vectorscan = false; if (!use_vectorscan) return; - #if USE_VECTORSCAN - try + +#if USE_VECTORSCAN + std::vector patterns; + std::vector flags; + std::vector lengths; + + for (const std::string & simple_regexp : simple_regexps) { - std::vector regexps_views(simple_regexps.begin(), simple_regexps.end()); - hyperscan_regex = MultiRegexps::getOrSet(regexps_views, std::nullopt); - hyperscan_regex->get(); + patterns.push_back(simple_regexp.data()); + lengths.push_back(simple_regexp.size()); + flags.push_back(HS_FLAG_SINGLEMATCH); } - catch (Exception & e) + + hs_database_t * db = nullptr; + hs_compile_error_t * compile_error; + + std::unique_ptr ids; + ids.reset(new unsigned int[patterns.size()]); + for (size_t i = 0; i < patterns.size(); i++) + ids[i] = static_cast(i+1); + + hs_error_t err = hs_compile_lit_multi(patterns.data(), flags.data(), ids.get(), lengths.data(), static_cast(patterns.size()), HS_MODE_BLOCK, nullptr, &db, &compile_error); + origin_db = (db); + if (err != HS_SUCCESS) { - /// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large. - /// We catch the error here and rethrow again. - throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message()); + /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown. + MultiRegexps::CompilerErrorPtr error(compile_error); + + if (error->expression < 0) + throw Exception::createRuntime(ErrorCodes::LOGICAL_ERROR, String(error->message)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", patterns[error->expression], String(error->message)); } - #endif + + /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch + /// function which is faster than allocating scratch space each time in each thread. + hs_scratch_t * scratch = nullptr; + err = hs_alloc_scratch(db, &scratch); + origin_scratch.reset(scratch); + /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch. + if (err != HS_SUCCESS) + throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan"); +#endif + } else { @@ -396,47 +449,70 @@ bool RegExpTreeDictionary::setAttributes( return attributes_to_set.size() == attributes.size(); } -namespace +/// a temp struct to store all the matched result. +struct MatchContext { - struct MatchContext + std::set matched_idx_set; + std::vector> matched_idx_sorted_list; + + const std::vector & regexp_ids ; + const std::unordered_map & topology_order; + const char * data; + size_t length; + const std::map & regex_nodes; + + size_t pre_match_counter = 0; + size_t match_counter = 0; + + MatchContext( + const std::vector & regexp_ids_, + const std::unordered_map & topology_order_, + const char * data_, size_t length_, + const std::map & regex_nodes_ + ) + : regexp_ids(regexp_ids_), + topology_order(topology_order_), + data(data_), + length(length_), + regex_nodes(regex_nodes_) + {} + + [[maybe_unused]] + void insertIdx(unsigned int idx) { - std::set matched_idx_set; - std::vector> matched_idx_sorted_list; - - const std::vector & regexp_ids ; - const std::unordered_map & topology_order; - - MatchContext(const std::vector & regexp_ids_, const std::unordered_map & topology_order_) - : regexp_ids(regexp_ids_), topology_order(topology_order_) {} - - [[maybe_unused]] - void insertIdx(unsigned int idx) + UInt64 node_id = regexp_ids[idx-1]; + pre_match_counter++; + if (!regex_nodes.at(node_id)->match(data, length)) { - UInt64 node_id = regexp_ids[idx-1]; - UInt64 topological_order = topology_order.at(node_id); - matched_idx_set.emplace(node_id); - matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id)); + return; } + match_counter++; + matched_idx_set.emplace(node_id); - void insertNodeID(UInt64 id) - { - UInt64 topological_order = topology_order.at(id); - matched_idx_set.emplace(id); - matched_idx_sorted_list.push_back(std::make_pair(topological_order, id)); - } + UInt64 topological_order = topology_order.at(node_id); + matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id)); + } - /// Sort by topological order, which indicates the matching priorities. - void sort() - { - std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end()); - } + [[maybe_unused]] + void insertNodeID(UInt64 id) + { + matched_idx_set.emplace(id); - bool contains(UInt64 idx) const - { - return matched_idx_set.contains(idx); - } - }; -} + UInt64 topological_order = topology_order.at(id); + matched_idx_sorted_list.push_back(std::make_pair(topological_order, id)); + } + + /// Sort by topological order, which indicates the matching priorities. + void sort() + { + std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end()); + } + + bool contains(UInt64 idx) const + { + return matched_idx_set.contains(idx); + } +}; std::unordered_map RegExpTreeDictionary::match( const ColumnString::Chars & keys_data, @@ -449,7 +525,7 @@ std::unordered_map RegExpTreeDictionary::match( hs_scratch_t * scratch = nullptr; if (use_vectorscan) { - hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch); + hs_error_t err = hs_clone_scratch(origin_scratch.get(), &scratch); if (err != HS_SUCCESS) { @@ -476,11 +552,14 @@ std::unordered_map RegExpTreeDictionary::match( auto key_offset = keys_offsets[key_idx]; UInt64 length = key_offset - offset - 1; - MatchContext match_result(regexp_ids, topology_order); + const char * begin = reinterpret_cast(keys_data.data()) + offset; + + MatchContext match_result(regexp_ids, topology_order, begin, length, regex_nodes); #if USE_VECTORSCAN if (use_vectorscan) { + /// pre-select all the possible matches auto on_match = [](unsigned int id, unsigned long long /* from */, // NOLINT unsigned long long /* to */, // NOLINT @@ -490,8 +569,9 @@ std::unordered_map RegExpTreeDictionary::match( static_cast(context)->insertIdx(id); return 0; }; + hs_error_t err = hs_scan( - hyperscan_regex->get()->getDB(), + origin_db, reinterpret_cast(keys_data.data()) + offset, static_cast(length), 0, @@ -501,6 +581,7 @@ std::unordered_map RegExpTreeDictionary::match( if (err != HS_SUCCESS) throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan"); + } #endif diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 32206f25429..17a0c6bbef3 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -33,6 +33,7 @@ namespace ErrorCodes class RegExpTreeDictionary : public IDictionary { + friend struct MatchContext; public: struct Configuration { @@ -162,6 +163,8 @@ private: std::unordered_map topology_order; #if USE_VECTORSCAN MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex; + MultiRegexps::ScratchPtr origin_scratch; + hs_database_t* origin_db; #endif Poco::Logger * logger;