Merge pull request #47218 from hanfei1991/hanfei/optimize-regexp-tree-1

Refine OptimizeRegularExpression Function and RegexpTreeDict
2024-09-21 09:10:48 +00:00 · 2023-03-27 15:23:01 +02:00 · 2023-03-27 15:23:01 +02:00 · e3afa5090f
commit e3afa5090f
parent bc2b6257f2 02de4ad6df
6 changed files with 400 additions and 126 deletions
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@ -1,3 +1,4 @@
+#include <limits>
 #include <Common/Exception.h>
 #include <Common/PODArray.h>
 #include <Common/OptimizedRegularExpression.h>
@ -14,13 +15,40 @@ namespace DB
    }
 }

+namespace
+{

-template <bool thread_safe>
-void OptimizedRegularExpressionImpl<thread_safe>::analyze(
+struct Literal
+{
+    std::string literal;
+    bool prefix; /// this literal string is the prefix of the whole string.
+    bool suffix; /// this literal string is the suffix of the whole string.
+    void clear()
+    {
+        literal.clear();
+        prefix = false;
+        suffix = false;
+    }
+};
+
+using Literals = std::vector<Literal>;
+
+size_t shortest_literal_length(const Literals & literals)
+{
+    if (literals.empty()) return 0;
+    size_t shortest = std::numeric_limits<size_t>::max();
+    for (const auto & lit : literals)
+        if (shortest > lit.literal.size())
+            shortest = lit.literal.size();
+    return shortest;
+}
+
+const char * analyzeImpl(
    std::string_view regexp,
-    std::string & required_substring,
+    const char * pos,
+    Literal & required_substring,
    bool & is_trivial,
-    bool & required_substring_is_prefix)
+    Literals & global_alternatives)
 {
    /** The expression is trivial if all the metacharacters in it are escaped.
      * The non-alternative string is
@ -30,12 +58,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
      *  and also avoid substrings of the form `http://` or `www` and some other
      *   (this is the hack for typical use case in web analytics applications).
      */
-    const char * begin = regexp.data();
-    const char * pos = begin;
+    const char * begin = pos;
    const char * end = regexp.data() + regexp.size();
+    bool is_first_call = begin == regexp.data();
    int depth = 0;
    is_trivial = true;
-    required_substring_is_prefix = false;
    required_substring.clear();
    bool has_alternative_on_depth_0 = false;
    bool has_case_insensitive_flag = false;
@ -47,6 +74,80 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
    Substrings trivial_substrings(1);
    Substring * last_substring = &trivial_substrings.back();

+    Literals cur_alternatives;
+
+    auto finish_cur_alternatives = [&]()
+    {
+        if (cur_alternatives.empty())
+            return;
+
+        if (global_alternatives.empty())
+        {
+            global_alternatives = cur_alternatives;
+            cur_alternatives.clear();
+            return;
+        }
+        /// that means current alternatives have better quality.
+        if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives))
+        {
+            global_alternatives.clear();
+            global_alternatives = cur_alternatives;
+        }
+        cur_alternatives.clear();
+    };
+
+    auto finish_non_trivial_char = [&](bool create_new_substr = true)
+    {
+        if (depth != 0)
+            return;
+
+        for (auto & alter : cur_alternatives)
+        {
+            if (alter.suffix)
+            {
+                alter.literal += last_substring->first;
+            }
+        }
+
+        finish_cur_alternatives();
+
+        if (!last_substring->first.empty() && create_new_substr)
+        {
+            trivial_substrings.resize(trivial_substrings.size() + 1);
+            last_substring = &trivial_substrings.back();
+        }
+    };
+
+    /// Resolve the string or alters in a group (xxxxx)
+    auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives)
+    {
+        for (auto & alter : group_alternatives)
+        {
+            if (alter.prefix)
+            {
+                alter.literal = last_substring->first + alter.literal;
+            }
+        }
+
+        if (group_required_string.prefix)
+            last_substring->first += group_required_string.literal;
+        else
+        {
+            finish_non_trivial_char();
+            last_substring->first = group_required_string.literal;
+        }
+        /// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
+        if (!last_substring->first.empty() && !group_required_string.suffix)
+        {
+            trivial_substrings.resize(trivial_substrings.size() + 1);
+            last_substring = &trivial_substrings.back();
+        }
+
+        /// assign group alters to current alters.
+        finish_cur_alternatives();
+        cur_alternatives = std::move(group_alternatives);
+    };
+
    bool in_curly_braces = false;
    bool in_square_braces = false;

@ -73,25 +174,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                    case '$':
                    case '.':
                    case '[':
+                    case ']':
                    case '?':
                    case '*':
                    case '+':
+                    case '-':
                    case '{':
-                        if (depth == 0 && !in_curly_braces && !in_square_braces)
-                        {
-                            if (last_substring->first.empty())
-                                last_substring->second = pos - begin;
-                            last_substring->first.push_back(*pos);
-                        }
-                        break;
+                    case '}':
+                    case '/':
+                        goto ordinary;
                    default:
                        /// all other escape sequences are not supported
                        is_trivial = false;
-                        if (!last_substring->first.empty())
-                        {
-                            trivial_substrings.resize(trivial_substrings.size() + 1);
-                            last_substring = &trivial_substrings.back();
-                        }
+                        finish_non_trivial_char();
                        break;
                }

@ -100,28 +195,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
            }

            case '|':
-                if (depth == 0)
-                    has_alternative_on_depth_0 = true;
                is_trivial = false;
-                if (!in_square_braces && !last_substring->first.empty())
-                {
-                    trivial_substrings.resize(trivial_substrings.size() + 1);
-                    last_substring = &trivial_substrings.back();
-                }
                ++pos;
+                if (depth == 0)
+                {
+                    has_alternative_on_depth_0 = true;
+                    goto finish;
+                }
                break;

            case '(':
+                is_trivial = false;
                if (!in_square_braces)
                {
-                    ++depth;
-                    is_trivial = false;
-                    if (!last_substring->first.empty())
-                    {
-                        trivial_substrings.resize(trivial_substrings.size() + 1);
-                        last_substring = &trivial_substrings.back();
-                    }
-
                    /// Check for case-insensitive flag.
                    if (pos + 1 < end && pos[1] == '?')
                    {
@ -143,6 +229,28 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                                break;
                        }
                    }
+                    if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
+                    {
+                        pos += 2;
+                    }
+                    Literal group_required_substr;
+                    bool group_is_trival = true;
+                    Literals group_alters;
+                    pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters);
+                    /// pos should be ')', if not, then it is not a valid regular expression
+                    if (pos == end)
+                        return pos;
+
+                    /// For ()? or ()* or (){0,1}, we can just ignore the whole group.
+                    if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) ||
+                        (pos + 2 < end && pos[1] == '{' && pos[2] == '0'))
+                    {
+                        finish_non_trivial_char();
+                    }
+                    else
+                    {
+                        finish_group(group_required_substr, group_alters);
+                    }
                }
                ++pos;
                break;
@ -151,11 +259,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                in_square_braces = true;
                ++depth;
                is_trivial = false;
-                if (!last_substring->first.empty())
-                {
-                    trivial_substrings.resize(trivial_substrings.size() + 1);
-                    last_substring = &trivial_substrings.back();
-                }
+                finish_non_trivial_char();
                ++pos;
                break;

@ -163,38 +267,25 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                if (!in_square_braces)
                    goto ordinary;

-                in_square_braces = false;
                --depth;
+                if (depth == 0)
+                    in_square_braces = false;
                is_trivial = false;
-                if (!last_substring->first.empty())
-                {
-                    trivial_substrings.resize(trivial_substrings.size() + 1);
-                    last_substring = &trivial_substrings.back();
-                }
+                finish_non_trivial_char();
                ++pos;
                break;

            case ')':
                if (!in_square_braces)
                {
-                    --depth;
-                    is_trivial = false;
-                    if (!last_substring->first.empty())
-                    {
-                        trivial_substrings.resize(trivial_substrings.size() + 1);
-                        last_substring = &trivial_substrings.back();
-                    }
+                    goto finish;
                }
                ++pos;
                break;

            case '^': case '$': case '.': case '+':
                is_trivial = false;
-                if (!last_substring->first.empty() && !in_square_braces)
-                {
-                    trivial_substrings.resize(trivial_substrings.size() + 1);
-                    last_substring = &trivial_substrings.back();
-                }
+                finish_non_trivial_char();
                ++pos;
                break;

@ -206,12 +297,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                [[fallthrough]];
            case '*':
                is_trivial = false;
-                if (!last_substring->first.empty() && !in_square_braces)
+                if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
                {
                    last_substring->first.resize(last_substring->first.size() - 1);
-                    trivial_substrings.resize(trivial_substrings.size() + 1);
-                    last_substring = &trivial_substrings.back();
                }
+                finish_non_trivial_char();
                ++pos;
                break;

@ -236,13 +326,15 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                break;
        }
    }
+finish:

-    if (last_substring && last_substring->first.empty())
-        trivial_substrings.pop_back();
+    finish_non_trivial_char(false);

    if (!is_trivial)
    {
-        if (!has_alternative_on_depth_0 && !has_case_insensitive_flag)
+        /// we calculate required substring even though has_alternative_on_depth_0.
+        /// we will clear the required substring after putting it to alternatives.
+        if (!has_case_insensitive_flag)
        {
            /// We choose the non-alternative substring of the maximum length for first search.

@ -262,19 +354,45 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
                }
            }

-            if (max_length >= MIN_LENGTH_FOR_STRSTR)
+            if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
            {
-                required_substring = candidate_it->first;
-                required_substring_is_prefix = candidate_it->second == 0;
+                required_substring.literal = candidate_it->first;
+                required_substring.prefix = candidate_it->second == 0;
+                required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
            }
        }
    }
    else if (!trivial_substrings.empty())
    {
-        required_substring = trivial_substrings.front().first;
-        required_substring_is_prefix = trivial_substrings.front().second == 0;
+        required_substring.literal = trivial_substrings.front().first;
+        required_substring.prefix = trivial_substrings.front().second == 0;
+        required_substring.suffix = true;
    }

+    /// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result.
+    if (has_alternative_on_depth_0)
+    {
+        /// compare the quality of required substring and alternatives and choose the better one.
+        if (shortest_literal_length(global_alternatives) < required_substring.literal.size())
+            global_alternatives = {required_substring};
+        Literals next_alternatives;
+        /// this two vals are useless, xxx|xxx cannot be trivial nor prefix.
+        bool next_is_trivial = true;
+        pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives);
+        /// For xxx|xxx|xxx, we only conbine the alternatives and return a empty required_substring.
+        if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size())
+        {
+            global_alternatives.push_back(required_substring);
+        }
+        else
+        {
+            global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end());
+        }
+        required_substring.clear();
+    }
+
+    return pos;
+
 /*    std::cerr
        << "regexp: " << regexp
        << ", is_trivial: " << is_trivial
@ -282,12 +400,31 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
        << ", required_substring_is_prefix: " << required_substring_is_prefix
        << std::endl;*/
 }
+}

+template <bool thread_safe>
+void OptimizedRegularExpressionImpl<thread_safe>::analyze(
+        std::string_view regexp_,
+        std::string & required_substring,
+        bool & is_trivial,
+        bool & required_substring_is_prefix,
+        std::vector<std::string> & alternatives)
+{
+    Literals alternative_literals;
+    Literal required_literal;
+    analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals);
+    required_substring = std::move(required_literal.literal);
+    required_substring_is_prefix = required_literal.prefix;
+    for (auto & lit : alternative_literals)
+        alternatives.push_back(std::move(lit.literal));
+}

 template <bool thread_safe>
 OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
 {
-    analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
+    std::vector<std::string> alternativesDummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used.
+    analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternativesDummy);
+

    /// Just three following options are supported
    if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
--- a/src/Common/OptimizedRegularExpression.h
+++ b/src/Common/OptimizedRegularExpression.h
@ -95,6 +95,15 @@ public:
        out_required_substring_is_prefix = required_substring_is_prefix;
    }

+    /// analyze function will extract the longest string literal or multiple alternative string literals from regexp for pre-checking if
+    /// a string contains the string literal(s). If not, we can tell this string can never match the regexp.
+    static void analyze(
+        std::string_view regexp_,
+        std::string & required_substring,
+        bool & is_trivial,
+        bool & required_substring_is_prefix,
+        std::vector<std::string> & alternatives);
+
 private:
    bool is_trivial;
    bool required_substring_is_prefix;
@ -104,8 +113,6 @@ private:
    std::optional<DB::ASCIICaseInsensitiveStringSearcher> case_insensitive_substring_searcher;
    std::unique_ptr<RegexType> re2;
    unsigned number_of_subpatterns;
-
-    static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
 };

 using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
--- a/src/Common/tests/gtest_optimize_re.cpp
+++ b/src/Common/tests/gtest_optimize_re.cpp
@ -0,0 +1,46 @@
+#include <gtest/gtest.h>
+
+#include <Common/OptimizedRegularExpression.h>
+
+TEST(OptimizeRE, analyze)
+{
+    auto test_f = [](const std::string & regexp, const std::string & answer, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false)
+    {
+        std::string required;
+        bool is_trivial;
+        bool is_prefix;
+        std::vector<std::string> alternatives;
+        OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives);
+        std::cerr << regexp << std::endl;
+        EXPECT_EQ(required, answer);
+        EXPECT_EQ(alternatives, expect_alternatives);
+        EXPECT_EQ(is_trivial, trival_expected);
+    };
+    test_f("abc", "abc", {}, true);
+    test_f("c([^k]*)de", "");
+    test_f("abc(de)fg", "abcdefg");
+    test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"});
+    test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"});
+    test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"});
+    test_f("(abc)", "abc");
+    test_f("(abc|fgk)", "", {"abc","fgk"});
+    test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"});
+    test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"});
+    test_f("abc[k]xyzz", "xyzz");
+    test_f("(abc[k]xyzz)", "xyzz");
+    test_f("abc((de)fg(hi))jk", "abcdefghijk");
+    test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk");
+    test_f("abc((de)fghi+zzz)jk", "abcdefghi");
+    test_f("abc((de)fg(hi))?jk", "abc");
+    test_f("abc((de)fghi?zzz)jk", "abcdefgh");
+    test_f("abc(*cd)jk", "cdjk");
+    test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"});
+    test_f("abc(abc|fg)?xyzz", "xyzz");
+    test_f("abc(abc|fg){0,1}xyzz", "xyzz");
+    test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"});
+    test_f("abc(abc|fg)xyzz|bc(dd?x|kk?y|(f))k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bck", "bcfk", "bc"});
+    test_f("((?:abc|efg|xyz)/[a-zA-Z0-9]{1-50})(/?[^ ]*|)", "", {"abc/", "efg/", "xyz/"});
+    test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"});
+    test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
+    test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
+}
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -934,7 +934,7 @@ class IColumn;
    M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \
    \
    M(Bool, regexp_dict_allow_other_sources, false, "Allow regexp_tree dictionary to use sources other than yaml source.", 0) \
-    M(Bool, regexp_dict_allow_hyperscan, false, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
+    M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \

 // End of FORMAT_FACTORY_SETTINGS
 // Please add settings non-related to formats into the COMMON_SETTINGS above.
--- a/src/Dictionaries/RegExpTreeDictionary.cpp
+++ b/src/Dictionaries/RegExpTreeDictionary.cpp
@ -9,9 +9,10 @@
 #include <Poco/Logger.h>
 #include <Poco/RegularExpression.h>

-#include "Common/Exception.h"
 #include <Common/ArenaUtils.h>
+#include <Common/Exception.h>
 #include <Common/logger_useful.h>
+#include <Common/OptimizedRegularExpression.h>
 #include <Core/ColumnsWithTypeAndName.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
@ -34,6 +35,7 @@

 #if USE_VECTORSCAN
 #    include <hs.h>
+#    include <hs_compile.h>
 #endif

 namespace DB
@ -46,6 +48,7 @@ namespace ErrorCodes
    extern const int HYPERSCAN_CANNOT_SCAN_TEXT;
    extern const int UNSUPPORTED_METHOD;
    extern const int INCORRECT_DICTIONARY_DEFINITION;
+    extern const int LOGICAL_ERROR;
 }

 const std::string kRegExp = "regexp";
@ -172,10 +175,6 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
    auto keys_column = block.getByName(kKeys).column;
    auto values_column = block.getByName(kValues).column;

-#ifdef USE_VECTORSCAN
-    SlowWithHyperscanChecker checker;
-#endif
-
    size_t size = block.rows();
    for (size_t i = 0; i < size; i++)
    {
@ -219,12 +218,36 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
            }
        }
        regex_nodes.emplace(id, node);
+
 #if USE_VECTORSCAN
-        if (use_vectorscan && !checker.isSlow(regex))
+        String required_substring;
+        bool is_trivial, required_substring_is_prefix;
+        std::vector<std::string> alternatives;
+
+        if (use_vectorscan)
+            OptimizedRegularExpression::analyze(regex, required_substring, is_trivial, required_substring_is_prefix, alternatives);
+
+        for (auto & alter : alternatives)
        {
-            simple_regexps.push_back(regex);
+            if (alter.size() < 3)
+            {
+                alternatives.clear();
+                break;
+            }
+        }
+        if (!required_substring.empty())
+        {
+            simple_regexps.push_back(required_substring);
            regexp_ids.push_back(id);
        }
+        else if (!alternatives.empty())
+        {
+            for (auto & alternative : alternatives)
+            {
+                simple_regexps.push_back(alternative);
+                regexp_ids.push_back(id);
+            }
+        }
        else
 #endif
            complex_regexp_nodes.push_back(node);
@ -284,20 +307,50 @@ void RegExpTreeDictionary::loadData()
            use_vectorscan = false;
        if (!use_vectorscan)
            return;
-        #if USE_VECTORSCAN
-        try
+
+#if USE_VECTORSCAN
+        std::vector<const char *> patterns;
+        std::vector<unsigned int> flags;
+        std::vector<size_t> lengths;
+
+        for (const std::string & simple_regexp : simple_regexps)
        {
-            std::vector<std::string_view> regexps_views(simple_regexps.begin(), simple_regexps.end());
-            hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
-            hyperscan_regex->get();
+            patterns.push_back(simple_regexp.data());
+            lengths.push_back(simple_regexp.size());
+            flags.push_back(HS_FLAG_SINGLEMATCH);
        }
-        catch (Exception & e)
+
+        hs_database_t * db = nullptr;
+        hs_compile_error_t * compile_error;
+
+        std::unique_ptr<unsigned int[]> ids;
+        ids.reset(new unsigned int[patterns.size()]);
+        for (size_t i = 0; i < patterns.size(); i++)
+            ids[i] = static_cast<unsigned>(i+1);
+
+        hs_error_t err = hs_compile_lit_multi(patterns.data(), flags.data(), ids.get(), lengths.data(), static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db, &compile_error);
+        origin_db = (db);
+        if (err != HS_SUCCESS)
        {
-            /// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large.
-            /// We catch the error here and rethrow again.
-            throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message());
+            /// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
+            MultiRegexps::CompilerErrorPtr error(compile_error);
+
+            if (error->expression < 0)
+                throw Exception::createRuntime(ErrorCodes::LOGICAL_ERROR, String(error->message));
+            else
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", patterns[error->expression], String(error->message));
        }
-        #endif
+
+        /// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
+        /// function which is faster than allocating scratch space each time in each thread.
+        hs_scratch_t * scratch = nullptr;
+        err = hs_alloc_scratch(db, &scratch);
+        origin_scratch.reset(scratch);
+        /// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
+        if (err != HS_SUCCESS)
+            throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan");
+#endif
+
    }
    else
    {
@ -396,47 +449,70 @@ bool RegExpTreeDictionary::setAttributes(
    return attributes_to_set.size() == attributes.size();
 }

-namespace
+/// a temp struct to store all the matched result.
+struct MatchContext
 {
-    struct MatchContext
+    std::set<UInt64> matched_idx_set;
+    std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
+
+    const std::vector<UInt64> & regexp_ids ;
+    const std::unordered_map<UInt64, UInt64> & topology_order;
+    const char * data;
+    size_t length;
+    const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes;
+
+    size_t pre_match_counter = 0;
+    size_t match_counter = 0;
+
+    MatchContext(
+        const std::vector<UInt64> & regexp_ids_,
+        const std::unordered_map<UInt64, UInt64> & topology_order_,
+        const char * data_, size_t length_,
+        const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes_
+    )
+    : regexp_ids(regexp_ids_),
+        topology_order(topology_order_),
+        data(data_),
+        length(length_),
+        regex_nodes(regex_nodes_)
+    {}
+
+    [[maybe_unused]]
+    void insertIdx(unsigned int idx)
    {
-        std::set<UInt64> matched_idx_set;
-        std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
-
-        const std::vector<UInt64> & regexp_ids ;
-        const std::unordered_map<UInt64, UInt64> & topology_order;
-
-        MatchContext(const std::vector<UInt64> & regexp_ids_, const std::unordered_map<UInt64, UInt64> & topology_order_)
-            : regexp_ids(regexp_ids_), topology_order(topology_order_) {}
-
-        [[maybe_unused]]
-        void insertIdx(unsigned int idx)
+        UInt64 node_id = regexp_ids[idx-1];
+        pre_match_counter++;
+        if (!regex_nodes.at(node_id)->match(data, length))
        {
-            UInt64 node_id = regexp_ids[idx-1];
-            UInt64 topological_order = topology_order.at(node_id);
-            matched_idx_set.emplace(node_id);
-            matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
+            return;
        }
+        match_counter++;
+        matched_idx_set.emplace(node_id);

-        void insertNodeID(UInt64 id)
-        {
-            UInt64 topological_order = topology_order.at(id);
-            matched_idx_set.emplace(id);
-            matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
-        }
+        UInt64 topological_order = topology_order.at(node_id);
+        matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
+    }

-        /// Sort by topological order, which indicates the matching priorities.
-        void sort()
-        {
-            std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
-        }
+    [[maybe_unused]]
+    void insertNodeID(UInt64 id)
+    {
+        matched_idx_set.emplace(id);

-        bool contains(UInt64 idx) const
-        {
-            return matched_idx_set.contains(idx);
-        }
-    };
-}
+        UInt64 topological_order = topology_order.at(id);
+        matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
+    }
+
+    /// Sort by topological order, which indicates the matching priorities.
+    void sort()
+    {
+        std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
+    }
+
+    bool contains(UInt64 idx) const
+    {
+        return matched_idx_set.contains(idx);
+    }
+};

 std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
    const ColumnString::Chars & keys_data,
@ -449,7 +525,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
    hs_scratch_t * scratch = nullptr;
    if (use_vectorscan)
    {
-        hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
+        hs_error_t err = hs_clone_scratch(origin_scratch.get(), &scratch);

        if (err != HS_SUCCESS)
        {
@ -476,11 +552,14 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
        auto key_offset = keys_offsets[key_idx];
        UInt64 length = key_offset - offset - 1;

-        MatchContext match_result(regexp_ids, topology_order);
+        const char * begin = reinterpret_cast<const char *>(keys_data.data()) + offset;
+
+        MatchContext match_result(regexp_ids, topology_order, begin, length, regex_nodes);

 #if USE_VECTORSCAN
        if (use_vectorscan)
        {
+            /// pre-select all the possible matches
            auto on_match = [](unsigned int id,
                            unsigned long long /* from */, // NOLINT
                            unsigned long long /* to */, // NOLINT
@ -490,8 +569,9 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
                static_cast<MatchContext *>(context)->insertIdx(id);
                return 0;
            };
+
            hs_error_t err = hs_scan(
-                hyperscan_regex->get()->getDB(),
+                origin_db,
                reinterpret_cast<const char *>(keys_data.data()) + offset,
                static_cast<unsigned>(length),
                0,
@ -501,6 +581,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(

            if (err != HS_SUCCESS)
                throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan");
+
        }
 #endif

--- a/src/Dictionaries/RegExpTreeDictionary.h
+++ b/src/Dictionaries/RegExpTreeDictionary.h
@ -33,6 +33,7 @@ namespace ErrorCodes

 class RegExpTreeDictionary : public IDictionary
 {
+    friend struct MatchContext;
 public:
    struct Configuration
    {
@ -162,6 +163,8 @@ private:
    std::unordered_map<UInt64, UInt64> topology_order;
    #if USE_VECTORSCAN
    MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
+    MultiRegexps::ScratchPtr origin_scratch;
+    hs_database_t* origin_db;
    #endif

    Poco::Logger * logger;