From 49934a3dc865cc8131d94de4592d3bd4f21150c0 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 25 May 2022 21:22:45 +0200
Subject: [PATCH] Cache compiled regexps when evaluating non-const needles

Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.

------------------------------------------------------------------------

(sorry for the wall of text, dumping it here not for reading but just
for reference)

Write-up about considered design alternatives:

1. Keep the current global cache of const needles. For non-const
   needles, probe the cache but don't store values in it.
   Pros: need to maintain just a single cache, no problem with cache
         pollution assuming there are few distinct constant needles
   Cons: only useful if a non-const needle occurred as already as a
         const needle
   --> overall too simplistic

2. Keep the current global cache for const needles. For non-const
   needles, create a local (e.g. per-query) cache
   Pros: unlike (1.), non-const needles can be skipped even if they
         did not occur yet, no pollution of the const pattern cache when
         there are very many non-const needles (e.g. large / highly
         distinct needle columns).
   Cons: caches may explode "horizontally", i.e. we'll end up with the
         const cache + caches for Q1, Q2, ... QN, this makes it harder
         to control the overall space consumption, also patterns
         residing in different caches cannot be reused between queries,
         another difficulty is that the concept of "query" does not
         really exist at matching level - there are only column chunks
         and we'd potentially end up with 1 cache / chunk

3. Queries with const and non-const needles insert into the same global
   cache.
   Pros: the advantages of (2.) + allows to reuse compiled patterns
         accross parallel queries
   Cons: needs an eviction strategy to control cache size and pollution
         (and btw. (2.) also needs eviction strategies for the
         individual caches)

4. Queries with const needle use global cache, queries with non-const
   needle use a different global cache
   --> Overall similar to (3) but ignores the (likely) edge case that
       const and non-const needles overlap.

In sum, (3.) seems the simplest and most beneficial approach.

Eviction strategies:

0. Don't ever evict --> cache may grow infinitely and eventually make
   the system unusable (may even pose a DoS risk)

1. Flush the cache after a certain threshold is exceeded --> very
   simple but may lead to peridic performance drops

2. Use LRU --> more graceful performance degradation at threshold but
   comes with a (constant) performance overhead to maintain the LRU
   queue

In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
---
 src/Functions/FunctionsStringArray.h |  4 ++--
 src/Functions/MatchImpl.h            | 30 ++++++++++---------------
 src/Functions/Regexps.h              | 33 +++++++++++-----------------
 src/Functions/countMatches.h         |  4 ++--
 4 files changed, 29 insertions(+), 42 deletions(-)

diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h
index 2680816670f..6545c3e3549 100644
--- a/src/Functions/FunctionsStringArray.h
+++ b/src/Functions/FunctionsStringArray.h
@@ -448,7 +448,7 @@ public:
 class SplitByRegexpImpl
 {
 private:
-    Regexps::Pool::Pointer re;
+    Regexps::RegexpPtr re;
     OptimizedRegularExpression::MatchVec matches;
 
     Pos pos;
@@ -532,7 +532,7 @@ public:
 class ExtractAllImpl
 {
 private:
-    Regexps::Pool::Pointer re;
+    Regexps::RegexpPtr re;
     OptimizedRegularExpression::MatchVec matches;
     size_t capture;
 
diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h
index 17bda74f8ab..9779eb8d608 100644
--- a/src/Functions/MatchImpl.h
+++ b/src/Functions/MatchImpl.h
@@ -166,7 +166,7 @@ struct MatchImpl
         }
         else
         {
-            auto regexp = Regexps::get<is_like, true, case_insensitive>(needle);
+            auto regexp = Regexps::get<is_like, /*no_capture*/ true, case_insensitive>(needle);
 
             String required_substring;
             bool is_trivial;
@@ -325,7 +325,7 @@ struct MatchImpl
         }
         else
         {
-            auto regexp = Regexps::get<is_like, true, case_insensitive>(needle);
+            auto regexp = Regexps::get<is_like, /*no_capture*/ true, case_insensitive>(needle);
 
             String required_substring;
             bool is_trivial;
@@ -479,22 +479,19 @@ struct MatchImpl
             }
             else
             {
-                // each row is expected to contain a different like/re2 pattern
-                // --> bypass the regexp cache, instead construct the pattern on-the-fly
-                const int flags = Regexps::buildRe2Flags</*no_capture*/ true, case_insensitive>();
-                const auto & regexp = Regexps::Regexp(Regexps::createRegexp<is_like>(needle, flags));
+                auto regexp = Regexps::get<is_like, /*no_capture*/ true, case_insensitive>(needle);
 
-                regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
+                regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
 
                 if (required_substr.empty())
                 {
-                    if (!regexp.getRE2()) /// An empty regexp. Always matches.
+                    if (!regexp->getRE2()) /// An empty regexp. Always matches.
                     {
                         res[i] = !negate;
                     }
                     else
                     {
-                        const bool match = regexp.getRE2()->Match(
+                        const bool match = regexp->getRE2()->Match(
                                 {reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
                                 0,
                                 cur_haystack_length,
@@ -524,7 +521,7 @@ struct MatchImpl
                             const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0;
                             const size_t end_pos = cur_haystack_length;
 
-                            const bool match2 = regexp.getRE2()->Match(
+                            const bool match2 = regexp->getRE2()->Match(
                                     {reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
                                     start_pos,
                                     end_pos,
@@ -593,22 +590,19 @@ struct MatchImpl
             }
             else
             {
-                // each row is expected to contain a different like/re2 pattern
-                // --> bypass the regexp cache, instead construct the pattern on-the-fly
-                const int flags = Regexps::buildRe2Flags</*no_capture*/ true, case_insensitive>();
-                const auto & regexp = Regexps::Regexp(Regexps::createRegexp<is_like>(needle, flags));
+                auto regexp = Regexps::get<is_like, /*no_capture*/ true, case_insensitive>(needle);
 
-                regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
+                regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix);
 
                 if (required_substr.empty())
                 {
-                    if (!regexp.getRE2()) /// An empty regexp. Always matches.
+                    if (!regexp->getRE2()) /// An empty regexp. Always matches.
                     {
                         res[i] = !negate;
                     }
                     else
                     {
-                        const bool match = regexp.getRE2()->Match(
+                        const bool match = regexp->getRE2()->Match(
                                 {reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
                                 0,
                                 cur_haystack_length,
@@ -638,7 +632,7 @@ struct MatchImpl
                             const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0;
                             const size_t end_pos = cur_haystack_length;
 
-                            const bool match2 = regexp.getRE2()->Match(
+                            const bool match2 = regexp->getRE2()->Match(
                                     {reinterpret_cast<const char *>(cur_haystack_data), cur_haystack_length},
                                     start_pos,
                                     end_pos,
diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h
index 2611afedc14..be3ce6cdeee 100644
--- a/src/Functions/Regexps.h
+++ b/src/Functions/Regexps.h
@@ -9,7 +9,7 @@
 #include <vector>
 #include <Functions/likePatternToRegexp.h>
 #include <Common/Exception.h>
-#include <Common/ObjectPool.h>
+#include <Common/LRUCache.h>
 #include <Common/OptimizedRegularExpression.h>
 #include <Common/ProfileEvents.h>
 #include <Common/config.h>
@@ -39,16 +39,8 @@ namespace ErrorCodes
 namespace Regexps
 {
 using Regexp = OptimizedRegularExpressionSingleThreaded;
-using Pool = ObjectPoolMap<Regexp, String>;
-
-template <bool like>
-inline Regexp createRegexp(const std::string & pattern, int flags)
-{
-    if constexpr (like)
-        return {likePatternToRegexp(pattern), flags};
-    else
-        return {pattern, flags};
-}
+using Cache = LRUCache<String, Regexp>;
+using RegexpPtr = Cache::MappedPtr;
 
 template<bool no_capture, bool case_insensitive>
 inline int buildRe2Flags()
@@ -61,22 +53,23 @@ inline int buildRe2Flags()
     return flags;
 }
 
-/** Returns holder of an object from Pool.
-  * You must hold the ownership while using the object.
-  * In destructor, it returns the object back to the Pool for further reuse.
-  */
+/// Probes the cache of known compiled regexps for the given string pattern and returns a compiled regexp if
+/// found. Otherwise, a new cache entry is created.
 template <bool like, bool no_capture, bool case_insensitive>
-inline Pool::Pointer get(const std::string & pattern)
+inline RegexpPtr get(const String & pattern)
 {
-    /// the Singleton is thread-safe in C++11
-    static Pool known_regexps; /// Different variables for different pattern parameters.
+    static Cache known_regexps(42'000);
 
-    return known_regexps.get(pattern, [&pattern]
+    auto [regexp_ptr, _] = known_regexps.getOrSet(pattern, [&pattern]()
     {
         const int flags = buildRe2Flags<no_capture, case_insensitive>();
         ProfileEvents::increment(ProfileEvents::RegexpCreated);
-        return new Regexp{createRegexp<like>(pattern, flags)};
+        if constexpr (like)
+            return std::make_shared<Regexp>(likePatternToRegexp(pattern), flags);
+        else
+            return std::make_shared<Regexp>(pattern, flags);
     });
+    return regexp_ptr;
 }
 
 }
diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h
index 6d60ca94c18..1d43b66d867 100644
--- a/src/Functions/countMatches.h
+++ b/src/Functions/countMatches.h
@@ -55,7 +55,7 @@ public:
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
     {
         const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
-        Regexps::Pool::Pointer re = Regexps::get<false /* like */, true /* is_no_capture */, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());
+        Regexps::RegexpPtr re = Regexps::get<false /* like */, true /* is_no_capture */, CountMatchesBase::case_insensitive>(column_pattern->getValue<String>());
         OptimizedRegularExpression::MatchVec matches;
 
         const IColumn * column_haystack = arguments[0].column.get();
@@ -95,7 +95,7 @@ public:
             throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()");
     }
 
-    static uint64_t countMatches(StringRef src, Regexps::Pool::Pointer & re, OptimizedRegularExpression::MatchVec & matches)
+    static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches)
     {
         /// Only one match is required, no need to copy more.
         static const unsigned matches_limit = 1;