From 49934a3dc865cc8131d94de4592d3bd4f21150c0 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 25 May 2022 21:22:45 +0200 Subject: [PATCH] Cache compiled regexps when evaluating non-const needles Needles in a (non-const) needle column may repeat and this commit allows to skip compilation for known needles. Out of the different design alternatives (see below, if someone is interested), we now maintain - one global pattern cache, - with a fixed size of 42k elements currently, - and use LRU as eviction strategy. ------------------------------------------------------------------------ (sorry for the wall of text, dumping it here not for reading but just for reference) Write-up about considered design alternatives: 1. Keep the current global cache of const needles. For non-const needles, probe the cache but don't store values in it. Pros: need to maintain just a single cache, no problem with cache pollution assuming there are few distinct constant needles Cons: only useful if a non-const needle occurred as already as a const needle --> overall too simplistic 2. Keep the current global cache for const needles. For non-const needles, create a local (e.g. per-query) cache Pros: unlike (1.), non-const needles can be skipped even if they did not occur yet, no pollution of the const pattern cache when there are very many non-const needles (e.g. large / highly distinct needle columns). Cons: caches may explode "horizontally", i.e. we'll end up with the const cache + caches for Q1, Q2, ... QN, this makes it harder to control the overall space consumption, also patterns residing in different caches cannot be reused between queries, another difficulty is that the concept of "query" does not really exist at matching level - there are only column chunks and we'd potentially end up with 1 cache / chunk 3. Queries with const and non-const needles insert into the same global cache. Pros: the advantages of (2.) + allows to reuse compiled patterns accross parallel queries Cons: needs an eviction strategy to control cache size and pollution (and btw. (2.) also needs eviction strategies for the individual caches) 4. Queries with const needle use global cache, queries with non-const needle use a different global cache --> Overall similar to (3) but ignores the (likely) edge case that const and non-const needles overlap. In sum, (3.) seems the simplest and most beneficial approach. Eviction strategies: 0. Don't ever evict --> cache may grow infinitely and eventually make the system unusable (may even pose a DoS risk) 1. Flush the cache after a certain threshold is exceeded --> very simple but may lead to peridic performance drops 2. Use LRU --> more graceful performance degradation at threshold but comes with a (constant) performance overhead to maintain the LRU queue In sum, given that the pattern compilation in RE2 should be quite costly (pattern-to-DFA/NFA), LRU may be acceptable. --- src/Functions/FunctionsStringArray.h | 4 ++-- src/Functions/MatchImpl.h | 30 ++++++++++--------------- src/Functions/Regexps.h | 33 +++++++++++----------------- src/Functions/countMatches.h | 4 ++-- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 2680816670f..6545c3e3549 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -448,7 +448,7 @@ public: class SplitByRegexpImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; Pos pos; @@ -532,7 +532,7 @@ public: class ExtractAllImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; size_t capture; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 17bda74f8ab..9779eb8d608 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -166,7 +166,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -325,7 +325,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -479,22 +479,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -524,7 +521,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, @@ -593,22 +590,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -638,7 +632,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 2611afedc14..be3ce6cdeee 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -39,16 +39,8 @@ namespace ErrorCodes namespace Regexps { using Regexp = OptimizedRegularExpressionSingleThreaded; -using Pool = ObjectPoolMap; - -template -inline Regexp createRegexp(const std::string & pattern, int flags) -{ - if constexpr (like) - return {likePatternToRegexp(pattern), flags}; - else - return {pattern, flags}; -} +using Cache = LRUCache; +using RegexpPtr = Cache::MappedPtr; template inline int buildRe2Flags() @@ -61,22 +53,23 @@ inline int buildRe2Flags() return flags; } -/** Returns holder of an object from Pool. - * You must hold the ownership while using the object. - * In destructor, it returns the object back to the Pool for further reuse. - */ +/// Probes the cache of known compiled regexps for the given string pattern and returns a compiled regexp if +/// found. Otherwise, a new cache entry is created. template -inline Pool::Pointer get(const std::string & pattern) +inline RegexpPtr get(const String & pattern) { - /// the Singleton is thread-safe in C++11 - static Pool known_regexps; /// Different variables for different pattern parameters. + static Cache known_regexps(42'000); - return known_regexps.get(pattern, [&pattern] + auto [regexp_ptr, _] = known_regexps.getOrSet(pattern, [&pattern]() { const int flags = buildRe2Flags(); ProfileEvents::increment(ProfileEvents::RegexpCreated); - return new Regexp{createRegexp(pattern, flags)}; + if constexpr (like) + return std::make_shared(likePatternToRegexp(pattern), flags); + else + return std::make_shared(pattern, flags); }); + return regexp_ptr; } } diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 6d60ca94c18..1d43b66d867 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -55,7 +55,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - Regexps::Pool::Pointer re = Regexps::get(column_pattern->getValue()); + Regexps::RegexpPtr re = Regexps::get(column_pattern->getValue()); OptimizedRegularExpression::MatchVec matches; const IColumn * column_haystack = arguments[0].column.get(); @@ -95,7 +95,7 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); } - static uint64_t countMatches(StringRef src, Regexps::Pool::Pointer & re, OptimizedRegularExpression::MatchVec & matches) + static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches) { /// Only one match is required, no need to copy more. static const unsigned matches_limit = 1;