diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 2680816670f..6545c3e3549 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -448,7 +448,7 @@ public: class SplitByRegexpImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; Pos pos; @@ -532,7 +532,7 @@ public: class ExtractAllImpl { private: - Regexps::Pool::Pointer re; + Regexps::RegexpPtr re; OptimizedRegularExpression::MatchVec matches; size_t capture; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 17bda74f8ab..9779eb8d608 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -166,7 +166,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -325,7 +325,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -479,22 +479,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -524,7 +521,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, @@ -593,22 +590,19 @@ struct MatchImpl } else { - // each row is expected to contain a different like/re2 pattern - // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + auto regexp = Regexps::get(needle); - regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); + regexp->getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); if (required_substr.empty()) { - if (!regexp.getRE2()) /// An empty regexp. Always matches. + if (!regexp->getRE2()) /// An empty regexp. Always matches. { res[i] = !negate; } else { - const bool match = regexp.getRE2()->Match( + const bool match = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, 0, cur_haystack_length, @@ -638,7 +632,7 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - const bool match2 = regexp.getRE2()->Match( + const bool match2 = regexp->getRE2()->Match( {reinterpret_cast(cur_haystack_data), cur_haystack_length}, start_pos, end_pos, diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 2611afedc14..be3ce6cdeee 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -39,16 +39,8 @@ namespace ErrorCodes namespace Regexps { using Regexp = OptimizedRegularExpressionSingleThreaded; -using Pool = ObjectPoolMap; - -template -inline Regexp createRegexp(const std::string & pattern, int flags) -{ - if constexpr (like) - return {likePatternToRegexp(pattern), flags}; - else - return {pattern, flags}; -} +using Cache = LRUCache; +using RegexpPtr = Cache::MappedPtr; template inline int buildRe2Flags() @@ -61,22 +53,23 @@ inline int buildRe2Flags() return flags; } -/** Returns holder of an object from Pool. - * You must hold the ownership while using the object. - * In destructor, it returns the object back to the Pool for further reuse. - */ +/// Probes the cache of known compiled regexps for the given string pattern and returns a compiled regexp if +/// found. Otherwise, a new cache entry is created. template -inline Pool::Pointer get(const std::string & pattern) +inline RegexpPtr get(const String & pattern) { - /// the Singleton is thread-safe in C++11 - static Pool known_regexps; /// Different variables for different pattern parameters. + static Cache known_regexps(42'000); - return known_regexps.get(pattern, [&pattern] + auto [regexp_ptr, _] = known_regexps.getOrSet(pattern, [&pattern]() { const int flags = buildRe2Flags(); ProfileEvents::increment(ProfileEvents::RegexpCreated); - return new Regexp{createRegexp(pattern, flags)}; + if constexpr (like) + return std::make_shared(likePatternToRegexp(pattern), flags); + else + return std::make_shared(pattern, flags); }); + return regexp_ptr; } } diff --git a/src/Functions/countMatches.h b/src/Functions/countMatches.h index 6d60ca94c18..1d43b66d867 100644 --- a/src/Functions/countMatches.h +++ b/src/Functions/countMatches.h @@ -55,7 +55,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { const ColumnConst * column_pattern = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - Regexps::Pool::Pointer re = Regexps::get(column_pattern->getValue()); + Regexps::RegexpPtr re = Regexps::get(column_pattern->getValue()); OptimizedRegularExpression::MatchVec matches; const IColumn * column_haystack = arguments[0].column.get(); @@ -95,7 +95,7 @@ public: throw Exception(ErrorCodes::LOGICAL_ERROR, "Error in FunctionCountMatches::getReturnTypeImpl()"); } - static uint64_t countMatches(StringRef src, Regexps::Pool::Pointer & re, OptimizedRegularExpression::MatchVec & matches) + static uint64_t countMatches(StringRef src, Regexps::RegexpPtr & re, OptimizedRegularExpression::MatchVec & matches) { /// Only one match is required, no need to copy more. static const unsigned matches_limit = 1;