mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-15 19:02:04 +00:00
18eb8b6d48
After #37544 OptimizedRegularExpressionImpl started to be moved, but StringSearcher is not copyable since it holds pointers that goes out of scope after move (before Regexps::get() returns std::shared_ptr<Regexp> but it had been replaced with Regexps::createRegexp() that returns Regexp object). <details> <summary>ASan report</summary> ==48348==ERROR: AddressSanitizer: stack-use-after-scope on address 0x7fff577239a9 at pc 0x00001518209b bp 0x7fff57723820 sp 0x7fff57723818 READ of size 1 at 0x7fff577239a9 thread T0 0 0x1518209a in char8_t const* DB::StringSearcher<true, true>::search<char8_t>(char8_t const*, char8_t const*) const /bld/./src/Common/StringSearcher.h:730:41 1 0x1518dd3f in char8_t const* DB::StringSearcher<true, true>::search<char8_t>(char8_t const*, unsigned long) const /bld/./src/Common/StringSearcher.h:751:16 2 0x1518dd3f in OptimizedRegularExpressionImpl<false>::match(char const*, unsigned long, std::__1::vector<OptimizedRegularExpressionDetails::Match, std::__1::allocator<OptimizedRegularExpressionDetails::Match> >&, unsigned int) const /bld/./src/Common/OptimizedRegularExpression.cpp:463:54 3 0x1811cb42 in DB::ExtractAllImpl::get(char const*&, char const*&) /bld/./src/Functions/FunctionsStringArray.h:588:18 4 0x1811aa62 in DB::FunctionTokens<DB::ExtractAllImpl>::executeImpl(std::__1::vector<DB::ColumnWithTypeAndName, std::__1::allocator<DB::ColumnWithTypeAndName> > const&, std::__1::shared_ptr<DB::IDataType const> const&, unsigned long) const /bld/./src/Functions/FunctionsStringArray.h:704:30 5 0x14fe17b4 in DB::IFunction::executeImplDryRun(std::__1::vector<DB::ColumnWithTypeAndName, std::__1::allocator<DB::ColumnWithTypeAndName> > const&, std::__1::shared_ptr<DB::IDataType const> const&, unsigned long) const /bld/./src/Functions/IFunction.h:409:16 Address 0x7fff577239a9 is located in stack of thread T0 at offset 201 in frame 0 0x1518d98f in OptimizedRegularExpressionImpl<false>::match(char const*, unsigned long, std::__1::vector<OptimizedRegularExpressionDetails::Match, std::__1::allocator<OptimizedRegularExpressionDetails::Match> >&, unsigned int) const /bld/./src/Common/OptimizedRegularExpression.cpp:439 </details> CI: https://s3.amazonaws.com/clickhouse-test-reports/39342/c6f7698f9ad6ae22199182ebf7c3b2dac77d69d8/fuzzer_astfuzzerasan,actions//report.html Fixes: #37544 (cc @rschu1ze) Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
114 lines
4.1 KiB
C++
114 lines
4.1 KiB
C++
#pragma once
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <Common/StringSearcher.h>
|
|
#include <re2/re2.h>
|
|
|
|
#include <Common/config.h>
|
|
#include <re2_st/re2.h>
|
|
|
|
|
|
/** Uses two ways to optimize a regular expression:
|
|
* 1. If the regular expression is trivial (reduces to finding a substring in a string),
|
|
* then replaces the search with strstr or strcasestr.
|
|
* 2. If the regular expression contains a non-alternative substring of sufficient length,
|
|
* then before testing, strstr or strcasestr of sufficient length is used;
|
|
* regular expression is only fully checked if a substring is found.
|
|
* 3. In other cases, the re2 engine is used.
|
|
*
|
|
* This makes sense, since strstr and strcasestr in libc for Linux are well optimized.
|
|
*
|
|
* Suitable if the following conditions are simultaneously met:
|
|
* - if in most calls, the regular expression does not match;
|
|
* - if the regular expression is compatible with the re2 engine;
|
|
* - you can use at your own risk, since, probably, not all cases are taken into account.
|
|
*
|
|
* NOTE: Multi-character metasymbols such as \Pl are handled incorrectly.
|
|
*/
|
|
|
|
namespace OptimizedRegularExpressionDetails
|
|
{
|
|
struct Match
|
|
{
|
|
std::string::size_type offset;
|
|
std::string::size_type length;
|
|
};
|
|
}
|
|
|
|
template <bool thread_safe>
|
|
class OptimizedRegularExpressionImpl
|
|
{
|
|
public:
|
|
enum Options
|
|
{
|
|
RE_CASELESS = 0x00000001,
|
|
RE_NO_CAPTURE = 0x00000010,
|
|
RE_DOT_NL = 0x00000100
|
|
};
|
|
|
|
using Match = OptimizedRegularExpressionDetails::Match;
|
|
using MatchVec = std::vector<Match>;
|
|
|
|
using RegexType = std::conditional_t<thread_safe, re2::RE2, re2_st::RE2>;
|
|
using StringPieceType = std::conditional_t<thread_safe, re2::StringPiece, re2_st::StringPiece>;
|
|
|
|
OptimizedRegularExpressionImpl(const std::string & regexp_, int options = 0); /// NOLINT
|
|
/// StringSearcher store pointers to required_substring, it must be updated on move.
|
|
OptimizedRegularExpressionImpl(OptimizedRegularExpressionImpl && rhs) noexcept;
|
|
OptimizedRegularExpressionImpl(const OptimizedRegularExpressionImpl & rhs) = delete;
|
|
|
|
bool match(const std::string & subject) const
|
|
{
|
|
return match(subject.data(), subject.size());
|
|
}
|
|
|
|
bool match(const std::string & subject, Match & match_) const
|
|
{
|
|
return match(subject.data(), subject.size(), match_);
|
|
}
|
|
|
|
unsigned match(const std::string & subject, MatchVec & matches) const
|
|
{
|
|
return match(subject.data(), subject.size(), matches);
|
|
}
|
|
|
|
unsigned match(const char * subject, size_t subject_size, MatchVec & matches) const
|
|
{
|
|
return match(subject, subject_size, matches, number_of_subpatterns + 1);
|
|
}
|
|
|
|
bool match(const char * subject, size_t subject_size) const;
|
|
bool match(const char * subject, size_t subject_size, Match & match) const;
|
|
unsigned match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const;
|
|
|
|
unsigned getNumberOfSubpatterns() const { return number_of_subpatterns; }
|
|
|
|
/// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log).
|
|
const std::unique_ptr<RegexType> & getRE2() const { return re2; }
|
|
|
|
void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const
|
|
{
|
|
out_required_substring = required_substring;
|
|
out_is_trivial = is_trivial;
|
|
out_required_substring_is_prefix = required_substring_is_prefix;
|
|
}
|
|
|
|
private:
|
|
bool is_trivial;
|
|
bool required_substring_is_prefix;
|
|
bool is_case_insensitive;
|
|
std::string required_substring;
|
|
std::optional<DB::StringSearcher<true, true>> case_sensitive_substring_searcher;
|
|
std::optional<DB::StringSearcher<false, true>> case_insensitive_substring_searcher;
|
|
std::unique_ptr<RegexType> re2;
|
|
unsigned number_of_subpatterns;
|
|
|
|
static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
|
|
};
|
|
|
|
using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
|
|
using OptimizedRegularExpressionSingleThreaded = OptimizedRegularExpressionImpl<false>;
|