mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-18 13:42:02 +00:00
1219 lines
36 KiB
C++
1219 lines
36 KiB
C++
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Regular expression interface RE2.
|
|
//
|
|
// Originally the PCRE C++ wrapper, but adapted to use
|
|
// the new automata-based regular expression engines.
|
|
|
|
#include "re2/re2.h"
|
|
|
|
#include <stdio.h>
|
|
#include <string>
|
|
#include <pthread.h>
|
|
#include <errno.h>
|
|
#include "util/atomicops.h"
|
|
#include "util/util.h"
|
|
#include "util/flags.h"
|
|
#include "re2/prog.h"
|
|
#include "re2/regexp.h"
|
|
|
|
DEFINE_bool(trace_re2, false, "trace RE2 execution");
|
|
|
|
namespace re2 {
|
|
|
|
// Maximum number of args we can set
|
|
static const int kMaxArgs = 16;
|
|
static const int kVecSize = 1+kMaxArgs;
|
|
|
|
const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch = {};
|
|
const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch = {};
|
|
const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume = {};
|
|
const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume = {};
|
|
|
|
// This will trigger LNK2005 error in MSVC.
|
|
#ifndef COMPILER_MSVC
|
|
const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
|
|
#endif // COMPILER_MSVC
|
|
|
|
RE2::Options::Options(RE2::CannedOptions opt)
|
|
: encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
|
|
posix_syntax_(opt == RE2::POSIX),
|
|
longest_match_(opt == RE2::POSIX),
|
|
log_errors_(opt != RE2::Quiet),
|
|
max_mem_(kDefaultMaxMem),
|
|
literal_(false),
|
|
never_nl_(false),
|
|
dot_nl_(false),
|
|
never_capture_(false),
|
|
case_sensitive_(true),
|
|
perl_classes_(false),
|
|
word_boundary_(false),
|
|
one_line_(false) {
|
|
}
|
|
|
|
// static empty things for use as const references.
|
|
// To avoid global constructors, initialized on demand.
|
|
GLOBAL_MUTEX(empty_mutex);
|
|
static const string *empty_string;
|
|
static const map<string, int> *empty_named_groups;
|
|
static const map<int, string> *empty_group_names;
|
|
|
|
static void InitEmpty() {
|
|
GLOBAL_MUTEX_LOCK(empty_mutex);
|
|
if (empty_string == NULL) {
|
|
empty_string = new string;
|
|
empty_named_groups = new map<string, int>;
|
|
empty_group_names = new map<int, string>;
|
|
}
|
|
GLOBAL_MUTEX_UNLOCK(empty_mutex);
|
|
}
|
|
|
|
// Converts from Regexp error code to RE2 error code.
|
|
// Maybe some day they will diverge. In any event, this
|
|
// hides the existence of Regexp from RE2 users.
|
|
static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
|
|
switch (code) {
|
|
case re2::kRegexpSuccess:
|
|
return RE2::NoError;
|
|
case re2::kRegexpInternalError:
|
|
return RE2::ErrorInternal;
|
|
case re2::kRegexpBadEscape:
|
|
return RE2::ErrorBadEscape;
|
|
case re2::kRegexpBadCharClass:
|
|
return RE2::ErrorBadCharClass;
|
|
case re2::kRegexpBadCharRange:
|
|
return RE2::ErrorBadCharRange;
|
|
case re2::kRegexpMissingBracket:
|
|
return RE2::ErrorMissingBracket;
|
|
case re2::kRegexpMissingParen:
|
|
return RE2::ErrorMissingParen;
|
|
case re2::kRegexpTrailingBackslash:
|
|
return RE2::ErrorTrailingBackslash;
|
|
case re2::kRegexpRepeatArgument:
|
|
return RE2::ErrorRepeatArgument;
|
|
case re2::kRegexpRepeatSize:
|
|
return RE2::ErrorRepeatSize;
|
|
case re2::kRegexpRepeatOp:
|
|
return RE2::ErrorRepeatOp;
|
|
case re2::kRegexpBadPerlOp:
|
|
return RE2::ErrorBadPerlOp;
|
|
case re2::kRegexpBadUTF8:
|
|
return RE2::ErrorBadUTF8;
|
|
case re2::kRegexpBadNamedCapture:
|
|
return RE2::ErrorBadNamedCapture;
|
|
}
|
|
return RE2::ErrorInternal;
|
|
}
|
|
|
|
static string trunc(const StringPiece& pattern) {
|
|
if (pattern.size() < 100)
|
|
return pattern.as_string();
|
|
return pattern.substr(0, 100).as_string() + "...";
|
|
}
|
|
|
|
|
|
RE2::RE2(const char* pattern) {
|
|
Init(pattern, DefaultOptions);
|
|
}
|
|
|
|
RE2::RE2(const string& pattern) {
|
|
Init(pattern, DefaultOptions);
|
|
}
|
|
|
|
RE2::RE2(const StringPiece& pattern) {
|
|
Init(pattern, DefaultOptions);
|
|
}
|
|
|
|
RE2::RE2(const StringPiece& pattern, const Options& options) {
|
|
Init(pattern, options);
|
|
}
|
|
|
|
int RE2::Options::ParseFlags() const {
|
|
int flags = Regexp::ClassNL;
|
|
switch (encoding()) {
|
|
default:
|
|
if (log_errors())
|
|
LOG(ERROR) << "Unknown encoding " << encoding();
|
|
break;
|
|
case RE2::Options::EncodingUTF8:
|
|
break;
|
|
case RE2::Options::EncodingLatin1:
|
|
flags |= Regexp::Latin1;
|
|
break;
|
|
}
|
|
|
|
if (!posix_syntax())
|
|
flags |= Regexp::LikePerl;
|
|
|
|
if (literal())
|
|
flags |= Regexp::Literal;
|
|
|
|
if (never_nl())
|
|
flags |= Regexp::NeverNL;
|
|
|
|
if (dot_nl())
|
|
flags |= Regexp::DotNL;
|
|
|
|
if (never_capture())
|
|
flags |= Regexp::NeverCapture;
|
|
|
|
if (!case_sensitive())
|
|
flags |= Regexp::FoldCase;
|
|
|
|
if (perl_classes())
|
|
flags |= Regexp::PerlClasses;
|
|
|
|
if (word_boundary())
|
|
flags |= Regexp::PerlB;
|
|
|
|
if (one_line())
|
|
flags |= Regexp::OneLine;
|
|
|
|
return flags;
|
|
}
|
|
|
|
void RE2::Init(const StringPiece& pattern, const Options& options) {
|
|
mutex_ = new Mutex;
|
|
pattern_ = pattern.as_string();
|
|
options_.Copy(options);
|
|
InitEmpty();
|
|
error_ = empty_string;
|
|
error_code_ = NoError;
|
|
suffix_regexp_ = NULL;
|
|
entire_regexp_ = NULL;
|
|
prog_ = NULL;
|
|
rprog_ = NULL;
|
|
named_groups_ = NULL;
|
|
group_names_ = NULL;
|
|
num_captures_ = -1;
|
|
|
|
RegexpStatus status;
|
|
entire_regexp_ = Regexp::Parse(
|
|
pattern_,
|
|
static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
|
|
&status);
|
|
if (entire_regexp_ == NULL) {
|
|
if (error_ == empty_string)
|
|
error_ = new string(status.Text());
|
|
if (options_.log_errors()) {
|
|
LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
|
|
<< status.Text();
|
|
}
|
|
error_arg_ = status.error_arg().as_string();
|
|
error_code_ = RegexpErrorToRE2(status.code());
|
|
return;
|
|
}
|
|
|
|
prefix_.clear();
|
|
prefix_foldcase_ = false;
|
|
re2::Regexp* suffix;
|
|
if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
|
|
suffix_regexp_ = suffix;
|
|
else
|
|
suffix_regexp_ = entire_regexp_->Incref();
|
|
|
|
// Two thirds of the memory goes to the forward Prog,
|
|
// one third to the reverse prog, because the forward
|
|
// Prog has two DFAs but the reverse prog has one.
|
|
prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
|
|
if (prog_ == NULL) {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
|
|
error_ = new string("pattern too large - compile failed");
|
|
error_code_ = RE2::ErrorPatternTooLarge;
|
|
return;
|
|
}
|
|
|
|
// Could delay this until the first match call that
|
|
// cares about submatch information, but the one-pass
|
|
// machine's memory gets cut from the DFA memory budget,
|
|
// and that is harder to do if the DFA has already
|
|
// been built.
|
|
is_one_pass_ = prog_->IsOnePass();
|
|
}
|
|
|
|
// Returns rprog_, computing it if needed.
|
|
re2::Prog* RE2::ReverseProg() const {
|
|
MutexLock l(mutex_);
|
|
if (rprog_ == NULL && error_ == empty_string) {
|
|
rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
|
|
if (rprog_ == NULL) {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
|
|
error_ = new string("pattern too large - reverse compile failed");
|
|
error_code_ = RE2::ErrorPatternTooLarge;
|
|
return NULL;
|
|
}
|
|
}
|
|
return rprog_;
|
|
}
|
|
|
|
RE2::~RE2() {
|
|
if (suffix_regexp_)
|
|
suffix_regexp_->Decref();
|
|
if (entire_regexp_)
|
|
entire_regexp_->Decref();
|
|
delete mutex_;
|
|
delete prog_;
|
|
delete rprog_;
|
|
if (error_ != empty_string)
|
|
delete error_;
|
|
if (named_groups_ != NULL && named_groups_ != empty_named_groups)
|
|
delete named_groups_;
|
|
if (group_names_ != NULL && group_names_ != empty_group_names)
|
|
delete group_names_;
|
|
}
|
|
|
|
int RE2::ProgramSize() const {
|
|
if (prog_ == NULL)
|
|
return -1;
|
|
return prog_->size();
|
|
}
|
|
|
|
// Returns named_groups_, computing it if needed.
|
|
const map<string, int>& RE2::NamedCapturingGroups() const {
|
|
MutexLock l(mutex_);
|
|
if (!ok())
|
|
return *empty_named_groups;
|
|
if (named_groups_ == NULL) {
|
|
named_groups_ = suffix_regexp_->NamedCaptures();
|
|
if (named_groups_ == NULL)
|
|
named_groups_ = empty_named_groups;
|
|
}
|
|
return *named_groups_;
|
|
}
|
|
|
|
// Returns group_names_, computing it if needed.
|
|
const map<int, string>& RE2::CapturingGroupNames() const {
|
|
MutexLock l(mutex_);
|
|
if (!ok())
|
|
return *empty_group_names;
|
|
if (group_names_ == NULL) {
|
|
group_names_ = suffix_regexp_->CaptureNames();
|
|
if (group_names_ == NULL)
|
|
group_names_ = empty_group_names;
|
|
}
|
|
return *group_names_;
|
|
}
|
|
|
|
/***** Convenience interfaces *****/
|
|
|
|
bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
|
|
const Arg* const args[], int n) {
|
|
return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
|
|
}
|
|
|
|
bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
|
|
const Arg* const args[], int n) {
|
|
return re.DoMatch(text, UNANCHORED, NULL, args, n);
|
|
}
|
|
|
|
bool RE2::ConsumeN(StringPiece* input, const RE2& re,
|
|
const Arg* const args[], int n) {
|
|
int consumed;
|
|
if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
|
|
input->remove_prefix(consumed);
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
|
|
const Arg* const args[], int n) {
|
|
int consumed;
|
|
if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
|
|
input->remove_prefix(consumed);
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Returns the maximum submatch needed for the rewrite to be done by Replace().
|
|
// E.g. if rewrite == "foo \\2,\\1", returns 2.
|
|
int RE2::MaxSubmatch(const StringPiece& rewrite) {
|
|
int max = 0;
|
|
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
|
s < end; s++) {
|
|
if (*s == '\\') {
|
|
s++;
|
|
int c = (s < end) ? *s : -1;
|
|
if (isdigit(c)) {
|
|
int n = (c - '0');
|
|
if (n > max)
|
|
max = n;
|
|
}
|
|
}
|
|
}
|
|
return max;
|
|
}
|
|
|
|
bool RE2::Replace(string *str,
|
|
const RE2& re,
|
|
const StringPiece& rewrite) {
|
|
StringPiece vec[kVecSize];
|
|
int nvec = 1 + MaxSubmatch(rewrite);
|
|
if (static_cast<size_t>(nvec) > arraysize(vec))
|
|
return false;
|
|
if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
|
|
return false;
|
|
|
|
string s;
|
|
if (!re.Rewrite(&s, rewrite, vec, nvec))
|
|
return false;
|
|
|
|
assert(vec[0].begin() >= str->data());
|
|
assert(vec[0].end() <= str->data()+str->size());
|
|
str->replace(vec[0].data() - str->data(), vec[0].size(), s);
|
|
return true;
|
|
}
|
|
|
|
int RE2::GlobalReplace(string *str,
|
|
const RE2& re,
|
|
const StringPiece& rewrite) {
|
|
StringPiece vec[kVecSize];
|
|
int nvec = 1 + MaxSubmatch(rewrite);
|
|
if (static_cast<size_t>(nvec) > arraysize(vec))
|
|
return false;
|
|
|
|
const char* p = str->data();
|
|
const char* ep = p + str->size();
|
|
const char* lastend = NULL;
|
|
string out;
|
|
int count = 0;
|
|
while (p <= ep) {
|
|
if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
|
|
break;
|
|
if (p < vec[0].begin())
|
|
out.append(p, vec[0].begin() - p);
|
|
if (vec[0].begin() == lastend && vec[0].size() == 0) {
|
|
// Disallow empty match at end of last match: skip ahead.
|
|
if (p < ep)
|
|
out.append(p, 1);
|
|
p++;
|
|
continue;
|
|
}
|
|
re.Rewrite(&out, rewrite, vec, nvec);
|
|
p = vec[0].end();
|
|
lastend = p;
|
|
count++;
|
|
}
|
|
|
|
if (count == 0)
|
|
return 0;
|
|
|
|
if (p < ep)
|
|
out.append(p, ep - p);
|
|
swap(out, *str);
|
|
return count;
|
|
}
|
|
|
|
bool RE2::Extract(const StringPiece &text,
|
|
const RE2& re,
|
|
const StringPiece &rewrite,
|
|
string *out) {
|
|
StringPiece vec[kVecSize];
|
|
int nvec = 1 + MaxSubmatch(rewrite);
|
|
if (static_cast<size_t>(nvec) > arraysize(vec))
|
|
return false;
|
|
|
|
if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
|
|
return false;
|
|
|
|
out->clear();
|
|
return re.Rewrite(out, rewrite, vec, nvec);
|
|
}
|
|
|
|
string RE2::QuoteMeta(const StringPiece& unquoted) {
|
|
string result;
|
|
result.reserve(unquoted.size() << 1);
|
|
|
|
// Escape any ascii character not in [A-Za-z_0-9].
|
|
//
|
|
// Note that it's legal to escape a character even if it has no
|
|
// special meaning in a regular expression -- so this function does
|
|
// that. (This also makes it identical to the perl function of the
|
|
// same name except for the null-character special case;
|
|
// see `perldoc -f quotemeta`.)
|
|
for (int ii = 0; ii < unquoted.length(); ++ii) {
|
|
// Note that using 'isalnum' here raises the benchmark time from
|
|
// 32ns to 58ns:
|
|
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
|
(unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
|
|
(unquoted[ii] < '0' || unquoted[ii] > '9') &&
|
|
unquoted[ii] != '_' &&
|
|
// If this is the part of a UTF8 or Latin1 character, we need
|
|
// to copy this byte without escaping. Experimentally this is
|
|
// what works correctly with the regexp library.
|
|
!(unquoted[ii] & 128)) {
|
|
if (unquoted[ii] == '\0') { // Special handling for null chars.
|
|
// Note that this special handling is not strictly required for RE2,
|
|
// but this quoting is required for other regexp libraries such as
|
|
// PCRE.
|
|
// Can't use "\\0" since the next character might be a digit.
|
|
result += "\\x00";
|
|
continue;
|
|
}
|
|
result += '\\';
|
|
}
|
|
result += unquoted[ii];
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
|
|
if (prog_ == NULL)
|
|
return false;
|
|
|
|
int n = prefix_.size();
|
|
if (n > maxlen)
|
|
n = maxlen;
|
|
|
|
// Determine initial min max from prefix_ literal.
|
|
string pmin, pmax;
|
|
pmin = prefix_.substr(0, n);
|
|
pmax = prefix_.substr(0, n);
|
|
if (prefix_foldcase_) {
|
|
// prefix is ASCII lowercase; change pmin to uppercase.
|
|
for (int i = 0; i < n; i++) {
|
|
if ('a' <= pmin[i] && pmin[i] <= 'z')
|
|
pmin[i] += 'A' - 'a';
|
|
}
|
|
}
|
|
|
|
// Add to prefix min max using PossibleMatchRange on regexp.
|
|
string dmin, dmax;
|
|
maxlen -= n;
|
|
if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
|
|
pmin += dmin;
|
|
pmax += dmax;
|
|
} else if (pmax.size() > 0) {
|
|
// prog_->PossibleMatchRange has failed us,
|
|
// but we still have useful information from prefix_.
|
|
// Round up pmax to allow any possible suffix.
|
|
pmax = PrefixSuccessor(pmax);
|
|
} else {
|
|
// Nothing useful.
|
|
*min = "";
|
|
*max = "";
|
|
return false;
|
|
}
|
|
|
|
*min = pmin;
|
|
*max = pmax;
|
|
return true;
|
|
}
|
|
|
|
// Avoid possible locale nonsense in standard strcasecmp.
|
|
// The string a is known to be all lowercase.
|
|
static int ascii_strcasecmp(const char* a, const char* b, int len) {
|
|
const char *ae = a + len;
|
|
|
|
for (; a < ae; a++, b++) {
|
|
uint8 x = *a;
|
|
uint8 y = *b;
|
|
if ('A' <= y && y <= 'Z')
|
|
y += 'a' - 'A';
|
|
if (x != y)
|
|
return x - y;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
/***** Actual matching and rewriting code *****/
|
|
|
|
bool RE2::Match(const StringPiece& text,
|
|
int startpos,
|
|
int endpos,
|
|
Anchor re_anchor,
|
|
StringPiece* submatch,
|
|
int nsubmatch) const {
|
|
if (!ok() || suffix_regexp_ == NULL) {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "Invalid RE2: " << *error_;
|
|
return false;
|
|
}
|
|
|
|
if (startpos < 0 || startpos > endpos || endpos > text.size()) {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
|
|
return false;
|
|
}
|
|
|
|
StringPiece subtext = text;
|
|
subtext.remove_prefix(startpos);
|
|
subtext.remove_suffix(text.size() - endpos);
|
|
|
|
// Use DFAs to find exact location of match, filter out non-matches.
|
|
|
|
// Don't ask for the location if we won't use it.
|
|
// SearchDFA can do extra optimizations in that case.
|
|
StringPiece match;
|
|
StringPiece* matchp = &match;
|
|
if (nsubmatch == 0)
|
|
matchp = NULL;
|
|
|
|
int ncap = 1 + NumberOfCapturingGroups();
|
|
if (ncap > nsubmatch)
|
|
ncap = nsubmatch;
|
|
|
|
// If the regexp is anchored explicitly, must not be in middle of text.
|
|
if (prog_->anchor_start() && startpos != 0)
|
|
return false;
|
|
|
|
// If the regexp is anchored explicitly, update re_anchor
|
|
// so that we can potentially fall into a faster case below.
|
|
if (prog_->anchor_start() && prog_->anchor_end())
|
|
re_anchor = ANCHOR_BOTH;
|
|
else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
|
|
re_anchor = ANCHOR_START;
|
|
|
|
// Check for the required prefix, if any.
|
|
int prefixlen = 0;
|
|
if (!prefix_.empty()) {
|
|
if (startpos != 0)
|
|
return false;
|
|
prefixlen = prefix_.size();
|
|
if (prefixlen > subtext.size())
|
|
return false;
|
|
if (prefix_foldcase_) {
|
|
if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
|
|
return false;
|
|
} else {
|
|
if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
|
|
return false;
|
|
}
|
|
subtext.remove_prefix(prefixlen);
|
|
// If there is a required prefix, the anchor must be at least ANCHOR_START.
|
|
if (re_anchor != ANCHOR_BOTH)
|
|
re_anchor = ANCHOR_START;
|
|
}
|
|
|
|
Prog::Anchor anchor = Prog::kUnanchored;
|
|
Prog::MatchKind kind = Prog::kFirstMatch;
|
|
if (options_.longest_match())
|
|
kind = Prog::kLongestMatch;
|
|
bool skipped_test = false;
|
|
|
|
bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
|
|
|
|
// SearchBitState allocates a bit vector of size prog_->size() * text.size().
|
|
// It also allocates a stack of 3-word structures which could potentially
|
|
// grow as large as prog_->size() * text.size() but in practice is much
|
|
// smaller.
|
|
// Conditions for using SearchBitState:
|
|
const int MaxBitStateProg = 500; // prog_->size() <= Max.
|
|
const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
|
|
bool can_bit_state = prog_->size() <= MaxBitStateProg;
|
|
int bit_state_text_max = MaxBitStateVector / prog_->size();
|
|
|
|
bool dfa_failed = false;
|
|
switch (re_anchor) {
|
|
default:
|
|
case UNANCHORED: {
|
|
if (!prog_->SearchDFA(subtext, text, anchor, kind,
|
|
matchp, &dfa_failed, NULL)) {
|
|
if (dfa_failed) {
|
|
// Fall back to NFA below.
|
|
skipped_test = true;
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " DFA failed.";
|
|
break;
|
|
}
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " used DFA - no match.";
|
|
return false;
|
|
}
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " used DFA - match";
|
|
if (matchp == NULL) // Matched. Don't care where
|
|
return true;
|
|
// SearchDFA set match[0].end() but didn't know where the
|
|
// match started. Run the regexp backward from match[0].end()
|
|
// to find the longest possible match -- that's where it started.
|
|
Prog* prog = ReverseProg();
|
|
if (prog == NULL)
|
|
return false;
|
|
if (!prog->SearchDFA(match, text, Prog::kAnchored,
|
|
Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
|
|
if (dfa_failed) {
|
|
// Fall back to NFA below.
|
|
skipped_test = true;
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " reverse DFA failed.";
|
|
break;
|
|
}
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " DFA inconsistency.";
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "DFA inconsistency";
|
|
return false;
|
|
}
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " used reverse DFA.";
|
|
break;
|
|
}
|
|
|
|
case ANCHOR_BOTH:
|
|
case ANCHOR_START:
|
|
if (re_anchor == ANCHOR_BOTH)
|
|
kind = Prog::kFullMatch;
|
|
anchor = Prog::kAnchored;
|
|
|
|
// If only a small amount of text and need submatch
|
|
// information anyway and we're going to use OnePass or BitState
|
|
// to get it, we might as well not even bother with the DFA:
|
|
// OnePass or BitState will be fast enough.
|
|
// On tiny texts, OnePass outruns even the DFA, and
|
|
// it doesn't have the shared state and occasional mutex that
|
|
// the DFA does.
|
|
if (can_one_pass && text.size() <= 4096 &&
|
|
(ncap > 1 || text.size() <= 8)) {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " skipping DFA for OnePass.";
|
|
skipped_test = true;
|
|
break;
|
|
}
|
|
if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " skipping DFA for BitState.";
|
|
skipped_test = true;
|
|
break;
|
|
}
|
|
if (!prog_->SearchDFA(subtext, text, anchor, kind,
|
|
&match, &dfa_failed, NULL)) {
|
|
if (dfa_failed) {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " DFA failed.";
|
|
skipped_test = true;
|
|
break;
|
|
}
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " used DFA - no match.";
|
|
return false;
|
|
}
|
|
break;
|
|
}
|
|
|
|
if (!skipped_test && ncap <= 1) {
|
|
// We know exactly where it matches. That's enough.
|
|
if (ncap == 1)
|
|
submatch[0] = match;
|
|
} else {
|
|
StringPiece subtext1;
|
|
if (skipped_test) {
|
|
// DFA ran out of memory or was skipped:
|
|
// need to search in entire original text.
|
|
subtext1 = subtext;
|
|
} else {
|
|
// DFA found the exact match location:
|
|
// let NFA run an anchored, full match search
|
|
// to find submatch locations.
|
|
subtext1 = match;
|
|
anchor = Prog::kAnchored;
|
|
kind = Prog::kFullMatch;
|
|
}
|
|
|
|
if (can_one_pass && anchor != Prog::kUnanchored) {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " using OnePass.";
|
|
if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
|
|
if (!skipped_test && options_.log_errors())
|
|
LOG(ERROR) << "SearchOnePass inconsistency";
|
|
return false;
|
|
}
|
|
} else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " using BitState.";
|
|
if (!prog_->SearchBitState(subtext1, text, anchor,
|
|
kind, submatch, ncap)) {
|
|
if (!skipped_test && options_.log_errors())
|
|
LOG(ERROR) << "SearchBitState inconsistency";
|
|
return false;
|
|
}
|
|
} else {
|
|
if (FLAGS_trace_re2)
|
|
LOG(INFO) << "Match " << trunc(pattern_)
|
|
<< " [" << CEscape(subtext) << "]"
|
|
<< " using NFA.";
|
|
if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
|
|
if (!skipped_test && options_.log_errors())
|
|
LOG(ERROR) << "SearchNFA inconsistency";
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Adjust overall match for required prefix that we stripped off.
|
|
if (prefixlen > 0 && nsubmatch > 0)
|
|
submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
|
|
submatch[0].size() + prefixlen);
|
|
|
|
// Zero submatches that don't exist in the regexp.
|
|
for (int i = ncap; i < nsubmatch; i++)
|
|
submatch[i] = NULL;
|
|
return true;
|
|
}
|
|
|
|
// Internal matcher - like Match() but takes Args not StringPieces.
|
|
bool RE2::DoMatch(const StringPiece& text,
|
|
Anchor anchor,
|
|
int* consumed,
|
|
const Arg* const* args,
|
|
int n) const {
|
|
if (!ok()) {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "Invalid RE2: " << *error_;
|
|
return false;
|
|
}
|
|
|
|
// Count number of capture groups needed.
|
|
int nvec;
|
|
if (n == 0 && consumed == NULL)
|
|
nvec = 0;
|
|
else
|
|
nvec = n+1;
|
|
|
|
StringPiece* vec;
|
|
StringPiece stkvec[kVecSize];
|
|
StringPiece* heapvec = NULL;
|
|
|
|
if (static_cast<size_t>(nvec) <= arraysize(stkvec)) {
|
|
vec = stkvec;
|
|
} else {
|
|
vec = new StringPiece[nvec];
|
|
heapvec = vec;
|
|
}
|
|
|
|
if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
|
|
delete[] heapvec;
|
|
return false;
|
|
}
|
|
|
|
if(consumed != NULL)
|
|
*consumed = vec[0].end() - text.begin();
|
|
|
|
if (n == 0 || args == NULL) {
|
|
// We are not interested in results
|
|
delete[] heapvec;
|
|
return true;
|
|
}
|
|
|
|
int ncap = NumberOfCapturingGroups();
|
|
if (ncap < n) {
|
|
// RE has fewer capturing groups than number of arg pointers passed in
|
|
VLOG(1) << "Asked for " << n << " but only have " << ncap;
|
|
delete[] heapvec;
|
|
return false;
|
|
}
|
|
|
|
// If we got here, we must have matched the whole pattern.
|
|
for (int i = 0; i < n; i++) {
|
|
const StringPiece& s = vec[i+1];
|
|
if (!args[i]->Parse(s.data(), s.size())) {
|
|
// TODO: Should we indicate what the error was?
|
|
VLOG(1) << "Parse error on #" << i << " " << s << " "
|
|
<< (void*)s.data() << "/" << s.size();
|
|
delete[] heapvec;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
delete[] heapvec;
|
|
return true;
|
|
}
|
|
|
|
// Append the "rewrite" string, with backslash subsitutions from "vec",
|
|
// to string "out".
|
|
bool RE2::Rewrite(string *out, const StringPiece &rewrite,
|
|
const StringPiece *vec, int veclen) const {
|
|
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
|
s < end; s++) {
|
|
int c = *s;
|
|
if (c == '\\') {
|
|
s++;
|
|
c = (s < end) ? *s : -1;
|
|
if (isdigit(c)) {
|
|
int n = (c - '0');
|
|
if (n >= veclen) {
|
|
if (options_.log_errors()) {
|
|
LOG(ERROR) << "requested group " << n
|
|
<< " in regexp " << rewrite.data();
|
|
}
|
|
return false;
|
|
}
|
|
StringPiece snip = vec[n];
|
|
if (snip.size() > 0)
|
|
out->append(snip.data(), snip.size());
|
|
} else if (c == '\\') {
|
|
out->push_back('\\');
|
|
} else {
|
|
if (options_.log_errors())
|
|
LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
|
|
return false;
|
|
}
|
|
} else {
|
|
out->push_back(c);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Return the number of capturing subpatterns, or -1 if the
|
|
// regexp wasn't valid on construction.
|
|
int RE2::NumberOfCapturingGroups() const {
|
|
if (suffix_regexp_ == NULL)
|
|
return -1;
|
|
int n;
|
|
ATOMIC_LOAD_RELAXED(n, &num_captures_);
|
|
if (n == -1) {
|
|
n = suffix_regexp_->NumCaptures();
|
|
ATOMIC_STORE_RELAXED(&num_captures_, n);
|
|
}
|
|
return n;
|
|
}
|
|
|
|
// Checks that the rewrite string is well-formed with respect to this
|
|
// regular expression.
|
|
bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
|
|
int max_token = -1;
|
|
for (const char *s = rewrite.data(), *end = s + rewrite.size();
|
|
s < end; s++) {
|
|
int c = *s;
|
|
if (c != '\\') {
|
|
continue;
|
|
}
|
|
if (++s == end) {
|
|
*error = "Rewrite schema error: '\\' not allowed at end.";
|
|
return false;
|
|
}
|
|
c = *s;
|
|
if (c == '\\') {
|
|
continue;
|
|
}
|
|
if (!isdigit(c)) {
|
|
*error = "Rewrite schema error: "
|
|
"'\\' must be followed by a digit or '\\'.";
|
|
return false;
|
|
}
|
|
int n = (c - '0');
|
|
if (max_token < n) {
|
|
max_token = n;
|
|
}
|
|
}
|
|
|
|
if (max_token > NumberOfCapturingGroups()) {
|
|
SStringPrintf(error, "Rewrite schema requests %d matches, "
|
|
"but the regexp only has %d parenthesized subexpressions.",
|
|
max_token, NumberOfCapturingGroups());
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/***** Parsers for various types *****/
|
|
|
|
bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
|
|
// We fail if somebody asked us to store into a non-NULL void* pointer
|
|
return (dest == NULL);
|
|
}
|
|
|
|
bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
|
|
if (dest == NULL) return true;
|
|
reinterpret_cast<string*>(dest)->assign(str, n);
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
|
|
if (dest == NULL) return true;
|
|
reinterpret_cast<StringPiece*>(dest)->set(str, n);
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
|
|
if (n != 1) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<char*>(dest)) = str[0];
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
|
|
if (n != 1) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
|
|
return true;
|
|
}
|
|
|
|
// Largest number spec that we are willing to parse
|
|
static const int kMaxNumberLength = 32;
|
|
|
|
// REQUIRES "buf" must have length at least kMaxNumberLength+1
|
|
// Copies "str" into "buf" and null-terminates.
|
|
// Overwrites *np with the new length.
|
|
static const char* TerminateNumber(char* buf, const char* str, int* np) {
|
|
int n = *np;
|
|
if (n <= 0) return "";
|
|
if (n > 0 && isspace(*str)) {
|
|
// We are less forgiving than the strtoxxx() routines and do not
|
|
// allow leading spaces.
|
|
return "";
|
|
}
|
|
|
|
// Although buf has a fixed maximum size, we can still handle
|
|
// arbitrarily large integers correctly by omitting leading zeros.
|
|
// (Numbers that are still too long will be out of range.)
|
|
// Before deciding whether str is too long,
|
|
// remove leading zeros with s/000+/00/.
|
|
// Leaving the leading two zeros in place means that
|
|
// we don't change 0000x123 (invalid) into 0x123 (valid).
|
|
// Skip over leading - before replacing.
|
|
bool neg = false;
|
|
if (n >= 1 && str[0] == '-') {
|
|
neg = true;
|
|
n--;
|
|
str++;
|
|
}
|
|
|
|
if (n >= 3 && str[0] == '0' && str[1] == '0') {
|
|
while (n >= 3 && str[2] == '0') {
|
|
n--;
|
|
str++;
|
|
}
|
|
}
|
|
|
|
if (neg) { // make room in buf for -
|
|
n++;
|
|
str--;
|
|
}
|
|
|
|
if (n > kMaxNumberLength) return "";
|
|
|
|
memmove(buf, str, n);
|
|
if (neg) {
|
|
buf[0] = '-';
|
|
}
|
|
buf[n] = '\0';
|
|
*np = n;
|
|
return buf;
|
|
}
|
|
|
|
bool RE2::Arg::parse_long_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
if (n == 0) return false;
|
|
char buf[kMaxNumberLength+1];
|
|
str = TerminateNumber(buf, str, &n);
|
|
char* end;
|
|
errno = 0;
|
|
long r = strtol(str, &end, radix);
|
|
if (end != str + n) return false; // Leftover junk
|
|
if (errno) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<long*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_ulong_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
if (n == 0) return false;
|
|
char buf[kMaxNumberLength+1];
|
|
str = TerminateNumber(buf, str, &n);
|
|
if (str[0] == '-') {
|
|
// strtoul() will silently accept negative numbers and parse
|
|
// them. This module is more strict and treats them as errors.
|
|
return false;
|
|
}
|
|
|
|
char* end;
|
|
errno = 0;
|
|
unsigned long r = strtoul(str, &end, radix);
|
|
if (end != str + n) return false; // Leftover junk
|
|
if (errno) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<unsigned long*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_short_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
long r;
|
|
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
|
if ((short)r != r) return false; // Out of range
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<short*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_ushort_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
unsigned long r;
|
|
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
|
if ((ushort)r != r) return false; // Out of range
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<unsigned short*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_int_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
long r;
|
|
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
|
if ((int)r != r) return false; // Out of range
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<int*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_uint_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
unsigned long r;
|
|
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
|
if ((uint)r != r) return false; // Out of range
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<unsigned int*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
#ifdef RE2_HAVE_LONGLONG
|
|
bool RE2::Arg::parse_longlong_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
if (n == 0) return false;
|
|
char buf[kMaxNumberLength+1];
|
|
str = TerminateNumber(buf, str, &n);
|
|
char* end;
|
|
errno = 0;
|
|
int64 r = strtoll(str, &end, radix);
|
|
if (end != str + n) return false; // Leftover junk
|
|
if (errno) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<int64*>(dest)) = r;
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_ulonglong_radix(const char* str,
|
|
int n,
|
|
void* dest,
|
|
int radix) {
|
|
if (n == 0) return false;
|
|
char buf[kMaxNumberLength+1];
|
|
str = TerminateNumber(buf, str, &n);
|
|
if (str[0] == '-') {
|
|
// strtoull() will silently accept negative numbers and parse
|
|
// them. This module is more strict and treats them as errors.
|
|
return false;
|
|
}
|
|
char* end;
|
|
errno = 0;
|
|
uint64 r = strtoull(str, &end, radix);
|
|
if (end != str + n) return false; // Leftover junk
|
|
if (errno) return false;
|
|
if (dest == NULL) return true;
|
|
*(reinterpret_cast<uint64*>(dest)) = r;
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
|
|
if (n == 0) return false;
|
|
static const int kMaxLength = 200;
|
|
char buf[kMaxLength];
|
|
if (n >= kMaxLength) return false;
|
|
memcpy(buf, str, n);
|
|
buf[n] = '\0';
|
|
errno = 0;
|
|
char* end;
|
|
double r;
|
|
if (isfloat) {
|
|
r = strtof(buf, &end);
|
|
} else {
|
|
r = strtod(buf, &end);
|
|
}
|
|
if (end != buf + n) return false; // Leftover junk
|
|
if (errno) return false;
|
|
if (dest == NULL) return true;
|
|
if (isfloat) {
|
|
*(reinterpret_cast<float*>(dest)) = r;
|
|
} else {
|
|
*(reinterpret_cast<double*>(dest)) = r;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
|
|
return parse_double_float(str, n, false, dest);
|
|
}
|
|
|
|
bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
|
|
return parse_double_float(str, n, true, dest);
|
|
}
|
|
|
|
|
|
#define DEFINE_INTEGER_PARSERS(name) \
|
|
bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
|
|
return parse_##name##_radix(str, n, dest, 10); \
|
|
} \
|
|
bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
|
|
return parse_##name##_radix(str, n, dest, 16); \
|
|
} \
|
|
bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
|
|
return parse_##name##_radix(str, n, dest, 8); \
|
|
} \
|
|
bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
|
|
return parse_##name##_radix(str, n, dest, 0); \
|
|
}
|
|
|
|
DEFINE_INTEGER_PARSERS(short);
|
|
DEFINE_INTEGER_PARSERS(ushort);
|
|
DEFINE_INTEGER_PARSERS(int);
|
|
DEFINE_INTEGER_PARSERS(uint);
|
|
DEFINE_INTEGER_PARSERS(long);
|
|
DEFINE_INTEGER_PARSERS(ulong);
|
|
DEFINE_INTEGER_PARSERS(longlong);
|
|
DEFINE_INTEGER_PARSERS(ulonglong);
|
|
|
|
#undef DEFINE_INTEGER_PARSERS
|
|
|
|
} // namespace re2
|