mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-14 11:33:46 +00:00
186 lines
5.8 KiB
C++
186 lines
5.8 KiB
C++
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Determine whether this library should match PCRE exactly
|
|
// for a particular Regexp. (If so, the testing framework can
|
|
// check that it does.)
|
|
//
|
|
// This library matches PCRE except in these cases:
|
|
// * the regexp contains a repetition of an empty string,
|
|
// like (a*)* or (a*)+. In this case, PCRE will treat
|
|
// the repetition sequence as ending with an empty string,
|
|
// while this library does not.
|
|
// * Perl and PCRE differ on whether \v matches \n.
|
|
// For historical reasons, this library implements the Perl behavior.
|
|
// * Perl and PCRE allow $ in one-line mode to match either the very
|
|
// end of the text or just before a \n at the end of the text.
|
|
// This library requires it to match only the end of the text.
|
|
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
|
// match the end of the text if the last character is a \n.
|
|
// This library does allow it.
|
|
//
|
|
// Regexp::MimicsPCRE checks for any of these conditions.
|
|
|
|
#include "util/util.h"
|
|
#include "re2/regexp.h"
|
|
#include "re2/walker-inl.h"
|
|
|
|
namespace re2 {
|
|
|
|
// Returns whether re might match an empty string.
|
|
static bool CanBeEmptyString(Regexp *re);
|
|
|
|
// Walker class to compute whether library handles a regexp
|
|
// exactly as PCRE would. See comment at top for conditions.
|
|
|
|
class PCREWalker : public Regexp::Walker<bool> {
|
|
public:
|
|
PCREWalker() {}
|
|
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
|
int nchild_args);
|
|
|
|
bool ShortVisit(Regexp* re, bool a) {
|
|
// Should never be called: we use Walk not WalkExponential.
|
|
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
return a;
|
|
}
|
|
};
|
|
|
|
// Called after visiting each of re's children and accumulating
|
|
// the return values in child_args. So child_args contains whether
|
|
// this library mimics PCRE for those subexpressions.
|
|
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
bool* child_args, int nchild_args) {
|
|
// If children failed, so do we.
|
|
for (int i = 0; i < nchild_args; i++)
|
|
if (!child_args[i])
|
|
return false;
|
|
|
|
// Otherwise look for other reasons to fail.
|
|
switch (re->op()) {
|
|
// Look for repeated empty string.
|
|
case kRegexpStar:
|
|
case kRegexpPlus:
|
|
case kRegexpQuest:
|
|
if (CanBeEmptyString(re->sub()[0]))
|
|
return false;
|
|
break;
|
|
case kRegexpRepeat:
|
|
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
|
return false;
|
|
break;
|
|
|
|
// Look for \v
|
|
case kRegexpLiteral:
|
|
if (re->rune() == '\v')
|
|
return false;
|
|
break;
|
|
|
|
// Look for $ in single-line mode.
|
|
case kRegexpEndText:
|
|
case kRegexpEmptyMatch:
|
|
if (re->parse_flags() & Regexp::WasDollar)
|
|
return false;
|
|
break;
|
|
|
|
// Look for ^ in multi-line mode.
|
|
case kRegexpBeginLine:
|
|
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
|
return false;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// Not proven guilty.
|
|
return true;
|
|
}
|
|
|
|
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
|
bool Regexp::MimicsPCRE() {
|
|
PCREWalker w;
|
|
return w.Walk(this, true);
|
|
}
|
|
|
|
|
|
// Walker class to compute whether a Regexp can match an empty string.
|
|
// It is okay to overestimate. For example, \b\B cannot match an empty
|
|
// string, because \b and \B are mutually exclusive, but this isn't
|
|
// that smart and will say it can. Spurious empty strings
|
|
// will reduce the number of regexps we sanity check against PCRE,
|
|
// but they won't break anything.
|
|
|
|
class EmptyStringWalker : public Regexp::Walker<bool> {
|
|
public:
|
|
EmptyStringWalker() { }
|
|
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
bool* child_args, int nchild_args);
|
|
|
|
bool ShortVisit(Regexp* re, bool a) {
|
|
// Should never be called: we use Walk not WalkExponential.
|
|
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
return a;
|
|
}
|
|
|
|
private:
|
|
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
|
};
|
|
|
|
// Called after visiting re's children. child_args contains the return
|
|
// value from each of the children's PostVisits (i.e., whether each child
|
|
// can match an empty string). Returns whether this clause can match an
|
|
// empty string.
|
|
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
bool* child_args, int nchild_args) {
|
|
switch (re->op()) {
|
|
case kRegexpNoMatch: // never empty
|
|
case kRegexpLiteral:
|
|
case kRegexpAnyChar:
|
|
case kRegexpAnyByte:
|
|
case kRegexpCharClass:
|
|
case kRegexpLiteralString:
|
|
return false;
|
|
|
|
case kRegexpEmptyMatch: // always empty
|
|
case kRegexpBeginLine: // always empty, when they match
|
|
case kRegexpEndLine:
|
|
case kRegexpNoWordBoundary:
|
|
case kRegexpWordBoundary:
|
|
case kRegexpBeginText:
|
|
case kRegexpEndText:
|
|
case kRegexpStar: // can always be empty
|
|
case kRegexpQuest:
|
|
case kRegexpHaveMatch:
|
|
return true;
|
|
|
|
case kRegexpConcat: // can be empty if all children can
|
|
for (int i = 0; i < nchild_args; i++)
|
|
if (!child_args[i])
|
|
return false;
|
|
return true;
|
|
|
|
case kRegexpAlternate: // can be empty if any child can
|
|
for (int i = 0; i < nchild_args; i++)
|
|
if (child_args[i])
|
|
return true;
|
|
return false;
|
|
|
|
case kRegexpPlus: // can be empty if the child can
|
|
case kRegexpCapture:
|
|
return child_args[0];
|
|
|
|
case kRegexpRepeat: // can be empty if child can or is x{0}
|
|
return child_args[0] || re->min() == 0;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Returns whether re can match an empty string.
|
|
static bool CanBeEmptyString(Regexp* re) {
|
|
EmptyStringWalker w;
|
|
return w.Walk(re, true);
|
|
}
|
|
|
|
} // namespace re2
|