mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-05 15:21:43 +00:00
342 lines
8.1 KiB
C++
342 lines
8.1 KiB
C++
|
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
// Format a regular expression structure as a string.
|
||
|
// Tested by parse_test.cc
|
||
|
|
||
|
#include "util/util.h"
|
||
|
#include "re2/regexp.h"
|
||
|
#include "re2/walker-inl.h"
|
||
|
|
||
|
namespace re2 {
|
||
|
|
||
|
enum {
|
||
|
PrecAtom,
|
||
|
PrecUnary,
|
||
|
PrecConcat,
|
||
|
PrecAlternate,
|
||
|
PrecEmpty,
|
||
|
PrecParen,
|
||
|
PrecToplevel,
|
||
|
};
|
||
|
|
||
|
// Helper function. See description below.
|
||
|
static void AppendCCRange(string* t, Rune lo, Rune hi);
|
||
|
|
||
|
// Walker to generate string in s_.
|
||
|
// The arg pointers are actually integers giving the
|
||
|
// context precedence.
|
||
|
// The child_args are always NULL.
|
||
|
class ToStringWalker : public Regexp::Walker<int> {
|
||
|
public:
|
||
|
explicit ToStringWalker(string* t) : t_(t) {}
|
||
|
|
||
|
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
|
||
|
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
||
|
int* child_args, int nchild_args);
|
||
|
virtual int ShortVisit(Regexp* re, int parent_arg) {
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
private:
|
||
|
string* t_; // The string the walker appends to.
|
||
|
|
||
|
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
|
||
|
};
|
||
|
|
||
|
string Regexp::ToString() {
|
||
|
string t;
|
||
|
ToStringWalker w(&t);
|
||
|
w.WalkExponential(this, PrecToplevel, 100000);
|
||
|
if (w.stopped_early())
|
||
|
t += " [truncated]";
|
||
|
return t;
|
||
|
}
|
||
|
|
||
|
#define ToString DontCallToString // Avoid accidental recursion.
|
||
|
|
||
|
// Visits re before children are processed.
|
||
|
// Appends ( if needed and passes new precedence to children.
|
||
|
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
||
|
int prec = parent_arg;
|
||
|
int nprec = PrecAtom;
|
||
|
|
||
|
switch (re->op()) {
|
||
|
case kRegexpNoMatch:
|
||
|
case kRegexpEmptyMatch:
|
||
|
case kRegexpLiteral:
|
||
|
case kRegexpAnyChar:
|
||
|
case kRegexpAnyByte:
|
||
|
case kRegexpBeginLine:
|
||
|
case kRegexpEndLine:
|
||
|
case kRegexpBeginText:
|
||
|
case kRegexpEndText:
|
||
|
case kRegexpWordBoundary:
|
||
|
case kRegexpNoWordBoundary:
|
||
|
case kRegexpCharClass:
|
||
|
case kRegexpHaveMatch:
|
||
|
nprec = PrecAtom;
|
||
|
break;
|
||
|
|
||
|
case kRegexpConcat:
|
||
|
case kRegexpLiteralString:
|
||
|
if (prec < PrecConcat)
|
||
|
t_->append("(?:");
|
||
|
nprec = PrecConcat;
|
||
|
break;
|
||
|
|
||
|
case kRegexpAlternate:
|
||
|
if (prec < PrecAlternate)
|
||
|
t_->append("(?:");
|
||
|
nprec = PrecAlternate;
|
||
|
break;
|
||
|
|
||
|
case kRegexpCapture:
|
||
|
t_->append("(");
|
||
|
if (re->name()) {
|
||
|
t_->append("?P<");
|
||
|
t_->append(*re->name());
|
||
|
t_->append(">");
|
||
|
}
|
||
|
nprec = PrecParen;
|
||
|
break;
|
||
|
|
||
|
case kRegexpStar:
|
||
|
case kRegexpPlus:
|
||
|
case kRegexpQuest:
|
||
|
case kRegexpRepeat:
|
||
|
if (prec < PrecUnary)
|
||
|
t_->append("(?:");
|
||
|
// The subprecedence here is PrecAtom instead of PrecUnary
|
||
|
// because PCRE treats two unary ops in a row as a parse error.
|
||
|
nprec = PrecAtom;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
return nprec;
|
||
|
}
|
||
|
|
||
|
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
||
|
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
||
|
t->append(1, '\\');
|
||
|
t->append(1, r);
|
||
|
} else if (foldcase && 'a' <= r && r <= 'z') {
|
||
|
if ('a' <= r && r <= 'z')
|
||
|
r += 'A' - 'a';
|
||
|
t->append(1, '[');
|
||
|
t->append(1, r);
|
||
|
t->append(1, r + 'a' - 'A');
|
||
|
t->append(1, ']');
|
||
|
} else {
|
||
|
AppendCCRange(t, r, r);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Visits re after children are processed.
|
||
|
// For childless regexps, all the work is done here.
|
||
|
// For regexps with children, append any unary suffixes or ).
|
||
|
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
||
|
int* child_args, int nchild_args) {
|
||
|
int prec = parent_arg;
|
||
|
switch (re->op()) {
|
||
|
case kRegexpNoMatch:
|
||
|
// There's no simple symbol for "no match", but
|
||
|
// [^0-Runemax] excludes everything.
|
||
|
t_->append("[^\\x00-\\x{10ffff}]");
|
||
|
break;
|
||
|
|
||
|
case kRegexpEmptyMatch:
|
||
|
// Append (?:) to make empty string visible,
|
||
|
// unless this is already being parenthesized.
|
||
|
if (prec < PrecEmpty)
|
||
|
t_->append("(?:)");
|
||
|
break;
|
||
|
|
||
|
case kRegexpLiteral:
|
||
|
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
|
||
|
break;
|
||
|
|
||
|
case kRegexpLiteralString:
|
||
|
for (int i = 0; i < re->nrunes(); i++)
|
||
|
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
|
||
|
if (prec < PrecConcat)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpConcat:
|
||
|
if (prec < PrecConcat)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpAlternate:
|
||
|
// Clumsy but workable: the children all appended |
|
||
|
// at the end of their strings, so just remove the last one.
|
||
|
if ((*t_)[t_->size()-1] == '|')
|
||
|
t_->erase(t_->size()-1);
|
||
|
else
|
||
|
LOG(DFATAL) << "Bad final char: " << t_;
|
||
|
if (prec < PrecAlternate)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpStar:
|
||
|
t_->append("*");
|
||
|
if (re->parse_flags() & Regexp::NonGreedy)
|
||
|
t_->append("?");
|
||
|
if (prec < PrecUnary)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpPlus:
|
||
|
t_->append("+");
|
||
|
if (re->parse_flags() & Regexp::NonGreedy)
|
||
|
t_->append("?");
|
||
|
if (prec < PrecUnary)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpQuest:
|
||
|
t_->append("?");
|
||
|
if (re->parse_flags() & Regexp::NonGreedy)
|
||
|
t_->append("?");
|
||
|
if (prec < PrecUnary)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpRepeat:
|
||
|
if (re->max() == -1)
|
||
|
t_->append(StringPrintf("{%d,}", re->min()));
|
||
|
else if (re->min() == re->max())
|
||
|
t_->append(StringPrintf("{%d}", re->min()));
|
||
|
else
|
||
|
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
|
||
|
if (re->parse_flags() & Regexp::NonGreedy)
|
||
|
t_->append("?");
|
||
|
if (prec < PrecUnary)
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpAnyChar:
|
||
|
t_->append(".");
|
||
|
break;
|
||
|
|
||
|
case kRegexpAnyByte:
|
||
|
t_->append("\\C");
|
||
|
break;
|
||
|
|
||
|
case kRegexpBeginLine:
|
||
|
t_->append("^");
|
||
|
break;
|
||
|
|
||
|
case kRegexpEndLine:
|
||
|
t_->append("$");
|
||
|
break;
|
||
|
|
||
|
case kRegexpBeginText:
|
||
|
t_->append("(?-m:^)");
|
||
|
break;
|
||
|
|
||
|
case kRegexpEndText:
|
||
|
if (re->parse_flags() & Regexp::WasDollar)
|
||
|
t_->append("(?-m:$)");
|
||
|
else
|
||
|
t_->append("\\z");
|
||
|
break;
|
||
|
|
||
|
case kRegexpWordBoundary:
|
||
|
t_->append("\\b");
|
||
|
break;
|
||
|
|
||
|
case kRegexpNoWordBoundary:
|
||
|
t_->append("\\B");
|
||
|
break;
|
||
|
|
||
|
case kRegexpCharClass: {
|
||
|
if (re->cc()->size() == 0) {
|
||
|
t_->append("[^\\x00-\\x{10ffff}]");
|
||
|
break;
|
||
|
}
|
||
|
t_->append("[");
|
||
|
// Heuristic: show class as negated if it contains the
|
||
|
// non-character 0xFFFE.
|
||
|
CharClass* cc = re->cc();
|
||
|
if (cc->Contains(0xFFFE)) {
|
||
|
cc = cc->Negate();
|
||
|
t_->append("^");
|
||
|
}
|
||
|
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
|
||
|
AppendCCRange(t_, i->lo, i->hi);
|
||
|
if (cc != re->cc())
|
||
|
cc->Delete();
|
||
|
t_->append("]");
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case kRegexpCapture:
|
||
|
t_->append(")");
|
||
|
break;
|
||
|
|
||
|
case kRegexpHaveMatch:
|
||
|
// There's no syntax accepted by the parser to generate
|
||
|
// this node (it is generated by RE2::Set) so make something
|
||
|
// up that is readable but won't compile.
|
||
|
t_->append("(?HaveMatch:%d)", re->match_id());
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
// If the parent is an alternation, append the | for it.
|
||
|
if (prec == PrecAlternate)
|
||
|
t_->append("|");
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// Appends a rune for use in a character class to the string t.
|
||
|
static void AppendCCChar(string* t, Rune r) {
|
||
|
if (0x20 <= r && r <= 0x7E) {
|
||
|
if (strchr("[]^-\\", r))
|
||
|
t->append("\\");
|
||
|
t->append(1, r);
|
||
|
return;
|
||
|
}
|
||
|
switch (r) {
|
||
|
default:
|
||
|
break;
|
||
|
|
||
|
case '\r':
|
||
|
t->append("\\r");
|
||
|
return;
|
||
|
|
||
|
case '\t':
|
||
|
t->append("\\t");
|
||
|
return;
|
||
|
|
||
|
case '\n':
|
||
|
t->append("\\n");
|
||
|
return;
|
||
|
|
||
|
case '\f':
|
||
|
t->append("\\f");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if (r < 0x100) {
|
||
|
StringAppendF(t, "\\x%02x", static_cast<int>(r));
|
||
|
return;
|
||
|
}
|
||
|
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
|
||
|
}
|
||
|
|
||
|
static void AppendCCRange(string* t, Rune lo, Rune hi) {
|
||
|
if (lo > hi)
|
||
|
return;
|
||
|
AppendCCChar(t, lo);
|
||
|
if (lo < hi) {
|
||
|
t->append("-");
|
||
|
AppendCCChar(t, hi);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} // namespace re2
|