mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
contrib/re2 as submodule
This commit is contained in:
parent
98c52ede21
commit
282448e67d
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -28,3 +28,6 @@
|
|||||||
[submodule "contrib/double-conversion"]
|
[submodule "contrib/double-conversion"]
|
||||||
path = contrib/double-conversion
|
path = contrib/double-conversion
|
||||||
url = https://github.com/google/double-conversion.git
|
url = https://github.com/google/double-conversion.git
|
||||||
|
[submodule "contrib/re2"]
|
||||||
|
path = contrib/re2
|
||||||
|
url = https://github.com/google/re2.git
|
||||||
|
4
contrib/CMakeLists.txt
vendored
4
contrib/CMakeLists.txt
vendored
@ -13,7 +13,9 @@ if (USE_INTERNAL_ZSTD_LIBRARY)
|
|||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (USE_INTERNAL_RE2_LIBRARY)
|
if (USE_INTERNAL_RE2_LIBRARY)
|
||||||
add_subdirectory (libre2)
|
set(RE2_BUILD_TESTING 0 CACHE INTERNAL "")
|
||||||
|
add_subdirectory (re2)
|
||||||
|
add_subdirectory (re2_st)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
|
if (USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
# This is the official list of RE2 authors for copyright purposes.
|
|
||||||
# This file is distinct from the CONTRIBUTORS files.
|
|
||||||
# See the latter for an explanation.
|
|
||||||
|
|
||||||
# Names should be added to this file as
|
|
||||||
# Name or Organization <email address>
|
|
||||||
# The email address is not required for organizations.
|
|
||||||
|
|
||||||
# Please keep the list sorted.
|
|
||||||
|
|
||||||
Google Inc.
|
|
||||||
Samsung Electronics
|
|
||||||
Stefano Rivera <stefano.rivera@gmail.com>
|
|
@ -1,39 +0,0 @@
|
|||||||
# This is the official list of people who can contribute
|
|
||||||
# (and typically have contributed) code to the RE2 repository.
|
|
||||||
# The AUTHORS file lists the copyright holders; this file
|
|
||||||
# lists people. For example, Google employees are listed here
|
|
||||||
# but not in AUTHORS, because Google holds the copyright.
|
|
||||||
#
|
|
||||||
# The submission process automatically checks to make sure
|
|
||||||
# that people submitting code are listed in this file (by email address).
|
|
||||||
#
|
|
||||||
# Names should be added to this file only after verifying that
|
|
||||||
# the individual or the individual's organization has agreed to
|
|
||||||
# the appropriate Contributor License Agreement, found here:
|
|
||||||
#
|
|
||||||
# http://code.google.com/legal/individual-cla-v1.0.html
|
|
||||||
# http://code.google.com/legal/corporate-cla-v1.0.html
|
|
||||||
#
|
|
||||||
# The agreement for individuals can be filled out on the web.
|
|
||||||
#
|
|
||||||
# When adding J Random Contributor's name to this file,
|
|
||||||
# either J's name or J's organization's name should be
|
|
||||||
# added to the AUTHORS file, depending on whether the
|
|
||||||
# individual or corporate CLA was used.
|
|
||||||
|
|
||||||
# Names should be added to this file like so:
|
|
||||||
# Name <email address>
|
|
||||||
|
|
||||||
# Please keep the list sorted.
|
|
||||||
|
|
||||||
Dominic Battré <battre@chromium.org>
|
|
||||||
Dmitriy Vyukov <dvyukov@google.com>
|
|
||||||
John Millikin <jmillikin@gmail.com>
|
|
||||||
Mike Nazarewicz <mpn@google.com>
|
|
||||||
Pawel Hajdan <phajdan.jr@gmail.com>
|
|
||||||
Rob Pike <r@google.com>
|
|
||||||
Russ Cox <rsc@swtch.com>
|
|
||||||
Sanjay Ghemawat <sanjay@google.com>
|
|
||||||
Stefano Rivera <stefano.rivera@gmail.com>
|
|
||||||
Srinivasan Venkatachary <vsri@google.com>
|
|
||||||
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
|
|
@ -1,27 +0,0 @@
|
|||||||
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
|
|
||||||
//
|
|
||||||
// Redistribution and use in source and binary forms, with or without
|
|
||||||
// modification, are permitted provided that the following conditions are
|
|
||||||
// met:
|
|
||||||
//
|
|
||||||
// * Redistributions of source code must retain the above copyright
|
|
||||||
// notice, this list of conditions and the following disclaimer.
|
|
||||||
// * Redistributions in binary form must reproduce the above
|
|
||||||
// copyright notice, this list of conditions and the following disclaimer
|
|
||||||
// in the documentation and/or other materials provided with the
|
|
||||||
// distribution.
|
|
||||||
// * Neither the name of Google Inc. nor the names of its
|
|
||||||
// contributors may be used to endorse or promote products derived from
|
|
||||||
// this software without specific prior written permission.
|
|
||||||
//
|
|
||||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
||||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
||||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
||||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
||||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
||||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1 +0,0 @@
|
|||||||
https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0
|
|
@ -1,113 +0,0 @@
|
|||||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_BITMAP256_H_
|
|
||||||
#define RE2_BITMAP256_H_
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#include <intrin.h>
|
|
||||||
#endif
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class Bitmap256 {
|
|
||||||
public:
|
|
||||||
Bitmap256() {
|
|
||||||
memset(words_, 0, sizeof words_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tests the bit with index c.
|
|
||||||
bool Test(int c) const {
|
|
||||||
DCHECK_GE(c, 0);
|
|
||||||
DCHECK_LE(c, 255);
|
|
||||||
|
|
||||||
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sets the bit with index c.
|
|
||||||
void Set(int c) {
|
|
||||||
DCHECK_GE(c, 0);
|
|
||||||
DCHECK_LE(c, 255);
|
|
||||||
|
|
||||||
words_[c / 64] |= (1ULL << (c % 64));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finds the next non-zero bit with index >= c.
|
|
||||||
// Returns -1 if no such bit exists.
|
|
||||||
int FindNextSetBit(int c) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Finds the least significant non-zero bit in n.
|
|
||||||
static int FindLSBSet(uint64_t n) {
|
|
||||||
DCHECK_NE(n, 0);
|
|
||||||
|
|
||||||
#if defined(__GNUC__)
|
|
||||||
return __builtin_ctzll(n);
|
|
||||||
#elif defined(_MSC_VER) && defined(_M_X64)
|
|
||||||
unsigned long c;
|
|
||||||
_BitScanForward64(&c, n);
|
|
||||||
return static_cast<int>(c);
|
|
||||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
|
||||||
unsigned long c;
|
|
||||||
if (static_cast<uint32_t>(n) != 0) {
|
|
||||||
_BitScanForward(&c, static_cast<uint32_t>(n));
|
|
||||||
return static_cast<int>(c);
|
|
||||||
} else {
|
|
||||||
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
|
|
||||||
return static_cast<int>(c) + 32;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
int c = 63;
|
|
||||||
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
|
|
||||||
uint64_t word = n << shift;
|
|
||||||
if (word != 0) {
|
|
||||||
n = word;
|
|
||||||
c -= shift;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t words_[4];
|
|
||||||
};
|
|
||||||
|
|
||||||
int Bitmap256::FindNextSetBit(int c) const {
|
|
||||||
DCHECK_GE(c, 0);
|
|
||||||
DCHECK_LE(c, 255);
|
|
||||||
|
|
||||||
// Check the word that contains the bit. Mask out any lower bits.
|
|
||||||
int i = c / 64;
|
|
||||||
uint64_t word = words_[i] & (~0ULL << (c % 64));
|
|
||||||
if (word != 0)
|
|
||||||
return (i * 64) + FindLSBSet(word);
|
|
||||||
|
|
||||||
// Check any following words.
|
|
||||||
i++;
|
|
||||||
switch (i) {
|
|
||||||
case 1:
|
|
||||||
if (words_[1] != 0)
|
|
||||||
return (1 * 64) + FindLSBSet(words_[1]);
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
case 2:
|
|
||||||
if (words_[2] != 0)
|
|
||||||
return (2 * 64) + FindLSBSet(words_[2]);
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
case 3:
|
|
||||||
if (words_[3] != 0)
|
|
||||||
return (3 * 64) + FindLSBSet(words_[3]);
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
default:
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_BITMAP256_H_
|
|
@ -1,401 +0,0 @@
|
|||||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
|
||||||
|
|
||||||
// Prog::SearchBitState is a regular expression search with submatch
|
|
||||||
// tracking for small regular expressions and texts. Like
|
|
||||||
// testing/backtrack.cc, it allocates a bit vector with (length of
|
|
||||||
// text) * (length of prog) bits, to make sure it never explores the
|
|
||||||
// same (character position, instruction) state multiple times. This
|
|
||||||
// limits the search to run in time linear in the length of the text.
|
|
||||||
//
|
|
||||||
// Unlike testing/backtrack.cc, SearchBitState is not recursive
|
|
||||||
// on the text.
|
|
||||||
//
|
|
||||||
// SearchBitState is a fast replacement for the NFA code on small
|
|
||||||
// regexps and texts when SearchOnePass cannot be used.
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "re2/prog.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
struct Job {
|
|
||||||
int id;
|
|
||||||
int arg;
|
|
||||||
const char* p;
|
|
||||||
};
|
|
||||||
|
|
||||||
class BitState {
|
|
||||||
public:
|
|
||||||
explicit BitState(Prog* prog);
|
|
||||||
~BitState();
|
|
||||||
|
|
||||||
// The usual Search prototype.
|
|
||||||
// Can only call Search once per BitState.
|
|
||||||
bool Search(const StringPiece& text, const StringPiece& context,
|
|
||||||
bool anchored, bool longest,
|
|
||||||
StringPiece* submatch, int nsubmatch);
|
|
||||||
|
|
||||||
private:
|
|
||||||
inline bool ShouldVisit(int id, const char* p);
|
|
||||||
void Push(int id, const char* p, int arg);
|
|
||||||
bool GrowStack();
|
|
||||||
bool TrySearch(int id, const char* p);
|
|
||||||
|
|
||||||
// Search parameters
|
|
||||||
Prog* prog_; // program being run
|
|
||||||
StringPiece text_; // text being searched
|
|
||||||
StringPiece context_; // greater context of text being searched
|
|
||||||
bool anchored_; // whether search is anchored at text.begin()
|
|
||||||
bool longest_; // whether search wants leftmost-longest match
|
|
||||||
bool endmatch_; // whether match must end at text.end()
|
|
||||||
StringPiece *submatch_; // submatches to fill in
|
|
||||||
int nsubmatch_; // # of submatches to fill in
|
|
||||||
|
|
||||||
// Search state
|
|
||||||
const char** cap_; // capture registers
|
|
||||||
int ncap_;
|
|
||||||
|
|
||||||
static const int VisitedBits = 32;
|
|
||||||
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
|
||||||
size_t nvisited_; // # of words in bitmap
|
|
||||||
|
|
||||||
Job *job_; // stack of text positions to explore
|
|
||||||
int njob_;
|
|
||||||
int maxjob_;
|
|
||||||
};
|
|
||||||
|
|
||||||
BitState::BitState(Prog* prog)
|
|
||||||
: prog_(prog),
|
|
||||||
anchored_(false),
|
|
||||||
longest_(false),
|
|
||||||
endmatch_(false),
|
|
||||||
submatch_(NULL),
|
|
||||||
nsubmatch_(0),
|
|
||||||
cap_(NULL),
|
|
||||||
ncap_(0),
|
|
||||||
visited_(NULL),
|
|
||||||
nvisited_(0),
|
|
||||||
job_(NULL),
|
|
||||||
njob_(0),
|
|
||||||
maxjob_(0) {
|
|
||||||
}
|
|
||||||
|
|
||||||
BitState::~BitState() {
|
|
||||||
delete[] visited_;
|
|
||||||
delete[] job_;
|
|
||||||
delete[] cap_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Should the search visit the pair ip, p?
|
|
||||||
// If so, remember that it was visited so that the next time,
|
|
||||||
// we don't repeat the visit.
|
|
||||||
bool BitState::ShouldVisit(int id, const char* p) {
|
|
||||||
size_t n = id * (text_.size() + 1) + (p - text_.begin());
|
|
||||||
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
|
||||||
return false;
|
|
||||||
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Grow the stack.
|
|
||||||
bool BitState::GrowStack() {
|
|
||||||
maxjob_ *= 2;
|
|
||||||
Job* newjob = new Job[maxjob_];
|
|
||||||
memmove(newjob, job_, njob_*sizeof job_[0]);
|
|
||||||
delete[] job_;
|
|
||||||
job_ = newjob;
|
|
||||||
if (njob_ >= maxjob_) {
|
|
||||||
LOG(DFATAL) << "Job stack overflow.";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
|
|
||||||
void BitState::Push(int id, const char* p, int arg) {
|
|
||||||
if (njob_ >= maxjob_) {
|
|
||||||
if (!GrowStack())
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
int op = prog_->inst(id)->opcode();
|
|
||||||
if (op == kInstFail)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Only check ShouldVisit when arg == 0.
|
|
||||||
// When arg > 0, we are continuing a previous visit.
|
|
||||||
if (arg == 0 && !ShouldVisit(id, p))
|
|
||||||
return;
|
|
||||||
|
|
||||||
Job* j = &job_[njob_++];
|
|
||||||
j->id = id;
|
|
||||||
j->p = p;
|
|
||||||
j->arg = arg;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Try a search from instruction id0 in state p0.
|
|
||||||
// Return whether it succeeded.
|
|
||||||
bool BitState::TrySearch(int id0, const char* p0) {
|
|
||||||
bool matched = false;
|
|
||||||
bool inaltmatch = false;
|
|
||||||
const char* end = text_.end();
|
|
||||||
njob_ = 0;
|
|
||||||
Push(id0, p0, 0);
|
|
||||||
while (njob_ > 0) {
|
|
||||||
// Pop job off stack.
|
|
||||||
--njob_;
|
|
||||||
int id = job_[njob_].id;
|
|
||||||
const char* p = job_[njob_].p;
|
|
||||||
int arg = job_[njob_].arg;
|
|
||||||
|
|
||||||
// Optimization: rather than push and pop,
|
|
||||||
// code that is going to Push and continue
|
|
||||||
// the loop simply updates ip, p, and arg
|
|
||||||
// and jumps to CheckAndLoop. We have to
|
|
||||||
// do the ShouldVisit check that Push
|
|
||||||
// would have, but we avoid the stack
|
|
||||||
// manipulation.
|
|
||||||
if (0) {
|
|
||||||
Next:
|
|
||||||
// If the Match of a non-greedy AltMatch failed,
|
|
||||||
// we stop ourselves from trying the ByteRange,
|
|
||||||
// which would steer us off the short circuit.
|
|
||||||
if (prog_->inst(id)->last() || inaltmatch)
|
|
||||||
continue;
|
|
||||||
id++;
|
|
||||||
|
|
||||||
CheckAndLoop:
|
|
||||||
if (!ShouldVisit(id, p))
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Visit ip, p.
|
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
continue;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
switch (arg) {
|
|
||||||
case 0:
|
|
||||||
inaltmatch = true;
|
|
||||||
Push(id, p, 1); // come back when we're done
|
|
||||||
|
|
||||||
// One opcode is ByteRange; the other leads to Match
|
|
||||||
// (possibly via Nop or Capture).
|
|
||||||
if (ip->greedy(prog_)) {
|
|
||||||
// out1 is the match
|
|
||||||
Push(ip->out1(), p, 0);
|
|
||||||
id = ip->out1();
|
|
||||||
p = end;
|
|
||||||
goto CheckAndLoop;
|
|
||||||
}
|
|
||||||
// out is the match - non-greedy
|
|
||||||
Push(ip->out(), end, 0);
|
|
||||||
id = ip->out();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
|
|
||||||
case 1:
|
|
||||||
inaltmatch = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
|
|
||||||
continue;
|
|
||||||
|
|
||||||
case kInstByteRange: {
|
|
||||||
int c = -1;
|
|
||||||
if (p < end)
|
|
||||||
c = *p & 0xFF;
|
|
||||||
if (!ip->Matches(c))
|
|
||||||
goto Next;
|
|
||||||
|
|
||||||
if (!ip->last())
|
|
||||||
Push(id+1, p, 0); // try the next when we're done
|
|
||||||
id = ip->out();
|
|
||||||
p++;
|
|
||||||
goto CheckAndLoop;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
switch (arg) {
|
|
||||||
case 0:
|
|
||||||
if (!ip->last())
|
|
||||||
Push(id+1, p, 0); // try the next when we're done
|
|
||||||
|
|
||||||
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
|
||||||
// Capture p to register, but save old value.
|
|
||||||
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
|
||||||
cap_[ip->cap()] = p;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Continue on.
|
|
||||||
id = ip->out();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
|
|
||||||
case 1:
|
|
||||||
// Finished ip->out(); restore the old value.
|
|
||||||
cap_[ip->cap()] = p;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
|
||||||
continue;
|
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
|
||||||
goto Next;
|
|
||||||
|
|
||||||
if (!ip->last())
|
|
||||||
Push(id+1, p, 0); // try the next when we're done
|
|
||||||
id = ip->out();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
if (!ip->last())
|
|
||||||
Push(id+1, p, 0); // try the next when we're done
|
|
||||||
id = ip->out();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
|
|
||||||
case kInstMatch: {
|
|
||||||
if (endmatch_ && p != text_.end())
|
|
||||||
goto Next;
|
|
||||||
|
|
||||||
// We found a match. If the caller doesn't care
|
|
||||||
// where the match is, no point going further.
|
|
||||||
if (nsubmatch_ == 0)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// Record best match so far.
|
|
||||||
// Only need to check end point, because this entire
|
|
||||||
// call is only considering one start position.
|
|
||||||
matched = true;
|
|
||||||
cap_[1] = p;
|
|
||||||
if (submatch_[0].data() == NULL ||
|
|
||||||
(longest_ && p > submatch_[0].end())) {
|
|
||||||
for (int i = 0; i < nsubmatch_; i++)
|
|
||||||
submatch_[i] =
|
|
||||||
StringPiece(cap_[2 * i],
|
|
||||||
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
|
|
||||||
}
|
|
||||||
|
|
||||||
// If going for first match, we're done.
|
|
||||||
if (!longest_)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// If we used the entire text, no longer match is possible.
|
|
||||||
if (p == text_.end())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
// Otherwise, continue on in hope of a longer match.
|
|
||||||
goto Next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return matched;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Search text (within context) for prog_.
|
|
||||||
bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
|
||||||
bool anchored, bool longest,
|
|
||||||
StringPiece* submatch, int nsubmatch) {
|
|
||||||
// Search parameters.
|
|
||||||
text_ = text;
|
|
||||||
context_ = context;
|
|
||||||
if (context_.begin() == NULL)
|
|
||||||
context_ = text;
|
|
||||||
if (prog_->anchor_start() && context_.begin() != text.begin())
|
|
||||||
return false;
|
|
||||||
if (prog_->anchor_end() && context_.end() != text.end())
|
|
||||||
return false;
|
|
||||||
anchored_ = anchored || prog_->anchor_start();
|
|
||||||
longest_ = longest || prog_->anchor_end();
|
|
||||||
endmatch_ = prog_->anchor_end();
|
|
||||||
submatch_ = submatch;
|
|
||||||
nsubmatch_ = nsubmatch;
|
|
||||||
for (int i = 0; i < nsubmatch_; i++)
|
|
||||||
submatch_[i] = StringPiece();
|
|
||||||
|
|
||||||
// Allocate scratch space.
|
|
||||||
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
|
||||||
visited_ = new uint32_t[nvisited_];
|
|
||||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
|
||||||
|
|
||||||
ncap_ = 2*nsubmatch;
|
|
||||||
if (ncap_ < 2)
|
|
||||||
ncap_ = 2;
|
|
||||||
cap_ = new const char*[ncap_];
|
|
||||||
memset(cap_, 0, ncap_*sizeof cap_[0]);
|
|
||||||
|
|
||||||
maxjob_ = 256;
|
|
||||||
job_ = new Job[maxjob_];
|
|
||||||
|
|
||||||
// Anchored search must start at text.begin().
|
|
||||||
if (anchored_) {
|
|
||||||
cap_[0] = text.begin();
|
|
||||||
return TrySearch(prog_->start(), text.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unanchored search, starting from each possible text position.
|
|
||||||
// Notice that we have to try the empty string at the end of
|
|
||||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
|
||||||
// This looks like it's quadratic in the size of the text,
|
|
||||||
// but we are not clearing visited_ between calls to TrySearch,
|
|
||||||
// so no work is duplicated and it ends up still being linear.
|
|
||||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
|
||||||
// Try to use memchr to find the first byte quickly.
|
|
||||||
int fb = prog_->first_byte();
|
|
||||||
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
|
||||||
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
|
||||||
if (p == NULL)
|
|
||||||
p = text.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
cap_[0] = p;
|
|
||||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bit-state search.
|
|
||||||
bool Prog::SearchBitState(const StringPiece& text,
|
|
||||||
const StringPiece& context,
|
|
||||||
Anchor anchor,
|
|
||||||
MatchKind kind,
|
|
||||||
StringPiece* match,
|
|
||||||
int nmatch) {
|
|
||||||
// If full match, we ask for an anchored longest match
|
|
||||||
// and then check that match[0] == text.
|
|
||||||
// So make sure match[0] exists.
|
|
||||||
StringPiece sp0;
|
|
||||||
if (kind == kFullMatch) {
|
|
||||||
anchor = kAnchored;
|
|
||||||
if (nmatch < 1) {
|
|
||||||
match = &sp0;
|
|
||||||
nmatch = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run the search.
|
|
||||||
BitState b(this);
|
|
||||||
bool anchored = anchor == kAnchored;
|
|
||||||
bool longest = kind != kFirstMatch;
|
|
||||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
|
||||||
return false;
|
|
||||||
if (kind == kFullMatch && match[0].end() != text.end())
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,121 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/filtered_re2.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "re2/prefilter.h"
|
|
||||||
#include "re2/prefilter_tree.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
FilteredRE2::FilteredRE2()
|
|
||||||
: compiled_(false),
|
|
||||||
prefilter_tree_(new PrefilterTree()) {
|
|
||||||
}
|
|
||||||
|
|
||||||
FilteredRE2::FilteredRE2(int min_atom_len)
|
|
||||||
: compiled_(false),
|
|
||||||
prefilter_tree_(new PrefilterTree(min_atom_len)) {
|
|
||||||
}
|
|
||||||
|
|
||||||
FilteredRE2::~FilteredRE2() {
|
|
||||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
|
||||||
delete re2_vec_[i];
|
|
||||||
delete prefilter_tree_;
|
|
||||||
}
|
|
||||||
|
|
||||||
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
|
||||||
const RE2::Options& options, int* id) {
|
|
||||||
RE2* re = new RE2(pattern, options);
|
|
||||||
RE2::ErrorCode code = re->error_code();
|
|
||||||
|
|
||||||
if (!re->ok()) {
|
|
||||||
if (options.log_errors()) {
|
|
||||||
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
|
|
||||||
<< re << " due to error " << re->error();
|
|
||||||
}
|
|
||||||
delete re;
|
|
||||||
} else {
|
|
||||||
*id = static_cast<int>(re2_vec_.size());
|
|
||||||
re2_vec_.push_back(re);
|
|
||||||
}
|
|
||||||
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FilteredRE2::Compile(std::vector<string>* atoms) {
|
|
||||||
if (compiled_) {
|
|
||||||
LOG(ERROR) << "Compile called already.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (re2_vec_.empty()) {
|
|
||||||
LOG(ERROR) << "Compile called before Add.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < re2_vec_.size(); i++) {
|
|
||||||
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
|
|
||||||
prefilter_tree_->Add(prefilter);
|
|
||||||
}
|
|
||||||
atoms->clear();
|
|
||||||
prefilter_tree_->Compile(atoms);
|
|
||||||
compiled_ = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
|
||||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
|
||||||
return static_cast<int>(i);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int FilteredRE2::FirstMatch(const StringPiece& text,
|
|
||||||
const std::vector<int>& atoms) const {
|
|
||||||
if (!compiled_) {
|
|
||||||
LOG(DFATAL) << "FirstMatch called before Compile.";
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
std::vector<int> regexps;
|
|
||||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
|
||||||
for (size_t i = 0; i < regexps.size(); i++)
|
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
|
||||||
return regexps[i];
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool FilteredRE2::AllMatches(
|
|
||||||
const StringPiece& text,
|
|
||||||
const std::vector<int>& atoms,
|
|
||||||
std::vector<int>* matching_regexps) const {
|
|
||||||
matching_regexps->clear();
|
|
||||||
std::vector<int> regexps;
|
|
||||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
|
||||||
for (size_t i = 0; i < regexps.size(); i++)
|
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
|
||||||
matching_regexps->push_back(regexps[i]);
|
|
||||||
return !matching_regexps->empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
void FilteredRE2::AllPotentials(
|
|
||||||
const std::vector<int>& atoms,
|
|
||||||
std::vector<int>* potential_regexps) const {
|
|
||||||
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
|
|
||||||
}
|
|
||||||
|
|
||||||
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
|
||||||
std::vector<int>* passed_regexps) {
|
|
||||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
|
||||||
}
|
|
||||||
|
|
||||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
|
||||||
prefilter_tree_->PrintPrefilter(regexpid);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,110 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_FILTERED_RE2_H_
|
|
||||||
#define RE2_FILTERED_RE2_H_
|
|
||||||
|
|
||||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
|
||||||
// It provides a prefilter mechanism that helps in cutting down the
|
|
||||||
// number of regexps that need to be actually searched.
|
|
||||||
//
|
|
||||||
// By design, it does not include a string matching engine. This is to
|
|
||||||
// allow the user of the class to use their favorite string match
|
|
||||||
// engine. The overall flow is: Add all the regexps using Add, then
|
|
||||||
// Compile the FilteredRE2. The compile returns strings that need to
|
|
||||||
// be matched. Note that all returned strings are lowercase. For
|
|
||||||
// applying regexps to a search text, the caller does the string
|
|
||||||
// matching using the strings returned. When doing the string match,
|
|
||||||
// note that the caller has to do that on lower cased version of the
|
|
||||||
// search text. Then call FirstMatch or AllMatches with a vector of
|
|
||||||
// indices of strings that were found in the text to get the actual
|
|
||||||
// regexp matches.
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class PrefilterTree;
|
|
||||||
|
|
||||||
class FilteredRE2 {
|
|
||||||
public:
|
|
||||||
FilteredRE2();
|
|
||||||
explicit FilteredRE2(int min_atom_len);
|
|
||||||
~FilteredRE2();
|
|
||||||
|
|
||||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
|
||||||
// re->error_code(). If error_code is other than NoError, then re is
|
|
||||||
// deleted and not added to re2_vec_.
|
|
||||||
RE2::ErrorCode Add(const StringPiece& pattern,
|
|
||||||
const RE2::Options& options,
|
|
||||||
int *id);
|
|
||||||
|
|
||||||
// Prepares the regexps added by Add for filtering. Returns a set
|
|
||||||
// of strings that the caller should check for in candidate texts.
|
|
||||||
// The returned strings are lowercased. When doing string matching,
|
|
||||||
// the search text should be lowercased first to find matching
|
|
||||||
// strings from the set of strings returned by Compile. Call after
|
|
||||||
// all Add calls are done.
|
|
||||||
void Compile(std::vector<string>* strings_to_match);
|
|
||||||
|
|
||||||
// Returns the index of the first matching regexp.
|
|
||||||
// Returns -1 on no match. Can be called prior to Compile.
|
|
||||||
// Does not do any filtering: simply tries to Match the
|
|
||||||
// regexps in a loop.
|
|
||||||
int SlowFirstMatch(const StringPiece& text) const;
|
|
||||||
|
|
||||||
// Returns the index of the first matching regexp.
|
|
||||||
// Returns -1 on no match. Compile has to be called before
|
|
||||||
// calling this.
|
|
||||||
int FirstMatch(const StringPiece& text,
|
|
||||||
const std::vector<int>& atoms) const;
|
|
||||||
|
|
||||||
// Returns the indices of all matching regexps, after first clearing
|
|
||||||
// matched_regexps.
|
|
||||||
bool AllMatches(const StringPiece& text,
|
|
||||||
const std::vector<int>& atoms,
|
|
||||||
std::vector<int>* matching_regexps) const;
|
|
||||||
|
|
||||||
// Returns the indices of all potentially matching regexps after first
|
|
||||||
// clearing potential_regexps.
|
|
||||||
// A regexp is potentially matching if it passes the filter.
|
|
||||||
// If a regexp passes the filter it may still not match.
|
|
||||||
// A regexp that does not pass the filter is guaranteed to not match.
|
|
||||||
void AllPotentials(const std::vector<int>& atoms,
|
|
||||||
std::vector<int>* potential_regexps) const;
|
|
||||||
|
|
||||||
// The number of regexps added.
|
|
||||||
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
// Get the individual RE2 objects. Useful for testing.
|
|
||||||
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
|
|
||||||
|
|
||||||
// Print prefilter.
|
|
||||||
void PrintPrefilter(int regexpid);
|
|
||||||
|
|
||||||
// Useful for testing and debugging.
|
|
||||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
|
||||||
std::vector<int>* passed_regexps);
|
|
||||||
|
|
||||||
// All the regexps in the FilteredRE2.
|
|
||||||
std::vector<RE2*> re2_vec_;
|
|
||||||
|
|
||||||
// Has the FilteredRE2 been compiled using Compile()
|
|
||||||
bool compiled_;
|
|
||||||
|
|
||||||
// An AND-OR tree of string atoms used for filtering regexps.
|
|
||||||
PrefilterTree* prefilter_tree_;
|
|
||||||
|
|
||||||
FilteredRE2(const FilteredRE2&) = delete;
|
|
||||||
FilteredRE2& operator=(const FilteredRE2&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_FILTERED_RE2_H_
|
|
@ -1,187 +0,0 @@
|
|||||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Determine whether this library should match PCRE exactly
|
|
||||||
// for a particular Regexp. (If so, the testing framework can
|
|
||||||
// check that it does.)
|
|
||||||
//
|
|
||||||
// This library matches PCRE except in these cases:
|
|
||||||
// * the regexp contains a repetition of an empty string,
|
|
||||||
// like (a*)* or (a*)+. In this case, PCRE will treat
|
|
||||||
// the repetition sequence as ending with an empty string,
|
|
||||||
// while this library does not.
|
|
||||||
// * Perl and PCRE differ on whether \v matches \n.
|
|
||||||
// For historical reasons, this library implements the Perl behavior.
|
|
||||||
// * Perl and PCRE allow $ in one-line mode to match either the very
|
|
||||||
// end of the text or just before a \n at the end of the text.
|
|
||||||
// This library requires it to match only the end of the text.
|
|
||||||
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
|
||||||
// match the end of the text if the last character is a \n.
|
|
||||||
// This library does allow it.
|
|
||||||
//
|
|
||||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
#include "re2/walker-inl.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Returns whether re might match an empty string.
|
|
||||||
static bool CanBeEmptyString(Regexp *re);
|
|
||||||
|
|
||||||
// Walker class to compute whether library handles a regexp
|
|
||||||
// exactly as PCRE would. See comment at top for conditions.
|
|
||||||
|
|
||||||
class PCREWalker : public Regexp::Walker<bool> {
|
|
||||||
public:
|
|
||||||
PCREWalker() {}
|
|
||||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
|
||||||
int nchild_args);
|
|
||||||
|
|
||||||
bool ShortVisit(Regexp* re, bool a) {
|
|
||||||
// Should never be called: we use Walk not WalkExponential.
|
|
||||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Called after visiting each of re's children and accumulating
|
|
||||||
// the return values in child_args. So child_args contains whether
|
|
||||||
// this library mimics PCRE for those subexpressions.
|
|
||||||
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
||||||
bool* child_args, int nchild_args) {
|
|
||||||
// If children failed, so do we.
|
|
||||||
for (int i = 0; i < nchild_args; i++)
|
|
||||||
if (!child_args[i])
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Otherwise look for other reasons to fail.
|
|
||||||
switch (re->op()) {
|
|
||||||
// Look for repeated empty string.
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
if (CanBeEmptyString(re->sub()[0]))
|
|
||||||
return false;
|
|
||||||
break;
|
|
||||||
case kRegexpRepeat:
|
|
||||||
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
|
||||||
return false;
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Look for \v
|
|
||||||
case kRegexpLiteral:
|
|
||||||
if (re->rune() == '\v')
|
|
||||||
return false;
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Look for $ in single-line mode.
|
|
||||||
case kRegexpEndText:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
if (re->parse_flags() & Regexp::WasDollar)
|
|
||||||
return false;
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Look for ^ in multi-line mode.
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
|
||||||
return false;
|
|
||||||
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Not proven guilty.
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
|
||||||
bool Regexp::MimicsPCRE() {
|
|
||||||
PCREWalker w;
|
|
||||||
return w.Walk(this, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Walker class to compute whether a Regexp can match an empty string.
|
|
||||||
// It is okay to overestimate. For example, \b\B cannot match an empty
|
|
||||||
// string, because \b and \B are mutually exclusive, but this isn't
|
|
||||||
// that smart and will say it can. Spurious empty strings
|
|
||||||
// will reduce the number of regexps we sanity check against PCRE,
|
|
||||||
// but they won't break anything.
|
|
||||||
|
|
||||||
class EmptyStringWalker : public Regexp::Walker<bool> {
|
|
||||||
public:
|
|
||||||
EmptyStringWalker() { }
|
|
||||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
||||||
bool* child_args, int nchild_args);
|
|
||||||
|
|
||||||
bool ShortVisit(Regexp* re, bool a) {
|
|
||||||
// Should never be called: we use Walk not WalkExponential.
|
|
||||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
EmptyStringWalker(const EmptyStringWalker&) = delete;
|
|
||||||
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Called after visiting re's children. child_args contains the return
|
|
||||||
// value from each of the children's PostVisits (i.e., whether each child
|
|
||||||
// can match an empty string). Returns whether this clause can match an
|
|
||||||
// empty string.
|
|
||||||
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
|
||||||
bool* child_args, int nchild_args) {
|
|
||||||
switch (re->op()) {
|
|
||||||
case kRegexpNoMatch: // never empty
|
|
||||||
case kRegexpLiteral:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
case kRegexpCharClass:
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kRegexpEmptyMatch: // always empty
|
|
||||||
case kRegexpBeginLine: // always empty, when they match
|
|
||||||
case kRegexpEndLine:
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
case kRegexpBeginText:
|
|
||||||
case kRegexpEndText:
|
|
||||||
case kRegexpStar: // can always be empty
|
|
||||||
case kRegexpQuest:
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case kRegexpConcat: // can be empty if all children can
|
|
||||||
for (int i = 0; i < nchild_args; i++)
|
|
||||||
if (!child_args[i])
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case kRegexpAlternate: // can be empty if any child can
|
|
||||||
for (int i = 0; i < nchild_args; i++)
|
|
||||||
if (child_args[i])
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kRegexpPlus: // can be empty if the child can
|
|
||||||
case kRegexpCapture:
|
|
||||||
return child_args[0];
|
|
||||||
|
|
||||||
case kRegexpRepeat: // can be empty if child can or is x{0}
|
|
||||||
return child_args[0] || re->min() == 0;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns whether re can match an empty string.
|
|
||||||
static bool CanBeEmptyString(Regexp* re) {
|
|
||||||
EmptyStringWalker w;
|
|
||||||
return w.Walk(re, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,791 +0,0 @@
|
|||||||
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Tested by search_test.cc.
|
|
||||||
//
|
|
||||||
// Prog::SearchNFA, an NFA search.
|
|
||||||
// This is an actual NFA like the theorists talk about,
|
|
||||||
// not the pseudo-NFA found in backtracking regexp implementations.
|
|
||||||
//
|
|
||||||
// IMPLEMENTATION
|
|
||||||
//
|
|
||||||
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
|
|
||||||
// which is a variant of the one described in Thompson's 1968 CACM paper.
|
|
||||||
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
|
|
||||||
// over the DFA implementation is that it tracks submatch boundaries.
|
|
||||||
//
|
|
||||||
// When the choice of submatch boundaries is ambiguous, this particular
|
|
||||||
// implementation makes the same choices that traditional backtracking
|
|
||||||
// implementations (in particular, Perl and PCRE) do.
|
|
||||||
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
|
|
||||||
// time in the length of the input.
|
|
||||||
//
|
|
||||||
// Like Thompson's original machine and like the DFA implementation, this
|
|
||||||
// implementation notices a match only once it is one byte past it.
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "re2/prog.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/sparse_array.h"
|
|
||||||
#include "util/sparse_set.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static const bool ExtraDebug = false;
|
|
||||||
|
|
||||||
class NFA {
|
|
||||||
public:
|
|
||||||
NFA(Prog* prog);
|
|
||||||
~NFA();
|
|
||||||
|
|
||||||
// Searches for a matching string.
|
|
||||||
// * If anchored is true, only considers matches starting at offset.
|
|
||||||
// Otherwise finds lefmost match at or after offset.
|
|
||||||
// * If longest is true, returns the longest match starting
|
|
||||||
// at the chosen start point. Otherwise returns the so-called
|
|
||||||
// left-biased match, the one traditional backtracking engines
|
|
||||||
// (like Perl and PCRE) find.
|
|
||||||
// Records submatch boundaries in submatch[1..nsubmatch-1].
|
|
||||||
// Submatch[0] is the entire match. When there is a choice in
|
|
||||||
// which text matches each subexpression, the submatch boundaries
|
|
||||||
// are chosen to match what a backtracking implementation would choose.
|
|
||||||
bool Search(const StringPiece& text, const StringPiece& context,
|
|
||||||
bool anchored, bool longest,
|
|
||||||
StringPiece* submatch, int nsubmatch);
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct Thread {
|
|
||||||
union {
|
|
||||||
int ref;
|
|
||||||
Thread* next; // when on free list
|
|
||||||
};
|
|
||||||
const char** capture;
|
|
||||||
};
|
|
||||||
|
|
||||||
// State for explicit stack in AddToThreadq.
|
|
||||||
struct AddState {
|
|
||||||
int id; // Inst to process
|
|
||||||
Thread* t; // if not null, set t0 = t before processing id
|
|
||||||
|
|
||||||
AddState()
|
|
||||||
: id(0), t(NULL) {}
|
|
||||||
explicit AddState(int id)
|
|
||||||
: id(id), t(NULL) {}
|
|
||||||
AddState(int id, Thread* t)
|
|
||||||
: id(id), t(t) {}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Threadq is a list of threads. The list is sorted by the order
|
|
||||||
// in which Perl would explore that particular state -- the earlier
|
|
||||||
// choices appear earlier in the list.
|
|
||||||
typedef SparseArray<Thread*> Threadq;
|
|
||||||
|
|
||||||
inline Thread* AllocThread();
|
|
||||||
inline Thread* Incref(Thread* t);
|
|
||||||
inline void Decref(Thread* t);
|
|
||||||
|
|
||||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
|
||||||
// Enqueues only the ByteRange instructions that match byte c.
|
|
||||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
|
||||||
// p is the current input position, and t0 is the current thread.
|
|
||||||
void AddToThreadq(Threadq* q, int id0, int c, int flag,
|
|
||||||
const char* p, Thread* t0);
|
|
||||||
|
|
||||||
// Run runq on byte c, appending new states to nextq.
|
|
||||||
// Updates matched_ and match_ as new, better matches are found.
|
|
||||||
// p is the position of byte c in the input string for AddToThreadq;
|
|
||||||
// p-1 will be used when processing Match instructions.
|
|
||||||
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
|
||||||
// ^, $ and \b match the current input position (after c).
|
|
||||||
// Frees all the threads on runq.
|
|
||||||
// If there is a shortcut to the end, returns that shortcut.
|
|
||||||
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
|
||||||
|
|
||||||
// Returns text version of capture information, for debugging.
|
|
||||||
string FormatCapture(const char** capture);
|
|
||||||
|
|
||||||
inline void CopyCapture(const char** dst, const char** src);
|
|
||||||
|
|
||||||
Prog* prog_; // underlying program
|
|
||||||
int start_; // start instruction in program
|
|
||||||
int ncapture_; // number of submatches to track
|
|
||||||
bool longest_; // whether searching for longest match
|
|
||||||
bool endmatch_; // whether match must end at text.end()
|
|
||||||
const char* btext_; // beginning of text being matched (for FormatSubmatch)
|
|
||||||
const char* etext_; // end of text being matched (for endmatch_)
|
|
||||||
Threadq q0_, q1_; // pre-allocated for Search.
|
|
||||||
const char** match_; // best match so far
|
|
||||||
bool matched_; // any match so far?
|
|
||||||
AddState* astack_; // pre-allocated for AddToThreadq
|
|
||||||
int nastack_;
|
|
||||||
|
|
||||||
Thread* free_threads_; // free list
|
|
||||||
|
|
||||||
NFA(const NFA&) = delete;
|
|
||||||
NFA& operator=(const NFA&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
NFA::NFA(Prog* prog) {
|
|
||||||
prog_ = prog;
|
|
||||||
start_ = prog_->start();
|
|
||||||
ncapture_ = 0;
|
|
||||||
longest_ = false;
|
|
||||||
endmatch_ = false;
|
|
||||||
btext_ = NULL;
|
|
||||||
etext_ = NULL;
|
|
||||||
q0_.resize(prog_->size());
|
|
||||||
q1_.resize(prog_->size());
|
|
||||||
// See NFA::AddToThreadq() for why this is so.
|
|
||||||
nastack_ = 2*prog_->inst_count(kInstCapture) +
|
|
||||||
prog_->inst_count(kInstEmptyWidth) +
|
|
||||||
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
|
|
||||||
astack_ = new AddState[nastack_];
|
|
||||||
match_ = NULL;
|
|
||||||
matched_ = false;
|
|
||||||
free_threads_ = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
NFA::~NFA() {
|
|
||||||
delete[] match_;
|
|
||||||
delete[] astack_;
|
|
||||||
Thread* next;
|
|
||||||
for (Thread* t = free_threads_; t; t = next) {
|
|
||||||
next = t->next;
|
|
||||||
delete[] t->capture;
|
|
||||||
delete t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
NFA::Thread* NFA::AllocThread() {
|
|
||||||
Thread* t = free_threads_;
|
|
||||||
if (t == NULL) {
|
|
||||||
t = new Thread;
|
|
||||||
t->ref = 1;
|
|
||||||
t->capture = new const char*[ncapture_];
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
free_threads_ = t->next;
|
|
||||||
t->ref = 1;
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
NFA::Thread* NFA::Incref(Thread* t) {
|
|
||||||
DCHECK(t != NULL);
|
|
||||||
t->ref++;
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
void NFA::Decref(Thread* t) {
|
|
||||||
if (t == NULL)
|
|
||||||
return;
|
|
||||||
t->ref--;
|
|
||||||
if (t->ref > 0)
|
|
||||||
return;
|
|
||||||
DCHECK_EQ(t->ref, 0);
|
|
||||||
t->next = free_threads_;
|
|
||||||
free_threads_ = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
void NFA::CopyCapture(const char** dst, const char** src) {
|
|
||||||
for (int i = 0; i < ncapture_; i+=2) {
|
|
||||||
dst[i] = src[i];
|
|
||||||
dst[i+1] = src[i+1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
|
||||||
// Enqueues only the ByteRange instructions that match byte c.
|
|
||||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
|
||||||
// p is the current input position, and t0 is the current thread.
|
|
||||||
void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
|
|
||||||
const char* p, Thread* t0) {
|
|
||||||
if (id0 == 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Use astack_ to hold our stack of instructions yet to process.
|
|
||||||
// It was preallocated as follows:
|
|
||||||
// two entries per Capture;
|
|
||||||
// one entry per EmptyWidth; and
|
|
||||||
// one entry per Nop.
|
|
||||||
// This reflects the maximum number of stack pushes that each can
|
|
||||||
// perform. (Each instruction can be processed at most once.)
|
|
||||||
AddState* stk = astack_;
|
|
||||||
int nstk = 0;
|
|
||||||
|
|
||||||
stk[nstk++] = AddState(id0);
|
|
||||||
while (nstk > 0) {
|
|
||||||
DCHECK_LE(nstk, nastack_);
|
|
||||||
AddState a = stk[--nstk];
|
|
||||||
|
|
||||||
Loop:
|
|
||||||
if (a.t != NULL) {
|
|
||||||
// t0 was a thread that we allocated and copied in order to
|
|
||||||
// record the capture, so we must now decref it.
|
|
||||||
Decref(t0);
|
|
||||||
t0 = a.t;
|
|
||||||
}
|
|
||||||
|
|
||||||
int id = a.id;
|
|
||||||
if (id == 0)
|
|
||||||
continue;
|
|
||||||
if (q->has_index(id)) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create entry in q no matter what. We might fill it in below,
|
|
||||||
// or we might not. Even if not, it is necessary to have it,
|
|
||||||
// so that we don't revisit id0 during the recursion.
|
|
||||||
q->set_new(id, NULL);
|
|
||||||
|
|
||||||
Thread** tp = &q->find(id)->second;
|
|
||||||
int j;
|
|
||||||
Thread* t;
|
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
// Save state; will pick up at next byte.
|
|
||||||
t = Incref(t0);
|
|
||||||
*tp = t;
|
|
||||||
|
|
||||||
DCHECK(!ip->last());
|
|
||||||
a = AddState(id+1);
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
if (!ip->last())
|
|
||||||
stk[nstk++] = AddState(id+1);
|
|
||||||
|
|
||||||
// Continue on.
|
|
||||||
a = AddState(ip->out());
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
if (!ip->last())
|
|
||||||
stk[nstk++] = AddState(id+1);
|
|
||||||
|
|
||||||
if ((j=ip->cap()) < ncapture_) {
|
|
||||||
// Push a dummy whose only job is to restore t0
|
|
||||||
// once we finish exploring this possibility.
|
|
||||||
stk[nstk++] = AddState(0, t0);
|
|
||||||
|
|
||||||
// Record capture.
|
|
||||||
t = AllocThread();
|
|
||||||
CopyCapture(t->capture, t0->capture);
|
|
||||||
t->capture[j] = p;
|
|
||||||
t0 = t;
|
|
||||||
}
|
|
||||||
a = AddState(ip->out());
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
if (!ip->Matches(c))
|
|
||||||
goto Next;
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
// Save state; will pick up at next byte.
|
|
||||||
t = Incref(t0);
|
|
||||||
*tp = t;
|
|
||||||
if (ExtraDebug)
|
|
||||||
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
|
|
||||||
|
|
||||||
Next:
|
|
||||||
if (ip->last())
|
|
||||||
break;
|
|
||||||
a = AddState(id+1);
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
if (!ip->last())
|
|
||||||
stk[nstk++] = AddState(id+1);
|
|
||||||
|
|
||||||
// Continue on if we have all the right flag bits.
|
|
||||||
if (ip->empty() & ~flag)
|
|
||||||
break;
|
|
||||||
a = AddState(ip->out());
|
|
||||||
goto Loop;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Run runq on byte c, appending new states to nextq.
|
|
||||||
// Updates matched_ and match_ as new, better matches are found.
|
|
||||||
// p is the position of byte c in the input string for AddToThreadq;
|
|
||||||
// p-1 will be used when processing Match instructions.
|
|
||||||
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
|
||||||
// ^, $ and \b match the current input position (after c).
|
|
||||||
// Frees all the threads on runq.
|
|
||||||
// If there is a shortcut to the end, returns that shortcut.
|
|
||||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
|
||||||
nextq->clear();
|
|
||||||
|
|
||||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
|
||||||
Thread* t = i->second;
|
|
||||||
if (t == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (longest_) {
|
|
||||||
// Can skip any threads started after our current best match.
|
|
||||||
if (matched_ && match_[0] < t->capture[0]) {
|
|
||||||
Decref(t);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int id = i->index();
|
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
|
||||||
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
// Should only see the values handled below.
|
|
||||||
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
AddToThreadq(nextq, ip->out(), c, flag, p, t);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
if (i != runq->begin())
|
|
||||||
break;
|
|
||||||
// The match is ours if we want it.
|
|
||||||
if (ip->greedy(prog_) || longest_) {
|
|
||||||
CopyCapture(match_, t->capture);
|
|
||||||
matched_ = true;
|
|
||||||
|
|
||||||
Decref(t);
|
|
||||||
for (++i; i != runq->end(); ++i)
|
|
||||||
Decref(i->second);
|
|
||||||
runq->clear();
|
|
||||||
if (ip->greedy(prog_))
|
|
||||||
return ip->out1();
|
|
||||||
return ip->out();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstMatch: {
|
|
||||||
// Avoid invoking undefined behavior when p happens
|
|
||||||
// to be null - and p-1 would be meaningless anyway.
|
|
||||||
if (p == NULL)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (endmatch_ && p-1 != etext_)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if (longest_) {
|
|
||||||
// Leftmost-longest mode: save this match only if
|
|
||||||
// it is either farther to the left or at the same
|
|
||||||
// point but longer than an existing match.
|
|
||||||
if (!matched_ || t->capture[0] < match_[0] ||
|
|
||||||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
|
|
||||||
CopyCapture(match_, t->capture);
|
|
||||||
match_[1] = p-1;
|
|
||||||
matched_ = true;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Leftmost-biased mode: this match is by definition
|
|
||||||
// better than what we've already found (see next line).
|
|
||||||
CopyCapture(match_, t->capture);
|
|
||||||
match_[1] = p-1;
|
|
||||||
matched_ = true;
|
|
||||||
|
|
||||||
// Cut off the threads that can only find matches
|
|
||||||
// worse than the one we just found: don't run the
|
|
||||||
// rest of the current Threadq.
|
|
||||||
Decref(t);
|
|
||||||
for (++i; i != runq->end(); ++i)
|
|
||||||
Decref(i->second);
|
|
||||||
runq->clear();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Decref(t);
|
|
||||||
}
|
|
||||||
runq->clear();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
string NFA::FormatCapture(const char** capture) {
|
|
||||||
string s;
|
|
||||||
|
|
||||||
for (int i = 0; i < ncapture_; i+=2) {
|
|
||||||
if (capture[i] == NULL)
|
|
||||||
StringAppendF(&s, "(?,?)");
|
|
||||||
else if (capture[i+1] == NULL)
|
|
||||||
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
|
|
||||||
else
|
|
||||||
StringAppendF(&s, "(%d,%d)",
|
|
||||||
(int)(capture[i] - btext_),
|
|
||||||
(int)(capture[i+1] - btext_));
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|
||||||
bool anchored, bool longest,
|
|
||||||
StringPiece* submatch, int nsubmatch) {
|
|
||||||
if (start_ == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
StringPiece context = const_context;
|
|
||||||
if (context.begin() == NULL)
|
|
||||||
context = text;
|
|
||||||
|
|
||||||
// Sanity check: make sure that text lies within context.
|
|
||||||
if (text.begin() < context.begin() || text.end() > context.end()) {
|
|
||||||
LOG(DFATAL) << "context does not contain text";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (prog_->anchor_start() && context.begin() != text.begin())
|
|
||||||
return false;
|
|
||||||
if (prog_->anchor_end() && context.end() != text.end())
|
|
||||||
return false;
|
|
||||||
anchored |= prog_->anchor_start();
|
|
||||||
if (prog_->anchor_end()) {
|
|
||||||
longest = true;
|
|
||||||
endmatch_ = true;
|
|
||||||
etext_ = text.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nsubmatch < 0) {
|
|
||||||
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Save search parameters.
|
|
||||||
ncapture_ = 2*nsubmatch;
|
|
||||||
longest_ = longest;
|
|
||||||
|
|
||||||
if (nsubmatch == 0) {
|
|
||||||
// We need to maintain match[0], both to distinguish the
|
|
||||||
// longest match (if longest is true) and also to tell
|
|
||||||
// whether we've seen any matches at all.
|
|
||||||
ncapture_ = 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
match_ = new const char*[ncapture_];
|
|
||||||
matched_ = false;
|
|
||||||
|
|
||||||
// For debugging prints.
|
|
||||||
btext_ = context.begin();
|
|
||||||
|
|
||||||
if (ExtraDebug)
|
|
||||||
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
|
||||||
text.ToString().c_str(), context.ToString().c_str(), anchored,
|
|
||||||
longest);
|
|
||||||
|
|
||||||
// Set up search.
|
|
||||||
Threadq* runq = &q0_;
|
|
||||||
Threadq* nextq = &q1_;
|
|
||||||
runq->clear();
|
|
||||||
nextq->clear();
|
|
||||||
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
|
||||||
int wasword = 0;
|
|
||||||
|
|
||||||
if (text.begin() > context.begin())
|
|
||||||
wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
|
|
||||||
|
|
||||||
// Loop over the text, stepping the machine.
|
|
||||||
for (const char* p = text.begin();; p++) {
|
|
||||||
// Check for empty-width specials.
|
|
||||||
int flag = 0;
|
|
||||||
|
|
||||||
// ^ and \A
|
|
||||||
if (p == context.begin())
|
|
||||||
flag |= kEmptyBeginText | kEmptyBeginLine;
|
|
||||||
else if (p <= context.end() && p[-1] == '\n')
|
|
||||||
flag |= kEmptyBeginLine;
|
|
||||||
|
|
||||||
// $ and \z
|
|
||||||
if (p == context.end())
|
|
||||||
flag |= kEmptyEndText | kEmptyEndLine;
|
|
||||||
else if (p < context.end() && p[0] == '\n')
|
|
||||||
flag |= kEmptyEndLine;
|
|
||||||
|
|
||||||
// \b and \B
|
|
||||||
int isword = 0;
|
|
||||||
if (p < context.end())
|
|
||||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
|
||||||
|
|
||||||
if (isword != wasword)
|
|
||||||
flag |= kEmptyWordBoundary;
|
|
||||||
else
|
|
||||||
flag |= kEmptyNonWordBoundary;
|
|
||||||
|
|
||||||
if (ExtraDebug) {
|
|
||||||
int c = 0;
|
|
||||||
if (p == context.begin())
|
|
||||||
c = '^';
|
|
||||||
else if (p > text.end())
|
|
||||||
c = '$';
|
|
||||||
else if (p < text.end())
|
|
||||||
c = p[0] & 0xFF;
|
|
||||||
|
|
||||||
fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
|
|
||||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
|
||||||
Thread* t = i->second;
|
|
||||||
if (t == NULL)
|
|
||||||
continue;
|
|
||||||
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
|
|
||||||
}
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is a no-op the first time around the loop because runq is empty.
|
|
||||||
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
|
|
||||||
DCHECK_EQ(runq->size(), 0);
|
|
||||||
using std::swap;
|
|
||||||
swap(nextq, runq);
|
|
||||||
nextq->clear();
|
|
||||||
if (id != 0) {
|
|
||||||
// We're done: full match ahead.
|
|
||||||
p = text.end();
|
|
||||||
for (;;) {
|
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
if (ip->cap() < ncapture_)
|
|
||||||
match_[ip->cap()] = p;
|
|
||||||
id = ip->out();
|
|
||||||
continue;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
id = ip->out();
|
|
||||||
continue;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
match_[1] = p;
|
|
||||||
matched_ = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (p > text.end())
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Start a new thread if there have not been any matches.
|
|
||||||
// (No point in starting a new thread if there have been
|
|
||||||
// matches, since it would be to the right of the match
|
|
||||||
// we already found.)
|
|
||||||
if (!matched_ && (!anchored || p == text.begin())) {
|
|
||||||
// If there's a required first byte for an unanchored search
|
|
||||||
// and we're not in the middle of any possible matches,
|
|
||||||
// use memchr to search for the byte quickly.
|
|
||||||
int fb = prog_->first_byte();
|
|
||||||
if (!anchored && runq->size() == 0 &&
|
|
||||||
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
|
||||||
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
|
||||||
if (p == NULL) {
|
|
||||||
p = text.end();
|
|
||||||
isword = 0;
|
|
||||||
} else {
|
|
||||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
|
||||||
}
|
|
||||||
flag = Prog::EmptyFlags(context, p);
|
|
||||||
}
|
|
||||||
|
|
||||||
Thread* t = AllocThread();
|
|
||||||
CopyCapture(t->capture, match_);
|
|
||||||
t->capture[0] = p;
|
|
||||||
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
|
|
||||||
Decref(t);
|
|
||||||
}
|
|
||||||
|
|
||||||
// If all the threads have died, stop early.
|
|
||||||
if (runq->size() == 0) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
fprintf(stderr, "dead\n");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
wasword = isword;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
|
||||||
Decref(i->second);
|
|
||||||
|
|
||||||
if (matched_) {
|
|
||||||
for (int i = 0; i < nsubmatch; i++)
|
|
||||||
submatch[i] =
|
|
||||||
StringPiece(match_[2 * i],
|
|
||||||
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
|
|
||||||
if (ExtraDebug)
|
|
||||||
fprintf(stderr, "match (%td,%td)\n",
|
|
||||||
match_[0] - btext_, match_[1] - btext_);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Computes whether all successful matches have a common first byte,
|
|
||||||
// and if so, returns that byte. If not, returns -1.
|
|
||||||
int Prog::ComputeFirstByte() {
|
|
||||||
int b = -1;
|
|
||||||
SparseSet q(size());
|
|
||||||
q.insert(start());
|
|
||||||
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
|
|
||||||
int id = *it;
|
|
||||||
Prog::Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
// The empty string matches: no first byte.
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
if (!ip->last())
|
|
||||||
q.insert(id+1);
|
|
||||||
|
|
||||||
// Must match only a single byte
|
|
||||||
if (ip->lo() != ip->hi())
|
|
||||||
return -1;
|
|
||||||
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
|
|
||||||
return -1;
|
|
||||||
// If we haven't seen any bytes yet, record it;
|
|
||||||
// otherwise must match the one we saw before.
|
|
||||||
if (b == -1)
|
|
||||||
b = ip->lo();
|
|
||||||
else if (b != ip->lo())
|
|
||||||
return -1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
if (!ip->last())
|
|
||||||
q.insert(id+1);
|
|
||||||
|
|
||||||
// Continue on.
|
|
||||||
// Ignore ip->empty() flags for kInstEmptyWidth
|
|
||||||
// in order to be as conservative as possible
|
|
||||||
// (assume all possible empty-width flags are true).
|
|
||||||
if (ip->out())
|
|
||||||
q.insert(ip->out());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
DCHECK(!ip->last());
|
|
||||||
q.insert(id+1);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return b;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
Dump();
|
|
||||||
|
|
||||||
NFA nfa(this);
|
|
||||||
StringPiece sp;
|
|
||||||
if (kind == kFullMatch) {
|
|
||||||
anchor = kAnchored;
|
|
||||||
if (nmatch == 0) {
|
|
||||||
match = &sp;
|
|
||||||
nmatch = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
|
|
||||||
return false;
|
|
||||||
if (kind == kFullMatch && match[0].end() != text.end())
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// For each instruction i in the program reachable from the start, compute the
|
|
||||||
// number of instructions reachable from i by following only empty transitions
|
|
||||||
// and record that count as fanout[i].
|
|
||||||
//
|
|
||||||
// fanout holds the results and is also the work queue for the outer iteration.
|
|
||||||
// reachable holds the reached nodes for the inner iteration.
|
|
||||||
void Prog::Fanout(SparseArray<int>* fanout) {
|
|
||||||
DCHECK_EQ(fanout->max_size(), size());
|
|
||||||
SparseSet reachable(size());
|
|
||||||
fanout->clear();
|
|
||||||
fanout->set_new(start(), 0);
|
|
||||||
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
|
|
||||||
int* count = &i->second;
|
|
||||||
reachable.clear();
|
|
||||||
reachable.insert(i->index());
|
|
||||||
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
|
|
||||||
int id = *j;
|
|
||||||
Prog::Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
if (!ip->last())
|
|
||||||
reachable.insert(id+1);
|
|
||||||
|
|
||||||
(*count)++;
|
|
||||||
if (!fanout->has_index(ip->out())) {
|
|
||||||
fanout->set_new(ip->out(), 0);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
DCHECK(!ip->last());
|
|
||||||
reachable.insert(id+1);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
case kInstNop:
|
|
||||||
if (!ip->last())
|
|
||||||
reachable.insert(id+1);
|
|
||||||
|
|
||||||
reachable.insert(ip->out());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
if (!ip->last())
|
|
||||||
reachable.insert(id+1);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,627 +0,0 @@
|
|||||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Tested by search_test.cc.
|
|
||||||
//
|
|
||||||
// Prog::SearchOnePass is an efficient implementation of
|
|
||||||
// regular expression search with submatch tracking for
|
|
||||||
// what I call "one-pass regular expressions". (An alternate
|
|
||||||
// name might be "backtracking-free regular expressions".)
|
|
||||||
//
|
|
||||||
// One-pass regular expressions have the property that
|
|
||||||
// at each input byte during an anchored match, there may be
|
|
||||||
// multiple alternatives but only one can proceed for any
|
|
||||||
// given input byte.
|
|
||||||
//
|
|
||||||
// For example, the regexp /x*yx*/ is one-pass: you read
|
|
||||||
// x's until a y, then you read the y, then you keep reading x's.
|
|
||||||
// At no point do you have to guess what to do or back up
|
|
||||||
// and try a different guess.
|
|
||||||
//
|
|
||||||
// On the other hand, /x*x/ is not one-pass: when you're
|
|
||||||
// looking at an input "x", it's not clear whether you should
|
|
||||||
// use it to extend the x* or as the final x.
|
|
||||||
//
|
|
||||||
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
|
|
||||||
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
|
|
||||||
//
|
|
||||||
// A simple intuition for identifying one-pass regular expressions
|
|
||||||
// is that it's always immediately obvious when a repetition ends.
|
|
||||||
// It must also be immediately obvious which branch of an | to take:
|
|
||||||
//
|
|
||||||
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
|
|
||||||
//
|
|
||||||
// The NFA-based search in nfa.cc does some bookkeeping to
|
|
||||||
// avoid the need for backtracking and its associated exponential blowup.
|
|
||||||
// But if we have a one-pass regular expression, there is no
|
|
||||||
// possibility of backtracking, so there is no need for the
|
|
||||||
// extra bookkeeping. Hence, this code.
|
|
||||||
//
|
|
||||||
// On a one-pass regular expression, the NFA code in nfa.cc
|
|
||||||
// runs at about 1/20 of the backtracking-based PCRE speed.
|
|
||||||
// In contrast, the code in this file runs at about the same
|
|
||||||
// speed as PCRE.
|
|
||||||
//
|
|
||||||
// One-pass regular expressions get used a lot when RE is
|
|
||||||
// used for parsing simple strings, so it pays off to
|
|
||||||
// notice them and handle them efficiently.
|
|
||||||
//
|
|
||||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
|
||||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <map>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/sparse_set.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/prog.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
// Silence "zero-sized array in struct/union" warning for OneState::action.
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning(disable: 4200)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static const bool ExtraDebug = false;
|
|
||||||
|
|
||||||
// The key insight behind this implementation is that the
|
|
||||||
// non-determinism in an NFA for a one-pass regular expression
|
|
||||||
// is contained. To explain what that means, first a
|
|
||||||
// refresher about what regular expression programs look like
|
|
||||||
// and how the usual NFA execution runs.
|
|
||||||
//
|
|
||||||
// In a regular expression program, only the kInstByteRange
|
|
||||||
// instruction processes an input byte c and moves on to the
|
|
||||||
// next byte in the string (it does so if c is in the given range).
|
|
||||||
// The kInstByteRange instructions correspond to literal characters
|
|
||||||
// and character classes in the regular expression.
|
|
||||||
//
|
|
||||||
// The kInstAlt instructions are used as wiring to connect the
|
|
||||||
// kInstByteRange instructions together in interesting ways when
|
|
||||||
// implementing | + and *.
|
|
||||||
// The kInstAlt instruction forks execution, like a goto that
|
|
||||||
// jumps to ip->out() and ip->out1() in parallel. Each of the
|
|
||||||
// resulting computation paths is called a thread.
|
|
||||||
//
|
|
||||||
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
|
|
||||||
// are interesting in their own right but like kInstAlt they don't
|
|
||||||
// advance the input pointer. Only kInstByteRange does.
|
|
||||||
//
|
|
||||||
// The automaton execution in nfa.cc runs all the possible
|
|
||||||
// threads of execution in lock-step over the input. To process
|
|
||||||
// a particular byte, each thread gets run until it either dies
|
|
||||||
// or finds a kInstByteRange instruction matching the byte.
|
|
||||||
// If the latter happens, the thread stops just past the
|
|
||||||
// kInstByteRange instruction (at ip->out()) and waits for
|
|
||||||
// the other threads to finish processing the input byte.
|
|
||||||
// Then, once all the threads have processed that input byte,
|
|
||||||
// the whole process repeats. The kInstAlt state instruction
|
|
||||||
// might create new threads during input processing, but no
|
|
||||||
// matter what, all the threads stop after a kInstByteRange
|
|
||||||
// and wait for the other threads to "catch up".
|
|
||||||
// Running in lock step like this ensures that the NFA reads
|
|
||||||
// the input string only once.
|
|
||||||
//
|
|
||||||
// Each thread maintains its own set of capture registers
|
|
||||||
// (the string positions at which it executed the kInstCapture
|
|
||||||
// instructions corresponding to capturing parentheses in the
|
|
||||||
// regular expression). Repeated copying of the capture registers
|
|
||||||
// is the main performance bottleneck in the NFA implementation.
|
|
||||||
//
|
|
||||||
// A regular expression program is "one-pass" if, no matter what
|
|
||||||
// the input string, there is only one thread that makes it
|
|
||||||
// past a kInstByteRange instruction at each input byte. This means
|
|
||||||
// that there is in some sense only one active thread throughout
|
|
||||||
// the execution. Other threads might be created during the
|
|
||||||
// processing of an input byte, but they are ephemeral: only one
|
|
||||||
// thread is left to start processing the next input byte.
|
|
||||||
// This is what I meant above when I said the non-determinism
|
|
||||||
// was "contained".
|
|
||||||
//
|
|
||||||
// To execute a one-pass regular expression program, we can build
|
|
||||||
// a DFA (no non-determinism) that has at most as many states as
|
|
||||||
// the NFA (compare this to the possibly exponential number of states
|
|
||||||
// in the general case). Each state records, for each possible
|
|
||||||
// input byte, the next state along with the conditions required
|
|
||||||
// before entering that state -- empty-width flags that must be true
|
|
||||||
// and capture operations that must be performed. It also records
|
|
||||||
// whether a set of conditions required to finish a match at that
|
|
||||||
// point in the input rather than process the next byte.
|
|
||||||
|
|
||||||
// A state in the one-pass NFA - just an array of actions indexed
|
|
||||||
// by the bytemap_[] of the next input byte. (The bytemap
|
|
||||||
// maps next input bytes into equivalence classes, to reduce
|
|
||||||
// the memory footprint.)
|
|
||||||
struct OneState {
|
|
||||||
uint32_t matchcond; // conditions to match right now.
|
|
||||||
uint32_t action[];
|
|
||||||
};
|
|
||||||
|
|
||||||
// The uint32_t conditions in the action are a combination of
|
|
||||||
// condition and capture bits and the next state. The bottom 16 bits
|
|
||||||
// are the condition and capture bits, and the top 16 are the index of
|
|
||||||
// the next state.
|
|
||||||
//
|
|
||||||
// Bits 0-5 are the empty-width flags from prog.h.
|
|
||||||
// Bit 6 is kMatchWins, which means the match takes
|
|
||||||
// priority over moving to next in a first-match search.
|
|
||||||
// The remaining bits mark capture registers that should
|
|
||||||
// be set to the current input position. The capture bits
|
|
||||||
// start at index 2, since the search loop can take care of
|
|
||||||
// cap[0], cap[1] (the overall match position).
|
|
||||||
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
|
|
||||||
// No input position can satisfy both kEmptyWordBoundary
|
|
||||||
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
|
||||||
// instead of needing an extra bit.
|
|
||||||
|
|
||||||
static const int kIndexShift = 16; // number of bits below index
|
|
||||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
|
||||||
static const int kRealCapShift = kEmptyShift + 1;
|
|
||||||
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
|
||||||
|
|
||||||
// Parameters used to skip over cap[0], cap[1].
|
|
||||||
static const int kCapShift = kRealCapShift - 2;
|
|
||||||
static const int kMaxCap = kRealMaxCap + 2;
|
|
||||||
|
|
||||||
static const uint32_t kMatchWins = 1 << kEmptyShift;
|
|
||||||
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
|
||||||
|
|
||||||
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
|
||||||
|
|
||||||
// Check, at compile time, that prog.h agrees with math above.
|
|
||||||
// This function is never called.
|
|
||||||
void OnePass_Checks() {
|
|
||||||
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
|
||||||
"kEmptyShift disagrees with kEmptyAllFlags");
|
|
||||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
|
||||||
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
|
|
||||||
"kMaxCap disagrees with kMaxOnePassCapture");
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
|
|
||||||
uint32_t satisfied = Prog::EmptyFlags(context, p);
|
|
||||||
if (cond & kEmptyAllFlags & ~satisfied)
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply the capture bits in cond, saving p to the appropriate
|
|
||||||
// locations in cap[].
|
|
||||||
static void ApplyCaptures(uint32_t cond, const char* p,
|
|
||||||
const char** cap, int ncap) {
|
|
||||||
for (int i = 2; i < ncap; i++)
|
|
||||||
if (cond & (1 << kCapShift << i))
|
|
||||||
cap[i] = p;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Computes the OneState* for the given nodeindex.
|
|
||||||
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
|
|
||||||
int nodeindex) {
|
|
||||||
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Prog::SearchOnePass(const StringPiece& text,
|
|
||||||
const StringPiece& const_context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch) {
|
|
||||||
if (anchor != kAnchored && kind != kFullMatch) {
|
|
||||||
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Make sure we have at least cap[1],
|
|
||||||
// because we use it to tell if we matched.
|
|
||||||
int ncap = 2*nmatch;
|
|
||||||
if (ncap < 2)
|
|
||||||
ncap = 2;
|
|
||||||
|
|
||||||
const char* cap[kMaxCap];
|
|
||||||
for (int i = 0; i < ncap; i++)
|
|
||||||
cap[i] = NULL;
|
|
||||||
|
|
||||||
const char* matchcap[kMaxCap];
|
|
||||||
for (int i = 0; i < ncap; i++)
|
|
||||||
matchcap[i] = NULL;
|
|
||||||
|
|
||||||
StringPiece context = const_context;
|
|
||||||
if (context.begin() == NULL)
|
|
||||||
context = text;
|
|
||||||
if (anchor_start() && context.begin() != text.begin())
|
|
||||||
return false;
|
|
||||||
if (anchor_end() && context.end() != text.end())
|
|
||||||
return false;
|
|
||||||
if (anchor_end())
|
|
||||||
kind = kFullMatch;
|
|
||||||
|
|
||||||
uint8_t* nodes = onepass_nodes_;
|
|
||||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
|
||||||
// start() is always mapped to the zeroth OneState.
|
|
||||||
OneState* state = IndexToNode(nodes, statesize, 0);
|
|
||||||
uint8_t* bytemap = bytemap_;
|
|
||||||
const char* bp = text.begin();
|
|
||||||
const char* ep = text.end();
|
|
||||||
const char* p;
|
|
||||||
bool matched = false;
|
|
||||||
matchcap[0] = bp;
|
|
||||||
cap[0] = bp;
|
|
||||||
uint32_t nextmatchcond = state->matchcond;
|
|
||||||
for (p = bp; p < ep; p++) {
|
|
||||||
int c = bytemap[*p & 0xFF];
|
|
||||||
uint32_t matchcond = nextmatchcond;
|
|
||||||
uint32_t cond = state->action[c];
|
|
||||||
|
|
||||||
// Determine whether we can reach act->next.
|
|
||||||
// If so, advance state and nextmatchcond.
|
|
||||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
|
||||||
uint32_t nextindex = cond >> kIndexShift;
|
|
||||||
state = IndexToNode(nodes, statesize, nextindex);
|
|
||||||
nextmatchcond = state->matchcond;
|
|
||||||
} else {
|
|
||||||
state = NULL;
|
|
||||||
nextmatchcond = kImpossible;
|
|
||||||
}
|
|
||||||
|
|
||||||
// This code section is carefully tuned.
|
|
||||||
// The goto sequence is about 10% faster than the
|
|
||||||
// obvious rewrite as a large if statement in the
|
|
||||||
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
|
|
||||||
|
|
||||||
// Saving the match capture registers is expensive.
|
|
||||||
// Is this intermediate match worth thinking about?
|
|
||||||
|
|
||||||
// Not if we want a full match.
|
|
||||||
if (kind == kFullMatch)
|
|
||||||
goto skipmatch;
|
|
||||||
|
|
||||||
// Not if it's impossible.
|
|
||||||
if (matchcond == kImpossible)
|
|
||||||
goto skipmatch;
|
|
||||||
|
|
||||||
// Not if the possible match is beaten by the certain
|
|
||||||
// match at the next byte. When this test is useless
|
|
||||||
// (e.g., HTTPPartialMatchRE2) it slows the loop by
|
|
||||||
// about 10%, but when it avoids work (e.g., DotMatchRE2),
|
|
||||||
// it cuts the loop execution by about 45%.
|
|
||||||
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
|
|
||||||
goto skipmatch;
|
|
||||||
|
|
||||||
// Finally, the match conditions must be satisfied.
|
|
||||||
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
|
|
||||||
for (int i = 2; i < 2*nmatch; i++)
|
|
||||||
matchcap[i] = cap[i];
|
|
||||||
if (nmatch > 1 && (matchcond & kCapMask))
|
|
||||||
ApplyCaptures(matchcond, p, matchcap, ncap);
|
|
||||||
matchcap[1] = p;
|
|
||||||
matched = true;
|
|
||||||
|
|
||||||
// If we're in longest match mode, we have to keep
|
|
||||||
// going and see if we find a longer match.
|
|
||||||
// In first match mode, we can stop if the match
|
|
||||||
// takes priority over the next state for this input byte.
|
|
||||||
// That bit is per-input byte and thus in cond, not matchcond.
|
|
||||||
if (kind == kFirstMatch && (cond & kMatchWins))
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
|
|
||||||
skipmatch:
|
|
||||||
if (state == NULL)
|
|
||||||
goto done;
|
|
||||||
if ((cond & kCapMask) && nmatch > 1)
|
|
||||||
ApplyCaptures(cond, p, cap, ncap);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for match at end of input.
|
|
||||||
{
|
|
||||||
uint32_t matchcond = state->matchcond;
|
|
||||||
if (matchcond != kImpossible &&
|
|
||||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
|
||||||
if (nmatch > 1 && (matchcond & kCapMask))
|
|
||||||
ApplyCaptures(matchcond, p, cap, ncap);
|
|
||||||
for (int i = 2; i < ncap; i++)
|
|
||||||
matchcap[i] = cap[i];
|
|
||||||
matchcap[1] = p;
|
|
||||||
matched = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
done:
|
|
||||||
if (!matched)
|
|
||||||
return false;
|
|
||||||
for (int i = 0; i < nmatch; i++)
|
|
||||||
match[i] =
|
|
||||||
StringPiece(matchcap[2 * i],
|
|
||||||
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Analysis to determine whether a given regexp program is one-pass.
|
|
||||||
|
|
||||||
// If ip is not on workq, adds ip to work queue and returns true.
|
|
||||||
// If ip is already on work queue, does nothing and returns false.
|
|
||||||
// If ip is NULL, does nothing and returns true (pretends to add it).
|
|
||||||
typedef SparseSet Instq;
|
|
||||||
static bool AddQ(Instq *q, int id) {
|
|
||||||
if (id == 0)
|
|
||||||
return true;
|
|
||||||
if (q->contains(id))
|
|
||||||
return false;
|
|
||||||
q->insert(id);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct InstCond {
|
|
||||||
int id;
|
|
||||||
uint32_t cond;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Returns whether this is a one-pass program; that is,
|
|
||||||
// returns whether it is safe to use SearchOnePass on this program.
|
|
||||||
// These conditions must be true for any instruction ip:
|
|
||||||
//
|
|
||||||
// (1) for any other Inst nip, there is at most one input-free
|
|
||||||
// path from ip to nip.
|
|
||||||
// (2) there is at most one kInstByte instruction reachable from
|
|
||||||
// ip that matches any particular byte c.
|
|
||||||
// (3) there is at most one input-free path from ip to a kInstMatch
|
|
||||||
// instruction.
|
|
||||||
//
|
|
||||||
// This is actually just a conservative approximation: it might
|
|
||||||
// return false when the answer is true, when kInstEmptyWidth
|
|
||||||
// instructions are involved.
|
|
||||||
// Constructs and saves corresponding one-pass NFA on success.
|
|
||||||
bool Prog::IsOnePass() {
|
|
||||||
if (did_onepass_)
|
|
||||||
return onepass_nodes_ != NULL;
|
|
||||||
did_onepass_ = true;
|
|
||||||
|
|
||||||
if (start() == 0) // no match
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Steal memory for the one-pass NFA from the overall DFA budget.
|
|
||||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
|
||||||
// Limit max node count to 65000 as a conservative estimate to
|
|
||||||
// avoid overflowing 16-bit node index in encoding.
|
|
||||||
int maxnodes = 2 + inst_count(kInstByteRange);
|
|
||||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
|
||||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Flood the graph starting at the start state, and check
|
|
||||||
// that in each reachable state, each possible byte leads
|
|
||||||
// to a unique next state.
|
|
||||||
int stacksize = inst_count(kInstCapture) +
|
|
||||||
inst_count(kInstEmptyWidth) +
|
|
||||||
inst_count(kInstNop) + 1; // + 1 for start inst
|
|
||||||
InstCond* stack = new InstCond[stacksize];
|
|
||||||
|
|
||||||
int size = this->size();
|
|
||||||
int* nodebyid = new int[size]; // indexed by ip
|
|
||||||
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
|
||||||
|
|
||||||
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
|
|
||||||
// unnecessarily optimistic: why allocate a large amount of memory
|
|
||||||
// upfront for a large program when it is unlikely to be one-pass?
|
|
||||||
std::vector<uint8_t> nodes;
|
|
||||||
|
|
||||||
Instq tovisit(size), workq(size);
|
|
||||||
AddQ(&tovisit, start());
|
|
||||||
nodebyid[start()] = 0;
|
|
||||||
int nalloc = 1;
|
|
||||||
nodes.insert(nodes.end(), statesize, 0);
|
|
||||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
|
||||||
int id = *it;
|
|
||||||
int nodeindex = nodebyid[id];
|
|
||||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
|
||||||
|
|
||||||
// Flood graph using manual stack, filling in actions as found.
|
|
||||||
// Default is none.
|
|
||||||
for (int b = 0; b < bytemap_range_; b++)
|
|
||||||
node->action[b] = kImpossible;
|
|
||||||
node->matchcond = kImpossible;
|
|
||||||
|
|
||||||
workq.clear();
|
|
||||||
bool matched = false;
|
|
||||||
int nstack = 0;
|
|
||||||
stack[nstack].id = id;
|
|
||||||
stack[nstack++].cond = 0;
|
|
||||||
while (nstack > 0) {
|
|
||||||
int id = stack[--nstack].id;
|
|
||||||
uint32_t cond = stack[nstack].cond;
|
|
||||||
|
|
||||||
Loop:
|
|
||||||
Prog::Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
|
||||||
// Should implement it in this engine, but it's subtle.
|
|
||||||
DCHECK(!ip->last());
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
|
||||||
if (!AddQ(&workq, id+1))
|
|
||||||
goto fail;
|
|
||||||
id = id+1;
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstByteRange: {
|
|
||||||
int nextindex = nodebyid[ip->out()];
|
|
||||||
if (nextindex == -1) {
|
|
||||||
if (nalloc >= maxnodes) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << StringPrintf(
|
|
||||||
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
nextindex = nalloc;
|
|
||||||
AddQ(&tovisit, ip->out());
|
|
||||||
nodebyid[ip->out()] = nalloc;
|
|
||||||
nalloc++;
|
|
||||||
nodes.insert(nodes.end(), statesize, 0);
|
|
||||||
// Update node because it might have been invalidated.
|
|
||||||
node = IndexToNode(nodes.data(), statesize, nodeindex);
|
|
||||||
}
|
|
||||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
|
||||||
int b = bytemap_[c];
|
|
||||||
// Skip any bytes immediately after c that are also in b.
|
|
||||||
while (c < 256-1 && bytemap_[c+1] == b)
|
|
||||||
c++;
|
|
||||||
uint32_t act = node->action[b];
|
|
||||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
|
||||||
if (matched)
|
|
||||||
newact |= kMatchWins;
|
|
||||||
if ((act & kImpossible) == kImpossible) {
|
|
||||||
node->action[b] = newact;
|
|
||||||
} else if (act != newact) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << StringPrintf(
|
|
||||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ip->foldcase()) {
|
|
||||||
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
|
||||||
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
|
||||||
for (int c = lo; c <= hi; c++) {
|
|
||||||
int b = bytemap_[c];
|
|
||||||
// Skip any bytes immediately after c that are also in b.
|
|
||||||
while (c < 256-1 && bytemap_[c+1] == b)
|
|
||||||
c++;
|
|
||||||
uint32_t act = node->action[b];
|
|
||||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
|
||||||
if (matched)
|
|
||||||
newact |= kMatchWins;
|
|
||||||
if ((act & kImpossible) == kImpossible) {
|
|
||||||
node->action[b] = newact;
|
|
||||||
} else if (act != newact) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << StringPrintf(
|
|
||||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ip->last())
|
|
||||||
break;
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
|
||||||
if (!AddQ(&workq, id+1))
|
|
||||||
goto fail;
|
|
||||||
id = id+1;
|
|
||||||
goto Loop;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
case kInstNop:
|
|
||||||
if (!ip->last()) {
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
|
||||||
if (!AddQ(&workq, id+1))
|
|
||||||
goto fail;
|
|
||||||
stack[nstack].id = id+1;
|
|
||||||
stack[nstack++].cond = cond;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
|
|
||||||
cond |= (1 << kCapShift) << ip->cap();
|
|
||||||
if (ip->opcode() == kInstEmptyWidth)
|
|
||||||
cond |= ip->empty();
|
|
||||||
|
|
||||||
// kInstCapture and kInstNop always proceed to ip->out().
|
|
||||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
|
||||||
// but as a conservative approximation we assume it always does.
|
|
||||||
// We could be a little more precise by looking at what c
|
|
||||||
// is, but that seems like overkill.
|
|
||||||
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
|
||||||
if (!AddQ(&workq, ip->out())) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << StringPrintf(
|
|
||||||
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
if (matched) {
|
|
||||||
// (3) is violated
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << StringPrintf(
|
|
||||||
"Not OnePass: multiple matches from %d\n", *it);
|
|
||||||
goto fail;
|
|
||||||
}
|
|
||||||
matched = true;
|
|
||||||
node->matchcond = cond;
|
|
||||||
|
|
||||||
if (ip->last())
|
|
||||||
break;
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
|
||||||
if (!AddQ(&workq, id+1))
|
|
||||||
goto fail;
|
|
||||||
id = id+1;
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
|
||||||
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
|
|
||||||
LOG(ERROR) << "prog:\n" << Dump();
|
|
||||||
|
|
||||||
std::map<int, int> idmap;
|
|
||||||
for (int i = 0; i < size; i++)
|
|
||||||
if (nodebyid[i] != -1)
|
|
||||||
idmap[nodebyid[i]] = i;
|
|
||||||
|
|
||||||
string dump;
|
|
||||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
|
||||||
int id = *it;
|
|
||||||
int nodeindex = nodebyid[id];
|
|
||||||
if (nodeindex == -1)
|
|
||||||
continue;
|
|
||||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
|
||||||
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
|
||||||
nodeindex, id, node->matchcond);
|
|
||||||
for (int i = 0; i < bytemap_range_; i++) {
|
|
||||||
if ((node->action[i] & kImpossible) == kImpossible)
|
|
||||||
continue;
|
|
||||||
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
|
|
||||||
i, node->action[i] & 0xFFFF,
|
|
||||||
node->action[i] >> kIndexShift,
|
|
||||||
idmap[node->action[i] >> kIndexShift]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LOG(ERROR) << "nodes:\n" << dump;
|
|
||||||
}
|
|
||||||
|
|
||||||
dfa_mem_ -= nalloc*statesize;
|
|
||||||
onepass_nodes_ = new uint8_t[nalloc*statesize];
|
|
||||||
memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
|
|
||||||
|
|
||||||
delete[] stack;
|
|
||||||
delete[] nodebyid;
|
|
||||||
return true;
|
|
||||||
|
|
||||||
fail:
|
|
||||||
delete[] stack;
|
|
||||||
delete[] nodebyid;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
File diff suppressed because it is too large
Load Diff
@ -1,119 +0,0 @@
|
|||||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
|
||||||
// make_perl_groups.pl >perl_groups.cc
|
|
||||||
|
|
||||||
#include "re2/unicode_groups.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static const URange16 code1[] = { /* \d */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
};
|
|
||||||
static const URange16 code2[] = { /* \s */
|
|
||||||
{ 0x9, 0xa },
|
|
||||||
{ 0xc, 0xd },
|
|
||||||
{ 0x20, 0x20 },
|
|
||||||
};
|
|
||||||
static const URange16 code3[] = { /* \w */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
{ 0x41, 0x5a },
|
|
||||||
{ 0x5f, 0x5f },
|
|
||||||
{ 0x61, 0x7a },
|
|
||||||
};
|
|
||||||
const UGroup perl_groups[] = {
|
|
||||||
{ "\\d", +1, code1, 1 },
|
|
||||||
{ "\\D", -1, code1, 1 },
|
|
||||||
{ "\\s", +1, code2, 3 },
|
|
||||||
{ "\\S", -1, code2, 3 },
|
|
||||||
{ "\\w", +1, code3, 4 },
|
|
||||||
{ "\\W", -1, code3, 4 },
|
|
||||||
};
|
|
||||||
const int num_perl_groups = 6;
|
|
||||||
static const URange16 code4[] = { /* [:alnum:] */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
{ 0x41, 0x5a },
|
|
||||||
{ 0x61, 0x7a },
|
|
||||||
};
|
|
||||||
static const URange16 code5[] = { /* [:alpha:] */
|
|
||||||
{ 0x41, 0x5a },
|
|
||||||
{ 0x61, 0x7a },
|
|
||||||
};
|
|
||||||
static const URange16 code6[] = { /* [:ascii:] */
|
|
||||||
{ 0x0, 0x7f },
|
|
||||||
};
|
|
||||||
static const URange16 code7[] = { /* [:blank:] */
|
|
||||||
{ 0x9, 0x9 },
|
|
||||||
{ 0x20, 0x20 },
|
|
||||||
};
|
|
||||||
static const URange16 code8[] = { /* [:cntrl:] */
|
|
||||||
{ 0x0, 0x1f },
|
|
||||||
{ 0x7f, 0x7f },
|
|
||||||
};
|
|
||||||
static const URange16 code9[] = { /* [:digit:] */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
};
|
|
||||||
static const URange16 code10[] = { /* [:graph:] */
|
|
||||||
{ 0x21, 0x7e },
|
|
||||||
};
|
|
||||||
static const URange16 code11[] = { /* [:lower:] */
|
|
||||||
{ 0x61, 0x7a },
|
|
||||||
};
|
|
||||||
static const URange16 code12[] = { /* [:print:] */
|
|
||||||
{ 0x20, 0x7e },
|
|
||||||
};
|
|
||||||
static const URange16 code13[] = { /* [:punct:] */
|
|
||||||
{ 0x21, 0x2f },
|
|
||||||
{ 0x3a, 0x40 },
|
|
||||||
{ 0x5b, 0x60 },
|
|
||||||
{ 0x7b, 0x7e },
|
|
||||||
};
|
|
||||||
static const URange16 code14[] = { /* [:space:] */
|
|
||||||
{ 0x9, 0xd },
|
|
||||||
{ 0x20, 0x20 },
|
|
||||||
};
|
|
||||||
static const URange16 code15[] = { /* [:upper:] */
|
|
||||||
{ 0x41, 0x5a },
|
|
||||||
};
|
|
||||||
static const URange16 code16[] = { /* [:word:] */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
{ 0x41, 0x5a },
|
|
||||||
{ 0x5f, 0x5f },
|
|
||||||
{ 0x61, 0x7a },
|
|
||||||
};
|
|
||||||
static const URange16 code17[] = { /* [:xdigit:] */
|
|
||||||
{ 0x30, 0x39 },
|
|
||||||
{ 0x41, 0x46 },
|
|
||||||
{ 0x61, 0x66 },
|
|
||||||
};
|
|
||||||
const UGroup posix_groups[] = {
|
|
||||||
{ "[:alnum:]", +1, code4, 3 },
|
|
||||||
{ "[:^alnum:]", -1, code4, 3 },
|
|
||||||
{ "[:alpha:]", +1, code5, 2 },
|
|
||||||
{ "[:^alpha:]", -1, code5, 2 },
|
|
||||||
{ "[:ascii:]", +1, code6, 1 },
|
|
||||||
{ "[:^ascii:]", -1, code6, 1 },
|
|
||||||
{ "[:blank:]", +1, code7, 2 },
|
|
||||||
{ "[:^blank:]", -1, code7, 2 },
|
|
||||||
{ "[:cntrl:]", +1, code8, 2 },
|
|
||||||
{ "[:^cntrl:]", -1, code8, 2 },
|
|
||||||
{ "[:digit:]", +1, code9, 1 },
|
|
||||||
{ "[:^digit:]", -1, code9, 1 },
|
|
||||||
{ "[:graph:]", +1, code10, 1 },
|
|
||||||
{ "[:^graph:]", -1, code10, 1 },
|
|
||||||
{ "[:lower:]", +1, code11, 1 },
|
|
||||||
{ "[:^lower:]", -1, code11, 1 },
|
|
||||||
{ "[:print:]", +1, code12, 1 },
|
|
||||||
{ "[:^print:]", -1, code12, 1 },
|
|
||||||
{ "[:punct:]", +1, code13, 4 },
|
|
||||||
{ "[:^punct:]", -1, code13, 4 },
|
|
||||||
{ "[:space:]", +1, code14, 2 },
|
|
||||||
{ "[:^space:]", -1, code14, 2 },
|
|
||||||
{ "[:upper:]", +1, code15, 1 },
|
|
||||||
{ "[:^upper:]", -1, code15, 1 },
|
|
||||||
{ "[:word:]", +1, code16, 4 },
|
|
||||||
{ "[:^word:]", -1, code16, 4 },
|
|
||||||
{ "[:xdigit:]", +1, code17, 3 },
|
|
||||||
{ "[:^xdigit:]", -1, code17, 3 },
|
|
||||||
};
|
|
||||||
const int num_posix_groups = 28;
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,711 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/prefilter.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/re2.h"
|
|
||||||
#include "re2/unicode_casefold.h"
|
|
||||||
#include "re2/walker-inl.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static const bool ExtraDebug = false;
|
|
||||||
|
|
||||||
typedef std::set<string>::iterator SSIter;
|
|
||||||
typedef std::set<string>::const_iterator ConstSSIter;
|
|
||||||
|
|
||||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
|
||||||
Prefilter::Prefilter(Op op) {
|
|
||||||
op_ = op;
|
|
||||||
subs_ = NULL;
|
|
||||||
if (op_ == AND || op_ == OR)
|
|
||||||
subs_ = new std::vector<Prefilter*>;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Destroys a Prefilter.
|
|
||||||
Prefilter::~Prefilter() {
|
|
||||||
if (subs_) {
|
|
||||||
for (size_t i = 0; i < subs_->size(); i++)
|
|
||||||
delete (*subs_)[i];
|
|
||||||
delete subs_;
|
|
||||||
subs_ = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simplify if the node is an empty Or or And.
|
|
||||||
Prefilter* Prefilter::Simplify() {
|
|
||||||
if (op_ != AND && op_ != OR) {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Nothing left in the AND/OR.
|
|
||||||
if (subs_->empty()) {
|
|
||||||
if (op_ == AND)
|
|
||||||
op_ = ALL; // AND of nothing is true
|
|
||||||
else
|
|
||||||
op_ = NONE; // OR of nothing is false
|
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Just one subnode: throw away wrapper.
|
|
||||||
if (subs_->size() == 1) {
|
|
||||||
Prefilter* a = (*subs_)[0];
|
|
||||||
subs_->clear();
|
|
||||||
delete this;
|
|
||||||
return a->Simplify();
|
|
||||||
}
|
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Combines two Prefilters together to create an "op" (AND or OR).
|
|
||||||
// The passed Prefilters will be part of the returned Prefilter or deleted.
|
|
||||||
// Does lots of work to avoid creating unnecessarily complicated structures.
|
|
||||||
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
|
|
||||||
// If a, b can be rewritten as op, do so.
|
|
||||||
a = a->Simplify();
|
|
||||||
b = b->Simplify();
|
|
||||||
|
|
||||||
// Canonicalize: a->op <= b->op.
|
|
||||||
if (a->op() > b->op()) {
|
|
||||||
Prefilter* t = a;
|
|
||||||
a = b;
|
|
||||||
b = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trivial cases.
|
|
||||||
// ALL AND b = b
|
|
||||||
// NONE OR b = b
|
|
||||||
// ALL OR b = ALL
|
|
||||||
// NONE AND b = NONE
|
|
||||||
// Don't need to look at b, because of canonicalization above.
|
|
||||||
// ALL and NONE are smallest opcodes.
|
|
||||||
if (a->op() == ALL || a->op() == NONE) {
|
|
||||||
if ((a->op() == ALL && op == AND) ||
|
|
||||||
(a->op() == NONE && op == OR)) {
|
|
||||||
delete a;
|
|
||||||
return b;
|
|
||||||
} else {
|
|
||||||
delete b;
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If a and b match op, merge their contents.
|
|
||||||
if (a->op() == op && b->op() == op) {
|
|
||||||
for (size_t i = 0; i < b->subs()->size(); i++) {
|
|
||||||
Prefilter* bb = (*b->subs())[i];
|
|
||||||
a->subs()->push_back(bb);
|
|
||||||
}
|
|
||||||
b->subs()->clear();
|
|
||||||
delete b;
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If a already has the same op as the op that is under construction
|
|
||||||
// add in b (similarly if b already has the same op, add in a).
|
|
||||||
if (b->op() == op) {
|
|
||||||
Prefilter* t = a;
|
|
||||||
a = b;
|
|
||||||
b = t;
|
|
||||||
}
|
|
||||||
if (a->op() == op) {
|
|
||||||
a->subs()->push_back(b);
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Otherwise just return the op.
|
|
||||||
Prefilter* c = new Prefilter(op);
|
|
||||||
c->subs()->push_back(a);
|
|
||||||
c->subs()->push_back(b);
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
|
|
||||||
return AndOr(AND, a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
|
||||||
return AndOr(OR, a, b);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void SimplifyStringSet(std::set<string> *ss) {
|
|
||||||
// Now make sure that the strings aren't redundant. For example, if
|
|
||||||
// we know "ab" is a required string, then it doesn't help at all to
|
|
||||||
// know that "abc" is also a required string, so delete "abc". This
|
|
||||||
// is because, when we are performing a string search to filter
|
|
||||||
// regexps, matching ab will already allow this regexp to be a
|
|
||||||
// candidate for match, so further matching abc is redundant.
|
|
||||||
|
|
||||||
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
|
|
||||||
SSIter j = i;
|
|
||||||
++j;
|
|
||||||
while (j != ss->end()) {
|
|
||||||
// Increment j early so that we can erase the element it points to.
|
|
||||||
SSIter old_j = j;
|
|
||||||
++j;
|
|
||||||
if (old_j->find(*i) != string::npos)
|
|
||||||
ss->erase(old_j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
|
|
||||||
SimplifyStringSet(ss);
|
|
||||||
Prefilter* or_prefilter = NULL;
|
|
||||||
if (!ss->empty()) {
|
|
||||||
or_prefilter = new Prefilter(NONE);
|
|
||||||
for (SSIter i = ss->begin(); i != ss->end(); ++i)
|
|
||||||
or_prefilter = Or(or_prefilter, FromString(*i));
|
|
||||||
}
|
|
||||||
return or_prefilter;
|
|
||||||
}
|
|
||||||
|
|
||||||
static Rune ToLowerRune(Rune r) {
|
|
||||||
if (r < Runeself) {
|
|
||||||
if ('A' <= r && r <= 'Z')
|
|
||||||
r += 'a' - 'A';
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
|
||||||
if (f == NULL || r < f->lo)
|
|
||||||
return r;
|
|
||||||
return ApplyFold(f, r);
|
|
||||||
}
|
|
||||||
|
|
||||||
static Rune ToLowerRuneLatin1(Rune r) {
|
|
||||||
if ('A' <= r && r <= 'Z')
|
|
||||||
r += 'a' - 'A';
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::FromString(const string& str) {
|
|
||||||
Prefilter* m = new Prefilter(Prefilter::ATOM);
|
|
||||||
m->atom_ = str;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Information about a regexp used during computation of Prefilter.
|
|
||||||
// Can be thought of as information about the set of strings matching
|
|
||||||
// the given regular expression.
|
|
||||||
class Prefilter::Info {
|
|
||||||
public:
|
|
||||||
Info();
|
|
||||||
~Info();
|
|
||||||
|
|
||||||
// More constructors. They delete their Info* arguments.
|
|
||||||
static Info* Alt(Info* a, Info* b);
|
|
||||||
static Info* Concat(Info* a, Info* b);
|
|
||||||
static Info* And(Info* a, Info* b);
|
|
||||||
static Info* Star(Info* a);
|
|
||||||
static Info* Plus(Info* a);
|
|
||||||
static Info* Quest(Info* a);
|
|
||||||
static Info* EmptyString();
|
|
||||||
static Info* NoMatch();
|
|
||||||
static Info* AnyChar();
|
|
||||||
static Info* CClass(CharClass* cc, bool latin1);
|
|
||||||
static Info* Literal(Rune r);
|
|
||||||
static Info* LiteralLatin1(Rune r);
|
|
||||||
static Info* AnyMatch();
|
|
||||||
|
|
||||||
// Format Info as a string.
|
|
||||||
string ToString();
|
|
||||||
|
|
||||||
// Caller takes ownership of the Prefilter.
|
|
||||||
Prefilter* TakeMatch();
|
|
||||||
|
|
||||||
std::set<string>& exact() { return exact_; }
|
|
||||||
|
|
||||||
bool is_exact() const { return is_exact_; }
|
|
||||||
|
|
||||||
class Walker;
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::set<string> exact_;
|
|
||||||
|
|
||||||
// When is_exact_ is true, the strings that match
|
|
||||||
// are placed in exact_. When it is no longer an exact
|
|
||||||
// set of strings that match this RE, then is_exact_
|
|
||||||
// is false and the match_ contains the required match
|
|
||||||
// criteria.
|
|
||||||
bool is_exact_;
|
|
||||||
|
|
||||||
// Accumulated Prefilter query that any
|
|
||||||
// match for this regexp is guaranteed to match.
|
|
||||||
Prefilter* match_;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
Prefilter::Info::Info()
|
|
||||||
: is_exact_(false),
|
|
||||||
match_(NULL) {
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter::Info::~Info() {
|
|
||||||
delete match_;
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::Info::TakeMatch() {
|
|
||||||
if (is_exact_) {
|
|
||||||
match_ = Prefilter::OrStrings(&exact_);
|
|
||||||
is_exact_ = false;
|
|
||||||
}
|
|
||||||
Prefilter* m = match_;
|
|
||||||
match_ = NULL;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Format a Info in string form.
|
|
||||||
string Prefilter::Info::ToString() {
|
|
||||||
if (is_exact_) {
|
|
||||||
int n = 0;
|
|
||||||
string s;
|
|
||||||
for (std::set<string>::iterator i = exact_.begin();
|
|
||||||
i != exact_.end();
|
|
||||||
++i) {
|
|
||||||
if (n++ > 0)
|
|
||||||
s += ",";
|
|
||||||
s += *i;
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (match_)
|
|
||||||
return match_->DebugString();
|
|
||||||
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the strings from src to dst.
|
|
||||||
static void CopyIn(const std::set<string>& src,
|
|
||||||
std::set<string>* dst) {
|
|
||||||
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
|
||||||
dst->insert(*i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the cross-product of a and b to dst.
|
|
||||||
// (For each string i in a and j in b, add i+j.)
|
|
||||||
static void CrossProduct(const std::set<string>& a,
|
|
||||||
const std::set<string>& b,
|
|
||||||
std::set<string>* dst) {
|
|
||||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
|
||||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
|
||||||
dst->insert(*i + *j);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Concats a and b. Requires that both are exact sets.
|
|
||||||
// Forms an exact set that is a crossproduct of a and b.
|
|
||||||
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
|
|
||||||
if (a == NULL)
|
|
||||||
return b;
|
|
||||||
DCHECK(a->is_exact_);
|
|
||||||
DCHECK(b && b->is_exact_);
|
|
||||||
Info *ab = new Info();
|
|
||||||
|
|
||||||
CrossProduct(a->exact_, b->exact_, &ab->exact_);
|
|
||||||
ab->is_exact_ = true;
|
|
||||||
|
|
||||||
delete a;
|
|
||||||
delete b;
|
|
||||||
return ab;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs an inexact Info for ab given a and b.
|
|
||||||
// Used only when a or b is not exact or when the
|
|
||||||
// exact cross product is likely to be too big.
|
|
||||||
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
|
|
||||||
if (a == NULL)
|
|
||||||
return b;
|
|
||||||
if (b == NULL)
|
|
||||||
return a;
|
|
||||||
|
|
||||||
Info *ab = new Info();
|
|
||||||
|
|
||||||
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
|
|
||||||
ab->is_exact_ = false;
|
|
||||||
delete a;
|
|
||||||
delete b;
|
|
||||||
return ab;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for a|b given a and b.
|
|
||||||
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
|
|
||||||
Info *ab = new Info();
|
|
||||||
|
|
||||||
if (a->is_exact_ && b->is_exact_) {
|
|
||||||
CopyIn(a->exact_, &ab->exact_);
|
|
||||||
CopyIn(b->exact_, &ab->exact_);
|
|
||||||
ab->is_exact_ = true;
|
|
||||||
} else {
|
|
||||||
// Either a or b has is_exact_ = false. If the other
|
|
||||||
// one has is_exact_ = true, we move it to match_ and
|
|
||||||
// then create a OR of a,b. The resulting Info has
|
|
||||||
// is_exact_ = false.
|
|
||||||
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
|
|
||||||
ab->is_exact_ = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
delete a;
|
|
||||||
delete b;
|
|
||||||
return ab;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for a? given a.
|
|
||||||
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
|
|
||||||
Info *ab = new Info();
|
|
||||||
|
|
||||||
ab->is_exact_ = false;
|
|
||||||
ab->match_ = new Prefilter(ALL);
|
|
||||||
delete a;
|
|
||||||
return ab;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for a* given a.
|
|
||||||
// Same as a? -- not much to do.
|
|
||||||
Prefilter::Info* Prefilter::Info::Star(Info *a) {
|
|
||||||
return Quest(a);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for a+ given a. If a was exact set, it isn't
|
|
||||||
// anymore.
|
|
||||||
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
|
|
||||||
Info *ab = new Info();
|
|
||||||
|
|
||||||
ab->match_ = a->TakeMatch();
|
|
||||||
ab->is_exact_ = false;
|
|
||||||
|
|
||||||
delete a;
|
|
||||||
return ab;
|
|
||||||
}
|
|
||||||
|
|
||||||
static string RuneToString(Rune r) {
|
|
||||||
char buf[UTFmax];
|
|
||||||
int n = runetochar(buf, &r);
|
|
||||||
return string(buf, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
static string RuneToStringLatin1(Rune r) {
|
|
||||||
char c = r & 0xff;
|
|
||||||
return string(&c, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for literal rune.
|
|
||||||
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
|
|
||||||
Info* info = new Info();
|
|
||||||
info->exact_.insert(RuneToString(ToLowerRune(r)));
|
|
||||||
info->is_exact_ = true;
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for literal rune for Latin1 encoded string.
|
|
||||||
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
|
|
||||||
Info* info = new Info();
|
|
||||||
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
|
||||||
info->is_exact_ = true;
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Info for dot (any character).
|
|
||||||
Prefilter::Info* Prefilter::Info::AnyChar() {
|
|
||||||
Prefilter::Info* info = new Prefilter::Info();
|
|
||||||
info->match_ = new Prefilter(ALL);
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Prefilter::Info for no possible match.
|
|
||||||
Prefilter::Info* Prefilter::Info::NoMatch() {
|
|
||||||
Prefilter::Info* info = new Prefilter::Info();
|
|
||||||
info->match_ = new Prefilter(NONE);
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Prefilter::Info for any possible match.
|
|
||||||
// This Prefilter::Info is valid for any regular expression,
|
|
||||||
// since it makes no assertions whatsoever about the
|
|
||||||
// strings being matched.
|
|
||||||
Prefilter::Info* Prefilter::Info::AnyMatch() {
|
|
||||||
Prefilter::Info *info = new Prefilter::Info();
|
|
||||||
info->match_ = new Prefilter(ALL);
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Prefilter::Info for just the empty string.
|
|
||||||
Prefilter::Info* Prefilter::Info::EmptyString() {
|
|
||||||
Prefilter::Info* info = new Prefilter::Info();
|
|
||||||
info->is_exact_ = true;
|
|
||||||
info->exact_.insert("");
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs Prefilter::Info for a character class.
|
|
||||||
typedef CharClass::iterator CCIter;
|
|
||||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
|
||||||
bool latin1) {
|
|
||||||
if (ExtraDebug) {
|
|
||||||
LOG(ERROR) << "CharClassInfo:";
|
|
||||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
|
||||||
LOG(ERROR) << " " << i->lo << "-" << i->hi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the class is too large, it's okay to overestimate.
|
|
||||||
if (cc->size() > 10)
|
|
||||||
return AnyChar();
|
|
||||||
|
|
||||||
Prefilter::Info *a = new Prefilter::Info();
|
|
||||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
|
||||||
for (Rune r = i->lo; r <= i->hi; r++) {
|
|
||||||
if (latin1) {
|
|
||||||
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
|
||||||
} else {
|
|
||||||
a->exact_.insert(RuneToString(ToLowerRune(r)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
a->is_exact_ = true;
|
|
||||||
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << " = " << a->ToString();
|
|
||||||
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
|
||||||
public:
|
|
||||||
Walker(bool latin1) : latin1_(latin1) {}
|
|
||||||
|
|
||||||
virtual Info* PostVisit(
|
|
||||||
Regexp* re, Info* parent_arg,
|
|
||||||
Info* pre_arg,
|
|
||||||
Info** child_args, int nchild_args);
|
|
||||||
|
|
||||||
virtual Info* ShortVisit(
|
|
||||||
Regexp* re,
|
|
||||||
Info* parent_arg);
|
|
||||||
|
|
||||||
bool latin1() { return latin1_; }
|
|
||||||
private:
|
|
||||||
bool latin1_;
|
|
||||||
|
|
||||||
Walker(const Walker&) = delete;
|
|
||||||
Walker& operator=(const Walker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
|
|
||||||
|
|
||||||
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
|
|
||||||
Prefilter::Info::Walker w(latin1);
|
|
||||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
|
||||||
|
|
||||||
if (w.stopped_early()) {
|
|
||||||
delete info;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
|
|
||||||
Regexp* re, Prefilter::Info* parent_arg) {
|
|
||||||
return AnyMatch();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Constructs the Prefilter::Info for the given regular expression.
|
|
||||||
// Assumes re is simplified.
|
|
||||||
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
|
||||||
Regexp* re, Prefilter::Info* parent_arg,
|
|
||||||
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
|
|
||||||
int nchild_args) {
|
|
||||||
Prefilter::Info *info;
|
|
||||||
switch (re->op()) {
|
|
||||||
default:
|
|
||||||
case kRegexpRepeat:
|
|
||||||
LOG(DFATAL) << "Bad regexp op " << re->op();
|
|
||||||
info = EmptyString();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
info = NoMatch();
|
|
||||||
break;
|
|
||||||
|
|
||||||
// These ops match the empty string:
|
|
||||||
case kRegexpEmptyMatch: // anywhere
|
|
||||||
case kRegexpBeginLine: // at beginning of line
|
|
||||||
case kRegexpEndLine: // at end of line
|
|
||||||
case kRegexpBeginText: // at beginning of text
|
|
||||||
case kRegexpEndText: // at end of text
|
|
||||||
case kRegexpWordBoundary: // at word boundary
|
|
||||||
case kRegexpNoWordBoundary: // not at word boundary
|
|
||||||
info = EmptyString();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteral:
|
|
||||||
if (latin1()) {
|
|
||||||
info = LiteralLatin1(re->rune());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
info = Literal(re->rune());
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
if (re->nrunes() == 0) {
|
|
||||||
info = NoMatch();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (latin1()) {
|
|
||||||
info = LiteralLatin1(re->runes()[0]);
|
|
||||||
for (int i = 1; i < re->nrunes(); i++) {
|
|
||||||
info = Concat(info, LiteralLatin1(re->runes()[i]));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
info = Literal(re->runes()[0]);
|
|
||||||
for (int i = 1; i < re->nrunes(); i++) {
|
|
||||||
info = Concat(info, Literal(re->runes()[i]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpConcat: {
|
|
||||||
// Accumulate in info.
|
|
||||||
// Exact is concat of recent contiguous exact nodes.
|
|
||||||
info = NULL;
|
|
||||||
Info* exact = NULL;
|
|
||||||
for (int i = 0; i < nchild_args; i++) {
|
|
||||||
Info* ci = child_args[i]; // child info
|
|
||||||
if (!ci->is_exact() ||
|
|
||||||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
|
|
||||||
// Exact run is over.
|
|
||||||
info = And(info, exact);
|
|
||||||
exact = NULL;
|
|
||||||
// Add this child's info.
|
|
||||||
info = And(info, ci);
|
|
||||||
} else {
|
|
||||||
// Append to exact run.
|
|
||||||
exact = Concat(exact, ci);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
info = And(info, exact);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAlternate:
|
|
||||||
info = child_args[0];
|
|
||||||
for (int i = 1; i < nchild_args; i++)
|
|
||||||
info = Alt(info, child_args[i]);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
info = Star(child_args[0]);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpQuest:
|
|
||||||
info = Quest(child_args[0]);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpPlus:
|
|
||||||
info = Plus(child_args[0]);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
// Claim nothing, except that it's not empty.
|
|
||||||
info = AnyChar();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpCharClass:
|
|
||||||
info = CClass(re->cc(), latin1());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpCapture:
|
|
||||||
// These don't affect the set of matching strings.
|
|
||||||
info = child_args[0];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ExtraDebug)
|
|
||||||
LOG(ERROR) << "BuildInfo " << re->ToString()
|
|
||||||
<< ": " << (info ? info->ToString() : "");
|
|
||||||
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Prefilter* Prefilter::FromRegexp(Regexp* re) {
|
|
||||||
if (re == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
Regexp* simple = re->Simplify();
|
|
||||||
Prefilter::Info *info = BuildInfo(simple);
|
|
||||||
|
|
||||||
simple->Decref();
|
|
||||||
if (info == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
Prefilter* m = info->TakeMatch();
|
|
||||||
|
|
||||||
delete info;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
string Prefilter::DebugString() const {
|
|
||||||
switch (op_) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
|
|
||||||
return StringPrintf("op%d", op_);
|
|
||||||
case NONE:
|
|
||||||
return "*no-matches*";
|
|
||||||
case ATOM:
|
|
||||||
return atom_;
|
|
||||||
case ALL:
|
|
||||||
return "";
|
|
||||||
case AND: {
|
|
||||||
string s = "";
|
|
||||||
for (size_t i = 0; i < subs_->size(); i++) {
|
|
||||||
if (i > 0)
|
|
||||||
s += " ";
|
|
||||||
Prefilter* sub = (*subs_)[i];
|
|
||||||
s += sub ? sub->DebugString() : "<nil>";
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
case OR: {
|
|
||||||
string s = "(";
|
|
||||||
for (size_t i = 0; i < subs_->size(); i++) {
|
|
||||||
if (i > 0)
|
|
||||||
s += "|";
|
|
||||||
Prefilter* sub = (*subs_)[i];
|
|
||||||
s += sub ? sub->DebugString() : "<nil>";
|
|
||||||
}
|
|
||||||
s += ")";
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* Prefilter::FromRE2(const RE2* re2) {
|
|
||||||
if (re2 == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
Regexp* regexp = re2->Regexp();
|
|
||||||
if (regexp == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
return FromRegexp(regexp);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,108 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_PREFILTER_H_
|
|
||||||
#define RE2_PREFILTER_H_
|
|
||||||
|
|
||||||
// Prefilter is the class used to extract string guards from regexps.
|
|
||||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
|
||||||
// See filtered_re2.h
|
|
||||||
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class RE2;
|
|
||||||
|
|
||||||
class Regexp;
|
|
||||||
|
|
||||||
class Prefilter {
|
|
||||||
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
|
||||||
public:
|
|
||||||
enum Op {
|
|
||||||
ALL = 0, // Everything matches
|
|
||||||
NONE, // Nothing matches
|
|
||||||
ATOM, // The string atom() must match
|
|
||||||
AND, // All in subs() must match
|
|
||||||
OR, // One of subs() must match
|
|
||||||
};
|
|
||||||
|
|
||||||
explicit Prefilter(Op op);
|
|
||||||
~Prefilter();
|
|
||||||
|
|
||||||
Op op() { return op_; }
|
|
||||||
const string& atom() const { return atom_; }
|
|
||||||
void set_unique_id(int id) { unique_id_ = id; }
|
|
||||||
int unique_id() const { return unique_id_; }
|
|
||||||
|
|
||||||
// The children of the Prefilter node.
|
|
||||||
std::vector<Prefilter*>* subs() {
|
|
||||||
DCHECK(op_ == AND || op_ == OR);
|
|
||||||
return subs_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the children vector. Prefilter takes ownership of subs and
|
|
||||||
// subs_ will be deleted when Prefilter is deleted.
|
|
||||||
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
|
|
||||||
|
|
||||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
|
||||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
|
||||||
// cannot be formed.
|
|
||||||
static Prefilter* FromRE2(const RE2* re2);
|
|
||||||
|
|
||||||
// Returns a readable debug string of the prefilter.
|
|
||||||
string DebugString() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
class Info;
|
|
||||||
|
|
||||||
// Combines two prefilters together to create an AND. The passed
|
|
||||||
// Prefilters will be part of the returned Prefilter or deleted.
|
|
||||||
static Prefilter* And(Prefilter* a, Prefilter* b);
|
|
||||||
|
|
||||||
// Combines two prefilters together to create an OR. The passed
|
|
||||||
// Prefilters will be part of the returned Prefilter or deleted.
|
|
||||||
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
|
||||||
|
|
||||||
// Generalized And/Or
|
|
||||||
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
|
||||||
|
|
||||||
static Prefilter* FromRegexp(Regexp* a);
|
|
||||||
|
|
||||||
static Prefilter* FromString(const string& str);
|
|
||||||
|
|
||||||
static Prefilter* OrStrings(std::set<string>* ss);
|
|
||||||
|
|
||||||
static Info* BuildInfo(Regexp* re);
|
|
||||||
|
|
||||||
Prefilter* Simplify();
|
|
||||||
|
|
||||||
// Kind of Prefilter.
|
|
||||||
Op op_;
|
|
||||||
|
|
||||||
// Sub-matches for AND or OR Prefilter.
|
|
||||||
std::vector<Prefilter*>* subs_;
|
|
||||||
|
|
||||||
// Actual string to match in leaf node.
|
|
||||||
string atom_;
|
|
||||||
|
|
||||||
// If different prefilters have the same string atom, or if they are
|
|
||||||
// structurally the same (e.g., OR of same atom strings) they are
|
|
||||||
// considered the same unique nodes. This is the id for each unique
|
|
||||||
// node. This field is populated with a unique id for every node,
|
|
||||||
// and -1 for duplicate nodes.
|
|
||||||
int unique_id_;
|
|
||||||
|
|
||||||
Prefilter(const Prefilter&) = delete;
|
|
||||||
Prefilter& operator=(const Prefilter&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_PREFILTER_H_
|
|
@ -1,405 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/prefilter_tree.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <map>
|
|
||||||
#include <memory>
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
#include "re2/prefilter.h"
|
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static const bool ExtraDebug = false;
|
|
||||||
|
|
||||||
PrefilterTree::PrefilterTree()
|
|
||||||
: compiled_(false),
|
|
||||||
min_atom_len_(3) {
|
|
||||||
}
|
|
||||||
|
|
||||||
PrefilterTree::PrefilterTree(int min_atom_len)
|
|
||||||
: compiled_(false),
|
|
||||||
min_atom_len_(min_atom_len) {
|
|
||||||
}
|
|
||||||
|
|
||||||
PrefilterTree::~PrefilterTree() {
|
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); i++)
|
|
||||||
delete prefilter_vec_[i];
|
|
||||||
|
|
||||||
for (size_t i = 0; i < entries_.size(); i++)
|
|
||||||
delete entries_[i].parents;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::Add(Prefilter* prefilter) {
|
|
||||||
if (compiled_) {
|
|
||||||
LOG(DFATAL) << "Add called after Compile.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (prefilter != NULL && !KeepNode(prefilter)) {
|
|
||||||
delete prefilter;
|
|
||||||
prefilter = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
prefilter_vec_.push_back(prefilter);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::Compile(std::vector<string>* atom_vec) {
|
|
||||||
if (compiled_) {
|
|
||||||
LOG(DFATAL) << "Compile called already.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// We do this check to support some legacy uses of
|
|
||||||
// PrefilterTree that call Compile before adding any regexps,
|
|
||||||
// and expect Compile not to have effect.
|
|
||||||
if (prefilter_vec_.empty())
|
|
||||||
return;
|
|
||||||
|
|
||||||
compiled_ = true;
|
|
||||||
|
|
||||||
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
|
|
||||||
NodeMap nodes;
|
|
||||||
AssignUniqueIds(&nodes, atom_vec);
|
|
||||||
|
|
||||||
// Identify nodes that are too common among prefilters and are
|
|
||||||
// triggering too many parents. Then get rid of them if possible.
|
|
||||||
// Note that getting rid of a prefilter node simply means they are
|
|
||||||
// no longer necessary for their parent to trigger; that is, we do
|
|
||||||
// not miss out on any regexps triggering by getting rid of a
|
|
||||||
// prefilter node.
|
|
||||||
for (size_t i = 0; i < entries_.size(); i++) {
|
|
||||||
StdIntMap* parents = entries_[i].parents;
|
|
||||||
if (parents->size() > 8) {
|
|
||||||
// This one triggers too many things. If all the parents are AND
|
|
||||||
// nodes and have other things guarding them, then get rid of
|
|
||||||
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
|
||||||
// make it a function of total number of nodes?
|
|
||||||
bool have_other_guard = true;
|
|
||||||
for (StdIntMap::iterator it = parents->begin();
|
|
||||||
it != parents->end(); ++it) {
|
|
||||||
have_other_guard = have_other_guard &&
|
|
||||||
(entries_[it->first].propagate_up_at_count > 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (have_other_guard) {
|
|
||||||
for (StdIntMap::iterator it = parents->begin();
|
|
||||||
it != parents->end(); ++it)
|
|
||||||
entries_[it->first].propagate_up_at_count -= 1;
|
|
||||||
|
|
||||||
parents->clear(); // Forget the parents
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ExtraDebug)
|
|
||||||
PrintDebugInfo(&nodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
|
|
||||||
string node_string = NodeString(node);
|
|
||||||
std::map<string, Prefilter*>::iterator iter = nodes->find(node_string);
|
|
||||||
if (iter == nodes->end())
|
|
||||||
return NULL;
|
|
||||||
return (*iter).second;
|
|
||||||
}
|
|
||||||
|
|
||||||
string PrefilterTree::NodeString(Prefilter* node) const {
|
|
||||||
// Adding the operation disambiguates AND/OR/atom nodes.
|
|
||||||
string s = StringPrintf("%d", node->op()) + ":";
|
|
||||||
if (node->op() == Prefilter::ATOM) {
|
|
||||||
s += node->atom();
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
|
||||||
if (i > 0)
|
|
||||||
s += ',';
|
|
||||||
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool PrefilterTree::KeepNode(Prefilter* node) const {
|
|
||||||
if (node == NULL)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
switch (node->op()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case Prefilter::ALL:
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case Prefilter::ATOM:
|
|
||||||
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
|
|
||||||
|
|
||||||
case Prefilter::AND: {
|
|
||||||
int j = 0;
|
|
||||||
std::vector<Prefilter*>* subs = node->subs();
|
|
||||||
for (size_t i = 0; i < subs->size(); i++)
|
|
||||||
if (KeepNode((*subs)[i]))
|
|
||||||
(*subs)[j++] = (*subs)[i];
|
|
||||||
else
|
|
||||||
delete (*subs)[i];
|
|
||||||
|
|
||||||
subs->resize(j);
|
|
||||||
return j > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
case Prefilter::OR:
|
|
||||||
for (size_t i = 0; i < node->subs()->size(); i++)
|
|
||||||
if (!KeepNode((*node->subs())[i]))
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
|
|
||||||
std::vector<string>* atom_vec) {
|
|
||||||
atom_vec->clear();
|
|
||||||
|
|
||||||
// Build vector of all filter nodes, sorted topologically
|
|
||||||
// from top to bottom in v.
|
|
||||||
std::vector<Prefilter*> v;
|
|
||||||
|
|
||||||
// Add the top level nodes of each regexp prefilter.
|
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
|
||||||
Prefilter* f = prefilter_vec_[i];
|
|
||||||
if (f == NULL)
|
|
||||||
unfiltered_.push_back(static_cast<int>(i));
|
|
||||||
|
|
||||||
// We push NULL also on to v, so that we maintain the
|
|
||||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
|
||||||
v.push_back(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now add all the descendant nodes.
|
|
||||||
for (size_t i = 0; i < v.size(); i++) {
|
|
||||||
Prefilter* f = v[i];
|
|
||||||
if (f == NULL)
|
|
||||||
continue;
|
|
||||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
|
||||||
const std::vector<Prefilter*>& subs = *f->subs();
|
|
||||||
for (size_t j = 0; j < subs.size(); j++)
|
|
||||||
v.push_back(subs[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Identify unique nodes.
|
|
||||||
int unique_id = 0;
|
|
||||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
|
||||||
Prefilter *node = v[i];
|
|
||||||
if (node == NULL)
|
|
||||||
continue;
|
|
||||||
node->set_unique_id(-1);
|
|
||||||
Prefilter* canonical = CanonicalNode(nodes, node);
|
|
||||||
if (canonical == NULL) {
|
|
||||||
// Any further nodes that have the same node string
|
|
||||||
// will find this node as the canonical node.
|
|
||||||
nodes->emplace(NodeString(node), node);
|
|
||||||
if (node->op() == Prefilter::ATOM) {
|
|
||||||
atom_vec->push_back(node->atom());
|
|
||||||
atom_index_to_id_.push_back(unique_id);
|
|
||||||
}
|
|
||||||
node->set_unique_id(unique_id++);
|
|
||||||
} else {
|
|
||||||
node->set_unique_id(canonical->unique_id());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entries_.resize(nodes->size());
|
|
||||||
|
|
||||||
// Create parent StdIntMap for the entries.
|
|
||||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
|
||||||
Prefilter* prefilter = v[i];
|
|
||||||
if (prefilter == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
Entry* entry = &entries_[prefilter->unique_id()];
|
|
||||||
entry->parents = new StdIntMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fill the entries.
|
|
||||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
|
||||||
Prefilter* prefilter = v[i];
|
|
||||||
if (prefilter == NULL)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
Entry* entry = &entries_[prefilter->unique_id()];
|
|
||||||
|
|
||||||
switch (prefilter->op()) {
|
|
||||||
default:
|
|
||||||
case Prefilter::ALL:
|
|
||||||
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
|
||||||
return;
|
|
||||||
|
|
||||||
case Prefilter::ATOM:
|
|
||||||
entry->propagate_up_at_count = 1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case Prefilter::OR:
|
|
||||||
case Prefilter::AND: {
|
|
||||||
std::set<int> uniq_child;
|
|
||||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
|
||||||
Prefilter* child = (*prefilter->subs())[j];
|
|
||||||
Prefilter* canonical = CanonicalNode(nodes, child);
|
|
||||||
if (canonical == NULL) {
|
|
||||||
LOG(DFATAL) << "Null canonical node";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
int child_id = canonical->unique_id();
|
|
||||||
uniq_child.insert(child_id);
|
|
||||||
// To the child, we want to add to parent indices.
|
|
||||||
Entry* child_entry = &entries_[child_id];
|
|
||||||
if (child_entry->parents->find(prefilter->unique_id()) ==
|
|
||||||
child_entry->parents->end()) {
|
|
||||||
(*child_entry->parents)[prefilter->unique_id()] = 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
|
|
||||||
? static_cast<int>(uniq_child.size())
|
|
||||||
: 1;
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For top level nodes, populate regexp id.
|
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
|
||||||
if (prefilter_vec_[i] == NULL)
|
|
||||||
continue;
|
|
||||||
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
|
|
||||||
DCHECK_LE(0, id);
|
|
||||||
Entry* entry = &entries_[id];
|
|
||||||
entry->regexps.push_back(static_cast<int>(i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Functions for triggering during search.
|
|
||||||
void PrefilterTree::RegexpsGivenStrings(
|
|
||||||
const std::vector<int>& matched_atoms,
|
|
||||||
std::vector<int>* regexps) const {
|
|
||||||
regexps->clear();
|
|
||||||
if (!compiled_) {
|
|
||||||
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
|
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
|
|
||||||
regexps->push_back(static_cast<int>(i));
|
|
||||||
} else {
|
|
||||||
if (!prefilter_vec_.empty()) {
|
|
||||||
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
|
|
||||||
std::vector<int> matched_atom_ids;
|
|
||||||
for (size_t j = 0; j < matched_atoms.size(); j++) {
|
|
||||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
|
||||||
}
|
|
||||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
|
||||||
for (IntMap::iterator it = regexps_map.begin();
|
|
||||||
it != regexps_map.end();
|
|
||||||
++it)
|
|
||||||
regexps->push_back(it->index());
|
|
||||||
|
|
||||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::sort(regexps->begin(), regexps->end());
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
|
|
||||||
IntMap* regexps) const {
|
|
||||||
IntMap count(static_cast<int>(entries_.size()));
|
|
||||||
IntMap work(static_cast<int>(entries_.size()));
|
|
||||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
|
||||||
work.set(atom_ids[i], 1);
|
|
||||||
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
|
||||||
const Entry& entry = entries_[it->index()];
|
|
||||||
// Record regexps triggered.
|
|
||||||
for (size_t i = 0; i < entry.regexps.size(); i++)
|
|
||||||
regexps->set(entry.regexps[i], 1);
|
|
||||||
int c;
|
|
||||||
// Pass trigger up to parents.
|
|
||||||
for (StdIntMap::iterator it = entry.parents->begin();
|
|
||||||
it != entry.parents->end();
|
|
||||||
++it) {
|
|
||||||
int j = it->first;
|
|
||||||
const Entry& parent = entries_[j];
|
|
||||||
// Delay until all the children have succeeded.
|
|
||||||
if (parent.propagate_up_at_count > 1) {
|
|
||||||
if (count.has_index(j)) {
|
|
||||||
c = count.get_existing(j) + 1;
|
|
||||||
count.set_existing(j, c);
|
|
||||||
} else {
|
|
||||||
c = 1;
|
|
||||||
count.set_new(j, c);
|
|
||||||
}
|
|
||||||
if (c < parent.propagate_up_at_count)
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Trigger the parent.
|
|
||||||
work.set(j, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Debugging help.
|
|
||||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
|
||||||
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
|
|
||||||
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
|
|
||||||
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
|
|
||||||
|
|
||||||
for (size_t i = 0; i < entries_.size(); ++i) {
|
|
||||||
StdIntMap* parents = entries_[i].parents;
|
|
||||||
const std::vector<int>& regexps = entries_[i].regexps;
|
|
||||||
LOG(ERROR) << "EntryId: " << i
|
|
||||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
|
||||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
|
||||||
LOG(ERROR) << it->first;
|
|
||||||
}
|
|
||||||
LOG(ERROR) << "Map:";
|
|
||||||
for (std::map<string, Prefilter*>::const_iterator iter = nodes->begin();
|
|
||||||
iter != nodes->end(); ++iter)
|
|
||||||
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
|
|
||||||
<< " Str: " << (*iter).first;
|
|
||||||
}
|
|
||||||
|
|
||||||
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
|
||||||
string node_string = "";
|
|
||||||
|
|
||||||
if (node->op() == Prefilter::ATOM) {
|
|
||||||
DCHECK(!node->atom().empty());
|
|
||||||
node_string += node->atom();
|
|
||||||
} else {
|
|
||||||
// Adding the operation disambiguates AND and OR nodes.
|
|
||||||
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
|
||||||
node_string += "(";
|
|
||||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
|
||||||
if (i > 0)
|
|
||||||
node_string += ',';
|
|
||||||
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
|
||||||
node_string += ":";
|
|
||||||
node_string += DebugNodeString((*node->subs())[i]);
|
|
||||||
}
|
|
||||||
node_string += ")";
|
|
||||||
}
|
|
||||||
return node_string;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,139 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_PREFILTER_TREE_H_
|
|
||||||
#define RE2_PREFILTER_TREE_H_
|
|
||||||
|
|
||||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
|
||||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
|
||||||
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
|
||||||
// the unique strings across the prefilters. During search, by using
|
|
||||||
// matches from a string matching engine, PrefilterTree deduces the
|
|
||||||
// set of regexps that are to be triggered. The 'string matching
|
|
||||||
// engine' itself is outside of this class, and the caller can use any
|
|
||||||
// favorite engine. PrefilterTree provides a set of strings (called
|
|
||||||
// atoms) that the user of this class should use to do the string
|
|
||||||
// matching.
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/sparse_array.h"
|
|
||||||
#include "re2/prefilter.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class PrefilterTree {
|
|
||||||
public:
|
|
||||||
PrefilterTree();
|
|
||||||
explicit PrefilterTree(int min_atom_len);
|
|
||||||
~PrefilterTree();
|
|
||||||
|
|
||||||
// Adds the prefilter for the next regexp. Note that we assume that
|
|
||||||
// Add called sequentially for all regexps. All Add calls
|
|
||||||
// must precede Compile.
|
|
||||||
void Add(Prefilter* prefilter);
|
|
||||||
|
|
||||||
// The Compile returns a vector of string in atom_vec.
|
|
||||||
// Call this after all the prefilters are added through Add.
|
|
||||||
// No calls to Add after Compile are allowed.
|
|
||||||
// The caller should use the returned set of strings to do string matching.
|
|
||||||
// Each time a string matches, the corresponding index then has to be
|
|
||||||
// and passed to RegexpsGivenStrings below.
|
|
||||||
void Compile(std::vector<string>* atom_vec);
|
|
||||||
|
|
||||||
// Given the indices of the atoms that matched, returns the indexes
|
|
||||||
// of regexps that should be searched. The matched_atoms should
|
|
||||||
// contain all the ids of string atoms that were found to match the
|
|
||||||
// content. The caller can use any string match engine to perform
|
|
||||||
// this function. This function is thread safe.
|
|
||||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
|
||||||
std::vector<int>* regexps) const;
|
|
||||||
|
|
||||||
// Print debug prefilter. Also prints unique ids associated with
|
|
||||||
// nodes of the prefilter of the regexp.
|
|
||||||
void PrintPrefilter(int regexpid);
|
|
||||||
|
|
||||||
private:
|
|
||||||
typedef SparseArray<int> IntMap;
|
|
||||||
typedef std::map<int, int> StdIntMap;
|
|
||||||
typedef std::map<string, Prefilter*> NodeMap;
|
|
||||||
|
|
||||||
// Each unique node has a corresponding Entry that helps in
|
|
||||||
// passing the matching trigger information along the tree.
|
|
||||||
struct Entry {
|
|
||||||
public:
|
|
||||||
// How many children should match before this node triggers the
|
|
||||||
// parent. For an atom and an OR node, this is 1 and for an AND
|
|
||||||
// node, it is the number of unique children.
|
|
||||||
int propagate_up_at_count;
|
|
||||||
|
|
||||||
// When this node is ready to trigger the parent, what are the indices
|
|
||||||
// of the parent nodes to trigger. The reason there may be more than
|
|
||||||
// one is because of sharing. For example (abc | def) and (xyz | def)
|
|
||||||
// are two different nodes, but they share the atom 'def'. So when
|
|
||||||
// 'def' matches, it triggers two parents, corresponding to the two
|
|
||||||
// different OR nodes.
|
|
||||||
StdIntMap* parents;
|
|
||||||
|
|
||||||
// When this node is ready to trigger the parent, what are the
|
|
||||||
// regexps that are triggered.
|
|
||||||
std::vector<int> regexps;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Returns true if the prefilter node should be kept.
|
|
||||||
bool KeepNode(Prefilter* node) const;
|
|
||||||
|
|
||||||
// This function assigns unique ids to various parts of the
|
|
||||||
// prefilter, by looking at if these nodes are already in the
|
|
||||||
// PrefilterTree.
|
|
||||||
void AssignUniqueIds(NodeMap* nodes, std::vector<string>* atom_vec);
|
|
||||||
|
|
||||||
// Given the matching atoms, find the regexps to be triggered.
|
|
||||||
void PropagateMatch(const std::vector<int>& atom_ids,
|
|
||||||
IntMap* regexps) const;
|
|
||||||
|
|
||||||
// Returns the prefilter node that has the same NodeString as this
|
|
||||||
// node. For the canonical node, returns node.
|
|
||||||
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
|
|
||||||
|
|
||||||
// A string that uniquely identifies the node. Assumes that the
|
|
||||||
// children of node has already been assigned unique ids.
|
|
||||||
string NodeString(Prefilter* node) const;
|
|
||||||
|
|
||||||
// Recursively constructs a readable prefilter string.
|
|
||||||
string DebugNodeString(Prefilter* node) const;
|
|
||||||
|
|
||||||
// Used for debugging.
|
|
||||||
void PrintDebugInfo(NodeMap* nodes);
|
|
||||||
|
|
||||||
// These are all the nodes formed by Compile. Essentially, there is
|
|
||||||
// one node for each unique atom and each unique AND/OR node.
|
|
||||||
std::vector<Entry> entries_;
|
|
||||||
|
|
||||||
// indices of regexps that always pass through the filter (since we
|
|
||||||
// found no required literals in these regexps).
|
|
||||||
std::vector<int> unfiltered_;
|
|
||||||
|
|
||||||
// vector of Prefilter for all regexps.
|
|
||||||
std::vector<Prefilter*> prefilter_vec_;
|
|
||||||
|
|
||||||
// Atom index in returned strings to entry id mapping.
|
|
||||||
std::vector<int> atom_index_to_id_;
|
|
||||||
|
|
||||||
// Has the prefilter tree been compiled.
|
|
||||||
bool compiled_;
|
|
||||||
|
|
||||||
// Strings less than this length are not stored as atoms.
|
|
||||||
const int min_atom_len_;
|
|
||||||
|
|
||||||
PrefilterTree(const PrefilterTree&) = delete;
|
|
||||||
PrefilterTree& operator=(const PrefilterTree&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
#endif // RE2_PREFILTER_TREE_H_
|
|
@ -1,826 +0,0 @@
|
|||||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Compiled regular expression representation.
|
|
||||||
// Tested by compile_test.cc
|
|
||||||
|
|
||||||
#include "re2/prog.h"
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <memory>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
#include "re2/bitmap256.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Constructors per Inst opcode
|
|
||||||
|
|
||||||
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_out_opcode(out, kInstAlt);
|
|
||||||
out1_ = out1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_out_opcode(out, kInstByteRange);
|
|
||||||
lo_ = lo & 0xFF;
|
|
||||||
hi_ = hi & 0xFF;
|
|
||||||
foldcase_ = foldcase & 0xFF;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitCapture(int cap, uint32_t out) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_out_opcode(out, kInstCapture);
|
|
||||||
cap_ = cap;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_out_opcode(out, kInstEmptyWidth);
|
|
||||||
empty_ = empty;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitMatch(int32_t id) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_opcode(kInstMatch);
|
|
||||||
match_id_ = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitNop(uint32_t out) {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_opcode(kInstNop);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::Inst::InitFail() {
|
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
|
||||||
set_opcode(kInstFail);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Prog::Inst::Dump() {
|
|
||||||
switch (opcode()) {
|
|
||||||
default:
|
|
||||||
return StringPrintf("opcode %d", static_cast<int>(opcode()));
|
|
||||||
|
|
||||||
case kInstAlt:
|
|
||||||
return StringPrintf("alt -> %d | %d", out(), out1_);
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
return StringPrintf("altmatch -> %d | %d", out(), out1_);
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
return StringPrintf("byte%s [%02x-%02x] -> %d",
|
|
||||||
foldcase_ ? "/i" : "",
|
|
||||||
lo_, hi_, out());
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
return StringPrintf("capture %d -> %d", cap_, out());
|
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
return StringPrintf("emptywidth %#x -> %d",
|
|
||||||
static_cast<int>(empty_), out());
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
return StringPrintf("match! %d", match_id());
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
return StringPrintf("nop -> %d", out());
|
|
||||||
|
|
||||||
case kInstFail:
|
|
||||||
return StringPrintf("fail");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Prog::Prog()
|
|
||||||
: anchor_start_(false),
|
|
||||||
anchor_end_(false),
|
|
||||||
reversed_(false),
|
|
||||||
did_flatten_(false),
|
|
||||||
did_onepass_(false),
|
|
||||||
start_(0),
|
|
||||||
start_unanchored_(0),
|
|
||||||
size_(0),
|
|
||||||
bytemap_range_(0),
|
|
||||||
first_byte_(-1),
|
|
||||||
flags_(0),
|
|
||||||
list_count_(0),
|
|
||||||
inst_(NULL),
|
|
||||||
onepass_nodes_(NULL),
|
|
||||||
dfa_mem_(0),
|
|
||||||
dfa_first_(NULL),
|
|
||||||
dfa_longest_(NULL) {
|
|
||||||
}
|
|
||||||
|
|
||||||
Prog::~Prog() {
|
|
||||||
DeleteDFA(dfa_longest_);
|
|
||||||
DeleteDFA(dfa_first_);
|
|
||||||
delete[] onepass_nodes_;
|
|
||||||
delete[] inst_;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef SparseSet Workq;
|
|
||||||
|
|
||||||
static inline void AddToQueue(Workq* q, int id) {
|
|
||||||
if (id != 0)
|
|
||||||
q->insert(id);
|
|
||||||
}
|
|
||||||
|
|
||||||
static string ProgToString(Prog* prog, Workq* q) {
|
|
||||||
string s;
|
|
||||||
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
|
||||||
int id = *i;
|
|
||||||
Prog::Inst* ip = prog->inst(id);
|
|
||||||
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
|
||||||
AddToQueue(q, ip->out());
|
|
||||||
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
|
|
||||||
AddToQueue(q, ip->out1());
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static string FlattenedProgToString(Prog* prog, int start) {
|
|
||||||
string s;
|
|
||||||
for (int id = start; id < prog->size(); id++) {
|
|
||||||
Prog::Inst* ip = prog->inst(id);
|
|
||||||
if (ip->last())
|
|
||||||
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
|
||||||
else
|
|
||||||
StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
string Prog::Dump() {
|
|
||||||
if (did_flatten_)
|
|
||||||
return FlattenedProgToString(this, start_);
|
|
||||||
|
|
||||||
Workq q(size_);
|
|
||||||
AddToQueue(&q, start_);
|
|
||||||
return ProgToString(this, &q);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Prog::DumpUnanchored() {
|
|
||||||
if (did_flatten_)
|
|
||||||
return FlattenedProgToString(this, start_unanchored_);
|
|
||||||
|
|
||||||
Workq q(size_);
|
|
||||||
AddToQueue(&q, start_unanchored_);
|
|
||||||
return ProgToString(this, &q);
|
|
||||||
}
|
|
||||||
|
|
||||||
string Prog::DumpByteMap() {
|
|
||||||
string map;
|
|
||||||
for (int c = 0; c < 256; c++) {
|
|
||||||
int b = bytemap_[c];
|
|
||||||
int lo = c;
|
|
||||||
while (c < 256-1 && bytemap_[c+1] == b)
|
|
||||||
c++;
|
|
||||||
int hi = c;
|
|
||||||
StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
|
|
||||||
}
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
int Prog::first_byte() {
|
|
||||||
std::call_once(first_byte_once_, [](Prog* prog) {
|
|
||||||
prog->first_byte_ = prog->ComputeFirstByte();
|
|
||||||
}, this);
|
|
||||||
return first_byte_;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool IsMatch(Prog*, Prog::Inst*);
|
|
||||||
|
|
||||||
// Peep-hole optimizer.
|
|
||||||
void Prog::Optimize() {
|
|
||||||
Workq q(size_);
|
|
||||||
|
|
||||||
// Eliminate nops. Most are taken out during compilation
|
|
||||||
// but a few are hard to avoid.
|
|
||||||
q.clear();
|
|
||||||
AddToQueue(&q, start_);
|
|
||||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
|
||||||
int id = *i;
|
|
||||||
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
int j = ip->out();
|
|
||||||
Inst* jp;
|
|
||||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
|
||||||
j = jp->out();
|
|
||||||
}
|
|
||||||
ip->set_out(j);
|
|
||||||
AddToQueue(&q, ip->out());
|
|
||||||
|
|
||||||
if (ip->opcode() == kInstAlt) {
|
|
||||||
j = ip->out1();
|
|
||||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
|
||||||
j = jp->out();
|
|
||||||
}
|
|
||||||
ip->out1_ = j;
|
|
||||||
AddToQueue(&q, ip->out1());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert kInstAltMatch instructions
|
|
||||||
// Look for
|
|
||||||
// ip: Alt -> j | k
|
|
||||||
// j: ByteRange [00-FF] -> ip
|
|
||||||
// k: Match
|
|
||||||
// or the reverse (the above is the greedy one).
|
|
||||||
// Rewrite Alt to AltMatch.
|
|
||||||
q.clear();
|
|
||||||
AddToQueue(&q, start_);
|
|
||||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
|
||||||
int id = *i;
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
AddToQueue(&q, ip->out());
|
|
||||||
if (ip->opcode() == kInstAlt)
|
|
||||||
AddToQueue(&q, ip->out1());
|
|
||||||
|
|
||||||
if (ip->opcode() == kInstAlt) {
|
|
||||||
Inst* j = inst(ip->out());
|
|
||||||
Inst* k = inst(ip->out1());
|
|
||||||
if (j->opcode() == kInstByteRange && j->out() == id &&
|
|
||||||
j->lo() == 0x00 && j->hi() == 0xFF &&
|
|
||||||
IsMatch(this, k)) {
|
|
||||||
ip->set_opcode(kInstAltMatch);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (IsMatch(this, j) &&
|
|
||||||
k->opcode() == kInstByteRange && k->out() == id &&
|
|
||||||
k->lo() == 0x00 && k->hi() == 0xFF) {
|
|
||||||
ip->set_opcode(kInstAltMatch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Is ip a guaranteed match at end of text, perhaps after some capturing?
|
|
||||||
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
|
||||||
for (;;) {
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kInstAlt:
|
|
||||||
case kInstAltMatch:
|
|
||||||
case kInstByteRange:
|
|
||||||
case kInstFail:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstNop:
|
|
||||||
ip = prog->inst(ip->out());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
|
||||||
int flags = 0;
|
|
||||||
|
|
||||||
// ^ and \A
|
|
||||||
if (p == text.begin())
|
|
||||||
flags |= kEmptyBeginText | kEmptyBeginLine;
|
|
||||||
else if (p[-1] == '\n')
|
|
||||||
flags |= kEmptyBeginLine;
|
|
||||||
|
|
||||||
// $ and \z
|
|
||||||
if (p == text.end())
|
|
||||||
flags |= kEmptyEndText | kEmptyEndLine;
|
|
||||||
else if (p < text.end() && p[0] == '\n')
|
|
||||||
flags |= kEmptyEndLine;
|
|
||||||
|
|
||||||
// \b and \B
|
|
||||||
if (p == text.begin() && p == text.end()) {
|
|
||||||
// no word boundary here
|
|
||||||
} else if (p == text.begin()) {
|
|
||||||
if (IsWordChar(p[0]))
|
|
||||||
flags |= kEmptyWordBoundary;
|
|
||||||
} else if (p == text.end()) {
|
|
||||||
if (IsWordChar(p[-1]))
|
|
||||||
flags |= kEmptyWordBoundary;
|
|
||||||
} else {
|
|
||||||
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
|
|
||||||
flags |= kEmptyWordBoundary;
|
|
||||||
}
|
|
||||||
if (!(flags & kEmptyWordBoundary))
|
|
||||||
flags |= kEmptyNonWordBoundary;
|
|
||||||
|
|
||||||
return flags;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ByteMapBuilder implements a coloring algorithm.
|
|
||||||
//
|
|
||||||
// The first phase is a series of "mark and merge" batches: we mark one or more
|
|
||||||
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
|
|
||||||
// performance; rather, it means that the ranges are treated indistinguishably.
|
|
||||||
//
|
|
||||||
// Internally, the ranges are represented using a bitmap that stores the splits
|
|
||||||
// and a vector that stores the colors; both of them are indexed by the ranges'
|
|
||||||
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
|
|
||||||
// hi (if not already split), then recolor each range in between. The color map
|
|
||||||
// (i.e. from the old color to the new color) is maintained for the lifetime of
|
|
||||||
// the batch and so underpins this somewhat obscure approach to set operations.
|
|
||||||
//
|
|
||||||
// The second phase builds the bytemap from our internal state: we recolor each
|
|
||||||
// range, then store the new color (which is now the byte class) in each of the
|
|
||||||
// corresponding array elements. Finally, we output the number of byte classes.
|
|
||||||
class ByteMapBuilder {
|
|
||||||
public:
|
|
||||||
ByteMapBuilder() {
|
|
||||||
// Initial state: the [0-255] range has color 256.
|
|
||||||
// This will avoid problems during the second phase,
|
|
||||||
// in which we assign byte classes numbered from 0.
|
|
||||||
splits_.Set(255);
|
|
||||||
colors_.resize(256);
|
|
||||||
colors_[255] = 256;
|
|
||||||
nextcolor_ = 257;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Mark(int lo, int hi);
|
|
||||||
void Merge();
|
|
||||||
void Build(uint8_t* bytemap, int* bytemap_range);
|
|
||||||
|
|
||||||
private:
|
|
||||||
int Recolor(int oldcolor);
|
|
||||||
|
|
||||||
Bitmap256 splits_;
|
|
||||||
std::vector<int> colors_;
|
|
||||||
int nextcolor_;
|
|
||||||
std::vector<std::pair<int, int>> colormap_;
|
|
||||||
std::vector<std::pair<int, int>> ranges_;
|
|
||||||
|
|
||||||
ByteMapBuilder(const ByteMapBuilder&) = delete;
|
|
||||||
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
void ByteMapBuilder::Mark(int lo, int hi) {
|
|
||||||
DCHECK_GE(lo, 0);
|
|
||||||
DCHECK_GE(hi, 0);
|
|
||||||
DCHECK_LE(lo, 255);
|
|
||||||
DCHECK_LE(hi, 255);
|
|
||||||
DCHECK_LE(lo, hi);
|
|
||||||
|
|
||||||
// Ignore any [0-255] ranges. They cause us to recolor every range, which
|
|
||||||
// has no effect on the eventual result and is therefore a waste of time.
|
|
||||||
if (lo == 0 && hi == 255)
|
|
||||||
return;
|
|
||||||
|
|
||||||
ranges_.emplace_back(lo, hi);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ByteMapBuilder::Merge() {
|
|
||||||
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
|
|
||||||
it != ranges_.end();
|
|
||||||
++it) {
|
|
||||||
int lo = it->first-1;
|
|
||||||
int hi = it->second;
|
|
||||||
|
|
||||||
if (0 <= lo && !splits_.Test(lo)) {
|
|
||||||
splits_.Set(lo);
|
|
||||||
int next = splits_.FindNextSetBit(lo+1);
|
|
||||||
colors_[lo] = colors_[next];
|
|
||||||
}
|
|
||||||
if (!splits_.Test(hi)) {
|
|
||||||
splits_.Set(hi);
|
|
||||||
int next = splits_.FindNextSetBit(hi+1);
|
|
||||||
colors_[hi] = colors_[next];
|
|
||||||
}
|
|
||||||
|
|
||||||
int c = lo+1;
|
|
||||||
while (c < 256) {
|
|
||||||
int next = splits_.FindNextSetBit(c);
|
|
||||||
colors_[next] = Recolor(colors_[next]);
|
|
||||||
if (next == hi)
|
|
||||||
break;
|
|
||||||
c = next+1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
colormap_.clear();
|
|
||||||
ranges_.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
|
|
||||||
// Assign byte classes numbered from 0.
|
|
||||||
nextcolor_ = 0;
|
|
||||||
|
|
||||||
int c = 0;
|
|
||||||
while (c < 256) {
|
|
||||||
int next = splits_.FindNextSetBit(c);
|
|
||||||
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
|
|
||||||
while (c <= next) {
|
|
||||||
bytemap[c] = b;
|
|
||||||
c++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*bytemap_range = nextcolor_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ByteMapBuilder::Recolor(int oldcolor) {
|
|
||||||
// Yes, this is a linear search. There can be at most 256
|
|
||||||
// colors and there will typically be far fewer than that.
|
|
||||||
// Also, we need to consider keys *and* values in order to
|
|
||||||
// avoid recoloring a given range more than once per batch.
|
|
||||||
std::vector<std::pair<int, int>>::const_iterator it =
|
|
||||||
std::find_if(colormap_.begin(), colormap_.end(),
|
|
||||||
[=](const std::pair<int, int>& kv) -> bool {
|
|
||||||
return kv.first == oldcolor || kv.second == oldcolor;
|
|
||||||
});
|
|
||||||
if (it != colormap_.end())
|
|
||||||
return it->second;
|
|
||||||
int newcolor = nextcolor_;
|
|
||||||
nextcolor_++;
|
|
||||||
colormap_.emplace_back(oldcolor, newcolor);
|
|
||||||
return newcolor;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::ComputeByteMap() {
|
|
||||||
// Fill in bytemap with byte classes for the program.
|
|
||||||
// Ranges of bytes that are treated indistinguishably
|
|
||||||
// will be mapped to a single byte class.
|
|
||||||
ByteMapBuilder builder;
|
|
||||||
|
|
||||||
// Don't repeat the work for ^ and $.
|
|
||||||
bool marked_line_boundaries = false;
|
|
||||||
// Don't repeat the work for \b and \B.
|
|
||||||
bool marked_word_boundaries = false;
|
|
||||||
|
|
||||||
for (int id = 0; id < size(); id++) {
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
if (ip->opcode() == kInstByteRange) {
|
|
||||||
int lo = ip->lo();
|
|
||||||
int hi = ip->hi();
|
|
||||||
builder.Mark(lo, hi);
|
|
||||||
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
|
|
||||||
int foldlo = lo;
|
|
||||||
int foldhi = hi;
|
|
||||||
if (foldlo < 'a')
|
|
||||||
foldlo = 'a';
|
|
||||||
if (foldhi > 'z')
|
|
||||||
foldhi = 'z';
|
|
||||||
if (foldlo <= foldhi)
|
|
||||||
builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
|
|
||||||
}
|
|
||||||
// If this Inst is not the last Inst in its list AND the next Inst is
|
|
||||||
// also a ByteRange AND the Insts have the same out, defer the merge.
|
|
||||||
if (!ip->last() &&
|
|
||||||
inst(id+1)->opcode() == kInstByteRange &&
|
|
||||||
ip->out() == inst(id+1)->out())
|
|
||||||
continue;
|
|
||||||
builder.Merge();
|
|
||||||
} else if (ip->opcode() == kInstEmptyWidth) {
|
|
||||||
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
|
|
||||||
!marked_line_boundaries) {
|
|
||||||
builder.Mark('\n', '\n');
|
|
||||||
builder.Merge();
|
|
||||||
marked_line_boundaries = true;
|
|
||||||
}
|
|
||||||
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
|
|
||||||
!marked_word_boundaries) {
|
|
||||||
// We require two batches here: the first for ranges that are word
|
|
||||||
// characters, the second for ranges that are not word characters.
|
|
||||||
for (bool isword : {true, false}) {
|
|
||||||
int j;
|
|
||||||
for (int i = 0; i < 256; i = j) {
|
|
||||||
for (j = i + 1; j < 256 &&
|
|
||||||
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
|
|
||||||
Prog::IsWordChar(static_cast<uint8_t>(j));
|
|
||||||
j++)
|
|
||||||
;
|
|
||||||
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
|
|
||||||
builder.Mark(i, j - 1);
|
|
||||||
}
|
|
||||||
builder.Merge();
|
|
||||||
}
|
|
||||||
marked_word_boundaries = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
builder.Build(bytemap_, &bytemap_range_);
|
|
||||||
|
|
||||||
if (0) { // For debugging, use trivial bytemap.
|
|
||||||
LOG(ERROR) << "Using trivial bytemap.";
|
|
||||||
for (int i = 0; i < 256; i++)
|
|
||||||
bytemap_[i] = static_cast<uint8_t>(i);
|
|
||||||
bytemap_range_ = 256;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prog::Flatten() implements a graph rewriting algorithm.
|
|
||||||
//
|
|
||||||
// The overall process is similar to epsilon removal, but retains some epsilon
|
|
||||||
// transitions: those from Capture and EmptyWidth instructions; and those from
|
|
||||||
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
|
|
||||||
// in the worst case.) It might be best thought of as Alt instruction elision.
|
|
||||||
//
|
|
||||||
// In conceptual terms, it divides the Prog into "trees" of instructions, then
|
|
||||||
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
|
|
||||||
// is one or more instructions that grow from one "root" instruction to one or
|
|
||||||
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
|
|
||||||
// "root" is also the "leaf". In most cases, a "root" is the successor of some
|
|
||||||
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
|
|
||||||
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
|
|
||||||
// EmptyWidth or Match instruction. However, this is insufficient for handling
|
|
||||||
// nested nullable subexpressions correctly, so in some cases, a "root" is the
|
|
||||||
// dominator of the instructions reachable from some "successor root" (i.e. it
|
|
||||||
// has an unreachable predecessor) and is considered a "dominator root". Since
|
|
||||||
// only Alt instructions can be "dominator roots" (other instructions would be
|
|
||||||
// "leaves"), only Alt instructions are required to be marked as predecessors.
|
|
||||||
//
|
|
||||||
// Dividing the Prog into "trees" comprises two passes: marking the "successor
|
|
||||||
// roots" and the predecessors; and marking the "dominator roots". Sorting the
|
|
||||||
// "successor roots" by their bytecode offsets enables iteration in order from
|
|
||||||
// greatest to least during the second pass; by working backwards in this case
|
|
||||||
// and flooding the graph no further than "leaves" and already marked "roots",
|
|
||||||
// it becomes possible to mark "dominator roots" without doing excessive work.
|
|
||||||
//
|
|
||||||
// Traversing the "trees" is just iterating over the "roots" in order of their
|
|
||||||
// marking and flooding the graph no further than "leaves" and "roots". When a
|
|
||||||
// "leaf" is reached, the instruction is copied with its successor remapped to
|
|
||||||
// its "root" number. When a "root" is reached, a Nop instruction is generated
|
|
||||||
// with its successor remapped similarly. As each "list" is produced, its last
|
|
||||||
// instruction is marked as such. After all of the "lists" have been produced,
|
|
||||||
// a pass over their instructions remaps their successors to bytecode offsets.
|
|
||||||
void Prog::Flatten() {
|
|
||||||
if (did_flatten_)
|
|
||||||
return;
|
|
||||||
did_flatten_ = true;
|
|
||||||
|
|
||||||
// Scratch structures. It's important that these are reused by functions
|
|
||||||
// that we call in loops because they would thrash the heap otherwise.
|
|
||||||
SparseSet reachable(size());
|
|
||||||
std::vector<int> stk;
|
|
||||||
stk.reserve(size());
|
|
||||||
|
|
||||||
// First pass: Marks "successor roots" and predecessors.
|
|
||||||
// Builds the mapping from inst-ids to root-ids.
|
|
||||||
SparseArray<int> rootmap(size());
|
|
||||||
SparseArray<int> predmap(size());
|
|
||||||
std::vector<std::vector<int>> predvec;
|
|
||||||
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
|
|
||||||
|
|
||||||
// Second pass: Marks "dominator roots".
|
|
||||||
SparseArray<int> sorted(rootmap);
|
|
||||||
std::sort(sorted.begin(), sorted.end(), sorted.less);
|
|
||||||
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
|
|
||||||
i != sorted.begin();
|
|
||||||
--i) {
|
|
||||||
if (i->index() != start_unanchored() && i->index() != start())
|
|
||||||
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Third pass: Emits "lists". Remaps outs to root-ids.
|
|
||||||
// Builds the mapping from root-ids to flat-ids.
|
|
||||||
std::vector<int> flatmap(rootmap.size());
|
|
||||||
std::vector<Inst> flat;
|
|
||||||
flat.reserve(size());
|
|
||||||
for (SparseArray<int>::const_iterator i = rootmap.begin();
|
|
||||||
i != rootmap.end();
|
|
||||||
++i) {
|
|
||||||
flatmap[i->value()] = static_cast<int>(flat.size());
|
|
||||||
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
|
|
||||||
flat.back().set_last();
|
|
||||||
}
|
|
||||||
|
|
||||||
list_count_ = static_cast<int>(flatmap.size());
|
|
||||||
for (int i = 0; i < kNumInst; i++)
|
|
||||||
inst_count_[i] = 0;
|
|
||||||
|
|
||||||
// Fourth pass: Remaps outs to flat-ids.
|
|
||||||
// Counts instructions by opcode.
|
|
||||||
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
|
|
||||||
Inst* ip = &flat[id];
|
|
||||||
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
|
|
||||||
ip->set_out(flatmap[ip->out()]);
|
|
||||||
inst_count_[ip->opcode()]++;
|
|
||||||
}
|
|
||||||
|
|
||||||
int total = 0;
|
|
||||||
for (int i = 0; i < kNumInst; i++)
|
|
||||||
total += inst_count_[i];
|
|
||||||
DCHECK_EQ(total, static_cast<int>(flat.size()));
|
|
||||||
|
|
||||||
// Remap start_unanchored and start.
|
|
||||||
if (start_unanchored() == 0) {
|
|
||||||
DCHECK_EQ(start(), 0);
|
|
||||||
} else if (start_unanchored() == start()) {
|
|
||||||
set_start_unanchored(flatmap[1]);
|
|
||||||
set_start(flatmap[1]);
|
|
||||||
} else {
|
|
||||||
set_start_unanchored(flatmap[1]);
|
|
||||||
set_start(flatmap[2]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally, replace the old instructions with the new instructions.
|
|
||||||
size_ = static_cast<int>(flat.size());
|
|
||||||
delete[] inst_;
|
|
||||||
inst_ = new Inst[size_];
|
|
||||||
memmove(inst_, flat.data(), size_ * sizeof *inst_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
|
|
||||||
SparseArray<int>* predmap,
|
|
||||||
std::vector<std::vector<int>>* predvec,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk) {
|
|
||||||
// Mark the kInstFail instruction.
|
|
||||||
rootmap->set_new(0, rootmap->size());
|
|
||||||
|
|
||||||
// Mark the start_unanchored and start instructions.
|
|
||||||
if (!rootmap->has_index(start_unanchored()))
|
|
||||||
rootmap->set_new(start_unanchored(), rootmap->size());
|
|
||||||
if (!rootmap->has_index(start()))
|
|
||||||
rootmap->set_new(start(), rootmap->size());
|
|
||||||
|
|
||||||
reachable->clear();
|
|
||||||
stk->clear();
|
|
||||||
stk->push_back(start_unanchored());
|
|
||||||
while (!stk->empty()) {
|
|
||||||
int id = stk->back();
|
|
||||||
stk->pop_back();
|
|
||||||
Loop:
|
|
||||||
if (reachable->contains(id))
|
|
||||||
continue;
|
|
||||||
reachable->insert_new(id);
|
|
||||||
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
case kInstAlt:
|
|
||||||
// Mark this instruction as a predecessor of each out.
|
|
||||||
for (int out : {ip->out(), ip->out1()}) {
|
|
||||||
if (!predmap->has_index(out)) {
|
|
||||||
predmap->set_new(out, static_cast<int>(predvec->size()));
|
|
||||||
predvec->emplace_back();
|
|
||||||
}
|
|
||||||
(*predvec)[predmap->get_existing(out)].emplace_back(id);
|
|
||||||
}
|
|
||||||
stk->push_back(ip->out1());
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
// Mark the out of this instruction as a "root".
|
|
||||||
if (!rootmap->has_index(ip->out()))
|
|
||||||
rootmap->set_new(ip->out(), rootmap->size());
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
|
|
||||||
SparseArray<int>* predmap,
|
|
||||||
std::vector<std::vector<int>>* predvec,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk) {
|
|
||||||
reachable->clear();
|
|
||||||
stk->clear();
|
|
||||||
stk->push_back(root);
|
|
||||||
while (!stk->empty()) {
|
|
||||||
int id = stk->back();
|
|
||||||
stk->pop_back();
|
|
||||||
Loop:
|
|
||||||
if (reachable->contains(id))
|
|
||||||
continue;
|
|
||||||
reachable->insert_new(id);
|
|
||||||
|
|
||||||
if (id != root && rootmap->has_index(id)) {
|
|
||||||
// We reached another "tree" via epsilon transition.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
case kInstAlt:
|
|
||||||
stk->push_back(ip->out1());
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
case kInstFail:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (SparseSet::const_iterator i = reachable->begin();
|
|
||||||
i != reachable->end();
|
|
||||||
++i) {
|
|
||||||
int id = *i;
|
|
||||||
if (predmap->has_index(id)) {
|
|
||||||
for (int pred : (*predvec)[predmap->get_existing(id)]) {
|
|
||||||
if (!reachable->contains(pred)) {
|
|
||||||
// id has a predecessor that cannot be reached from root!
|
|
||||||
// Therefore, id must be a "root" too - mark it as such.
|
|
||||||
if (!rootmap->has_index(id))
|
|
||||||
rootmap->set_new(id, rootmap->size());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Prog::EmitList(int root, SparseArray<int>* rootmap,
|
|
||||||
std::vector<Inst>* flat,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk) {
|
|
||||||
reachable->clear();
|
|
||||||
stk->clear();
|
|
||||||
stk->push_back(root);
|
|
||||||
while (!stk->empty()) {
|
|
||||||
int id = stk->back();
|
|
||||||
stk->pop_back();
|
|
||||||
Loop:
|
|
||||||
if (reachable->contains(id))
|
|
||||||
continue;
|
|
||||||
reachable->insert_new(id);
|
|
||||||
|
|
||||||
if (id != root && rootmap->has_index(id)) {
|
|
||||||
// We reached another "tree" via epsilon transition. Emit a kInstNop
|
|
||||||
// instruction so that the Prog does not become quadratically larger.
|
|
||||||
flat->emplace_back();
|
|
||||||
flat->back().set_opcode(kInstNop);
|
|
||||||
flat->back().set_out(rootmap->get_existing(id));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Inst* ip = inst(id);
|
|
||||||
switch (ip->opcode()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstAltMatch:
|
|
||||||
flat->emplace_back();
|
|
||||||
flat->back().set_opcode(kInstAltMatch);
|
|
||||||
flat->back().set_out(static_cast<int>(flat->size()));
|
|
||||||
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
|
|
||||||
case kInstAlt:
|
|
||||||
stk->push_back(ip->out1());
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstByteRange:
|
|
||||||
case kInstCapture:
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
flat->emplace_back();
|
|
||||||
memmove(&flat->back(), ip, sizeof *ip);
|
|
||||||
flat->back().set_out(rootmap->get_existing(ip->out()));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstNop:
|
|
||||||
id = ip->out();
|
|
||||||
goto Loop;
|
|
||||||
|
|
||||||
case kInstMatch:
|
|
||||||
case kInstFail:
|
|
||||||
flat->emplace_back();
|
|
||||||
memmove(&flat->back(), ip, sizeof *ip);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,410 +0,0 @@
|
|||||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_PROG_H_
|
|
||||||
#define RE2_PROG_H_
|
|
||||||
|
|
||||||
// Compiled representation of regular expressions.
|
|
||||||
// See regexp.h for the Regexp class, which represents a regular
|
|
||||||
// expression symbolically.
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <functional>
|
|
||||||
#include <mutex>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/sparse_array.h"
|
|
||||||
#include "util/sparse_set.h"
|
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Opcodes for Inst
|
|
||||||
enum InstOp {
|
|
||||||
kInstAlt = 0, // choose between out_ and out1_
|
|
||||||
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
|
||||||
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
|
||||||
kInstCapture, // capturing parenthesis number cap_
|
|
||||||
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
|
||||||
kInstMatch, // found a match!
|
|
||||||
kInstNop, // no-op; occasionally unavoidable
|
|
||||||
kInstFail, // never match; occasionally unavoidable
|
|
||||||
kNumInst,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Bit flags for empty-width specials
|
|
||||||
enum EmptyOp {
|
|
||||||
kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
|
||||||
kEmptyEndLine = 1<<1, // $ - end of line
|
|
||||||
kEmptyBeginText = 1<<2, // \A - beginning of text
|
|
||||||
kEmptyEndText = 1<<3, // \z - end of text
|
|
||||||
kEmptyWordBoundary = 1<<4, // \b - word boundary
|
|
||||||
kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
|
||||||
kEmptyAllFlags = (1<<6)-1,
|
|
||||||
};
|
|
||||||
|
|
||||||
class DFA;
|
|
||||||
class Regexp;
|
|
||||||
|
|
||||||
// Compiled form of regexp program.
|
|
||||||
class Prog {
|
|
||||||
public:
|
|
||||||
Prog();
|
|
||||||
~Prog();
|
|
||||||
|
|
||||||
// Single instruction in regexp program.
|
|
||||||
class Inst {
|
|
||||||
public:
|
|
||||||
Inst() : out_opcode_(0), out1_(0) {}
|
|
||||||
|
|
||||||
// Copyable.
|
|
||||||
Inst(const Inst&) = default;
|
|
||||||
Inst& operator=(const Inst&) = default;
|
|
||||||
|
|
||||||
// Constructors per opcode
|
|
||||||
void InitAlt(uint32_t out, uint32_t out1);
|
|
||||||
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
|
|
||||||
void InitCapture(int cap, uint32_t out);
|
|
||||||
void InitEmptyWidth(EmptyOp empty, uint32_t out);
|
|
||||||
void InitMatch(int id);
|
|
||||||
void InitNop(uint32_t out);
|
|
||||||
void InitFail();
|
|
||||||
|
|
||||||
// Getters
|
|
||||||
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
|
|
||||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
|
||||||
int last() { return (out_opcode_>>3)&1; }
|
|
||||||
int out() { return out_opcode_>>4; }
|
|
||||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
|
||||||
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
|
||||||
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
|
||||||
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
|
||||||
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
|
||||||
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
|
||||||
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
|
||||||
|
|
||||||
bool greedy(Prog* p) {
|
|
||||||
DCHECK_EQ(opcode(), kInstAltMatch);
|
|
||||||
return p->inst(out())->opcode() == kInstByteRange ||
|
|
||||||
(p->inst(out())->opcode() == kInstNop &&
|
|
||||||
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Does this inst (an kInstByteRange) match c?
|
|
||||||
inline bool Matches(int c) {
|
|
||||||
DCHECK_EQ(opcode(), kInstByteRange);
|
|
||||||
if (foldcase_ && 'A' <= c && c <= 'Z')
|
|
||||||
c += 'a' - 'A';
|
|
||||||
return lo_ <= c && c <= hi_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns string representation for debugging.
|
|
||||||
string Dump();
|
|
||||||
|
|
||||||
// Maximum instruction id.
|
|
||||||
// (Must fit in out_opcode_. PatchList/last steal another bit.)
|
|
||||||
static const int kMaxInst = (1<<28) - 1;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void set_opcode(InstOp opcode) {
|
|
||||||
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_last() {
|
|
||||||
out_opcode_ = (out()<<4) | (1<<3) | opcode();
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_out(int out) {
|
|
||||||
out_opcode_ = (out<<4) | (last()<<3) | opcode();
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_out_opcode(int out, InstOp opcode) {
|
|
||||||
out_opcode_ = (out<<4) | (last()<<3) | opcode;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
|
|
||||||
union { // additional instruction arguments:
|
|
||||||
uint32_t out1_; // opcode == kInstAlt
|
|
||||||
// alternate next instruction
|
|
||||||
|
|
||||||
int32_t cap_; // opcode == kInstCapture
|
|
||||||
// Index of capture register (holds text
|
|
||||||
// position recorded by capturing parentheses).
|
|
||||||
// For \n (the submatch for the nth parentheses),
|
|
||||||
// the left parenthesis captures into register 2*n
|
|
||||||
// and the right one captures into register 2*n+1.
|
|
||||||
|
|
||||||
int32_t match_id_; // opcode == kInstMatch
|
|
||||||
// Match ID to identify this match (for re2::Set).
|
|
||||||
|
|
||||||
struct { // opcode == kInstByteRange
|
|
||||||
uint8_t lo_; // byte range is lo_-hi_ inclusive
|
|
||||||
uint8_t hi_; //
|
|
||||||
uint8_t foldcase_; // convert A-Z to a-z before checking range.
|
|
||||||
};
|
|
||||||
|
|
||||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
|
||||||
// empty_ is bitwise OR of kEmpty* flags above.
|
|
||||||
};
|
|
||||||
|
|
||||||
friend class Compiler;
|
|
||||||
friend struct PatchList;
|
|
||||||
friend class Prog;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Whether to anchor the search.
|
|
||||||
enum Anchor {
|
|
||||||
kUnanchored, // match anywhere
|
|
||||||
kAnchored, // match only starting at beginning of text
|
|
||||||
};
|
|
||||||
|
|
||||||
// Kind of match to look for (for anchor != kFullMatch)
|
|
||||||
//
|
|
||||||
// kLongestMatch mode finds the overall longest
|
|
||||||
// match but still makes its submatch choices the way
|
|
||||||
// Perl would, not in the way prescribed by POSIX.
|
|
||||||
// The POSIX rules are much more expensive to implement,
|
|
||||||
// and no one has needed them.
|
|
||||||
//
|
|
||||||
// kFullMatch is not strictly necessary -- we could use
|
|
||||||
// kLongestMatch and then check the length of the match -- but
|
|
||||||
// the matching code can run faster if it knows to consider only
|
|
||||||
// full matches.
|
|
||||||
enum MatchKind {
|
|
||||||
kFirstMatch, // like Perl, PCRE
|
|
||||||
kLongestMatch, // like egrep or POSIX
|
|
||||||
kFullMatch, // match only entire text; implies anchor==kAnchored
|
|
||||||
kManyMatch // for SearchDFA, records set of matches
|
|
||||||
};
|
|
||||||
|
|
||||||
Inst *inst(int id) { return &inst_[id]; }
|
|
||||||
int start() { return start_; }
|
|
||||||
int start_unanchored() { return start_unanchored_; }
|
|
||||||
void set_start(int start) { start_ = start; }
|
|
||||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
|
||||||
int size() { return size_; }
|
|
||||||
bool reversed() { return reversed_; }
|
|
||||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
|
||||||
int list_count() { return list_count_; }
|
|
||||||
int inst_count(InstOp op) { return inst_count_[op]; }
|
|
||||||
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
|
|
||||||
int64_t dfa_mem() { return dfa_mem_; }
|
|
||||||
int flags() { return flags_; }
|
|
||||||
void set_flags(int flags) { flags_ = flags; }
|
|
||||||
bool anchor_start() { return anchor_start_; }
|
|
||||||
void set_anchor_start(bool b) { anchor_start_ = b; }
|
|
||||||
bool anchor_end() { return anchor_end_; }
|
|
||||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
|
||||||
int bytemap_range() { return bytemap_range_; }
|
|
||||||
const uint8_t* bytemap() { return bytemap_; }
|
|
||||||
|
|
||||||
// Lazily computed.
|
|
||||||
int first_byte();
|
|
||||||
|
|
||||||
// Returns string representation of program for debugging.
|
|
||||||
string Dump();
|
|
||||||
string DumpUnanchored();
|
|
||||||
string DumpByteMap();
|
|
||||||
|
|
||||||
// Returns the set of kEmpty flags that are in effect at
|
|
||||||
// position p within context.
|
|
||||||
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
|
|
||||||
|
|
||||||
// Returns whether byte c is a word character: ASCII only.
|
|
||||||
// Used by the implementation of \b and \B.
|
|
||||||
// This is not right for Unicode, but:
|
|
||||||
// - it's hard to get right in a byte-at-a-time matching world
|
|
||||||
// (the DFA has only one-byte lookahead).
|
|
||||||
// - even if the lookahead were possible, the Progs would be huge.
|
|
||||||
// This crude approximation is the same one PCRE uses.
|
|
||||||
static bool IsWordChar(uint8_t c) {
|
|
||||||
return ('A' <= c && c <= 'Z') ||
|
|
||||||
('a' <= c && c <= 'z') ||
|
|
||||||
('0' <= c && c <= '9') ||
|
|
||||||
c == '_';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Execution engines. They all search for the regexp (run the prog)
|
|
||||||
// in text, which is in the larger context (used for ^ $ \b etc).
|
|
||||||
// Anchor and kind control the kind of search.
|
|
||||||
// Returns true if match found, false if not.
|
|
||||||
// If match found, fills match[0..nmatch-1] with submatch info.
|
|
||||||
// match[0] is overall match, match[1] is first set of parens, etc.
|
|
||||||
// If a particular submatch is not matched during the regexp match,
|
|
||||||
// it is set to NULL.
|
|
||||||
//
|
|
||||||
// Matching text == StringPiece(NULL, 0) is treated as any other empty
|
|
||||||
// string, but note that on return, it will not be possible to distinguish
|
|
||||||
// submatches that matched that empty string from submatches that didn't
|
|
||||||
// match anything. Either way, match[i] == NULL.
|
|
||||||
|
|
||||||
// Search using NFA: can find submatches but kind of slow.
|
|
||||||
bool SearchNFA(const StringPiece& text, const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch);
|
|
||||||
|
|
||||||
// Search using DFA: much faster than NFA but only finds
|
|
||||||
// end of match and can use a lot more memory.
|
|
||||||
// Returns whether a match was found.
|
|
||||||
// If the DFA runs out of memory, sets *failed to true and returns false.
|
|
||||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
|
||||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
|
||||||
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind, StringPiece* match0,
|
|
||||||
bool* failed, SparseSet* matches);
|
|
||||||
|
|
||||||
// The callback issued after building each DFA state with BuildEntireDFA().
|
|
||||||
// If next is null, then the memory budget has been exhausted and building
|
|
||||||
// will halt. Otherwise, the state has been built and next points to an array
|
|
||||||
// of bytemap_range()+1 slots holding the next states as per the bytemap and
|
|
||||||
// kByteEndText. The number of the state is implied by the callback sequence:
|
|
||||||
// the first callback is for state 0, the second callback is for state 1, ...
|
|
||||||
// match indicates whether the state is a matching state.
|
|
||||||
using DFAStateCallback = std::function<void(const int* next, bool match)>;
|
|
||||||
|
|
||||||
// Build the entire DFA for the given match kind.
|
|
||||||
// Usually the DFA is built out incrementally, as needed, which
|
|
||||||
// avoids lots of unnecessary work.
|
|
||||||
// If cb is not empty, it receives one callback per state built.
|
|
||||||
// Returns the number of states built.
|
|
||||||
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
|
|
||||||
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
|
|
||||||
|
|
||||||
// Controls whether the DFA should bail out early if the NFA would be faster.
|
|
||||||
// FOR TESTING ONLY.
|
|
||||||
static void TEST_dfa_should_bail_when_slow(bool b);
|
|
||||||
|
|
||||||
// Compute bytemap.
|
|
||||||
void ComputeByteMap();
|
|
||||||
|
|
||||||
// Computes whether all matches must begin with the same first
|
|
||||||
// byte, and if so, returns that byte. If not, returns -1.
|
|
||||||
int ComputeFirstByte();
|
|
||||||
|
|
||||||
// Run peep-hole optimizer on program.
|
|
||||||
void Optimize();
|
|
||||||
|
|
||||||
// One-pass NFA: only correct if IsOnePass() is true,
|
|
||||||
// but much faster than NFA (competitive with PCRE)
|
|
||||||
// for those expressions.
|
|
||||||
bool IsOnePass();
|
|
||||||
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch);
|
|
||||||
|
|
||||||
// Bit-state backtracking. Fast on small cases but uses memory
|
|
||||||
// proportional to the product of the program size and the text size.
|
|
||||||
bool SearchBitState(const StringPiece& text, const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch);
|
|
||||||
|
|
||||||
static const int kMaxOnePassCapture = 5; // $0 through $4
|
|
||||||
|
|
||||||
// Backtracking search: the gold standard against which the other
|
|
||||||
// implementations are checked. FOR TESTING ONLY.
|
|
||||||
// It allocates a ton of memory to avoid running forever.
|
|
||||||
// It is also recursive, so can't use in production (will overflow stacks).
|
|
||||||
// The name "Unsafe" here is supposed to be a flag that
|
|
||||||
// you should not be using this function.
|
|
||||||
bool UnsafeSearchBacktrack(const StringPiece& text,
|
|
||||||
const StringPiece& context,
|
|
||||||
Anchor anchor, MatchKind kind,
|
|
||||||
StringPiece* match, int nmatch);
|
|
||||||
|
|
||||||
// Computes range for any strings matching regexp. The min and max can in
|
|
||||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
|
||||||
// maximum desired length of string returned.
|
|
||||||
//
|
|
||||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
|
||||||
// string s that is an anchored match for this regexp satisfies
|
|
||||||
// min <= s && s <= max.
|
|
||||||
//
|
|
||||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
|
||||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
|
||||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
|
||||||
// do not compile down to infinite repetitions.
|
|
||||||
//
|
|
||||||
// Returns true on success, false on error.
|
|
||||||
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
|
||||||
|
|
||||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
|
||||||
// Outputs the program fanout into the given sparse array.
|
|
||||||
void Fanout(SparseArray<int>* fanout);
|
|
||||||
|
|
||||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
|
||||||
// its own Match instruction recording the index in the output vector.
|
|
||||||
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
|
||||||
|
|
||||||
// Flattens the Prog from "tree" form to "list" form. This is an in-place
|
|
||||||
// operation in the sense that the old instructions are lost.
|
|
||||||
void Flatten();
|
|
||||||
|
|
||||||
// Walks the Prog; the "successor roots" or predecessors of the reachable
|
|
||||||
// instructions are marked in rootmap or predmap/predvec, respectively.
|
|
||||||
// reachable and stk are preallocated scratch structures.
|
|
||||||
void MarkSuccessors(SparseArray<int>* rootmap,
|
|
||||||
SparseArray<int>* predmap,
|
|
||||||
std::vector<std::vector<int>>* predvec,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk);
|
|
||||||
|
|
||||||
// Walks the Prog from the given "root" instruction; the "dominator root"
|
|
||||||
// of the reachable instructions (if such exists) is marked in rootmap.
|
|
||||||
// reachable and stk are preallocated scratch structures.
|
|
||||||
void MarkDominator(int root, SparseArray<int>* rootmap,
|
|
||||||
SparseArray<int>* predmap,
|
|
||||||
std::vector<std::vector<int>>* predvec,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk);
|
|
||||||
|
|
||||||
// Walks the Prog from the given "root" instruction; the reachable
|
|
||||||
// instructions are emitted in "list" form and appended to flat.
|
|
||||||
// reachable and stk are preallocated scratch structures.
|
|
||||||
void EmitList(int root, SparseArray<int>* rootmap,
|
|
||||||
std::vector<Inst>* flat,
|
|
||||||
SparseSet* reachable, std::vector<int>* stk);
|
|
||||||
|
|
||||||
private:
|
|
||||||
friend class Compiler;
|
|
||||||
|
|
||||||
DFA* GetDFA(MatchKind kind);
|
|
||||||
void DeleteDFA(DFA* dfa);
|
|
||||||
|
|
||||||
bool anchor_start_; // regexp has explicit start anchor
|
|
||||||
bool anchor_end_; // regexp has explicit end anchor
|
|
||||||
bool reversed_; // whether program runs backward over input
|
|
||||||
bool did_flatten_; // has Flatten been called?
|
|
||||||
bool did_onepass_; // has IsOnePass been called?
|
|
||||||
|
|
||||||
int start_; // entry point for program
|
|
||||||
int start_unanchored_; // unanchored entry point for program
|
|
||||||
int size_; // number of instructions
|
|
||||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
|
||||||
int first_byte_; // required first byte for match, or -1 if none
|
|
||||||
int flags_; // regexp parse flags
|
|
||||||
|
|
||||||
int list_count_; // count of lists (see above)
|
|
||||||
int inst_count_[kNumInst]; // count of instructions by opcode
|
|
||||||
|
|
||||||
Inst* inst_; // pointer to instruction array
|
|
||||||
uint8_t* onepass_nodes_; // data for OnePass nodes
|
|
||||||
|
|
||||||
int64_t dfa_mem_; // Maximum memory for DFAs.
|
|
||||||
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
|
|
||||||
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
|
|
||||||
|
|
||||||
uint8_t bytemap_[256]; // map from input bytes to byte classes
|
|
||||||
|
|
||||||
std::once_flag first_byte_once_;
|
|
||||||
std::once_flag dfa_first_once_;
|
|
||||||
std::once_flag dfa_longest_once_;
|
|
||||||
|
|
||||||
Prog(const Prog&) = delete;
|
|
||||||
Prog& operator=(const Prog&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_PROG_H_
|
|
File diff suppressed because it is too large
Load Diff
@ -1,941 +0,0 @@
|
|||||||
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_RE2_H_
|
|
||||||
#define RE2_RE2_H_
|
|
||||||
|
|
||||||
// C++ interface to the re2 regular-expression library.
|
|
||||||
// RE2 supports Perl-style regular expressions (with extensions like
|
|
||||||
// \d, \w, \s, ...).
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// REGEXP SYNTAX:
|
|
||||||
//
|
|
||||||
// This module uses the re2 library and hence supports
|
|
||||||
// its syntax for regular expressions, which is similar to Perl's with
|
|
||||||
// some of the more complicated things thrown away. In particular,
|
|
||||||
// backreferences and generalized assertions are not available, nor is \Z.
|
|
||||||
//
|
|
||||||
// See https://github.com/google/re2/wiki/Syntax for the syntax
|
|
||||||
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
|
||||||
//
|
|
||||||
// For those not familiar with Perl's regular expressions,
|
|
||||||
// here are some examples of the most commonly used extensions:
|
|
||||||
//
|
|
||||||
// "hello (\\w+) world" -- \w matches a "word" character
|
|
||||||
// "version (\\d+)" -- \d matches a digit
|
|
||||||
// "hello\\s+world" -- \s matches any whitespace character
|
|
||||||
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
|
|
||||||
// "(?i)hello" -- (?i) turns on case-insensitive matching
|
|
||||||
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// MATCHING INTERFACE:
|
|
||||||
//
|
|
||||||
// The "FullMatch" operation checks that supplied text matches a
|
|
||||||
// supplied pattern exactly.
|
|
||||||
//
|
|
||||||
// Example: successful match
|
|
||||||
// CHECK(RE2::FullMatch("hello", "h.*o"));
|
|
||||||
//
|
|
||||||
// Example: unsuccessful match (requires full match):
|
|
||||||
// CHECK(!RE2::FullMatch("hello", "e"));
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// UTF-8 AND THE MATCHING INTERFACE:
|
|
||||||
//
|
|
||||||
// By default, the pattern and input text are interpreted as UTF-8.
|
|
||||||
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
|
|
||||||
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// MATCHING WITH SUB-STRING EXTRACTION:
|
|
||||||
//
|
|
||||||
// You can supply extra pointer arguments to extract matched subpieces.
|
|
||||||
//
|
|
||||||
// Example: extracts "ruby" into "s" and 1234 into "i"
|
|
||||||
// int i;
|
|
||||||
// string s;
|
|
||||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
|
|
||||||
//
|
|
||||||
// Example: fails because string cannot be stored in integer
|
|
||||||
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
|
|
||||||
//
|
|
||||||
// Example: fails because there aren't enough sub-patterns:
|
|
||||||
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
|
|
||||||
//
|
|
||||||
// Example: does not try to extract any extra sub-patterns
|
|
||||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
|
|
||||||
//
|
|
||||||
// Example: does not try to extract into NULL
|
|
||||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
|
|
||||||
//
|
|
||||||
// Example: integer overflow causes failure
|
|
||||||
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
|
|
||||||
//
|
|
||||||
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
|
|
||||||
// This may get a little faster in the future, but right now is slower
|
|
||||||
// than PCRE. On the other hand, failed matches run *very* fast (faster
|
|
||||||
// than PCRE), as do matches without substring extraction.
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PARTIAL MATCHES
|
|
||||||
//
|
|
||||||
// You can use the "PartialMatch" operation when you want the pattern
|
|
||||||
// to match any substring of the text.
|
|
||||||
//
|
|
||||||
// Example: simple search for a string:
|
|
||||||
// CHECK(RE2::PartialMatch("hello", "ell"));
|
|
||||||
//
|
|
||||||
// Example: find first number in a string
|
|
||||||
// int number;
|
|
||||||
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
|
|
||||||
// CHECK_EQ(number, 100);
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PRE-COMPILED REGULAR EXPRESSIONS
|
|
||||||
//
|
|
||||||
// RE2 makes it easy to use any string as a regular expression, without
|
|
||||||
// requiring a separate compilation step.
|
|
||||||
//
|
|
||||||
// If speed is of the essence, you can create a pre-compiled "RE2"
|
|
||||||
// object from the pattern and use it multiple times. If you do so,
|
|
||||||
// you can typically parse text faster than with sscanf.
|
|
||||||
//
|
|
||||||
// Example: precompile pattern for faster matching:
|
|
||||||
// RE2 pattern("h.*o");
|
|
||||||
// while (ReadLine(&str)) {
|
|
||||||
// if (RE2::FullMatch(str, pattern)) ...;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// SCANNING TEXT INCREMENTALLY
|
|
||||||
//
|
|
||||||
// The "Consume" operation may be useful if you want to repeatedly
|
|
||||||
// match regular expressions at the front of a string and skip over
|
|
||||||
// them as they match. This requires use of the "StringPiece" type,
|
|
||||||
// which represents a sub-range of a real string.
|
|
||||||
//
|
|
||||||
// Example: read lines of the form "var = value" from a string.
|
|
||||||
// string contents = ...; // Fill string somehow
|
|
||||||
// StringPiece input(contents); // Wrap a StringPiece around it
|
|
||||||
//
|
|
||||||
// string var;
|
|
||||||
// int value;
|
|
||||||
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
|
|
||||||
// ...;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// Each successful call to "Consume" will set "var/value", and also
|
|
||||||
// advance "input" so it points past the matched text. Note that if the
|
|
||||||
// regular expression matches an empty string, input will advance
|
|
||||||
// by 0 bytes. If the regular expression being used might match
|
|
||||||
// an empty string, the loop body must check for this case and either
|
|
||||||
// advance the string or break out of the loop.
|
|
||||||
//
|
|
||||||
// The "FindAndConsume" operation is similar to "Consume" but does not
|
|
||||||
// anchor your match at the beginning of the string. For example, you
|
|
||||||
// could extract all words from a string by repeatedly calling
|
|
||||||
// RE2::FindAndConsume(&input, "(\\w+)", &word)
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// USING VARIABLE NUMBER OF ARGUMENTS
|
|
||||||
//
|
|
||||||
// The above operations require you to know the number of arguments
|
|
||||||
// when you write the code. This is not always possible or easy (for
|
|
||||||
// example, the regular expression may be calculated at run time).
|
|
||||||
// You can use the "N" version of the operations when the number of
|
|
||||||
// match arguments are determined at run time.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// const RE2::Arg* args[10];
|
|
||||||
// int n;
|
|
||||||
// // ... populate args with pointers to RE2::Arg values ...
|
|
||||||
// // ... set n to the number of RE2::Arg objects ...
|
|
||||||
// bool match = RE2::FullMatchN(input, pattern, args, n);
|
|
||||||
//
|
|
||||||
// The last statement is equivalent to
|
|
||||||
//
|
|
||||||
// bool match = RE2::FullMatch(input, pattern,
|
|
||||||
// *args[0], *args[1], ..., *args[n - 1]);
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PARSING HEX/OCTAL/C-RADIX NUMBERS
|
|
||||||
//
|
|
||||||
// By default, if you pass a pointer to a numeric value, the
|
|
||||||
// corresponding text is interpreted as a base-10 number. You can
|
|
||||||
// instead wrap the pointer with a call to one of the operators Hex(),
|
|
||||||
// Octal(), or CRadix() to interpret the text in another base. The
|
|
||||||
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
|
|
||||||
// prefixes, but defaults to base-10.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// int a, b, c, d;
|
|
||||||
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
|
|
||||||
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
|
||||||
// will leave 64 in a, b, c, and d.
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <map>
|
|
||||||
#include <mutex>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
class Prog;
|
|
||||||
class Regexp;
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// TODO(junyer): Get rid of this.
|
|
||||||
using std::string;
|
|
||||||
|
|
||||||
// Interface for regular expression matching. Also corresponds to a
|
|
||||||
// pre-compiled regular expression. An "RE2" object is safe for
|
|
||||||
// concurrent use by multiple threads.
|
|
||||||
class RE2 {
|
|
||||||
public:
|
|
||||||
// We convert user-passed pointers into special Arg objects
|
|
||||||
class Arg;
|
|
||||||
class Options;
|
|
||||||
|
|
||||||
// Defined in set.h.
|
|
||||||
class Set;
|
|
||||||
|
|
||||||
enum ErrorCode {
|
|
||||||
NoError = 0,
|
|
||||||
|
|
||||||
// Unexpected error
|
|
||||||
ErrorInternal,
|
|
||||||
|
|
||||||
// Parse errors
|
|
||||||
ErrorBadEscape, // bad escape sequence
|
|
||||||
ErrorBadCharClass, // bad character class
|
|
||||||
ErrorBadCharRange, // bad character class range
|
|
||||||
ErrorMissingBracket, // missing closing ]
|
|
||||||
ErrorMissingParen, // missing closing )
|
|
||||||
ErrorTrailingBackslash, // trailing \ at end of regexp
|
|
||||||
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
|
|
||||||
ErrorRepeatSize, // bad repetition argument
|
|
||||||
ErrorRepeatOp, // bad repetition operator
|
|
||||||
ErrorBadPerlOp, // bad perl operator
|
|
||||||
ErrorBadUTF8, // invalid UTF-8 in regexp
|
|
||||||
ErrorBadNamedCapture, // bad named capture group
|
|
||||||
ErrorPatternTooLarge // pattern too large (compile failed)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Predefined common options.
|
|
||||||
// If you need more complicated things, instantiate
|
|
||||||
// an Option class, possibly passing one of these to
|
|
||||||
// the Option constructor, change the settings, and pass that
|
|
||||||
// Option class to the RE2 constructor.
|
|
||||||
enum CannedOptions {
|
|
||||||
DefaultOptions = 0,
|
|
||||||
Latin1, // treat input as Latin-1 (default UTF-8)
|
|
||||||
POSIX, // POSIX syntax, leftmost-longest match
|
|
||||||
Quiet // do not log about regexp parse errors
|
|
||||||
};
|
|
||||||
|
|
||||||
// Need to have the const char* and const string& forms for implicit
|
|
||||||
// conversions when passing string literals to FullMatch and PartialMatch.
|
|
||||||
// Otherwise the StringPiece form would be sufficient.
|
|
||||||
#ifndef SWIG
|
|
||||||
RE2(const char* pattern);
|
|
||||||
RE2(const string& pattern);
|
|
||||||
#endif
|
|
||||||
RE2(const StringPiece& pattern);
|
|
||||||
RE2(const StringPiece& pattern, const Options& options);
|
|
||||||
~RE2();
|
|
||||||
|
|
||||||
// Returns whether RE2 was created properly.
|
|
||||||
bool ok() const { return error_code() == NoError; }
|
|
||||||
|
|
||||||
// The string specification for this RE2. E.g.
|
|
||||||
// RE2 re("ab*c?d+");
|
|
||||||
// re.pattern(); // "ab*c?d+"
|
|
||||||
const string& pattern() const { return pattern_; }
|
|
||||||
|
|
||||||
// If RE2 could not be created properly, returns an error string.
|
|
||||||
// Else returns the empty string.
|
|
||||||
const string& error() const { return *error_; }
|
|
||||||
|
|
||||||
// If RE2 could not be created properly, returns an error code.
|
|
||||||
// Else returns RE2::NoError (== 0).
|
|
||||||
ErrorCode error_code() const { return error_code_; }
|
|
||||||
|
|
||||||
// If RE2 could not be created properly, returns the offending
|
|
||||||
// portion of the regexp.
|
|
||||||
const string& error_arg() const { return error_arg_; }
|
|
||||||
|
|
||||||
// Returns the program size, a very approximate measure of a regexp's "cost".
|
|
||||||
// Larger numbers are more expensive than smaller numbers.
|
|
||||||
int ProgramSize() const;
|
|
||||||
|
|
||||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
|
||||||
// Outputs the program fanout as a histogram bucketed by powers of 2.
|
|
||||||
// Returns the number of the largest non-empty bucket.
|
|
||||||
int ProgramFanout(std::map<int, int>* histogram) const;
|
|
||||||
|
|
||||||
// Returns the underlying Regexp; not for general use.
|
|
||||||
// Returns entire_regexp_ so that callers don't need
|
|
||||||
// to know about prefix_ and prefix_foldcase_.
|
|
||||||
re2::Regexp* Regexp() const { return entire_regexp_; }
|
|
||||||
|
|
||||||
/***** The useful part: the matching interface *****/
|
|
||||||
|
|
||||||
// Matches "text" against "re". If pointer arguments are
|
|
||||||
// supplied, copies matched sub-patterns into them.
|
|
||||||
//
|
|
||||||
// You can pass in a "const char*" or a "string" for "text".
|
|
||||||
// You can pass in a "const char*" or a "string" or a "RE2" for "re".
|
|
||||||
//
|
|
||||||
// The provided pointer arguments can be pointers to any scalar numeric
|
|
||||||
// type, or one of:
|
|
||||||
// string (matched piece is copied to string)
|
|
||||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
|
||||||
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
|
||||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
|
||||||
//
|
|
||||||
// Returns true iff all of the following conditions are satisfied:
|
|
||||||
// a. "text" matches "re" exactly
|
|
||||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
|
||||||
// c. The "i"th argument has a suitable type for holding the
|
|
||||||
// string captured as the "i"th sub-pattern. If you pass in
|
|
||||||
// NULL for the "i"th argument, or pass fewer arguments than
|
|
||||||
// number of sub-patterns, "i"th captured sub-pattern is
|
|
||||||
// ignored.
|
|
||||||
//
|
|
||||||
// CAVEAT: An optional sub-pattern that does not exist in the
|
|
||||||
// matched string is assigned the empty string. Therefore, the
|
|
||||||
// following will return false (because the empty string is not a
|
|
||||||
// valid number):
|
|
||||||
// int number;
|
|
||||||
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
|
||||||
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
|
||||||
const Arg* const args[], int argc);
|
|
||||||
|
|
||||||
// Exactly like FullMatch(), except that "re" is allowed to match
|
|
||||||
// a substring of "text".
|
|
||||||
static bool PartialMatchN(const StringPiece& text, const RE2& re,
|
|
||||||
const Arg* const args[], int argc);
|
|
||||||
|
|
||||||
// Like FullMatch() and PartialMatch(), except that "re" has to match
|
|
||||||
// a prefix of the text, and "input" is advanced past the matched
|
|
||||||
// text. Note: "input" is modified iff this routine returns true.
|
|
||||||
static bool ConsumeN(StringPiece* input, const RE2& re,
|
|
||||||
const Arg* const args[], int argc);
|
|
||||||
|
|
||||||
// Like Consume(), but does not anchor the match at the beginning of
|
|
||||||
// the text. That is, "re" need not start its match at the beginning
|
|
||||||
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
|
|
||||||
// the next word in "s" and stores it in "word".
|
|
||||||
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
|
|
||||||
const Arg* const args[], int argc);
|
|
||||||
|
|
||||||
#ifndef SWIG
|
|
||||||
private:
|
|
||||||
template <typename F, typename SP>
|
|
||||||
static inline bool Apply(F f, SP sp, const RE2& re) {
|
|
||||||
return f(sp, re, NULL, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename F, typename SP, typename... A>
|
|
||||||
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
|
|
||||||
const Arg* const args[] = {&a...};
|
|
||||||
const int argc = sizeof...(a);
|
|
||||||
return f(sp, re, args, argc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
|
||||||
// In order to allow FullMatch() et al. to be called with a varying number
|
|
||||||
// of arguments of varying types, we use two layers of variadic templates.
|
|
||||||
// The first layer constructs the temporary Arg objects. The second layer
|
|
||||||
// (above) constructs the array of pointers to the temporary Arg objects.
|
|
||||||
|
|
||||||
template <typename... A>
|
|
||||||
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
|
||||||
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename... A>
|
|
||||||
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
|
||||||
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename... A>
|
|
||||||
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
|
|
||||||
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename... A>
|
|
||||||
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
|
|
||||||
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
|
||||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
|
||||||
// used to insert text matching corresponding parenthesized group
|
|
||||||
// from the pattern. \0 in "rewrite" refers to the entire matching
|
|
||||||
// text. E.g.,
|
|
||||||
//
|
|
||||||
// string s = "yabba dabba doo";
|
|
||||||
// CHECK(RE2::Replace(&s, "b+", "d"));
|
|
||||||
//
|
|
||||||
// will leave "s" containing "yada dabba doo"
|
|
||||||
//
|
|
||||||
// Returns true if the pattern matches and a replacement occurs,
|
|
||||||
// false otherwise.
|
|
||||||
static bool Replace(string *str,
|
|
||||||
const RE2& pattern,
|
|
||||||
const StringPiece& rewrite);
|
|
||||||
|
|
||||||
// Like Replace(), except replaces successive non-overlapping occurrences
|
|
||||||
// of the pattern in the string with the rewrite. E.g.
|
|
||||||
//
|
|
||||||
// string s = "yabba dabba doo";
|
|
||||||
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
|
|
||||||
//
|
|
||||||
// will leave "s" containing "yada dada doo"
|
|
||||||
// Replacements are not subject to re-matching.
|
|
||||||
//
|
|
||||||
// Because GlobalReplace only replaces non-overlapping matches,
|
|
||||||
// replacing "ana" within "banana" makes only one replacement, not two.
|
|
||||||
//
|
|
||||||
// Returns the number of replacements made.
|
|
||||||
static int GlobalReplace(string *str,
|
|
||||||
const RE2& pattern,
|
|
||||||
const StringPiece& rewrite);
|
|
||||||
|
|
||||||
// Like Replace, except that if the pattern matches, "rewrite"
|
|
||||||
// is copied into "out" with substitutions. The non-matching
|
|
||||||
// portions of "text" are ignored.
|
|
||||||
//
|
|
||||||
// Returns true iff a match occurred and the extraction happened
|
|
||||||
// successfully; if no match occurs, the string is left unaffected.
|
|
||||||
//
|
|
||||||
// REQUIRES: "text" must not alias any part of "*out".
|
|
||||||
static bool Extract(const StringPiece &text,
|
|
||||||
const RE2& pattern,
|
|
||||||
const StringPiece &rewrite,
|
|
||||||
string *out);
|
|
||||||
|
|
||||||
// Escapes all potentially meaningful regexp characters in
|
|
||||||
// 'unquoted'. The returned string, used as a regular expression,
|
|
||||||
// will exactly match the original string. For example,
|
|
||||||
// 1.5-2.0?
|
|
||||||
// may become:
|
|
||||||
// 1\.5\-2\.0\?
|
|
||||||
static string QuoteMeta(const StringPiece& unquoted);
|
|
||||||
|
|
||||||
// Computes range for any strings matching regexp. The min and max can in
|
|
||||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
|
||||||
// maximum desired length of string returned.
|
|
||||||
//
|
|
||||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
|
||||||
// string s that is an anchored match for this regexp satisfies
|
|
||||||
// min <= s && s <= max.
|
|
||||||
//
|
|
||||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
|
||||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
|
||||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
|
||||||
// do not compile down to infinite repetitions.
|
|
||||||
//
|
|
||||||
// Returns true on success, false on error.
|
|
||||||
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
|
|
||||||
|
|
||||||
// Generic matching interface
|
|
||||||
|
|
||||||
// Type of match.
|
|
||||||
enum Anchor {
|
|
||||||
UNANCHORED, // No anchoring
|
|
||||||
ANCHOR_START, // Anchor at start only
|
|
||||||
ANCHOR_BOTH // Anchor at start and end
|
|
||||||
};
|
|
||||||
|
|
||||||
// Return the number of capturing subpatterns, or -1 if the
|
|
||||||
// regexp wasn't valid on construction. The overall match ($0)
|
|
||||||
// does not count: if the regexp is "(a)(b)", returns 2.
|
|
||||||
int NumberOfCapturingGroups() const;
|
|
||||||
|
|
||||||
// Return a map from names to capturing indices.
|
|
||||||
// The map records the index of the leftmost group
|
|
||||||
// with the given name.
|
|
||||||
// Only valid until the re is deleted.
|
|
||||||
const std::map<string, int>& NamedCapturingGroups() const;
|
|
||||||
|
|
||||||
// Return a map from capturing indices to names.
|
|
||||||
// The map has no entries for unnamed groups.
|
|
||||||
// Only valid until the re is deleted.
|
|
||||||
const std::map<int, string>& CapturingGroupNames() const;
|
|
||||||
|
|
||||||
// General matching routine.
|
|
||||||
// Match against text starting at offset startpos
|
|
||||||
// and stopping the search at offset endpos.
|
|
||||||
// Returns true if match found, false if not.
|
|
||||||
// On a successful match, fills in match[] (up to nmatch entries)
|
|
||||||
// with information about submatches.
|
|
||||||
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
|
||||||
// setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar",
|
|
||||||
// match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL.
|
|
||||||
//
|
|
||||||
// Don't ask for more match information than you will use:
|
|
||||||
// runs much faster with nmatch == 1 than nmatch > 1, and
|
|
||||||
// runs even faster if nmatch == 0.
|
|
||||||
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
|
|
||||||
// but will be handled correctly.
|
|
||||||
//
|
|
||||||
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
|
||||||
// empty string, but note that on return, it will not be possible to tell
|
|
||||||
// whether submatch i matched the empty string or did not match:
|
|
||||||
// either way, match[i].data() == NULL.
|
|
||||||
bool Match(const StringPiece& text,
|
|
||||||
size_t startpos,
|
|
||||||
size_t endpos,
|
|
||||||
Anchor anchor,
|
|
||||||
StringPiece *match,
|
|
||||||
int nmatch) const;
|
|
||||||
|
|
||||||
// Check that the given rewrite string is suitable for use with this
|
|
||||||
// regular expression. It checks that:
|
|
||||||
// * The regular expression has enough parenthesized subexpressions
|
|
||||||
// to satisfy all of the \N tokens in rewrite
|
|
||||||
// * The rewrite string doesn't have any syntax errors. E.g.,
|
|
||||||
// '\' followed by anything other than a digit or '\'.
|
|
||||||
// A true return value guarantees that Replace() and Extract() won't
|
|
||||||
// fail because of a bad rewrite string.
|
|
||||||
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
|
|
||||||
|
|
||||||
// Returns the maximum submatch needed for the rewrite to be done by
|
|
||||||
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
|
|
||||||
static int MaxSubmatch(const StringPiece& rewrite);
|
|
||||||
|
|
||||||
// Append the "rewrite" string, with backslash subsitutions from "vec",
|
|
||||||
// to string "out".
|
|
||||||
// Returns true on success. This method can fail because of a malformed
|
|
||||||
// rewrite string. CheckRewriteString guarantees that the rewrite will
|
|
||||||
// be sucessful.
|
|
||||||
bool Rewrite(string* out,
|
|
||||||
const StringPiece& rewrite,
|
|
||||||
const StringPiece* vec,
|
|
||||||
int veclen) const;
|
|
||||||
|
|
||||||
// Constructor options
|
|
||||||
class Options {
|
|
||||||
public:
|
|
||||||
// The options are (defaults in parentheses):
|
|
||||||
//
|
|
||||||
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
|
|
||||||
// posix_syntax (false) restrict regexps to POSIX egrep syntax
|
|
||||||
// longest_match (false) search for longest match, not first match
|
|
||||||
// log_errors (true) log syntax and execution errors to ERROR
|
|
||||||
// max_mem (see below) approx. max memory footprint of RE2
|
|
||||||
// literal (false) interpret string as literal, not regexp
|
|
||||||
// never_nl (false) never match \n, even if it is in regexp
|
|
||||||
// dot_nl (false) dot matches everything including new line
|
|
||||||
// never_capture (false) parse all parens as non-capturing
|
|
||||||
// case_sensitive (true) match is case-sensitive (regexp can override
|
|
||||||
// with (?i) unless in posix_syntax mode)
|
|
||||||
//
|
|
||||||
// The following options are only consulted when posix_syntax == true.
|
|
||||||
// (When posix_syntax == false these features are always enabled and
|
|
||||||
// cannot be turned off.)
|
|
||||||
// perl_classes (false) allow Perl's \d \s \w \D \S \W
|
|
||||||
// word_boundary (false) allow Perl's \b \B (word boundary and not)
|
|
||||||
// one_line (false) ^ and $ only match beginning and end of text
|
|
||||||
//
|
|
||||||
// The max_mem option controls how much memory can be used
|
|
||||||
// to hold the compiled form of the regexp (the Prog) and
|
|
||||||
// its cached DFA graphs. Code Search placed limits on the number
|
|
||||||
// of Prog instructions and DFA states: 10,000 for both.
|
|
||||||
// In RE2, those limits would translate to about 240 KB per Prog
|
|
||||||
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
|
|
||||||
// better job of keeping them small than Code Search did).
|
|
||||||
// Each RE2 has two Progs (one forward, one reverse), and each Prog
|
|
||||||
// can have two DFAs (one first match, one longest match).
|
|
||||||
// That makes 4 DFAs:
|
|
||||||
//
|
|
||||||
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
|
|
||||||
// if opt.longest_match() == false
|
|
||||||
// forward, longest-match - used for all ANCHOR_BOTH searches,
|
|
||||||
// and the other two kinds if
|
|
||||||
// opt.longest_match() == true
|
|
||||||
// reverse, first-match - never used
|
|
||||||
// reverse, longest-match - used as second phase for unanchored searches
|
|
||||||
//
|
|
||||||
// The RE2 memory budget is statically divided between the two
|
|
||||||
// Progs and then the DFAs: two thirds to the forward Prog
|
|
||||||
// and one third to the reverse Prog. The forward Prog gives half
|
|
||||||
// of what it has left over to each of its DFAs. The reverse Prog
|
|
||||||
// gives it all to its longest-match DFA.
|
|
||||||
//
|
|
||||||
// Once a DFA fills its budget, it flushes its cache and starts over.
|
|
||||||
// If this happens too often, RE2 falls back on the NFA implementation.
|
|
||||||
|
|
||||||
// For now, make the default budget something close to Code Search.
|
|
||||||
static const int kDefaultMaxMem = 8<<20;
|
|
||||||
|
|
||||||
enum Encoding {
|
|
||||||
EncodingUTF8 = 1,
|
|
||||||
EncodingLatin1
|
|
||||||
};
|
|
||||||
|
|
||||||
Options() :
|
|
||||||
encoding_(EncodingUTF8),
|
|
||||||
posix_syntax_(false),
|
|
||||||
longest_match_(false),
|
|
||||||
log_errors_(true),
|
|
||||||
max_mem_(kDefaultMaxMem),
|
|
||||||
literal_(false),
|
|
||||||
never_nl_(false),
|
|
||||||
dot_nl_(false),
|
|
||||||
never_capture_(false),
|
|
||||||
case_sensitive_(true),
|
|
||||||
perl_classes_(false),
|
|
||||||
word_boundary_(false),
|
|
||||||
one_line_(false) {
|
|
||||||
}
|
|
||||||
|
|
||||||
/*implicit*/ Options(CannedOptions);
|
|
||||||
|
|
||||||
Encoding encoding() const { return encoding_; }
|
|
||||||
void set_encoding(Encoding encoding) { encoding_ = encoding; }
|
|
||||||
|
|
||||||
// Legacy interface to encoding.
|
|
||||||
// TODO(rsc): Remove once clients have been converted.
|
|
||||||
bool utf8() const { return encoding_ == EncodingUTF8; }
|
|
||||||
void set_utf8(bool b) {
|
|
||||||
if (b) {
|
|
||||||
encoding_ = EncodingUTF8;
|
|
||||||
} else {
|
|
||||||
encoding_ = EncodingLatin1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool posix_syntax() const { return posix_syntax_; }
|
|
||||||
void set_posix_syntax(bool b) { posix_syntax_ = b; }
|
|
||||||
|
|
||||||
bool longest_match() const { return longest_match_; }
|
|
||||||
void set_longest_match(bool b) { longest_match_ = b; }
|
|
||||||
|
|
||||||
bool log_errors() const { return log_errors_; }
|
|
||||||
void set_log_errors(bool b) { log_errors_ = b; }
|
|
||||||
|
|
||||||
int64_t max_mem() const { return max_mem_; }
|
|
||||||
void set_max_mem(int64_t m) { max_mem_ = m; }
|
|
||||||
|
|
||||||
bool literal() const { return literal_; }
|
|
||||||
void set_literal(bool b) { literal_ = b; }
|
|
||||||
|
|
||||||
bool never_nl() const { return never_nl_; }
|
|
||||||
void set_never_nl(bool b) { never_nl_ = b; }
|
|
||||||
|
|
||||||
bool dot_nl() const { return dot_nl_; }
|
|
||||||
void set_dot_nl(bool b) { dot_nl_ = b; }
|
|
||||||
|
|
||||||
bool never_capture() const { return never_capture_; }
|
|
||||||
void set_never_capture(bool b) { never_capture_ = b; }
|
|
||||||
|
|
||||||
bool case_sensitive() const { return case_sensitive_; }
|
|
||||||
void set_case_sensitive(bool b) { case_sensitive_ = b; }
|
|
||||||
|
|
||||||
bool perl_classes() const { return perl_classes_; }
|
|
||||||
void set_perl_classes(bool b) { perl_classes_ = b; }
|
|
||||||
|
|
||||||
bool word_boundary() const { return word_boundary_; }
|
|
||||||
void set_word_boundary(bool b) { word_boundary_ = b; }
|
|
||||||
|
|
||||||
bool one_line() const { return one_line_; }
|
|
||||||
void set_one_line(bool b) { one_line_ = b; }
|
|
||||||
|
|
||||||
void Copy(const Options& src) {
|
|
||||||
*this = src;
|
|
||||||
}
|
|
||||||
|
|
||||||
int ParseFlags() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
Encoding encoding_;
|
|
||||||
bool posix_syntax_;
|
|
||||||
bool longest_match_;
|
|
||||||
bool log_errors_;
|
|
||||||
int64_t max_mem_;
|
|
||||||
bool literal_;
|
|
||||||
bool never_nl_;
|
|
||||||
bool dot_nl_;
|
|
||||||
bool never_capture_;
|
|
||||||
bool case_sensitive_;
|
|
||||||
bool perl_classes_;
|
|
||||||
bool word_boundary_;
|
|
||||||
bool one_line_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Returns the options set in the constructor.
|
|
||||||
const Options& options() const { return options_; };
|
|
||||||
|
|
||||||
// Argument converters; see below.
|
|
||||||
static inline Arg CRadix(short* x);
|
|
||||||
static inline Arg CRadix(unsigned short* x);
|
|
||||||
static inline Arg CRadix(int* x);
|
|
||||||
static inline Arg CRadix(unsigned int* x);
|
|
||||||
static inline Arg CRadix(long* x);
|
|
||||||
static inline Arg CRadix(unsigned long* x);
|
|
||||||
static inline Arg CRadix(long long* x);
|
|
||||||
static inline Arg CRadix(unsigned long long* x);
|
|
||||||
|
|
||||||
static inline Arg Hex(short* x);
|
|
||||||
static inline Arg Hex(unsigned short* x);
|
|
||||||
static inline Arg Hex(int* x);
|
|
||||||
static inline Arg Hex(unsigned int* x);
|
|
||||||
static inline Arg Hex(long* x);
|
|
||||||
static inline Arg Hex(unsigned long* x);
|
|
||||||
static inline Arg Hex(long long* x);
|
|
||||||
static inline Arg Hex(unsigned long long* x);
|
|
||||||
|
|
||||||
static inline Arg Octal(short* x);
|
|
||||||
static inline Arg Octal(unsigned short* x);
|
|
||||||
static inline Arg Octal(int* x);
|
|
||||||
static inline Arg Octal(unsigned int* x);
|
|
||||||
static inline Arg Octal(long* x);
|
|
||||||
static inline Arg Octal(unsigned long* x);
|
|
||||||
static inline Arg Octal(long long* x);
|
|
||||||
static inline Arg Octal(unsigned long long* x);
|
|
||||||
|
|
||||||
private:
|
|
||||||
void Init(const StringPiece& pattern, const Options& options);
|
|
||||||
|
|
||||||
bool DoMatch(const StringPiece& text,
|
|
||||||
Anchor anchor,
|
|
||||||
size_t* consumed,
|
|
||||||
const Arg* const args[],
|
|
||||||
int n) const;
|
|
||||||
|
|
||||||
re2::Prog* ReverseProg() const;
|
|
||||||
|
|
||||||
string pattern_; // string regular expression
|
|
||||||
Options options_; // option flags
|
|
||||||
string prefix_; // required prefix (before regexp_)
|
|
||||||
bool prefix_foldcase_; // prefix is ASCII case-insensitive
|
|
||||||
re2::Regexp* entire_regexp_; // parsed regular expression
|
|
||||||
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
|
||||||
re2::Prog* prog_; // compiled program for regexp
|
|
||||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
|
||||||
|
|
||||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
|
||||||
mutable const string* error_; // Error indicator
|
|
||||||
// (or points to empty string)
|
|
||||||
mutable ErrorCode error_code_; // Error code
|
|
||||||
mutable string error_arg_; // Fragment of regexp showing error
|
|
||||||
mutable int num_captures_; // Number of capturing groups
|
|
||||||
|
|
||||||
// Map from capture names to indices
|
|
||||||
mutable const std::map<string, int>* named_groups_;
|
|
||||||
|
|
||||||
// Map from capture indices to names
|
|
||||||
mutable const std::map<int, string>* group_names_;
|
|
||||||
|
|
||||||
// Onces for lazy computations.
|
|
||||||
mutable std::once_flag rprog_once_;
|
|
||||||
mutable std::once_flag num_captures_once_;
|
|
||||||
mutable std::once_flag named_groups_once_;
|
|
||||||
mutable std::once_flag group_names_once_;
|
|
||||||
|
|
||||||
RE2(const RE2&) = delete;
|
|
||||||
RE2& operator=(const RE2&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
/***** Implementation details *****/
|
|
||||||
|
|
||||||
// Hex/Octal/Binary?
|
|
||||||
|
|
||||||
// Special class for parsing into objects that define a ParseFrom() method
|
|
||||||
template <class T>
|
|
||||||
class _RE2_MatchObject {
|
|
||||||
public:
|
|
||||||
static inline bool Parse(const char* str, size_t n, void* dest) {
|
|
||||||
if (dest == NULL) return true;
|
|
||||||
T* object = reinterpret_cast<T*>(dest);
|
|
||||||
return object->ParseFrom(str, n);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class RE2::Arg {
|
|
||||||
public:
|
|
||||||
// Empty constructor so we can declare arrays of RE2::Arg
|
|
||||||
Arg();
|
|
||||||
|
|
||||||
// Constructor specially designed for NULL arguments
|
|
||||||
Arg(void*);
|
|
||||||
|
|
||||||
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
|
||||||
|
|
||||||
// Type-specific parsers
|
|
||||||
#define MAKE_PARSER(type, name) \
|
|
||||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
|
||||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
|
||||||
|
|
||||||
MAKE_PARSER(char, parse_char);
|
|
||||||
MAKE_PARSER(signed char, parse_schar);
|
|
||||||
MAKE_PARSER(unsigned char, parse_uchar);
|
|
||||||
MAKE_PARSER(float, parse_float);
|
|
||||||
MAKE_PARSER(double, parse_double);
|
|
||||||
MAKE_PARSER(string, parse_string);
|
|
||||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
|
||||||
|
|
||||||
MAKE_PARSER(short, parse_short);
|
|
||||||
MAKE_PARSER(unsigned short, parse_ushort);
|
|
||||||
MAKE_PARSER(int, parse_int);
|
|
||||||
MAKE_PARSER(unsigned int, parse_uint);
|
|
||||||
MAKE_PARSER(long, parse_long);
|
|
||||||
MAKE_PARSER(unsigned long, parse_ulong);
|
|
||||||
MAKE_PARSER(long long, parse_longlong);
|
|
||||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
|
||||||
|
|
||||||
#undef MAKE_PARSER
|
|
||||||
|
|
||||||
// Generic constructor templates
|
|
||||||
template <class T> Arg(T* p)
|
|
||||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
|
|
||||||
template <class T> Arg(T* p, Parser parser)
|
|
||||||
: arg_(p), parser_(parser) { }
|
|
||||||
|
|
||||||
// Parse the data
|
|
||||||
bool Parse(const char* str, size_t n) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void* arg_;
|
|
||||||
Parser parser_;
|
|
||||||
|
|
||||||
static bool parse_null (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_char (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_schar (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_uchar (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_float (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_double (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_string (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
|
||||||
|
|
||||||
#define DECLARE_INTEGER_PARSER(name) \
|
|
||||||
private: \
|
|
||||||
static bool parse_##name(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
|
||||||
int radix); \
|
|
||||||
\
|
|
||||||
public: \
|
|
||||||
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
|
||||||
|
|
||||||
DECLARE_INTEGER_PARSER(short);
|
|
||||||
DECLARE_INTEGER_PARSER(ushort);
|
|
||||||
DECLARE_INTEGER_PARSER(int);
|
|
||||||
DECLARE_INTEGER_PARSER(uint);
|
|
||||||
DECLARE_INTEGER_PARSER(long);
|
|
||||||
DECLARE_INTEGER_PARSER(ulong);
|
|
||||||
DECLARE_INTEGER_PARSER(longlong);
|
|
||||||
DECLARE_INTEGER_PARSER(ulonglong);
|
|
||||||
|
|
||||||
#undef DECLARE_INTEGER_PARSER
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
|
||||||
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
|
||||||
|
|
||||||
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
|
|
||||||
return (*parser_)(str, n, arg_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This part of the parser, appropriate only for ints, deals with bases
|
|
||||||
#define MAKE_INTEGER_PARSER(type, name) \
|
|
||||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
|
|
||||||
} \
|
|
||||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
|
|
||||||
} \
|
|
||||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
|
|
||||||
}
|
|
||||||
|
|
||||||
MAKE_INTEGER_PARSER(short, short)
|
|
||||||
MAKE_INTEGER_PARSER(unsigned short, ushort)
|
|
||||||
MAKE_INTEGER_PARSER(int, int)
|
|
||||||
MAKE_INTEGER_PARSER(unsigned int, uint)
|
|
||||||
MAKE_INTEGER_PARSER(long, long)
|
|
||||||
MAKE_INTEGER_PARSER(unsigned long, ulong)
|
|
||||||
MAKE_INTEGER_PARSER(long long, longlong)
|
|
||||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
|
|
||||||
|
|
||||||
#undef MAKE_INTEGER_PARSER
|
|
||||||
|
|
||||||
#ifndef SWIG
|
|
||||||
|
|
||||||
// Silence warnings about missing initializers for members of LazyRE2.
|
|
||||||
// Note that we test for Clang first because it defines __GNUC__ as well.
|
|
||||||
#if defined(__clang__)
|
|
||||||
#elif defined(__GNUC__) && __GNUC__ >= 6
|
|
||||||
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Helper for writing global or static RE2s safely.
|
|
||||||
// Write
|
|
||||||
// static LazyRE2 re = {".*"};
|
|
||||||
// and then use *re instead of writing
|
|
||||||
// static RE2 re(".*");
|
|
||||||
// The former is more careful about multithreaded
|
|
||||||
// situations than the latter.
|
|
||||||
//
|
|
||||||
// N.B. This class never deletes the RE2 object that
|
|
||||||
// it constructs: that's a feature, so that it can be used
|
|
||||||
// for global and function static variables.
|
|
||||||
class LazyRE2 {
|
|
||||||
private:
|
|
||||||
struct NoArg {};
|
|
||||||
|
|
||||||
public:
|
|
||||||
typedef RE2 element_type; // support std::pointer_traits
|
|
||||||
|
|
||||||
// Constructor omitted to preserve braced initialization in C++98.
|
|
||||||
|
|
||||||
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
|
|
||||||
RE2& operator*() const { return *get(); }
|
|
||||||
RE2* operator->() const { return get(); }
|
|
||||||
|
|
||||||
// Named accessor/initializer:
|
|
||||||
RE2* get() const {
|
|
||||||
std::call_once(once_, &LazyRE2::Init, this);
|
|
||||||
return ptr_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// All data fields must be public to support {"foo"} initialization.
|
|
||||||
const char* pattern_;
|
|
||||||
RE2::CannedOptions options_;
|
|
||||||
NoArg barrier_against_excess_initializers_;
|
|
||||||
|
|
||||||
mutable RE2* ptr_;
|
|
||||||
mutable std::once_flag once_;
|
|
||||||
|
|
||||||
private:
|
|
||||||
static void Init(const LazyRE2* lazy_re2) {
|
|
||||||
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void operator=(const LazyRE2&); // disallowed
|
|
||||||
};
|
|
||||||
#endif // SWIG
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
using re2::RE2;
|
|
||||||
using re2::LazyRE2;
|
|
||||||
|
|
||||||
#endif // RE2_RE2_H_
|
|
@ -1,968 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Regular expression representation.
|
|
||||||
// Tested by parse_test.cc
|
|
||||||
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <map>
|
|
||||||
#include <mutex>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/mutex.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
#include "re2/walker-inl.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Constructor. Allocates vectors as appropriate for operator.
|
|
||||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
|
||||||
: op_(static_cast<uint8_t>(op)),
|
|
||||||
simple_(false),
|
|
||||||
parse_flags_(static_cast<uint16_t>(parse_flags)),
|
|
||||||
ref_(1),
|
|
||||||
nsub_(0),
|
|
||||||
down_(NULL) {
|
|
||||||
subone_ = NULL;
|
|
||||||
memset(the_union_, 0, sizeof the_union_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Destructor. Assumes already cleaned up children.
|
|
||||||
// Private: use Decref() instead of delete to destroy Regexps.
|
|
||||||
// Can't call Decref on the sub-Regexps here because
|
|
||||||
// that could cause arbitrarily deep recursion, so
|
|
||||||
// required Decref() to have handled them for us.
|
|
||||||
Regexp::~Regexp() {
|
|
||||||
if (nsub_ > 0)
|
|
||||||
LOG(DFATAL) << "Regexp not destroyed.";
|
|
||||||
|
|
||||||
switch (op_) {
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
case kRegexpCapture:
|
|
||||||
delete name_;
|
|
||||||
break;
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
delete[] runes_;
|
|
||||||
break;
|
|
||||||
case kRegexpCharClass:
|
|
||||||
if (cc_)
|
|
||||||
cc_->Delete();
|
|
||||||
delete ccb_;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If it's possible to destroy this regexp without recurring,
|
|
||||||
// do so and return true. Else return false.
|
|
||||||
bool Regexp::QuickDestroy() {
|
|
||||||
if (nsub_ == 0) {
|
|
||||||
delete this;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Lazily allocated.
|
|
||||||
static Mutex* ref_mutex;
|
|
||||||
static std::map<Regexp*, int>* ref_map;
|
|
||||||
|
|
||||||
int Regexp::Ref() {
|
|
||||||
if (ref_ < kMaxRef)
|
|
||||||
return ref_;
|
|
||||||
|
|
||||||
MutexLock l(ref_mutex);
|
|
||||||
return (*ref_map)[this];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Increments reference count, returns object as convenience.
|
|
||||||
Regexp* Regexp::Incref() {
|
|
||||||
if (ref_ >= kMaxRef-1) {
|
|
||||||
static std::once_flag ref_once;
|
|
||||||
std::call_once(ref_once, []() {
|
|
||||||
ref_mutex = new Mutex;
|
|
||||||
ref_map = new std::map<Regexp*, int>;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Store ref count in overflow map.
|
|
||||||
MutexLock l(ref_mutex);
|
|
||||||
if (ref_ == kMaxRef) {
|
|
||||||
// already overflowed
|
|
||||||
(*ref_map)[this]++;
|
|
||||||
} else {
|
|
||||||
// overflowing now
|
|
||||||
(*ref_map)[this] = kMaxRef;
|
|
||||||
ref_ = kMaxRef;
|
|
||||||
}
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
ref_++;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Decrements reference count and deletes this object if count reaches 0.
|
|
||||||
void Regexp::Decref() {
|
|
||||||
if (ref_ == kMaxRef) {
|
|
||||||
// Ref count is stored in overflow map.
|
|
||||||
MutexLock l(ref_mutex);
|
|
||||||
int r = (*ref_map)[this] - 1;
|
|
||||||
if (r < kMaxRef) {
|
|
||||||
ref_ = static_cast<uint16_t>(r);
|
|
||||||
ref_map->erase(this);
|
|
||||||
} else {
|
|
||||||
(*ref_map)[this] = r;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ref_--;
|
|
||||||
if (ref_ == 0)
|
|
||||||
Destroy();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Deletes this object; ref count has count reached 0.
|
|
||||||
void Regexp::Destroy() {
|
|
||||||
if (QuickDestroy())
|
|
||||||
return;
|
|
||||||
|
|
||||||
// Handle recursive Destroy with explicit stack
|
|
||||||
// to avoid arbitrarily deep recursion on process stack [sigh].
|
|
||||||
down_ = NULL;
|
|
||||||
Regexp* stack = this;
|
|
||||||
while (stack != NULL) {
|
|
||||||
Regexp* re = stack;
|
|
||||||
stack = re->down_;
|
|
||||||
if (re->ref_ != 0)
|
|
||||||
LOG(DFATAL) << "Bad reference count " << re->ref_;
|
|
||||||
if (re->nsub_ > 0) {
|
|
||||||
Regexp** subs = re->sub();
|
|
||||||
for (int i = 0; i < re->nsub_; i++) {
|
|
||||||
Regexp* sub = subs[i];
|
|
||||||
if (sub == NULL)
|
|
||||||
continue;
|
|
||||||
if (sub->ref_ == kMaxRef)
|
|
||||||
sub->Decref();
|
|
||||||
else
|
|
||||||
--sub->ref_;
|
|
||||||
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
|
|
||||||
sub->down_ = stack;
|
|
||||||
stack = sub;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (re->nsub_ > 1)
|
|
||||||
delete[] subs;
|
|
||||||
re->nsub_ = 0;
|
|
||||||
}
|
|
||||||
delete re;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Regexp::AddRuneToString(Rune r) {
|
|
||||||
DCHECK(op_ == kRegexpLiteralString);
|
|
||||||
if (nrunes_ == 0) {
|
|
||||||
// start with 8
|
|
||||||
runes_ = new Rune[8];
|
|
||||||
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
|
|
||||||
// double on powers of two
|
|
||||||
Rune *old = runes_;
|
|
||||||
runes_ = new Rune[nrunes_ * 2];
|
|
||||||
for (int i = 0; i < nrunes_; i++)
|
|
||||||
runes_[i] = old[i];
|
|
||||||
delete[] old;
|
|
||||||
}
|
|
||||||
|
|
||||||
runes_[nrunes_++] = r;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
|
||||||
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
|
|
||||||
re->match_id_ = match_id;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
|
|
||||||
// Squash **, ++ and ??.
|
|
||||||
if (op == sub->op() && flags == sub->parse_flags())
|
|
||||||
return sub;
|
|
||||||
|
|
||||||
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
|
|
||||||
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
|
|
||||||
if ((sub->op() == kRegexpStar ||
|
|
||||||
sub->op() == kRegexpPlus ||
|
|
||||||
sub->op() == kRegexpQuest) &&
|
|
||||||
flags == sub->parse_flags()) {
|
|
||||||
// If sub is Star, no need to rewrite it.
|
|
||||||
if (sub->op() == kRegexpStar)
|
|
||||||
return sub;
|
|
||||||
|
|
||||||
// Rewrite sub to Star.
|
|
||||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub->sub()[0]->Incref();
|
|
||||||
sub->Decref(); // We didn't consume the reference after all.
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* re = new Regexp(op, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
|
||||||
return StarPlusOrQuest(kRegexpPlus, sub, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
|
||||||
return StarPlusOrQuest(kRegexpStar, sub, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
|
||||||
return StarPlusOrQuest(kRegexpQuest, sub, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
|
||||||
ParseFlags flags, bool can_factor) {
|
|
||||||
if (nsub == 1)
|
|
||||||
return sub[0];
|
|
||||||
|
|
||||||
if (nsub == 0) {
|
|
||||||
if (op == kRegexpAlternate)
|
|
||||||
return new Regexp(kRegexpNoMatch, flags);
|
|
||||||
else
|
|
||||||
return new Regexp(kRegexpEmptyMatch, flags);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp** subcopy = NULL;
|
|
||||||
if (op == kRegexpAlternate && can_factor) {
|
|
||||||
// Going to edit sub; make a copy so we don't step on caller.
|
|
||||||
subcopy = new Regexp*[nsub];
|
|
||||||
memmove(subcopy, sub, nsub * sizeof sub[0]);
|
|
||||||
sub = subcopy;
|
|
||||||
nsub = FactorAlternation(sub, nsub, flags);
|
|
||||||
if (nsub == 1) {
|
|
||||||
Regexp* re = sub[0];
|
|
||||||
delete[] subcopy;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nsub > kMaxNsub) {
|
|
||||||
// Too many subexpressions to fit in a single Regexp.
|
|
||||||
// Make a two-level tree. Two levels gets us to 65535^2.
|
|
||||||
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
|
|
||||||
Regexp* re = new Regexp(op, flags);
|
|
||||||
re->AllocSub(nbigsub);
|
|
||||||
Regexp** subs = re->sub();
|
|
||||||
for (int i = 0; i < nbigsub - 1; i++)
|
|
||||||
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
|
|
||||||
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
|
|
||||||
nsub - (nbigsub-1)*kMaxNsub, flags,
|
|
||||||
false);
|
|
||||||
delete[] subcopy;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* re = new Regexp(op, flags);
|
|
||||||
re->AllocSub(nsub);
|
|
||||||
Regexp** subs = re->sub();
|
|
||||||
for (int i = 0; i < nsub; i++)
|
|
||||||
subs[i] = sub[i];
|
|
||||||
|
|
||||||
delete[] subcopy;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
|
|
||||||
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
|
|
||||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
|
|
||||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
|
|
||||||
Regexp* re = new Regexp(kRegexpCapture, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub;
|
|
||||||
re->cap_ = cap;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
|
|
||||||
Regexp* re = new Regexp(kRegexpRepeat, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub;
|
|
||||||
re->min_ = min;
|
|
||||||
re->max_ = max;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
|
|
||||||
Regexp* re = new Regexp(kRegexpLiteral, flags);
|
|
||||||
re->rune_ = rune;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
|
|
||||||
if (nrunes <= 0)
|
|
||||||
return new Regexp(kRegexpEmptyMatch, flags);
|
|
||||||
if (nrunes == 1)
|
|
||||||
return NewLiteral(runes[0], flags);
|
|
||||||
Regexp* re = new Regexp(kRegexpLiteralString, flags);
|
|
||||||
for (int i = 0; i < nrunes; i++)
|
|
||||||
re->AddRuneToString(runes[i]);
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
|
|
||||||
Regexp* re = new Regexp(kRegexpCharClass, flags);
|
|
||||||
re->cc_ = cc;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Swaps this and that in place.
|
|
||||||
void Regexp::Swap(Regexp* that) {
|
|
||||||
// Can use memmove because Regexp is just a struct (no vtable).
|
|
||||||
char tmp[sizeof *this];
|
|
||||||
memmove(tmp, this, sizeof tmp);
|
|
||||||
memmove(this, that, sizeof tmp);
|
|
||||||
memmove(that, tmp, sizeof tmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Tests equality of all top-level structure but not subregexps.
|
|
||||||
static bool TopEqual(Regexp* a, Regexp* b) {
|
|
||||||
if (a->op() != b->op())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
switch (a->op()) {
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
case kRegexpEndLine:
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
case kRegexpBeginText:
|
|
||||||
return true;
|
|
||||||
|
|
||||||
case kRegexpEndText:
|
|
||||||
// The parse flags remember whether it's \z or (?-m:$),
|
|
||||||
// which matters when testing against PCRE.
|
|
||||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
|
|
||||||
|
|
||||||
case kRegexpLiteral:
|
|
||||||
return a->rune() == b->rune() &&
|
|
||||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
|
|
||||||
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
return a->nrunes() == b->nrunes() &&
|
|
||||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
|
|
||||||
memcmp(a->runes(), b->runes(),
|
|
||||||
a->nrunes() * sizeof a->runes()[0]) == 0;
|
|
||||||
|
|
||||||
case kRegexpAlternate:
|
|
||||||
case kRegexpConcat:
|
|
||||||
return a->nsub() == b->nsub();
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
|
|
||||||
|
|
||||||
case kRegexpRepeat:
|
|
||||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
|
|
||||||
a->min() == b->min() &&
|
|
||||||
a->max() == b->max();
|
|
||||||
|
|
||||||
case kRegexpCapture:
|
|
||||||
return a->cap() == b->cap() && a->name() == b->name();
|
|
||||||
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
return a->match_id() == b->match_id();
|
|
||||||
|
|
||||||
case kRegexpCharClass: {
|
|
||||||
CharClass* acc = a->cc();
|
|
||||||
CharClass* bcc = b->cc();
|
|
||||||
return acc->size() == bcc->size() &&
|
|
||||||
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
|
|
||||||
memcmp(acc->begin(), bcc->begin(),
|
|
||||||
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Regexp::Equal(Regexp* a, Regexp* b) {
|
|
||||||
if (a == NULL || b == NULL)
|
|
||||||
return a == b;
|
|
||||||
|
|
||||||
if (!TopEqual(a, b))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Fast path:
|
|
||||||
// return without allocating vector if there are no subregexps.
|
|
||||||
switch (a->op()) {
|
|
||||||
case kRegexpAlternate:
|
|
||||||
case kRegexpConcat:
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
case kRegexpRepeat:
|
|
||||||
case kRegexpCapture:
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Committed to doing real work.
|
|
||||||
// The stack (vector) has pairs of regexps waiting to
|
|
||||||
// be compared. The regexps are only equal if
|
|
||||||
// all the pairs end up being equal.
|
|
||||||
std::vector<Regexp*> stk;
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
// Invariant: TopEqual(a, b) == true.
|
|
||||||
Regexp* a2;
|
|
||||||
Regexp* b2;
|
|
||||||
switch (a->op()) {
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
case kRegexpAlternate:
|
|
||||||
case kRegexpConcat:
|
|
||||||
for (int i = 0; i < a->nsub(); i++) {
|
|
||||||
a2 = a->sub()[i];
|
|
||||||
b2 = b->sub()[i];
|
|
||||||
if (!TopEqual(a2, b2))
|
|
||||||
return false;
|
|
||||||
stk.push_back(a2);
|
|
||||||
stk.push_back(b2);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
case kRegexpRepeat:
|
|
||||||
case kRegexpCapture:
|
|
||||||
a2 = a->sub()[0];
|
|
||||||
b2 = b->sub()[0];
|
|
||||||
if (!TopEqual(a2, b2))
|
|
||||||
return false;
|
|
||||||
// Really:
|
|
||||||
// stk.push_back(a2);
|
|
||||||
// stk.push_back(b2);
|
|
||||||
// break;
|
|
||||||
// but faster to assign directly and loop.
|
|
||||||
a = a2;
|
|
||||||
b = b2;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t n = stk.size();
|
|
||||||
if (n == 0)
|
|
||||||
break;
|
|
||||||
|
|
||||||
DCHECK_GE(n, 2);
|
|
||||||
a = stk[n-2];
|
|
||||||
b = stk[n-1];
|
|
||||||
stk.resize(n-2);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Keep in sync with enum RegexpStatusCode in regexp.h
|
|
||||||
static const char *kErrorStrings[] = {
|
|
||||||
"no error",
|
|
||||||
"unexpected error",
|
|
||||||
"invalid escape sequence",
|
|
||||||
"invalid character class",
|
|
||||||
"invalid character class range",
|
|
||||||
"missing ]",
|
|
||||||
"missing )",
|
|
||||||
"trailing \\",
|
|
||||||
"no argument for repetition operator",
|
|
||||||
"invalid repetition size",
|
|
||||||
"bad repetition operator",
|
|
||||||
"invalid perl operator",
|
|
||||||
"invalid UTF-8",
|
|
||||||
"invalid named capture group",
|
|
||||||
};
|
|
||||||
|
|
||||||
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
|
|
||||||
if (code < 0 || code >= arraysize(kErrorStrings))
|
|
||||||
code = kRegexpInternalError;
|
|
||||||
return kErrorStrings[code];
|
|
||||||
}
|
|
||||||
|
|
||||||
string RegexpStatus::Text() const {
|
|
||||||
if (error_arg_.empty())
|
|
||||||
return CodeText(code_);
|
|
||||||
string s;
|
|
||||||
s.append(CodeText(code_));
|
|
||||||
s.append(": ");
|
|
||||||
s.append(error_arg_.data(), error_arg_.size());
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
void RegexpStatus::Copy(const RegexpStatus& status) {
|
|
||||||
code_ = status.code_;
|
|
||||||
error_arg_ = status.error_arg_;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef int Ignored; // Walker<void> doesn't exist
|
|
||||||
|
|
||||||
// Walker subclass to count capturing parens in regexp.
|
|
||||||
class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
|
||||||
public:
|
|
||||||
NumCapturesWalker() : ncapture_(0) {}
|
|
||||||
int ncapture() { return ncapture_; }
|
|
||||||
|
|
||||||
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
|
||||||
if (re->op() == kRegexpCapture)
|
|
||||||
ncapture_++;
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
|
||||||
// Should never be called: we use Walk not WalkExponential.
|
|
||||||
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
int ncapture_;
|
|
||||||
|
|
||||||
NumCapturesWalker(const NumCapturesWalker&) = delete;
|
|
||||||
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
int Regexp::NumCaptures() {
|
|
||||||
NumCapturesWalker w;
|
|
||||||
w.Walk(this, 0);
|
|
||||||
return w.ncapture();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Walker class to build map of named capture groups and their indices.
|
|
||||||
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
|
||||||
public:
|
|
||||||
NamedCapturesWalker() : map_(NULL) {}
|
|
||||||
~NamedCapturesWalker() { delete map_; }
|
|
||||||
|
|
||||||
std::map<string, int>* TakeMap() {
|
|
||||||
std::map<string, int>* m = map_;
|
|
||||||
map_ = NULL;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
|
||||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
|
||||||
// Allocate map once we find a name.
|
|
||||||
if (map_ == NULL)
|
|
||||||
map_ = new std::map<string, int>;
|
|
||||||
|
|
||||||
// Record first occurrence of each name.
|
|
||||||
// (The rule is that if you have the same name
|
|
||||||
// multiple times, only the leftmost one counts.)
|
|
||||||
if (map_->find(*re->name()) == map_->end())
|
|
||||||
(*map_)[*re->name()] = re->cap();
|
|
||||||
}
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
|
||||||
// Should never be called: we use Walk not WalkExponential.
|
|
||||||
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::map<string, int>* map_;
|
|
||||||
|
|
||||||
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
|
|
||||||
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::map<string, int>* Regexp::NamedCaptures() {
|
|
||||||
NamedCapturesWalker w;
|
|
||||||
w.Walk(this, 0);
|
|
||||||
return w.TakeMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Walker class to build map from capture group indices to their names.
|
|
||||||
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
|
||||||
public:
|
|
||||||
CaptureNamesWalker() : map_(NULL) {}
|
|
||||||
~CaptureNamesWalker() { delete map_; }
|
|
||||||
|
|
||||||
std::map<int, string>* TakeMap() {
|
|
||||||
std::map<int, string>* m = map_;
|
|
||||||
map_ = NULL;
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
|
||||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
|
||||||
// Allocate map once we find a name.
|
|
||||||
if (map_ == NULL)
|
|
||||||
map_ = new std::map<int, string>;
|
|
||||||
|
|
||||||
(*map_)[re->cap()] = *re->name();
|
|
||||||
}
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
|
||||||
// Should never be called: we use Walk not WalkExponential.
|
|
||||||
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
|
|
||||||
return ignored;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::map<int, string>* map_;
|
|
||||||
|
|
||||||
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
|
|
||||||
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::map<int, string>* Regexp::CaptureNames() {
|
|
||||||
CaptureNamesWalker w;
|
|
||||||
w.Walk(this, 0);
|
|
||||||
return w.TakeMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Determines whether regexp matches must be anchored
|
|
||||||
// with a fixed string prefix. If so, returns the prefix and
|
|
||||||
// the regexp that remains after the prefix. The prefix might
|
|
||||||
// be ASCII case-insensitive.
|
|
||||||
bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
|
|
||||||
// No need for a walker: the regexp must be of the form
|
|
||||||
// 1. some number of ^ anchors
|
|
||||||
// 2. a literal char or string
|
|
||||||
// 3. the rest
|
|
||||||
prefix->clear();
|
|
||||||
*foldcase = false;
|
|
||||||
*suffix = NULL;
|
|
||||||
if (op_ != kRegexpConcat)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
// Some number of anchors, then a literal or concatenation.
|
|
||||||
int i = 0;
|
|
||||||
Regexp** sub = this->sub();
|
|
||||||
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
|
|
||||||
i++;
|
|
||||||
if (i == 0 || i >= nsub_)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
Regexp* re = sub[i];
|
|
||||||
switch (re->op_) {
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
// Convert to string in proper encoding.
|
|
||||||
if (re->parse_flags() & Latin1) {
|
|
||||||
prefix->resize(re->nrunes_);
|
|
||||||
for (int j = 0; j < re->nrunes_; j++)
|
|
||||||
(*prefix)[j] = static_cast<char>(re->runes_[j]);
|
|
||||||
} else {
|
|
||||||
// Convert to UTF-8 in place.
|
|
||||||
// Assume worst-case space and then trim.
|
|
||||||
prefix->resize(re->nrunes_ * UTFmax);
|
|
||||||
char *p = &(*prefix)[0];
|
|
||||||
for (int j = 0; j < re->nrunes_; j++) {
|
|
||||||
Rune r = re->runes_[j];
|
|
||||||
if (r < Runeself)
|
|
||||||
*p++ = static_cast<char>(r);
|
|
||||||
else
|
|
||||||
p += runetochar(p, &r);
|
|
||||||
}
|
|
||||||
prefix->resize(p - &(*prefix)[0]);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteral:
|
|
||||||
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
|
||||||
prefix->append(1, static_cast<char>(re->rune_));
|
|
||||||
} else {
|
|
||||||
char buf[UTFmax];
|
|
||||||
prefix->append(buf, runetochar(buf, &re->rune_));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
|
|
||||||
i++;
|
|
||||||
|
|
||||||
// The rest.
|
|
||||||
if (i < nsub_) {
|
|
||||||
for (int j = i; j < nsub_; j++)
|
|
||||||
sub[j]->Incref();
|
|
||||||
re = Concat(sub + i, nsub_ - i, parse_flags());
|
|
||||||
} else {
|
|
||||||
re = new Regexp(kRegexpEmptyMatch, parse_flags());
|
|
||||||
}
|
|
||||||
*suffix = re;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Character class builder is a balanced binary tree (STL set)
|
|
||||||
// containing non-overlapping, non-abutting RuneRanges.
|
|
||||||
// The less-than operator used in the tree treats two
|
|
||||||
// ranges as equal if they overlap at all, so that
|
|
||||||
// lookups for a particular Rune are possible.
|
|
||||||
|
|
||||||
CharClassBuilder::CharClassBuilder() {
|
|
||||||
nrunes_ = 0;
|
|
||||||
upper_ = 0;
|
|
||||||
lower_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add lo-hi to the class; return whether class got bigger.
|
|
||||||
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
|
||||||
if (hi < lo)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (lo <= 'z' && hi >= 'A') {
|
|
||||||
// Overlaps some alpha, maybe not all.
|
|
||||||
// Update bitmaps telling which ASCII letters are in the set.
|
|
||||||
Rune lo1 = std::max<Rune>(lo, 'A');
|
|
||||||
Rune hi1 = std::min<Rune>(hi, 'Z');
|
|
||||||
if (lo1 <= hi1)
|
|
||||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
|
||||||
|
|
||||||
lo1 = std::max<Rune>(lo, 'a');
|
|
||||||
hi1 = std::min<Rune>(hi, 'z');
|
|
||||||
if (lo1 <= hi1)
|
|
||||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
|
||||||
}
|
|
||||||
|
|
||||||
{ // Check whether lo, hi is already in the class.
|
|
||||||
iterator it = ranges_.find(RuneRange(lo, lo));
|
|
||||||
if (it != end() && it->lo <= lo && hi <= it->hi)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for a range abutting lo on the left.
|
|
||||||
// If it exists, take it out and increase our range.
|
|
||||||
if (lo > 0) {
|
|
||||||
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
|
|
||||||
if (it != end()) {
|
|
||||||
lo = it->lo;
|
|
||||||
if (it->hi > hi)
|
|
||||||
hi = it->hi;
|
|
||||||
nrunes_ -= it->hi - it->lo + 1;
|
|
||||||
ranges_.erase(it);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for a range abutting hi on the right.
|
|
||||||
// If it exists, take it out and increase our range.
|
|
||||||
if (hi < Runemax) {
|
|
||||||
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
|
|
||||||
if (it != end()) {
|
|
||||||
hi = it->hi;
|
|
||||||
nrunes_ -= it->hi - it->lo + 1;
|
|
||||||
ranges_.erase(it);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Look for ranges between lo and hi. Take them out.
|
|
||||||
// This is only safe because the set has no overlapping ranges.
|
|
||||||
// We've already removed any ranges abutting lo and hi, so
|
|
||||||
// any that overlap [lo, hi] must be contained within it.
|
|
||||||
for (;;) {
|
|
||||||
iterator it = ranges_.find(RuneRange(lo, hi));
|
|
||||||
if (it == end())
|
|
||||||
break;
|
|
||||||
nrunes_ -= it->hi - it->lo + 1;
|
|
||||||
ranges_.erase(it);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally, add [lo, hi].
|
|
||||||
nrunes_ += hi - lo + 1;
|
|
||||||
ranges_.insert(RuneRange(lo, hi));
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
|
|
||||||
for (iterator it = cc->begin(); it != cc->end(); ++it)
|
|
||||||
AddRange(it->lo, it->hi);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CharClassBuilder::Contains(Rune r) {
|
|
||||||
return ranges_.find(RuneRange(r, r)) != end();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Does the character class behave the same on A-Z as on a-z?
|
|
||||||
bool CharClassBuilder::FoldsASCII() {
|
|
||||||
return ((upper_ ^ lower_) & AlphaMask) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
CharClassBuilder* CharClassBuilder::Copy() {
|
|
||||||
CharClassBuilder* cc = new CharClassBuilder;
|
|
||||||
for (iterator it = begin(); it != end(); ++it)
|
|
||||||
cc->ranges_.insert(RuneRange(it->lo, it->hi));
|
|
||||||
cc->upper_ = upper_;
|
|
||||||
cc->lower_ = lower_;
|
|
||||||
cc->nrunes_ = nrunes_;
|
|
||||||
return cc;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void CharClassBuilder::RemoveAbove(Rune r) {
|
|
||||||
if (r >= Runemax)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (r < 'z') {
|
|
||||||
if (r < 'a')
|
|
||||||
lower_ = 0;
|
|
||||||
else
|
|
||||||
lower_ &= AlphaMask >> ('z' - r);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (r < 'Z') {
|
|
||||||
if (r < 'A')
|
|
||||||
upper_ = 0;
|
|
||||||
else
|
|
||||||
upper_ &= AlphaMask >> ('Z' - r);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
|
|
||||||
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
|
|
||||||
if (it == end())
|
|
||||||
break;
|
|
||||||
RuneRange rr = *it;
|
|
||||||
ranges_.erase(it);
|
|
||||||
nrunes_ -= rr.hi - rr.lo + 1;
|
|
||||||
if (rr.lo <= r) {
|
|
||||||
rr.hi = r;
|
|
||||||
ranges_.insert(rr);
|
|
||||||
nrunes_ += rr.hi - rr.lo + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void CharClassBuilder::Negate() {
|
|
||||||
// Build up negation and then copy in.
|
|
||||||
// Could edit ranges in place, but C++ won't let me.
|
|
||||||
std::vector<RuneRange> v;
|
|
||||||
v.reserve(ranges_.size() + 1);
|
|
||||||
|
|
||||||
// In negation, first range begins at 0, unless
|
|
||||||
// the current class begins at 0.
|
|
||||||
iterator it = begin();
|
|
||||||
if (it == end()) {
|
|
||||||
v.push_back(RuneRange(0, Runemax));
|
|
||||||
} else {
|
|
||||||
int nextlo = 0;
|
|
||||||
if (it->lo == 0) {
|
|
||||||
nextlo = it->hi + 1;
|
|
||||||
++it;
|
|
||||||
}
|
|
||||||
for (; it != end(); ++it) {
|
|
||||||
v.push_back(RuneRange(nextlo, it->lo - 1));
|
|
||||||
nextlo = it->hi + 1;
|
|
||||||
}
|
|
||||||
if (nextlo <= Runemax)
|
|
||||||
v.push_back(RuneRange(nextlo, Runemax));
|
|
||||||
}
|
|
||||||
|
|
||||||
ranges_.clear();
|
|
||||||
for (size_t i = 0; i < v.size(); i++)
|
|
||||||
ranges_.insert(v[i]);
|
|
||||||
|
|
||||||
upper_ = AlphaMask & ~upper_;
|
|
||||||
lower_ = AlphaMask & ~lower_;
|
|
||||||
nrunes_ = Runemax+1 - nrunes_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Character class is a sorted list of ranges.
|
|
||||||
// The ranges are allocated in the same block as the header,
|
|
||||||
// necessitating a special allocator and Delete method.
|
|
||||||
|
|
||||||
CharClass* CharClass::New(int maxranges) {
|
|
||||||
CharClass* cc;
|
|
||||||
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
|
||||||
cc = reinterpret_cast<CharClass*>(data);
|
|
||||||
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
|
||||||
cc->nranges_ = 0;
|
|
||||||
cc->folds_ascii_ = false;
|
|
||||||
cc->nrunes_ = 0;
|
|
||||||
return cc;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CharClass::Delete() {
|
|
||||||
uint8_t* data = reinterpret_cast<uint8_t*>(this);
|
|
||||||
delete[] data;
|
|
||||||
}
|
|
||||||
|
|
||||||
CharClass* CharClass::Negate() {
|
|
||||||
CharClass* cc = CharClass::New(nranges_+1);
|
|
||||||
cc->folds_ascii_ = folds_ascii_;
|
|
||||||
cc->nrunes_ = Runemax + 1 - nrunes_;
|
|
||||||
int n = 0;
|
|
||||||
int nextlo = 0;
|
|
||||||
for (CharClass::iterator it = begin(); it != end(); ++it) {
|
|
||||||
if (it->lo == nextlo) {
|
|
||||||
nextlo = it->hi + 1;
|
|
||||||
} else {
|
|
||||||
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
|
|
||||||
nextlo = it->hi + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (nextlo <= Runemax)
|
|
||||||
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
|
|
||||||
cc->nranges_ = n;
|
|
||||||
return cc;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CharClass::Contains(Rune r) {
|
|
||||||
RuneRange* rr = ranges_;
|
|
||||||
int n = nranges_;
|
|
||||||
while (n > 0) {
|
|
||||||
int m = n/2;
|
|
||||||
if (rr[m].hi < r) {
|
|
||||||
rr += m+1;
|
|
||||||
n -= m+1;
|
|
||||||
} else if (r < rr[m].lo) {
|
|
||||||
n = m;
|
|
||||||
} else { // rr[m].lo <= r && r <= rr[m].hi
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
CharClass* CharClassBuilder::GetCharClass() {
|
|
||||||
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
|
|
||||||
int n = 0;
|
|
||||||
for (iterator it = begin(); it != end(); ++it)
|
|
||||||
cc->ranges_[n++] = *it;
|
|
||||||
cc->nranges_ = n;
|
|
||||||
DCHECK_LE(n, static_cast<int>(ranges_.size()));
|
|
||||||
cc->nrunes_ = nrunes_;
|
|
||||||
cc->folds_ascii_ = FoldsASCII();
|
|
||||||
return cc;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,652 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_REGEXP_H_
|
|
||||||
#define RE2_REGEXP_H_
|
|
||||||
|
|
||||||
// --- SPONSORED LINK --------------------------------------------------
|
|
||||||
// If you want to use this library for regular expression matching,
|
|
||||||
// you should use re2/re2.h, which provides a class RE2 that
|
|
||||||
// mimics the PCRE interface provided by PCRE's C++ wrappers.
|
|
||||||
// This header describes the low-level interface used to implement RE2
|
|
||||||
// and may change in backwards-incompatible ways from time to time.
|
|
||||||
// In contrast, RE2's interface will not.
|
|
||||||
// ---------------------------------------------------------------------
|
|
||||||
|
|
||||||
// Regular expression library: parsing, execution, and manipulation
|
|
||||||
// of regular expressions.
|
|
||||||
//
|
|
||||||
// Any operation that traverses the Regexp structures should be written
|
|
||||||
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
|
|
||||||
// regular expressions such as x++++++++++++++++++++... might cause recursive
|
|
||||||
// traversals to overflow the stack.
|
|
||||||
//
|
|
||||||
// It is the caller's responsibility to provide appropriate mutual exclusion
|
|
||||||
// around manipulation of the regexps. RE2 does this.
|
|
||||||
//
|
|
||||||
// PARSING
|
|
||||||
//
|
|
||||||
// Regexp::Parse parses regular expressions encoded in UTF-8.
|
|
||||||
// The default syntax is POSIX extended regular expressions,
|
|
||||||
// with the following changes:
|
|
||||||
//
|
|
||||||
// 1. Backreferences (optional in POSIX EREs) are not supported.
|
|
||||||
// (Supporting them precludes the use of DFA-based
|
|
||||||
// matching engines.)
|
|
||||||
//
|
|
||||||
// 2. Collating elements and collation classes are not supported.
|
|
||||||
// (No one has needed or wanted them.)
|
|
||||||
//
|
|
||||||
// The exact syntax accepted can be modified by passing flags to
|
|
||||||
// Regexp::Parse. In particular, many of the basic Perl additions
|
|
||||||
// are available. The flags are documented below (search for LikePerl).
|
|
||||||
//
|
|
||||||
// If parsed with the flag Regexp::Latin1, both the regular expression
|
|
||||||
// and the input to the matching routines are assumed to be encoded in
|
|
||||||
// Latin-1, not UTF-8.
|
|
||||||
//
|
|
||||||
// EXECUTION
|
|
||||||
//
|
|
||||||
// Once Regexp has parsed a regular expression, it provides methods
|
|
||||||
// to search text using that regular expression. These methods are
|
|
||||||
// implemented via calling out to other regular expression libraries.
|
|
||||||
// (Let's call them the sublibraries.)
|
|
||||||
//
|
|
||||||
// To call a sublibrary, Regexp does not simply prepare a
|
|
||||||
// string version of the regular expression and hand it to the
|
|
||||||
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
|
|
||||||
// corresponding internal representation used by the sublibrary.
|
|
||||||
// This has the drawback of needing to know the internal representation
|
|
||||||
// used by the sublibrary, but it has two important benefits:
|
|
||||||
//
|
|
||||||
// 1. The syntax and meaning of regular expressions is guaranteed
|
|
||||||
// to be that used by Regexp's parser, not the syntax expected
|
|
||||||
// by the sublibrary. Regexp might accept a restricted or
|
|
||||||
// expanded syntax for regular expressions as compared with
|
|
||||||
// the sublibrary. As long as Regexp can translate from its
|
|
||||||
// internal form into the sublibrary's, clients need not know
|
|
||||||
// exactly which sublibrary they are using.
|
|
||||||
//
|
|
||||||
// 2. The sublibrary parsers are bypassed. For whatever reason,
|
|
||||||
// sublibrary regular expression parsers often have security
|
|
||||||
// problems. For example, plan9grep's regular expression parser
|
|
||||||
// has a buffer overflow in its handling of large character
|
|
||||||
// classes, and PCRE's parser has had buffer overflow problems
|
|
||||||
// in the past. Security-team requires sandboxing of sublibrary
|
|
||||||
// regular expression parsers. Avoiding the sublibrary parsers
|
|
||||||
// avoids the sandbox.
|
|
||||||
//
|
|
||||||
// The execution methods we use now are provided by the compiled form,
|
|
||||||
// Prog, described in prog.h
|
|
||||||
//
|
|
||||||
// MANIPULATION
|
|
||||||
//
|
|
||||||
// Unlike other regular expression libraries, Regexp makes its parsed
|
|
||||||
// form accessible to clients, so that client code can analyze the
|
|
||||||
// parsed regular expressions.
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <map>
|
|
||||||
#include <set>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
|
|
||||||
enum RegexpOp {
|
|
||||||
// Matches no strings.
|
|
||||||
kRegexpNoMatch = 1,
|
|
||||||
|
|
||||||
// Matches empty string.
|
|
||||||
kRegexpEmptyMatch,
|
|
||||||
|
|
||||||
// Matches rune_.
|
|
||||||
kRegexpLiteral,
|
|
||||||
|
|
||||||
// Matches runes_.
|
|
||||||
kRegexpLiteralString,
|
|
||||||
|
|
||||||
// Matches concatenation of sub_[0..nsub-1].
|
|
||||||
kRegexpConcat,
|
|
||||||
// Matches union of sub_[0..nsub-1].
|
|
||||||
kRegexpAlternate,
|
|
||||||
|
|
||||||
// Matches sub_[0] zero or more times.
|
|
||||||
kRegexpStar,
|
|
||||||
// Matches sub_[0] one or more times.
|
|
||||||
kRegexpPlus,
|
|
||||||
// Matches sub_[0] zero or one times.
|
|
||||||
kRegexpQuest,
|
|
||||||
|
|
||||||
// Matches sub_[0] at least min_ times, at most max_ times.
|
|
||||||
// max_ == -1 means no upper limit.
|
|
||||||
kRegexpRepeat,
|
|
||||||
|
|
||||||
// Parenthesized (capturing) subexpression. Index is cap_.
|
|
||||||
// Optionally, capturing name is name_.
|
|
||||||
kRegexpCapture,
|
|
||||||
|
|
||||||
// Matches any character.
|
|
||||||
kRegexpAnyChar,
|
|
||||||
|
|
||||||
// Matches any byte [sic].
|
|
||||||
kRegexpAnyByte,
|
|
||||||
|
|
||||||
// Matches empty string at beginning of line.
|
|
||||||
kRegexpBeginLine,
|
|
||||||
// Matches empty string at end of line.
|
|
||||||
kRegexpEndLine,
|
|
||||||
|
|
||||||
// Matches word boundary "\b".
|
|
||||||
kRegexpWordBoundary,
|
|
||||||
// Matches not-a-word boundary "\B".
|
|
||||||
kRegexpNoWordBoundary,
|
|
||||||
|
|
||||||
// Matches empty string at beginning of text.
|
|
||||||
kRegexpBeginText,
|
|
||||||
// Matches empty string at end of text.
|
|
||||||
kRegexpEndText,
|
|
||||||
|
|
||||||
// Matches character class given by cc_.
|
|
||||||
kRegexpCharClass,
|
|
||||||
|
|
||||||
// Forces match of entire expression right now,
|
|
||||||
// with match ID match_id_ (used by RE2::Set).
|
|
||||||
kRegexpHaveMatch,
|
|
||||||
|
|
||||||
kMaxRegexpOp = kRegexpHaveMatch,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Keep in sync with string list in regexp.cc
|
|
||||||
enum RegexpStatusCode {
|
|
||||||
// No error
|
|
||||||
kRegexpSuccess = 0,
|
|
||||||
|
|
||||||
// Unexpected error
|
|
||||||
kRegexpInternalError,
|
|
||||||
|
|
||||||
// Parse errors
|
|
||||||
kRegexpBadEscape, // bad escape sequence
|
|
||||||
kRegexpBadCharClass, // bad character class
|
|
||||||
kRegexpBadCharRange, // bad character class range
|
|
||||||
kRegexpMissingBracket, // missing closing ]
|
|
||||||
kRegexpMissingParen, // missing closing )
|
|
||||||
kRegexpTrailingBackslash, // at end of regexp
|
|
||||||
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
|
|
||||||
kRegexpRepeatSize, // bad repetition argument
|
|
||||||
kRegexpRepeatOp, // bad repetition operator
|
|
||||||
kRegexpBadPerlOp, // bad perl operator
|
|
||||||
kRegexpBadUTF8, // invalid UTF-8 in regexp
|
|
||||||
kRegexpBadNamedCapture, // bad named capture
|
|
||||||
};
|
|
||||||
|
|
||||||
// Error status for certain operations.
|
|
||||||
class RegexpStatus {
|
|
||||||
public:
|
|
||||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
|
||||||
~RegexpStatus() { delete tmp_; }
|
|
||||||
|
|
||||||
void set_code(RegexpStatusCode code) { code_ = code; }
|
|
||||||
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
|
||||||
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
|
||||||
RegexpStatusCode code() const { return code_; }
|
|
||||||
const StringPiece& error_arg() const { return error_arg_; }
|
|
||||||
bool ok() const { return code() == kRegexpSuccess; }
|
|
||||||
|
|
||||||
// Copies state from status.
|
|
||||||
void Copy(const RegexpStatus& status);
|
|
||||||
|
|
||||||
// Returns text equivalent of code, e.g.:
|
|
||||||
// "Bad character class"
|
|
||||||
static string CodeText(RegexpStatusCode code);
|
|
||||||
|
|
||||||
// Returns text describing error, e.g.:
|
|
||||||
// "Bad character class: [z-a]"
|
|
||||||
string Text() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
RegexpStatusCode code_; // Kind of error
|
|
||||||
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
|
||||||
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
|
||||||
|
|
||||||
RegexpStatus(const RegexpStatus&) = delete;
|
|
||||||
RegexpStatus& operator=(const RegexpStatus&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Compiled form; see prog.h
|
|
||||||
class Prog;
|
|
||||||
|
|
||||||
struct RuneRange {
|
|
||||||
RuneRange() : lo(0), hi(0) { }
|
|
||||||
RuneRange(int l, int h) : lo(l), hi(h) { }
|
|
||||||
Rune lo;
|
|
||||||
Rune hi;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Less-than on RuneRanges treats a == b if they overlap at all.
|
|
||||||
// This lets us look in a set to find the range covering a particular Rune.
|
|
||||||
struct RuneRangeLess {
|
|
||||||
bool operator()(const RuneRange& a, const RuneRange& b) const {
|
|
||||||
return a.hi < b.lo;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class CharClassBuilder;
|
|
||||||
|
|
||||||
class CharClass {
|
|
||||||
public:
|
|
||||||
void Delete();
|
|
||||||
|
|
||||||
typedef RuneRange* iterator;
|
|
||||||
iterator begin() { return ranges_; }
|
|
||||||
iterator end() { return ranges_ + nranges_; }
|
|
||||||
|
|
||||||
int size() { return nrunes_; }
|
|
||||||
bool empty() { return nrunes_ == 0; }
|
|
||||||
bool full() { return nrunes_ == Runemax+1; }
|
|
||||||
bool FoldsASCII() { return folds_ascii_; }
|
|
||||||
|
|
||||||
bool Contains(Rune r);
|
|
||||||
CharClass* Negate();
|
|
||||||
|
|
||||||
private:
|
|
||||||
CharClass(); // not implemented
|
|
||||||
~CharClass(); // not implemented
|
|
||||||
static CharClass* New(int maxranges);
|
|
||||||
|
|
||||||
friend class CharClassBuilder;
|
|
||||||
|
|
||||||
bool folds_ascii_;
|
|
||||||
int nrunes_;
|
|
||||||
RuneRange *ranges_;
|
|
||||||
int nranges_;
|
|
||||||
|
|
||||||
CharClass(const CharClass&) = delete;
|
|
||||||
CharClass& operator=(const CharClass&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
class Regexp {
|
|
||||||
public:
|
|
||||||
|
|
||||||
// Flags for parsing. Can be ORed together.
|
|
||||||
enum ParseFlags {
|
|
||||||
NoParseFlags = 0,
|
|
||||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
|
||||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
|
||||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
|
||||||
// and [[:space:]] to match newline.
|
|
||||||
DotNL = 1<<3, // Allow . to match newline.
|
|
||||||
MatchNL = ClassNL | DotNL,
|
|
||||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
|
||||||
// end of text, not around embedded newlines.
|
|
||||||
// (Perl's default)
|
|
||||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
|
||||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
|
||||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
|
||||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
|
||||||
PerlX = 1<<9, // Perl extensions:
|
|
||||||
// non-capturing parens - (?: )
|
|
||||||
// non-greedy operators - *? +? ?? {}?
|
|
||||||
// flag edits - (?i) (?-i) (?i: )
|
|
||||||
// i - FoldCase
|
|
||||||
// m - !OneLine
|
|
||||||
// s - DotNL
|
|
||||||
// U - NonGreedy
|
|
||||||
// line ends: \A \z
|
|
||||||
// \Q and \E to disable/enable metacharacters
|
|
||||||
// (?P<name>expr) for named captures
|
|
||||||
// \C to match any single byte
|
|
||||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
|
||||||
// and \P{Han} for its negation.
|
|
||||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
|
||||||
// it explicitly.
|
|
||||||
NeverCapture = 1<<12, // Parse all parens as non-capturing.
|
|
||||||
|
|
||||||
// As close to Perl as we can get.
|
|
||||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
|
||||||
UnicodeGroups,
|
|
||||||
|
|
||||||
// Internal use only.
|
|
||||||
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
|
|
||||||
AllParseFlags = (1<<14)-1,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Get. No set, Regexps are logically immutable once created.
|
|
||||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
|
||||||
int nsub() { return nsub_; }
|
|
||||||
bool simple() { return simple_ != 0; }
|
|
||||||
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
|
||||||
int Ref(); // For testing.
|
|
||||||
|
|
||||||
Regexp** sub() {
|
|
||||||
if(nsub_ <= 1)
|
|
||||||
return &subone_;
|
|
||||||
else
|
|
||||||
return submany_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
|
|
||||||
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
|
|
||||||
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
|
|
||||||
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
|
|
||||||
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
|
|
||||||
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
|
|
||||||
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
|
|
||||||
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
|
|
||||||
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
|
|
||||||
|
|
||||||
// Increments reference count, returns object as convenience.
|
|
||||||
Regexp* Incref();
|
|
||||||
|
|
||||||
// Decrements reference count and deletes this object if count reaches 0.
|
|
||||||
void Decref();
|
|
||||||
|
|
||||||
// Parses string s to produce regular expression, returned.
|
|
||||||
// Caller must release return value with re->Decref().
|
|
||||||
// On failure, sets *status (if status != NULL) and returns NULL.
|
|
||||||
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
|
|
||||||
RegexpStatus* status);
|
|
||||||
|
|
||||||
// Returns a _new_ simplified version of the current regexp.
|
|
||||||
// Does not edit the current regexp.
|
|
||||||
// Caller must release return value with re->Decref().
|
|
||||||
// Simplified means that counted repetition has been rewritten
|
|
||||||
// into simpler terms and all Perl/POSIX features have been
|
|
||||||
// removed. The result will capture exactly the same
|
|
||||||
// subexpressions the original did, unless formatted with ToString.
|
|
||||||
Regexp* Simplify();
|
|
||||||
friend class CoalesceWalker;
|
|
||||||
friend class SimplifyWalker;
|
|
||||||
|
|
||||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
|
||||||
// string representation of the simplified form. Returns true on success.
|
|
||||||
// Returns false and sets *status (if status != NULL) on parse error.
|
|
||||||
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
|
||||||
string* dst,
|
|
||||||
RegexpStatus* status);
|
|
||||||
|
|
||||||
// Returns the number of capturing groups in the regexp.
|
|
||||||
int NumCaptures();
|
|
||||||
friend class NumCapturesWalker;
|
|
||||||
|
|
||||||
// Returns a map from names to capturing group indices,
|
|
||||||
// or NULL if the regexp contains no named capture groups.
|
|
||||||
// The caller is responsible for deleting the map.
|
|
||||||
std::map<string, int>* NamedCaptures();
|
|
||||||
|
|
||||||
// Returns a map from capturing group indices to capturing group
|
|
||||||
// names or NULL if the regexp contains no named capture groups. The
|
|
||||||
// caller is responsible for deleting the map.
|
|
||||||
std::map<int, string>* CaptureNames();
|
|
||||||
|
|
||||||
// Returns a string representation of the current regexp,
|
|
||||||
// using as few parentheses as possible.
|
|
||||||
string ToString();
|
|
||||||
|
|
||||||
// Convenience functions. They consume the passed reference,
|
|
||||||
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
|
|
||||||
// They do not consume allocated arrays like subs or runes.
|
|
||||||
static Regexp* Plus(Regexp* sub, ParseFlags flags);
|
|
||||||
static Regexp* Star(Regexp* sub, ParseFlags flags);
|
|
||||||
static Regexp* Quest(Regexp* sub, ParseFlags flags);
|
|
||||||
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
|
|
||||||
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
|
|
||||||
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
|
|
||||||
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
|
|
||||||
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
|
|
||||||
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
|
|
||||||
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
|
|
||||||
static Regexp* HaveMatch(int match_id, ParseFlags flags);
|
|
||||||
|
|
||||||
// Like Alternate but does not factor out common prefixes.
|
|
||||||
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
|
|
||||||
|
|
||||||
// Debugging function. Returns string format for regexp
|
|
||||||
// that makes structure clear. Does NOT use regexp syntax.
|
|
||||||
string Dump();
|
|
||||||
|
|
||||||
// Helper traversal class, defined fully in walker-inl.h.
|
|
||||||
template<typename T> class Walker;
|
|
||||||
|
|
||||||
// Compile to Prog. See prog.h
|
|
||||||
// Reverse prog expects to be run over text backward.
|
|
||||||
// Construction and execution of prog will
|
|
||||||
// stay within approximately max_mem bytes of memory.
|
|
||||||
// If max_mem <= 0, a reasonable default is used.
|
|
||||||
Prog* CompileToProg(int64_t max_mem);
|
|
||||||
Prog* CompileToReverseProg(int64_t max_mem);
|
|
||||||
|
|
||||||
// Whether to expect this library to find exactly the same answer as PCRE
|
|
||||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
|
||||||
// obscure cases behave differently. Technically this is more a property
|
|
||||||
// of the Prog than the Regexp, but the computation is much easier to do
|
|
||||||
// on the Regexp. See mimics_pcre.cc for the exact conditions.
|
|
||||||
bool MimicsPCRE();
|
|
||||||
|
|
||||||
// Benchmarking function.
|
|
||||||
void NullWalk();
|
|
||||||
|
|
||||||
// Whether every match of this regexp must be anchored and
|
|
||||||
// begin with a non-empty fixed string (perhaps after ASCII
|
|
||||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
|
||||||
// follows it.
|
|
||||||
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
|
|
||||||
// regardless of the return value.
|
|
||||||
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Constructor allocates vectors as appropriate for operator.
|
|
||||||
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
|
|
||||||
|
|
||||||
// Use Decref() instead of delete to release Regexps.
|
|
||||||
// This is private to catch deletes at compile time.
|
|
||||||
~Regexp();
|
|
||||||
void Destroy();
|
|
||||||
bool QuickDestroy();
|
|
||||||
|
|
||||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
|
||||||
class ParseState;
|
|
||||||
|
|
||||||
friend class ParseState;
|
|
||||||
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
|
||||||
RegexpStatus* status);
|
|
||||||
|
|
||||||
// Helper for testing [sic].
|
|
||||||
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
|
|
||||||
|
|
||||||
// Computes whether Regexp is already simple.
|
|
||||||
bool ComputeSimple();
|
|
||||||
|
|
||||||
// Constructor that generates a Star, Plus or Quest,
|
|
||||||
// squashing the pair if sub is also a Star, Plus or Quest.
|
|
||||||
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
|
|
||||||
|
|
||||||
// Constructor that generates a concatenation or alternation,
|
|
||||||
// enforcing the limit on the number of subexpressions for
|
|
||||||
// a particular Regexp.
|
|
||||||
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
|
|
||||||
ParseFlags flags, bool can_factor);
|
|
||||||
|
|
||||||
// Returns the leading string that re starts with.
|
|
||||||
// The returned Rune* points into a piece of re,
|
|
||||||
// so it must not be used after the caller calls re->Decref().
|
|
||||||
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
|
|
||||||
|
|
||||||
// Removes the first n leading runes from the beginning of re.
|
|
||||||
// Edits re in place.
|
|
||||||
static void RemoveLeadingString(Regexp* re, int n);
|
|
||||||
|
|
||||||
// Returns the leading regexp in re's top-level concatenation.
|
|
||||||
// The returned Regexp* points at re or a sub-expression of re,
|
|
||||||
// so it must not be used after the caller calls re->Decref().
|
|
||||||
static Regexp* LeadingRegexp(Regexp* re);
|
|
||||||
|
|
||||||
// Removes LeadingRegexp(re) from re and returns the remainder.
|
|
||||||
// Might edit re in place.
|
|
||||||
static Regexp* RemoveLeadingRegexp(Regexp* re);
|
|
||||||
|
|
||||||
// Simplifies an alternation of literal strings by factoring out
|
|
||||||
// common prefixes.
|
|
||||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
|
||||||
friend class FactorAlternationImpl;
|
|
||||||
|
|
||||||
// Is a == b? Only efficient on regexps that have not been through
|
|
||||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
|
||||||
// take a long time. Do not call on such regexps, hence private.
|
|
||||||
static bool Equal(Regexp* a, Regexp* b);
|
|
||||||
|
|
||||||
// Allocate space for n sub-regexps.
|
|
||||||
void AllocSub(int n) {
|
|
||||||
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
|
|
||||||
if (n > 1)
|
|
||||||
submany_ = new Regexp*[n];
|
|
||||||
nsub_ = static_cast<uint16_t>(n);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add Rune to LiteralString
|
|
||||||
void AddRuneToString(Rune r);
|
|
||||||
|
|
||||||
// Swaps this with that, in place.
|
|
||||||
void Swap(Regexp *that);
|
|
||||||
|
|
||||||
// Operator. See description of operators above.
|
|
||||||
// uint8_t instead of RegexpOp to control space usage.
|
|
||||||
uint8_t op_;
|
|
||||||
|
|
||||||
// Is this regexp structure already simple
|
|
||||||
// (has it been returned by Simplify)?
|
|
||||||
// uint8_t instead of bool to control space usage.
|
|
||||||
uint8_t simple_;
|
|
||||||
|
|
||||||
// Flags saved from parsing and used during execution.
|
|
||||||
// (Only FoldCase is used.)
|
|
||||||
// uint16_t instead of ParseFlags to control space usage.
|
|
||||||
uint16_t parse_flags_;
|
|
||||||
|
|
||||||
// Reference count. Exists so that SimplifyRegexp can build
|
|
||||||
// regexp structures that are dags rather than trees to avoid
|
|
||||||
// exponential blowup in space requirements.
|
|
||||||
// uint16_t to control space usage.
|
|
||||||
// The standard regexp routines will never generate a
|
|
||||||
// ref greater than the maximum repeat count (kMaxRepeat),
|
|
||||||
// but even so, Incref and Decref consult an overflow map
|
|
||||||
// when ref_ reaches kMaxRef.
|
|
||||||
uint16_t ref_;
|
|
||||||
static const uint16_t kMaxRef = 0xffff;
|
|
||||||
|
|
||||||
// Subexpressions.
|
|
||||||
// uint16_t to control space usage.
|
|
||||||
// Concat and Alternate handle larger numbers of subexpressions
|
|
||||||
// by building concatenation or alternation trees.
|
|
||||||
// Other routines should call Concat or Alternate instead of
|
|
||||||
// filling in sub() by hand.
|
|
||||||
uint16_t nsub_;
|
|
||||||
static const uint16_t kMaxNsub = 0xffff;
|
|
||||||
union {
|
|
||||||
Regexp** submany_; // if nsub_ > 1
|
|
||||||
Regexp* subone_; // if nsub_ == 1
|
|
||||||
};
|
|
||||||
|
|
||||||
// Extra space for parse and teardown stacks.
|
|
||||||
Regexp* down_;
|
|
||||||
|
|
||||||
// Arguments to operator. See description of operators above.
|
|
||||||
union {
|
|
||||||
struct { // Repeat
|
|
||||||
int max_;
|
|
||||||
int min_;
|
|
||||||
};
|
|
||||||
struct { // Capture
|
|
||||||
int cap_;
|
|
||||||
string* name_;
|
|
||||||
};
|
|
||||||
struct { // LiteralString
|
|
||||||
int nrunes_;
|
|
||||||
Rune* runes_;
|
|
||||||
};
|
|
||||||
struct { // CharClass
|
|
||||||
// These two could be in separate union members,
|
|
||||||
// but it wouldn't save any space (there are other two-word structs)
|
|
||||||
// and keeping them separate avoids confusion during parsing.
|
|
||||||
CharClass* cc_;
|
|
||||||
CharClassBuilder* ccb_;
|
|
||||||
};
|
|
||||||
Rune rune_; // Literal
|
|
||||||
int match_id_; // HaveMatch
|
|
||||||
void *the_union_[2]; // as big as any other element, for memset
|
|
||||||
};
|
|
||||||
|
|
||||||
Regexp(const Regexp&) = delete;
|
|
||||||
Regexp& operator=(const Regexp&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
|
||||||
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
|
|
||||||
|
|
||||||
class CharClassBuilder {
|
|
||||||
public:
|
|
||||||
CharClassBuilder();
|
|
||||||
|
|
||||||
typedef RuneRangeSet::iterator iterator;
|
|
||||||
iterator begin() { return ranges_.begin(); }
|
|
||||||
iterator end() { return ranges_.end(); }
|
|
||||||
|
|
||||||
int size() { return nrunes_; }
|
|
||||||
bool empty() { return nrunes_ == 0; }
|
|
||||||
bool full() { return nrunes_ == Runemax+1; }
|
|
||||||
|
|
||||||
bool Contains(Rune r);
|
|
||||||
bool FoldsASCII();
|
|
||||||
bool AddRange(Rune lo, Rune hi); // returns whether class changed
|
|
||||||
CharClassBuilder* Copy();
|
|
||||||
void AddCharClass(CharClassBuilder* cc);
|
|
||||||
void Negate();
|
|
||||||
void RemoveAbove(Rune r);
|
|
||||||
CharClass* GetCharClass();
|
|
||||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
|
||||||
|
|
||||||
private:
|
|
||||||
static const uint32_t AlphaMask = (1<<26) - 1;
|
|
||||||
uint32_t upper_; // bitmap of A-Z
|
|
||||||
uint32_t lower_; // bitmap of a-z
|
|
||||||
int nrunes_;
|
|
||||||
RuneRangeSet ranges_;
|
|
||||||
|
|
||||||
CharClassBuilder(const CharClassBuilder&) = delete;
|
|
||||||
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Bitwise ops on ParseFlags produce ParseFlags.
|
|
||||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
|
|
||||||
Regexp::ParseFlags b) {
|
|
||||||
return static_cast<Regexp::ParseFlags>(
|
|
||||||
static_cast<int>(a) | static_cast<int>(b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
|
|
||||||
Regexp::ParseFlags b) {
|
|
||||||
return static_cast<Regexp::ParseFlags>(
|
|
||||||
static_cast<int>(a) ^ static_cast<int>(b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
|
|
||||||
Regexp::ParseFlags b) {
|
|
||||||
return static_cast<Regexp::ParseFlags>(
|
|
||||||
static_cast<int>(a) & static_cast<int>(b));
|
|
||||||
}
|
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
|
|
||||||
// Attempting to produce a value out of enum's range has undefined behaviour.
|
|
||||||
return static_cast<Regexp::ParseFlags>(
|
|
||||||
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_REGEXP_H_
|
|
@ -1,154 +0,0 @@
|
|||||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/set.h"
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
#include "re2/prog.h"
|
|
||||||
#include "re2/re2.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
|
||||||
options_.Copy(options);
|
|
||||||
options_.set_never_capture(true); // might unblock some optimisations
|
|
||||||
anchor_ = anchor;
|
|
||||||
prog_ = NULL;
|
|
||||||
compiled_ = false;
|
|
||||||
size_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
RE2::Set::~Set() {
|
|
||||||
for (size_t i = 0; i < elem_.size(); i++)
|
|
||||||
elem_[i].second->Decref();
|
|
||||||
delete prog_;
|
|
||||||
}
|
|
||||||
|
|
||||||
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
|
||||||
if (compiled_) {
|
|
||||||
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
|
||||||
options_.ParseFlags());
|
|
||||||
RegexpStatus status;
|
|
||||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
|
||||||
if (re == NULL) {
|
|
||||||
if (error != NULL)
|
|
||||||
*error = status.Text();
|
|
||||||
if (options_.log_errors())
|
|
||||||
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Concatenate with match index and push on vector.
|
|
||||||
int n = static_cast<int>(elem_.size());
|
|
||||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
|
||||||
if (re->op() == kRegexpConcat) {
|
|
||||||
int nsub = re->nsub();
|
|
||||||
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
|
|
||||||
for (int i = 0; i < nsub; i++)
|
|
||||||
sub[i] = re->sub()[i]->Incref();
|
|
||||||
sub[nsub] = m;
|
|
||||||
re->Decref();
|
|
||||||
re = re2::Regexp::Concat(sub, nsub + 1, pf);
|
|
||||||
delete[] sub;
|
|
||||||
} else {
|
|
||||||
re2::Regexp* sub[2];
|
|
||||||
sub[0] = re;
|
|
||||||
sub[1] = m;
|
|
||||||
re = re2::Regexp::Concat(sub, 2, pf);
|
|
||||||
}
|
|
||||||
elem_.emplace_back(pattern.ToString(), re);
|
|
||||||
return n;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool RE2::Set::Compile() {
|
|
||||||
if (compiled_) {
|
|
||||||
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
compiled_ = true;
|
|
||||||
size_ = static_cast<int>(elem_.size());
|
|
||||||
|
|
||||||
// Sort the elements by their patterns. This is good enough for now
|
|
||||||
// until we have a Regexp comparison function. (Maybe someday...)
|
|
||||||
std::sort(elem_.begin(), elem_.end(),
|
|
||||||
[](const Elem& a, const Elem& b) -> bool {
|
|
||||||
return a.first < b.first;
|
|
||||||
});
|
|
||||||
|
|
||||||
re2::Regexp** sub = new re2::Regexp*[size_];
|
|
||||||
for (size_t i = 0; i < elem_.size(); i++)
|
|
||||||
sub[i] = elem_[i].second;
|
|
||||||
elem_.clear();
|
|
||||||
elem_.shrink_to_fit();
|
|
||||||
|
|
||||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
|
||||||
options_.ParseFlags());
|
|
||||||
re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
|
|
||||||
delete[] sub;
|
|
||||||
|
|
||||||
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
|
|
||||||
re->Decref();
|
|
||||||
return prog_ != NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
|
|
||||||
return Match(text, v, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
|
|
||||||
ErrorInfo* error_info) const {
|
|
||||||
if (!compiled_) {
|
|
||||||
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
|
|
||||||
if (error_info != NULL)
|
|
||||||
error_info->kind = kNotCompiled;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
bool dfa_failed = false;
|
|
||||||
std::unique_ptr<SparseSet> matches;
|
|
||||||
if (v != NULL) {
|
|
||||||
matches.reset(new SparseSet(size_));
|
|
||||||
v->clear();
|
|
||||||
}
|
|
||||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
|
|
||||||
NULL, &dfa_failed, matches.get());
|
|
||||||
if (dfa_failed) {
|
|
||||||
if (options_.log_errors())
|
|
||||||
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
|
|
||||||
<< "bytemap range " << prog_->bytemap_range() << ", "
|
|
||||||
<< "list count " << prog_->list_count();
|
|
||||||
if (error_info != NULL)
|
|
||||||
error_info->kind = kOutOfMemory;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (ret == false) {
|
|
||||||
if (error_info != NULL)
|
|
||||||
error_info->kind = kNoError;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (v != NULL) {
|
|
||||||
if (matches->empty()) {
|
|
||||||
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
|
|
||||||
if (error_info != NULL)
|
|
||||||
error_info->kind = kInconsistent;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
v->assign(matches->begin(), matches->end());
|
|
||||||
}
|
|
||||||
if (error_info != NULL)
|
|
||||||
error_info->kind = kNoError;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,80 +0,0 @@
|
|||||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_SET_H_
|
|
||||||
#define RE2_SET_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
class Prog;
|
|
||||||
class Regexp;
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// An RE2::Set represents a collection of regexps that can
|
|
||||||
// be searched for simultaneously.
|
|
||||||
class RE2::Set {
|
|
||||||
public:
|
|
||||||
enum ErrorKind {
|
|
||||||
kNoError = 0,
|
|
||||||
kNotCompiled, // The set is not compiled.
|
|
||||||
kOutOfMemory, // The DFA ran out of memory.
|
|
||||||
kInconsistent, // The result is inconsistent. This should never happen.
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ErrorInfo {
|
|
||||||
ErrorKind kind;
|
|
||||||
};
|
|
||||||
|
|
||||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
|
||||||
~Set();
|
|
||||||
|
|
||||||
// Adds pattern to the set using the options passed to the constructor.
|
|
||||||
// Returns the index that will identify the regexp in the output of Match(),
|
|
||||||
// or -1 if the regexp cannot be parsed.
|
|
||||||
// Indices are assigned in sequential order starting from 0.
|
|
||||||
// Errors do not increment the index; if error is not NULL, *error will hold
|
|
||||||
// the error message from the parser.
|
|
||||||
int Add(const StringPiece& pattern, string* error);
|
|
||||||
|
|
||||||
// Compiles the set in preparation for matching.
|
|
||||||
// Returns false if the compiler runs out of memory.
|
|
||||||
// Add() must not be called again after Compile().
|
|
||||||
// Compile() must be called before Match().
|
|
||||||
bool Compile();
|
|
||||||
|
|
||||||
// Returns true if text matches at least one of the regexps in the set.
|
|
||||||
// Fills v (if not NULL) with the indices of the matching regexps.
|
|
||||||
// Callers must not expect v to be sorted.
|
|
||||||
bool Match(const StringPiece& text, std::vector<int>* v) const;
|
|
||||||
|
|
||||||
// As above, but populates error_info (if not NULL) when none of the regexps
|
|
||||||
// in the set matched. This can inform callers when DFA execution fails, for
|
|
||||||
// example, because they might wish to handle that case differently.
|
|
||||||
bool Match(const StringPiece& text, std::vector<int>* v,
|
|
||||||
ErrorInfo* error_info) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
typedef std::pair<string, re2::Regexp*> Elem;
|
|
||||||
|
|
||||||
RE2::Options options_;
|
|
||||||
RE2::Anchor anchor_;
|
|
||||||
std::vector<Elem> elem_;
|
|
||||||
re2::Prog* prog_;
|
|
||||||
bool compiled_;
|
|
||||||
int size_;
|
|
||||||
|
|
||||||
Set(const Set&) = delete;
|
|
||||||
Set& operator=(const Set&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_SET_H_
|
|
@ -1,658 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Rewrite POSIX and other features in re
|
|
||||||
// to use simple extended regular expression features.
|
|
||||||
// Also sort and simplify character classes.
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
#include "re2/walker-inl.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
|
||||||
// string representation of the simplified form. Returns true on success.
|
|
||||||
// Returns false and sets *error (if error != NULL) on error.
|
|
||||||
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
|
||||||
string* dst,
|
|
||||||
RegexpStatus* status) {
|
|
||||||
Regexp* re = Parse(src, flags, status);
|
|
||||||
if (re == NULL)
|
|
||||||
return false;
|
|
||||||
Regexp* sre = re->Simplify();
|
|
||||||
re->Decref();
|
|
||||||
if (sre == NULL) {
|
|
||||||
// Should not happen, since Simplify never fails.
|
|
||||||
LOG(ERROR) << "Simplify failed on " << src;
|
|
||||||
if (status) {
|
|
||||||
status->set_code(kRegexpInternalError);
|
|
||||||
status->set_error_arg(src);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
*dst = sre->ToString();
|
|
||||||
sre->Decref();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Assuming the simple_ flags on the children are accurate,
|
|
||||||
// is this Regexp* simple?
|
|
||||||
bool Regexp::ComputeSimple() {
|
|
||||||
Regexp** subs;
|
|
||||||
switch (op_) {
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
case kRegexpLiteral:
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
case kRegexpEndLine:
|
|
||||||
case kRegexpBeginText:
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
case kRegexpEndText:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
return true;
|
|
||||||
case kRegexpConcat:
|
|
||||||
case kRegexpAlternate:
|
|
||||||
// These are simple as long as the subpieces are simple.
|
|
||||||
subs = sub();
|
|
||||||
for (int i = 0; i < nsub_; i++)
|
|
||||||
if (!subs[i]->simple())
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
case kRegexpCharClass:
|
|
||||||
// Simple as long as the char class is not empty, not full.
|
|
||||||
if (ccb_ != NULL)
|
|
||||||
return !ccb_->empty() && !ccb_->full();
|
|
||||||
return !cc_->empty() && !cc_->full();
|
|
||||||
case kRegexpCapture:
|
|
||||||
subs = sub();
|
|
||||||
return subs[0]->simple();
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
subs = sub();
|
|
||||||
if (!subs[0]->simple())
|
|
||||||
return false;
|
|
||||||
switch (subs[0]->op_) {
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
return false;
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
case kRegexpRepeat:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Walker subclass used by Simplify.
|
|
||||||
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
|
||||||
// occurrences of that literal into repeats of that literal. It also works for
|
|
||||||
// char classes, any char and any byte.
|
|
||||||
// PostVisit creates the coalesced result, which should then be simplified.
|
|
||||||
class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
|
||||||
public:
|
|
||||||
CoalesceWalker() {}
|
|
||||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
|
||||||
Regexp** child_args, int nchild_args);
|
|
||||||
virtual Regexp* Copy(Regexp* re);
|
|
||||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
|
||||||
|
|
||||||
private:
|
|
||||||
// These functions are declared inside CoalesceWalker so that
|
|
||||||
// they can edit the private fields of the Regexps they construct.
|
|
||||||
|
|
||||||
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
|
||||||
// the parse flags are consistent. (They will not be checked again later.)
|
|
||||||
static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
|
||||||
|
|
||||||
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
|
||||||
// will be empty match and the coalesced op. In other cases, where part of a
|
|
||||||
// literal string was removed to be coalesced, the array elements afterwards
|
|
||||||
// will be the coalesced op and the remainder of the literal string.
|
|
||||||
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
|
||||||
|
|
||||||
CoalesceWalker(const CoalesceWalker&) = delete;
|
|
||||||
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Walker subclass used by Simplify.
|
|
||||||
// The simplify walk is purely post-recursive: given the simplified children,
|
|
||||||
// PostVisit creates the simplified result.
|
|
||||||
// The child_args are simplified Regexp*s.
|
|
||||||
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
|
||||||
public:
|
|
||||||
SimplifyWalker() {}
|
|
||||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
|
||||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
|
||||||
Regexp** child_args, int nchild_args);
|
|
||||||
virtual Regexp* Copy(Regexp* re);
|
|
||||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
|
||||||
|
|
||||||
private:
|
|
||||||
// These functions are declared inside SimplifyWalker so that
|
|
||||||
// they can edit the private fields of the Regexps they construct.
|
|
||||||
|
|
||||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
||||||
// Caller must Decref return value when done with it.
|
|
||||||
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
|
||||||
|
|
||||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
||||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
||||||
// Caller must Decref return value when done with it.
|
|
||||||
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
|
||||||
Regexp::ParseFlags parse_flags);
|
|
||||||
|
|
||||||
// Simplifies a character class by expanding any named classes
|
|
||||||
// into rune ranges. Does not edit re. Does not consume ref to re.
|
|
||||||
// Caller must Decref return value when done with it.
|
|
||||||
static Regexp* SimplifyCharClass(Regexp* re);
|
|
||||||
|
|
||||||
SimplifyWalker(const SimplifyWalker&) = delete;
|
|
||||||
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Simplifies a regular expression, returning a new regexp.
|
|
||||||
// The new regexp uses traditional Unix egrep features only,
|
|
||||||
// plus the Perl (?:) non-capturing parentheses.
|
|
||||||
// Otherwise, no POSIX or Perl additions. The new regexp
|
|
||||||
// captures exactly the same subexpressions (with the same indices)
|
|
||||||
// as the original.
|
|
||||||
// Does not edit current object.
|
|
||||||
// Caller must Decref() return value when done with it.
|
|
||||||
|
|
||||||
Regexp* Regexp::Simplify() {
|
|
||||||
CoalesceWalker cw;
|
|
||||||
Regexp* cre = cw.Walk(this, NULL);
|
|
||||||
if (cre == NULL)
|
|
||||||
return cre;
|
|
||||||
SimplifyWalker sw;
|
|
||||||
Regexp* sre = sw.Walk(cre, NULL);
|
|
||||||
cre->Decref();
|
|
||||||
return sre;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
|
||||||
|
|
||||||
// Utility function for PostVisit implementations that compares re->sub() with
|
|
||||||
// child_args to determine whether any child_args changed. In the common case,
|
|
||||||
// where nothing changed, calls Decref() for all child_args and returns false,
|
|
||||||
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
|
||||||
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
|
||||||
for (int i = 0; i < re->nsub(); i++) {
|
|
||||||
Regexp* sub = re->sub()[i];
|
|
||||||
Regexp* newsub = child_args[i];
|
|
||||||
if (newsub != sub)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < re->nsub(); i++) {
|
|
||||||
Regexp* newsub = child_args[i];
|
|
||||||
newsub->Decref();
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* CoalesceWalker::Copy(Regexp* re) {
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
|
||||||
// This should never be called, since we use Walk and not
|
|
||||||
// WalkExponential.
|
|
||||||
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
|
||||||
Regexp* parent_arg,
|
|
||||||
Regexp* pre_arg,
|
|
||||||
Regexp** child_args,
|
|
||||||
int nchild_args) {
|
|
||||||
if (re->nsub() == 0)
|
|
||||||
return re->Incref();
|
|
||||||
|
|
||||||
if (re->op() != kRegexpConcat) {
|
|
||||||
if (!ChildArgsChanged(re, child_args))
|
|
||||||
return re->Incref();
|
|
||||||
|
|
||||||
// Something changed. Build a new op.
|
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
||||||
nre->AllocSub(re->nsub());
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0; i < re->nsub(); i++)
|
|
||||||
nre_subs[i] = child_args[i];
|
|
||||||
// Repeats and Captures have additional data that must be copied.
|
|
||||||
if (re->op() == kRegexpRepeat) {
|
|
||||||
nre->min_ = re->min();
|
|
||||||
nre->max_ = re->max();
|
|
||||||
} else if (re->op() == kRegexpCapture) {
|
|
||||||
nre->cap_ = re->cap();
|
|
||||||
}
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool can_coalesce = false;
|
|
||||||
for (int i = 0; i < re->nsub(); i++) {
|
|
||||||
if (i+1 < re->nsub() &&
|
|
||||||
CanCoalesce(child_args[i], child_args[i+1])) {
|
|
||||||
can_coalesce = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!can_coalesce) {
|
|
||||||
if (!ChildArgsChanged(re, child_args))
|
|
||||||
return re->Incref();
|
|
||||||
|
|
||||||
// Something changed. Build a new op.
|
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
||||||
nre->AllocSub(re->nsub());
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0; i < re->nsub(); i++)
|
|
||||||
nre_subs[i] = child_args[i];
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < re->nsub(); i++) {
|
|
||||||
if (i+1 < re->nsub() &&
|
|
||||||
CanCoalesce(child_args[i], child_args[i+1]))
|
|
||||||
DoCoalesce(&child_args[i], &child_args[i+1]);
|
|
||||||
}
|
|
||||||
// Determine how many empty matches were left by DoCoalesce.
|
|
||||||
int n = 0;
|
|
||||||
for (int i = n; i < re->nsub(); i++) {
|
|
||||||
if (child_args[i]->op() == kRegexpEmptyMatch)
|
|
||||||
n++;
|
|
||||||
}
|
|
||||||
// Build a new op.
|
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
||||||
nre->AllocSub(re->nsub() - n);
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
|
||||||
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
|
||||||
child_args[i]->Decref();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
nre_subs[j] = child_args[i];
|
|
||||||
j++;
|
|
||||||
}
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
|
||||||
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
|
||||||
// any byte.
|
|
||||||
if ((r1->op() == kRegexpStar ||
|
|
||||||
r1->op() == kRegexpPlus ||
|
|
||||||
r1->op() == kRegexpQuest ||
|
|
||||||
r1->op() == kRegexpRepeat) &&
|
|
||||||
(r1->sub()[0]->op() == kRegexpLiteral ||
|
|
||||||
r1->sub()[0]->op() == kRegexpCharClass ||
|
|
||||||
r1->sub()[0]->op() == kRegexpAnyChar ||
|
|
||||||
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
|
||||||
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
|
||||||
// any char or any byte.
|
|
||||||
if ((r2->op() == kRegexpStar ||
|
|
||||||
r2->op() == kRegexpPlus ||
|
|
||||||
r2->op() == kRegexpQuest ||
|
|
||||||
r2->op() == kRegexpRepeat) &&
|
|
||||||
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
|
||||||
// The parse flags must be consistent.
|
|
||||||
((r1->parse_flags() & Regexp::NonGreedy) ==
|
|
||||||
(r2->parse_flags() & Regexp::NonGreedy))) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// ... OR an occurrence of that literal, char class, any char or any byte
|
|
||||||
if (Regexp::Equal(r1->sub()[0], r2)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// ... OR a literal string that begins with that literal.
|
|
||||||
if (r1->sub()[0]->op() == kRegexpLiteral &&
|
|
||||||
r2->op() == kRegexpLiteralString &&
|
|
||||||
r2->runes()[0] == r1->sub()[0]->rune() &&
|
|
||||||
// The parse flags must be consistent.
|
|
||||||
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
|
||||||
(r2->parse_flags() & Regexp::FoldCase))) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
|
||||||
Regexp* r1 = *r1ptr;
|
|
||||||
Regexp* r2 = *r2ptr;
|
|
||||||
|
|
||||||
Regexp* nre = Regexp::Repeat(
|
|
||||||
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
|
||||||
|
|
||||||
switch (r1->op()) {
|
|
||||||
case kRegexpStar:
|
|
||||||
nre->min_ = 0;
|
|
||||||
nre->max_ = -1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpPlus:
|
|
||||||
nre->min_ = 1;
|
|
||||||
nre->max_ = -1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpQuest:
|
|
||||||
nre->min_ = 0;
|
|
||||||
nre->max_ = 1;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpRepeat:
|
|
||||||
nre->min_ = r1->min();
|
|
||||||
nre->max_ = r1->max();
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
|
||||||
nre->Decref();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (r2->op()) {
|
|
||||||
case kRegexpStar:
|
|
||||||
nre->max_ = -1;
|
|
||||||
goto LeaveEmpty;
|
|
||||||
|
|
||||||
case kRegexpPlus:
|
|
||||||
nre->min_++;
|
|
||||||
nre->max_ = -1;
|
|
||||||
goto LeaveEmpty;
|
|
||||||
|
|
||||||
case kRegexpQuest:
|
|
||||||
if (nre->max() != -1)
|
|
||||||
nre->max_++;
|
|
||||||
goto LeaveEmpty;
|
|
||||||
|
|
||||||
case kRegexpRepeat:
|
|
||||||
nre->min_ += r2->min();
|
|
||||||
if (r2->max() == -1)
|
|
||||||
nre->max_ = -1;
|
|
||||||
else if (nre->max() != -1)
|
|
||||||
nre->max_ += r2->max();
|
|
||||||
goto LeaveEmpty;
|
|
||||||
|
|
||||||
case kRegexpLiteral:
|
|
||||||
case kRegexpCharClass:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
nre->min_++;
|
|
||||||
if (nre->max() != -1)
|
|
||||||
nre->max_++;
|
|
||||||
goto LeaveEmpty;
|
|
||||||
|
|
||||||
LeaveEmpty:
|
|
||||||
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
|
||||||
*r2ptr = nre;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteralString: {
|
|
||||||
Rune r = r1->sub()[0]->rune();
|
|
||||||
// Determine how much of the literal string is removed.
|
|
||||||
// We know that we have at least one rune. :)
|
|
||||||
int n = 1;
|
|
||||||
while (n < r2->nrunes() && r2->runes()[n] == r)
|
|
||||||
n++;
|
|
||||||
nre->min_ += n;
|
|
||||||
if (nre->max() != -1)
|
|
||||||
nre->max_ += n;
|
|
||||||
if (n == r2->nrunes())
|
|
||||||
goto LeaveEmpty;
|
|
||||||
*r1ptr = nre;
|
|
||||||
*r2ptr = Regexp::LiteralString(
|
|
||||||
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
|
||||||
nre->Decref();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
r1->Decref();
|
|
||||||
r2->Decref();
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
|
||||||
// This should never be called, since we use Walk and not
|
|
||||||
// WalkExponential.
|
|
||||||
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
|
||||||
if (re->simple()) {
|
|
||||||
*stop = true;
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
|
||||||
Regexp* parent_arg,
|
|
||||||
Regexp* pre_arg,
|
|
||||||
Regexp** child_args,
|
|
||||||
int nchild_args) {
|
|
||||||
switch (re->op()) {
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
case kRegexpLiteral:
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
case kRegexpEndLine:
|
|
||||||
case kRegexpBeginText:
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
case kRegexpEndText:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
// All these are always simple.
|
|
||||||
re->simple_ = true;
|
|
||||||
return re->Incref();
|
|
||||||
|
|
||||||
case kRegexpConcat:
|
|
||||||
case kRegexpAlternate: {
|
|
||||||
// These are simple as long as the subpieces are simple.
|
|
||||||
if (!ChildArgsChanged(re, child_args)) {
|
|
||||||
re->simple_ = true;
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
||||||
nre->AllocSub(re->nsub());
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0; i < re->nsub(); i++)
|
|
||||||
nre_subs[i] = child_args[i];
|
|
||||||
nre->simple_ = true;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kRegexpCapture: {
|
|
||||||
Regexp* newsub = child_args[0];
|
|
||||||
if (newsub == re->sub()[0]) {
|
|
||||||
newsub->Decref();
|
|
||||||
re->simple_ = true;
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
|
||||||
nre->AllocSub(1);
|
|
||||||
nre->sub()[0] = newsub;
|
|
||||||
nre->cap_ = re->cap();
|
|
||||||
nre->simple_ = true;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest: {
|
|
||||||
Regexp* newsub = child_args[0];
|
|
||||||
// Special case: repeat the empty string as much as
|
|
||||||
// you want, but it's still the empty string.
|
|
||||||
if (newsub->op() == kRegexpEmptyMatch)
|
|
||||||
return newsub;
|
|
||||||
|
|
||||||
// These are simple as long as the subpiece is simple.
|
|
||||||
if (newsub == re->sub()[0]) {
|
|
||||||
newsub->Decref();
|
|
||||||
re->simple_ = true;
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
// These are also idempotent if flags are constant.
|
|
||||||
if (re->op() == newsub->op() &&
|
|
||||||
re->parse_flags() == newsub->parse_flags())
|
|
||||||
return newsub;
|
|
||||||
|
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
|
||||||
nre->AllocSub(1);
|
|
||||||
nre->sub()[0] = newsub;
|
|
||||||
nre->simple_ = true;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kRegexpRepeat: {
|
|
||||||
Regexp* newsub = child_args[0];
|
|
||||||
// Special case: repeat the empty string as much as
|
|
||||||
// you want, but it's still the empty string.
|
|
||||||
if (newsub->op() == kRegexpEmptyMatch)
|
|
||||||
return newsub;
|
|
||||||
|
|
||||||
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
|
||||||
re->parse_flags());
|
|
||||||
newsub->Decref();
|
|
||||||
nre->simple_ = true;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kRegexpCharClass: {
|
|
||||||
Regexp* nre = SimplifyCharClass(re);
|
|
||||||
nre->simple_ = true;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
|
||||||
// Returns a new Regexp, handing the ref to the caller.
|
|
||||||
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
|
||||||
Regexp::ParseFlags parse_flags) {
|
|
||||||
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
|
||||||
re->AllocSub(2);
|
|
||||||
Regexp** subs = re->sub();
|
|
||||||
subs[0] = re1;
|
|
||||||
subs[1] = re2;
|
|
||||||
return re;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
|
||||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
|
||||||
// Caller must Decref return value when done with it.
|
|
||||||
// The result will *not* necessarily have the right capturing parens
|
|
||||||
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
|
||||||
// but in the Regexp* representation, both (x) are marked as $1.
|
|
||||||
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
|
||||||
Regexp::ParseFlags f) {
|
|
||||||
// x{n,} means at least n matches of x.
|
|
||||||
if (max == -1) {
|
|
||||||
// Special case: x{0,} is x*
|
|
||||||
if (min == 0)
|
|
||||||
return Regexp::Star(re->Incref(), f);
|
|
||||||
|
|
||||||
// Special case: x{1,} is x+
|
|
||||||
if (min == 1)
|
|
||||||
return Regexp::Plus(re->Incref(), f);
|
|
||||||
|
|
||||||
// General case: x{4,} is xxxx+
|
|
||||||
Regexp** nre_subs = new Regexp*[min];
|
|
||||||
for (int i = 0; i < min-1; i++)
|
|
||||||
nre_subs[i] = re->Incref();
|
|
||||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
|
||||||
Regexp* nre = Regexp::Concat(nre_subs, min, f);
|
|
||||||
delete[] nre_subs;
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Special case: (x){0} matches only empty string.
|
|
||||||
if (min == 0 && max == 0)
|
|
||||||
return new Regexp(kRegexpEmptyMatch, f);
|
|
||||||
|
|
||||||
// Special case: x{1} is just x.
|
|
||||||
if (min == 1 && max == 1)
|
|
||||||
return re->Incref();
|
|
||||||
|
|
||||||
// General case: x{n,m} means n copies of x and m copies of x?.
|
|
||||||
// The machine will do less work if we nest the final m copies,
|
|
||||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
|
||||||
|
|
||||||
// Build leading prefix: xx. Capturing only on the last one.
|
|
||||||
Regexp* nre = NULL;
|
|
||||||
if (min > 0) {
|
|
||||||
Regexp** nre_subs = new Regexp*[min];
|
|
||||||
for (int i = 0; i < min; i++)
|
|
||||||
nre_subs[i] = re->Incref();
|
|
||||||
nre = Regexp::Concat(nre_subs, min, f);
|
|
||||||
delete[] nre_subs;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Build and attach suffix: (x(x(x)?)?)?
|
|
||||||
if (max > min) {
|
|
||||||
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
|
||||||
for (int i = min+1; i < max; i++)
|
|
||||||
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
|
||||||
if (nre == NULL)
|
|
||||||
nre = suf;
|
|
||||||
else
|
|
||||||
nre = Concat2(nre, suf, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (nre == NULL) {
|
|
||||||
// Some degenerate case, like min > max, or min < max < 0.
|
|
||||||
// This shouldn't happen, because the parser rejects such regexps.
|
|
||||||
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
|
||||||
return new Regexp(kRegexpNoMatch, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
return nre;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Simplifies a character class.
|
|
||||||
// Caller must Decref return value when done with it.
|
|
||||||
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
|
||||||
CharClass* cc = re->cc();
|
|
||||||
|
|
||||||
// Special cases
|
|
||||||
if (cc->empty())
|
|
||||||
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
|
||||||
if (cc->full())
|
|
||||||
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
|
||||||
|
|
||||||
return re->Incref();
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,65 +0,0 @@
|
|||||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
#include <ostream>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
|
|
||||||
|
|
||||||
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
|
|
||||||
size_type pos) const {
|
|
||||||
size_type ret = std::min(size_ - pos, n);
|
|
||||||
memcpy(buf, data_ + pos, ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
|
||||||
if (pos > size_) pos = size_;
|
|
||||||
if (n > size_ - pos) n = size_ - pos;
|
|
||||||
return StringPiece(data_ + pos, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece::size_type StringPiece::find(const StringPiece& s,
|
|
||||||
size_type pos) const {
|
|
||||||
if (pos > size_) return npos;
|
|
||||||
const_pointer result = std::search(data_ + pos, data_ + size_,
|
|
||||||
s.data_, s.data_ + s.size_);
|
|
||||||
size_type xpos = result - data_;
|
|
||||||
return xpos + s.size_ <= size_ ? xpos : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
|
|
||||||
if (size_ <= 0 || pos >= size_) return npos;
|
|
||||||
const_pointer result = std::find(data_ + pos, data_ + size_, c);
|
|
||||||
return result != data_ + size_ ? result - data_ : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
|
|
||||||
size_type pos) const {
|
|
||||||
if (size_ < s.size_) return npos;
|
|
||||||
if (s.size_ == 0) return std::min(size_, pos);
|
|
||||||
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
|
|
||||||
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
|
|
||||||
return result != last ? result - data_ : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
|
|
||||||
if (size_ <= 0) return npos;
|
|
||||||
for (size_t i = std::min(pos + 1, size_); i != 0;) {
|
|
||||||
if (data_[--i] == c) return i;
|
|
||||||
}
|
|
||||||
return npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
|
|
||||||
o.write(p.data(), p.size());
|
|
||||||
return o;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,190 +0,0 @@
|
|||||||
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_STRINGPIECE_H_
|
|
||||||
#define RE2_STRINGPIECE_H_
|
|
||||||
|
|
||||||
// A string-like object that points to a sized piece of memory.
|
|
||||||
//
|
|
||||||
// Functions or methods may use const StringPiece& parameters to accept either
|
|
||||||
// a "const char*" or a "string" value that will be implicitly converted to
|
|
||||||
// a StringPiece. The implicit conversion means that it is often appropriate
|
|
||||||
// to include this .h file in other files rather than forward-declaring
|
|
||||||
// StringPiece as would be appropriate for most other Google classes.
|
|
||||||
//
|
|
||||||
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
|
||||||
// conversions from "const char*" to "string" and back again.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// Arghh! I wish C++ literals were "string".
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <iosfwd>
|
|
||||||
#include <iterator>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class StringPiece {
|
|
||||||
public:
|
|
||||||
typedef char value_type;
|
|
||||||
typedef char* pointer;
|
|
||||||
typedef const char* const_pointer;
|
|
||||||
typedef char& reference;
|
|
||||||
typedef const char& const_reference;
|
|
||||||
typedef const char* const_iterator;
|
|
||||||
typedef const_iterator iterator;
|
|
||||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
|
||||||
typedef const_reverse_iterator reverse_iterator;
|
|
||||||
typedef size_t size_type;
|
|
||||||
typedef ptrdiff_t difference_type;
|
|
||||||
static const size_type npos = static_cast<size_type>(-1);
|
|
||||||
|
|
||||||
// We provide non-explicit singleton constructors so users can pass
|
|
||||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
|
||||||
// expected.
|
|
||||||
StringPiece()
|
|
||||||
: data_(NULL), size_(0) {}
|
|
||||||
StringPiece(const std::string& str)
|
|
||||||
: data_(str.data()), size_(str.size()) {}
|
|
||||||
StringPiece(const char* str)
|
|
||||||
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
|
|
||||||
StringPiece(const char* str, size_type len)
|
|
||||||
: data_(str), size_(len) {}
|
|
||||||
|
|
||||||
const_iterator begin() const { return data_; }
|
|
||||||
const_iterator end() const { return data_ + size_; }
|
|
||||||
const_reverse_iterator rbegin() const {
|
|
||||||
return const_reverse_iterator(data_ + size_);
|
|
||||||
}
|
|
||||||
const_reverse_iterator rend() const {
|
|
||||||
return const_reverse_iterator(data_);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_type size() const { return size_; }
|
|
||||||
size_type length() const { return size_; }
|
|
||||||
bool empty() const { return size_ == 0; }
|
|
||||||
|
|
||||||
const_reference operator[](size_type i) const { return data_[i]; }
|
|
||||||
const_pointer data() const { return data_; }
|
|
||||||
|
|
||||||
void remove_prefix(size_type n) {
|
|
||||||
data_ += n;
|
|
||||||
size_ -= n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_suffix(size_type n) {
|
|
||||||
size_ -= n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void set(const char* str) {
|
|
||||||
data_ = str;
|
|
||||||
size_ = str == NULL ? 0 : strlen(str);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set(const char* str, size_type len) {
|
|
||||||
data_ = str;
|
|
||||||
size_ = len;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string as_string() const {
|
|
||||||
return std::string(data_, size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// We also define ToString() here, since many other string-like
|
|
||||||
// interfaces name the routine that converts to a C++ string
|
|
||||||
// "ToString", and it's confusing to have the method that does that
|
|
||||||
// for a StringPiece be called "as_string()". We also leave the
|
|
||||||
// "as_string()" method defined here for existing code.
|
|
||||||
std::string ToString() const {
|
|
||||||
return std::string(data_, size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void CopyToString(std::string* target) const {
|
|
||||||
target->assign(data_, size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void AppendToString(std::string* target) const {
|
|
||||||
target->append(data_, size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_type copy(char* buf, size_type n, size_type pos = 0) const;
|
|
||||||
StringPiece substr(size_type pos = 0, size_type n = npos) const;
|
|
||||||
|
|
||||||
int compare(const StringPiece& x) const {
|
|
||||||
size_type min_size = std::min(size(), x.size());
|
|
||||||
if (min_size > 0) {
|
|
||||||
int r = memcmp(data(), x.data(), min_size);
|
|
||||||
if (r < 0) return -1;
|
|
||||||
if (r > 0) return 1;
|
|
||||||
}
|
|
||||||
if (size() < x.size()) return -1;
|
|
||||||
if (size() > x.size()) return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Does "this" start with "x"?
|
|
||||||
bool starts_with(const StringPiece& x) const {
|
|
||||||
return x.empty() ||
|
|
||||||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Does "this" end with "x"?
|
|
||||||
bool ends_with(const StringPiece& x) const {
|
|
||||||
return x.empty() ||
|
|
||||||
(size() >= x.size() &&
|
|
||||||
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool contains(const StringPiece& s) const {
|
|
||||||
return find(s) != npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_type find(const StringPiece& s, size_type pos = 0) const;
|
|
||||||
size_type find(char c, size_type pos = 0) const;
|
|
||||||
size_type rfind(const StringPiece& s, size_type pos = npos) const;
|
|
||||||
size_type rfind(char c, size_type pos = npos) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
const_pointer data_;
|
|
||||||
size_type size_;
|
|
||||||
};
|
|
||||||
|
|
||||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
|
||||||
StringPiece::size_type len = x.size();
|
|
||||||
if (len != y.size()) return false;
|
|
||||||
return x.data() == y.data() || len == 0 ||
|
|
||||||
memcmp(x.data(), y.data(), len) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
|
||||||
return !(x == y);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
|
||||||
StringPiece::size_type min_size = std::min(x.size(), y.size());
|
|
||||||
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
|
|
||||||
return (r < 0) || (r == 0 && x.size() < y.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
|
||||||
return y < x;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
|
|
||||||
return !(x > y);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
|
||||||
return !(x < y);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Allow StringPiece to be logged.
|
|
||||||
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_STRINGPIECE_H_
|
|
@ -1,351 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Format a regular expression structure as a string.
|
|
||||||
// Tested by parse_test.cc
|
|
||||||
|
|
||||||
#include <string.h>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/strutil.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
#include "re2/walker-inl.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
enum {
|
|
||||||
PrecAtom,
|
|
||||||
PrecUnary,
|
|
||||||
PrecConcat,
|
|
||||||
PrecAlternate,
|
|
||||||
PrecEmpty,
|
|
||||||
PrecParen,
|
|
||||||
PrecToplevel,
|
|
||||||
};
|
|
||||||
|
|
||||||
// Helper function. See description below.
|
|
||||||
static void AppendCCRange(string* t, Rune lo, Rune hi);
|
|
||||||
|
|
||||||
// Walker to generate string in s_.
|
|
||||||
// The arg pointers are actually integers giving the
|
|
||||||
// context precedence.
|
|
||||||
// The child_args are always NULL.
|
|
||||||
class ToStringWalker : public Regexp::Walker<int> {
|
|
||||||
public:
|
|
||||||
explicit ToStringWalker(string* t) : t_(t) {}
|
|
||||||
|
|
||||||
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
|
|
||||||
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
|
||||||
int* child_args, int nchild_args);
|
|
||||||
virtual int ShortVisit(Regexp* re, int parent_arg) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
string* t_; // The string the walker appends to.
|
|
||||||
|
|
||||||
ToStringWalker(const ToStringWalker&) = delete;
|
|
||||||
ToStringWalker& operator=(const ToStringWalker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
string Regexp::ToString() {
|
|
||||||
string t;
|
|
||||||
ToStringWalker w(&t);
|
|
||||||
w.WalkExponential(this, PrecToplevel, 100000);
|
|
||||||
if (w.stopped_early())
|
|
||||||
t += " [truncated]";
|
|
||||||
return t;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define ToString DontCallToString // Avoid accidental recursion.
|
|
||||||
|
|
||||||
// Visits re before children are processed.
|
|
||||||
// Appends ( if needed and passes new precedence to children.
|
|
||||||
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
|
||||||
int prec = parent_arg;
|
|
||||||
int nprec = PrecAtom;
|
|
||||||
|
|
||||||
switch (re->op()) {
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
case kRegexpLiteral:
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
case kRegexpEndLine:
|
|
||||||
case kRegexpBeginText:
|
|
||||||
case kRegexpEndText:
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
case kRegexpCharClass:
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
nprec = PrecAtom;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpConcat:
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
if (prec < PrecConcat)
|
|
||||||
t_->append("(?:");
|
|
||||||
nprec = PrecConcat;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAlternate:
|
|
||||||
if (prec < PrecAlternate)
|
|
||||||
t_->append("(?:");
|
|
||||||
nprec = PrecAlternate;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpCapture:
|
|
||||||
t_->append("(");
|
|
||||||
if (re->cap() == 0)
|
|
||||||
LOG(DFATAL) << "kRegexpCapture cap() == 0";
|
|
||||||
if (re->name()) {
|
|
||||||
t_->append("?P<");
|
|
||||||
t_->append(*re->name());
|
|
||||||
t_->append(">");
|
|
||||||
}
|
|
||||||
nprec = PrecParen;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
case kRegexpPlus:
|
|
||||||
case kRegexpQuest:
|
|
||||||
case kRegexpRepeat:
|
|
||||||
if (prec < PrecUnary)
|
|
||||||
t_->append("(?:");
|
|
||||||
// The subprecedence here is PrecAtom instead of PrecUnary
|
|
||||||
// because PCRE treats two unary ops in a row as a parse error.
|
|
||||||
nprec = PrecAtom;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return nprec;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
|
||||||
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
|
||||||
t->append(1, '\\');
|
|
||||||
t->append(1, static_cast<char>(r));
|
|
||||||
} else if (foldcase && 'a' <= r && r <= 'z') {
|
|
||||||
r -= 'a' - 'A';
|
|
||||||
t->append(1, '[');
|
|
||||||
t->append(1, static_cast<char>(r));
|
|
||||||
t->append(1, static_cast<char>(r) + 'a' - 'A');
|
|
||||||
t->append(1, ']');
|
|
||||||
} else {
|
|
||||||
AppendCCRange(t, r, r);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Visits re after children are processed.
|
|
||||||
// For childless regexps, all the work is done here.
|
|
||||||
// For regexps with children, append any unary suffixes or ).
|
|
||||||
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
|
||||||
int* child_args, int nchild_args) {
|
|
||||||
int prec = parent_arg;
|
|
||||||
switch (re->op()) {
|
|
||||||
case kRegexpNoMatch:
|
|
||||||
// There's no simple symbol for "no match", but
|
|
||||||
// [^0-Runemax] excludes everything.
|
|
||||||
t_->append("[^\\x00-\\x{10ffff}]");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpEmptyMatch:
|
|
||||||
// Append (?:) to make empty string visible,
|
|
||||||
// unless this is already being parenthesized.
|
|
||||||
if (prec < PrecEmpty)
|
|
||||||
t_->append("(?:)");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteral:
|
|
||||||
AppendLiteral(t_, re->rune(),
|
|
||||||
(re->parse_flags() & Regexp::FoldCase) != 0);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpLiteralString:
|
|
||||||
for (int i = 0; i < re->nrunes(); i++)
|
|
||||||
AppendLiteral(t_, re->runes()[i],
|
|
||||||
(re->parse_flags() & Regexp::FoldCase) != 0);
|
|
||||||
if (prec < PrecConcat)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpConcat:
|
|
||||||
if (prec < PrecConcat)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAlternate:
|
|
||||||
// Clumsy but workable: the children all appended |
|
|
||||||
// at the end of their strings, so just remove the last one.
|
|
||||||
if ((*t_)[t_->size()-1] == '|')
|
|
||||||
t_->erase(t_->size()-1);
|
|
||||||
else
|
|
||||||
LOG(DFATAL) << "Bad final char: " << t_;
|
|
||||||
if (prec < PrecAlternate)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpStar:
|
|
||||||
t_->append("*");
|
|
||||||
if (re->parse_flags() & Regexp::NonGreedy)
|
|
||||||
t_->append("?");
|
|
||||||
if (prec < PrecUnary)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpPlus:
|
|
||||||
t_->append("+");
|
|
||||||
if (re->parse_flags() & Regexp::NonGreedy)
|
|
||||||
t_->append("?");
|
|
||||||
if (prec < PrecUnary)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpQuest:
|
|
||||||
t_->append("?");
|
|
||||||
if (re->parse_flags() & Regexp::NonGreedy)
|
|
||||||
t_->append("?");
|
|
||||||
if (prec < PrecUnary)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpRepeat:
|
|
||||||
if (re->max() == -1)
|
|
||||||
t_->append(StringPrintf("{%d,}", re->min()));
|
|
||||||
else if (re->min() == re->max())
|
|
||||||
t_->append(StringPrintf("{%d}", re->min()));
|
|
||||||
else
|
|
||||||
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
|
|
||||||
if (re->parse_flags() & Regexp::NonGreedy)
|
|
||||||
t_->append("?");
|
|
||||||
if (prec < PrecUnary)
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAnyChar:
|
|
||||||
t_->append(".");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpAnyByte:
|
|
||||||
t_->append("\\C");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpBeginLine:
|
|
||||||
t_->append("^");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpEndLine:
|
|
||||||
t_->append("$");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpBeginText:
|
|
||||||
t_->append("(?-m:^)");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpEndText:
|
|
||||||
if (re->parse_flags() & Regexp::WasDollar)
|
|
||||||
t_->append("(?-m:$)");
|
|
||||||
else
|
|
||||||
t_->append("\\z");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpWordBoundary:
|
|
||||||
t_->append("\\b");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpNoWordBoundary:
|
|
||||||
t_->append("\\B");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpCharClass: {
|
|
||||||
if (re->cc()->size() == 0) {
|
|
||||||
t_->append("[^\\x00-\\x{10ffff}]");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
t_->append("[");
|
|
||||||
// Heuristic: show class as negated if it contains the
|
|
||||||
// non-character 0xFFFE.
|
|
||||||
CharClass* cc = re->cc();
|
|
||||||
if (cc->Contains(0xFFFE)) {
|
|
||||||
cc = cc->Negate();
|
|
||||||
t_->append("^");
|
|
||||||
}
|
|
||||||
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
|
|
||||||
AppendCCRange(t_, i->lo, i->hi);
|
|
||||||
if (cc != re->cc())
|
|
||||||
cc->Delete();
|
|
||||||
t_->append("]");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kRegexpCapture:
|
|
||||||
t_->append(")");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kRegexpHaveMatch:
|
|
||||||
// There's no syntax accepted by the parser to generate
|
|
||||||
// this node (it is generated by RE2::Set) so make something
|
|
||||||
// up that is readable but won't compile.
|
|
||||||
t_->append("(?HaveMatch:%d)", re->match_id());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the parent is an alternation, append the | for it.
|
|
||||||
if (prec == PrecAlternate)
|
|
||||||
t_->append("|");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Appends a rune for use in a character class to the string t.
|
|
||||||
static void AppendCCChar(string* t, Rune r) {
|
|
||||||
if (0x20 <= r && r <= 0x7E) {
|
|
||||||
if (strchr("[]^-\\", r))
|
|
||||||
t->append("\\");
|
|
||||||
t->append(1, static_cast<char>(r));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
switch (r) {
|
|
||||||
default:
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '\r':
|
|
||||||
t->append("\\r");
|
|
||||||
return;
|
|
||||||
|
|
||||||
case '\t':
|
|
||||||
t->append("\\t");
|
|
||||||
return;
|
|
||||||
|
|
||||||
case '\n':
|
|
||||||
t->append("\\n");
|
|
||||||
return;
|
|
||||||
|
|
||||||
case '\f':
|
|
||||||
t->append("\\f");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (r < 0x100) {
|
|
||||||
StringAppendF(t, "\\x%02x", static_cast<int>(r));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void AppendCCRange(string* t, Rune lo, Rune hi) {
|
|
||||||
if (lo > hi)
|
|
||||||
return;
|
|
||||||
AppendCCChar(t, lo);
|
|
||||||
if (lo < hi) {
|
|
||||||
t->append("-");
|
|
||||||
AppendCCChar(t, hi);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,297 +0,0 @@
|
|||||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
# Use of this source code is governed by a BSD-style
|
|
||||||
# license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
"""Parser for Unicode data files (as distributed by unicode.org)."""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import urllib2
|
|
||||||
|
|
||||||
# Directory or URL where Unicode tables reside.
|
|
||||||
_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
|
|
||||||
|
|
||||||
# Largest valid Unicode code value.
|
|
||||||
_RUNE_MAX = 0x10FFFF
|
|
||||||
|
|
||||||
|
|
||||||
class Error(Exception):
|
|
||||||
"""Unicode error base class."""
|
|
||||||
|
|
||||||
|
|
||||||
class InputError(Error):
|
|
||||||
"""Unicode input error class. Raised on invalid input."""
|
|
||||||
|
|
||||||
|
|
||||||
def _UInt(s):
|
|
||||||
"""Converts string to Unicode code point ('263A' => 0x263a).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
s: string to convert
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Unicode code point
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
InputError: the string is not a valid Unicode value.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
v = int(s, 16)
|
|
||||||
except ValueError:
|
|
||||||
v = -1
|
|
||||||
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
|
|
||||||
raise InputError("invalid Unicode value %s" % (s,))
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
def _URange(s):
|
|
||||||
"""Converts string to Unicode range.
|
|
||||||
|
|
||||||
'0001..0003' => [1, 2, 3].
|
|
||||||
'0001' => [1].
|
|
||||||
|
|
||||||
Args:
|
|
||||||
s: string to convert
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Unicode range
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
InputError: the string is not a valid Unicode range.
|
|
||||||
"""
|
|
||||||
a = s.split("..")
|
|
||||||
if len(a) == 1:
|
|
||||||
return [_UInt(a[0])]
|
|
||||||
if len(a) == 2:
|
|
||||||
lo = _UInt(a[0])
|
|
||||||
hi = _UInt(a[1])
|
|
||||||
if lo < hi:
|
|
||||||
return range(lo, hi + 1)
|
|
||||||
raise InputError("invalid Unicode range %s" % (s,))
|
|
||||||
|
|
||||||
|
|
||||||
def _UStr(v):
|
|
||||||
"""Converts Unicode code point to hex string.
|
|
||||||
|
|
||||||
0x263a => '0x263A'.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
v: code point to convert
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Unicode string
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
InputError: the argument is not a valid Unicode value.
|
|
||||||
"""
|
|
||||||
if v < 0 or v > _RUNE_MAX:
|
|
||||||
raise InputError("invalid Unicode value %s" % (v,))
|
|
||||||
return "0x%04X" % (v,)
|
|
||||||
|
|
||||||
|
|
||||||
def _ParseContinue(s):
|
|
||||||
"""Parses a Unicode continuation field.
|
|
||||||
|
|
||||||
These are of the form '<Name, First>' or '<Name, Last>'.
|
|
||||||
Instead of giving an explicit range in a single table entry,
|
|
||||||
some Unicode tables use two entries, one for the first
|
|
||||||
code value in the range and one for the last.
|
|
||||||
The first entry's description is '<Name, First>' instead of 'Name'
|
|
||||||
and the second is '<Name, Last>'.
|
|
||||||
|
|
||||||
'<Name, First>' => ('Name', 'First')
|
|
||||||
'<Name, Last>' => ('Name', 'Last')
|
|
||||||
'Anything else' => ('Anything else', None)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
s: continuation field string
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pair: name and ('First', 'Last', or None)
|
|
||||||
"""
|
|
||||||
|
|
||||||
match = re.match("<(.*), (First|Last)>", s)
|
|
||||||
if match is not None:
|
|
||||||
return match.groups()
|
|
||||||
return (s, None)
|
|
||||||
|
|
||||||
|
|
||||||
def ReadUnicodeTable(filename, nfields, doline):
|
|
||||||
"""Generic Unicode table text file reader.
|
|
||||||
|
|
||||||
The reader takes care of stripping out comments and also
|
|
||||||
parsing the two different ways that the Unicode tables specify
|
|
||||||
code ranges (using the .. notation and splitting the range across
|
|
||||||
multiple lines).
|
|
||||||
|
|
||||||
Each non-comment line in the table is expected to have the given
|
|
||||||
number of fields. The first field is known to be the Unicode value
|
|
||||||
and the second field its description.
|
|
||||||
|
|
||||||
The reader calls doline(codes, fields) for each entry in the table.
|
|
||||||
If fn raises an exception, the reader prints that exception,
|
|
||||||
prefixed with the file name and line number, and continues
|
|
||||||
processing the file. When done with the file, the reader re-raises
|
|
||||||
the first exception encountered during the file.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
filename: the Unicode data file to read, or a file-like object.
|
|
||||||
nfields: the number of expected fields per line in that file.
|
|
||||||
doline: the function to call for each table entry.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
InputError: nfields is invalid (must be >= 2).
|
|
||||||
"""
|
|
||||||
|
|
||||||
if nfields < 2:
|
|
||||||
raise InputError("invalid number of fields %d" % (nfields,))
|
|
||||||
|
|
||||||
if type(filename) == str:
|
|
||||||
if filename.startswith("http://"):
|
|
||||||
fil = urllib2.urlopen(filename)
|
|
||||||
else:
|
|
||||||
fil = open(filename, "r")
|
|
||||||
else:
|
|
||||||
fil = filename
|
|
||||||
|
|
||||||
first = None # first code in multiline range
|
|
||||||
expect_last = None # tag expected for "Last" line in multiline range
|
|
||||||
lineno = 0 # current line number
|
|
||||||
for line in fil:
|
|
||||||
lineno += 1
|
|
||||||
try:
|
|
||||||
# Chop # comments and white space; ignore empty lines.
|
|
||||||
sharp = line.find("#")
|
|
||||||
if sharp >= 0:
|
|
||||||
line = line[:sharp]
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Split fields on ";", chop more white space.
|
|
||||||
# Must have the expected number of fields.
|
|
||||||
fields = [s.strip() for s in line.split(";")]
|
|
||||||
if len(fields) != nfields:
|
|
||||||
raise InputError("wrong number of fields %d %d - %s" %
|
|
||||||
(len(fields), nfields, line))
|
|
||||||
|
|
||||||
# The Unicode text files have two different ways
|
|
||||||
# to list a Unicode range. Either the first field is
|
|
||||||
# itself a range (0000..FFFF), or the range is split
|
|
||||||
# across two lines, with the second field noting
|
|
||||||
# the continuation.
|
|
||||||
codes = _URange(fields[0])
|
|
||||||
(name, cont) = _ParseContinue(fields[1])
|
|
||||||
|
|
||||||
if expect_last is not None:
|
|
||||||
# If the last line gave the First code in a range,
|
|
||||||
# this one had better give the Last one.
|
|
||||||
if (len(codes) != 1 or codes[0] <= first or
|
|
||||||
cont != "Last" or name != expect_last):
|
|
||||||
raise InputError("expected Last line for %s" %
|
|
||||||
(expect_last,))
|
|
||||||
codes = range(first, codes[0] + 1)
|
|
||||||
first = None
|
|
||||||
expect_last = None
|
|
||||||
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
|
|
||||||
fields[1] = name
|
|
||||||
elif cont == "First":
|
|
||||||
# Otherwise, if this is the First code in a range,
|
|
||||||
# remember it and go to the next line.
|
|
||||||
if len(codes) != 1:
|
|
||||||
raise InputError("bad First line: range given")
|
|
||||||
expect_last = name
|
|
||||||
first = codes[0]
|
|
||||||
continue
|
|
||||||
|
|
||||||
doline(codes, fields)
|
|
||||||
|
|
||||||
except Exception, e:
|
|
||||||
print "%s:%d: %s" % (filename, lineno, e)
|
|
||||||
raise
|
|
||||||
|
|
||||||
if expect_last is not None:
|
|
||||||
raise InputError("expected Last line for %s; got EOF" %
|
|
||||||
(expect_last,))
|
|
||||||
|
|
||||||
|
|
||||||
def CaseGroups(unicode_dir=_UNICODE_DIR):
|
|
||||||
"""Returns list of Unicode code groups equivalent under case folding.
|
|
||||||
|
|
||||||
Each group is a sorted list of code points,
|
|
||||||
and the list of groups is sorted by first code point
|
|
||||||
in the group.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
unicode_dir: Unicode data directory
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list of Unicode code groups
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Dict mapping lowercase code point to fold-equivalent group.
|
|
||||||
togroup = {}
|
|
||||||
|
|
||||||
def DoLine(codes, fields):
|
|
||||||
"""Process single CaseFolding.txt line, updating togroup."""
|
|
||||||
(_, foldtype, lower, _) = fields
|
|
||||||
if foldtype not in ("C", "S"):
|
|
||||||
return
|
|
||||||
lower = _UInt(lower)
|
|
||||||
togroup.setdefault(lower, [lower]).extend(codes)
|
|
||||||
|
|
||||||
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
|
|
||||||
|
|
||||||
groups = togroup.values()
|
|
||||||
for g in groups:
|
|
||||||
g.sort()
|
|
||||||
groups.sort()
|
|
||||||
return togroup, groups
|
|
||||||
|
|
||||||
|
|
||||||
def Scripts(unicode_dir=_UNICODE_DIR):
|
|
||||||
"""Returns dict mapping script names to code lists.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
unicode_dir: Unicode data directory
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict mapping script names to code lists
|
|
||||||
"""
|
|
||||||
|
|
||||||
scripts = {}
|
|
||||||
|
|
||||||
def DoLine(codes, fields):
|
|
||||||
"""Process single Scripts.txt line, updating scripts."""
|
|
||||||
(_, name) = fields
|
|
||||||
scripts.setdefault(name, []).extend(codes)
|
|
||||||
|
|
||||||
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
|
|
||||||
return scripts
|
|
||||||
|
|
||||||
|
|
||||||
def Categories(unicode_dir=_UNICODE_DIR):
|
|
||||||
"""Returns dict mapping category names to code lists.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
unicode_dir: Unicode data directory
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict mapping category names to code lists
|
|
||||||
"""
|
|
||||||
|
|
||||||
categories = {}
|
|
||||||
|
|
||||||
def DoLine(codes, fields):
|
|
||||||
"""Process single UnicodeData.txt line, updating categories."""
|
|
||||||
category = fields[2]
|
|
||||||
categories.setdefault(category, []).extend(codes)
|
|
||||||
# Add codes from Lu into L, etc.
|
|
||||||
if len(category) > 1:
|
|
||||||
short = category[0]
|
|
||||||
categories.setdefault(short, []).extend(codes)
|
|
||||||
|
|
||||||
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
|
|
||||||
return categories
|
|
||||||
|
|
@ -1,558 +0,0 @@
|
|||||||
|
|
||||||
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
|
||||||
// make_unicode_casefold.py >unicode_casefold.cc
|
|
||||||
|
|
||||||
#include "re2/unicode_casefold.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
|
|
||||||
// 1295 groups, 2620 pairs, 343 ranges
|
|
||||||
const CaseFold unicode_casefold[] = {
|
|
||||||
{ 65, 90, 32 },
|
|
||||||
{ 97, 106, -32 },
|
|
||||||
{ 107, 107, 8383 },
|
|
||||||
{ 108, 114, -32 },
|
|
||||||
{ 115, 115, 268 },
|
|
||||||
{ 116, 122, -32 },
|
|
||||||
{ 181, 181, 743 },
|
|
||||||
{ 192, 214, 32 },
|
|
||||||
{ 216, 222, 32 },
|
|
||||||
{ 223, 223, 7615 },
|
|
||||||
{ 224, 228, -32 },
|
|
||||||
{ 229, 229, 8262 },
|
|
||||||
{ 230, 246, -32 },
|
|
||||||
{ 248, 254, -32 },
|
|
||||||
{ 255, 255, 121 },
|
|
||||||
{ 256, 303, EvenOdd },
|
|
||||||
{ 306, 311, EvenOdd },
|
|
||||||
{ 313, 328, OddEven },
|
|
||||||
{ 330, 375, EvenOdd },
|
|
||||||
{ 376, 376, -121 },
|
|
||||||
{ 377, 382, OddEven },
|
|
||||||
{ 383, 383, -300 },
|
|
||||||
{ 384, 384, 195 },
|
|
||||||
{ 385, 385, 210 },
|
|
||||||
{ 386, 389, EvenOdd },
|
|
||||||
{ 390, 390, 206 },
|
|
||||||
{ 391, 392, OddEven },
|
|
||||||
{ 393, 394, 205 },
|
|
||||||
{ 395, 396, OddEven },
|
|
||||||
{ 398, 398, 79 },
|
|
||||||
{ 399, 399, 202 },
|
|
||||||
{ 400, 400, 203 },
|
|
||||||
{ 401, 402, OddEven },
|
|
||||||
{ 403, 403, 205 },
|
|
||||||
{ 404, 404, 207 },
|
|
||||||
{ 405, 405, 97 },
|
|
||||||
{ 406, 406, 211 },
|
|
||||||
{ 407, 407, 209 },
|
|
||||||
{ 408, 409, EvenOdd },
|
|
||||||
{ 410, 410, 163 },
|
|
||||||
{ 412, 412, 211 },
|
|
||||||
{ 413, 413, 213 },
|
|
||||||
{ 414, 414, 130 },
|
|
||||||
{ 415, 415, 214 },
|
|
||||||
{ 416, 421, EvenOdd },
|
|
||||||
{ 422, 422, 218 },
|
|
||||||
{ 423, 424, OddEven },
|
|
||||||
{ 425, 425, 218 },
|
|
||||||
{ 428, 429, EvenOdd },
|
|
||||||
{ 430, 430, 218 },
|
|
||||||
{ 431, 432, OddEven },
|
|
||||||
{ 433, 434, 217 },
|
|
||||||
{ 435, 438, OddEven },
|
|
||||||
{ 439, 439, 219 },
|
|
||||||
{ 440, 441, EvenOdd },
|
|
||||||
{ 444, 445, EvenOdd },
|
|
||||||
{ 447, 447, 56 },
|
|
||||||
{ 452, 452, EvenOdd },
|
|
||||||
{ 453, 453, OddEven },
|
|
||||||
{ 454, 454, -2 },
|
|
||||||
{ 455, 455, OddEven },
|
|
||||||
{ 456, 456, EvenOdd },
|
|
||||||
{ 457, 457, -2 },
|
|
||||||
{ 458, 458, EvenOdd },
|
|
||||||
{ 459, 459, OddEven },
|
|
||||||
{ 460, 460, -2 },
|
|
||||||
{ 461, 476, OddEven },
|
|
||||||
{ 477, 477, -79 },
|
|
||||||
{ 478, 495, EvenOdd },
|
|
||||||
{ 497, 497, OddEven },
|
|
||||||
{ 498, 498, EvenOdd },
|
|
||||||
{ 499, 499, -2 },
|
|
||||||
{ 500, 501, EvenOdd },
|
|
||||||
{ 502, 502, -97 },
|
|
||||||
{ 503, 503, -56 },
|
|
||||||
{ 504, 543, EvenOdd },
|
|
||||||
{ 544, 544, -130 },
|
|
||||||
{ 546, 563, EvenOdd },
|
|
||||||
{ 570, 570, 10795 },
|
|
||||||
{ 571, 572, OddEven },
|
|
||||||
{ 573, 573, -163 },
|
|
||||||
{ 574, 574, 10792 },
|
|
||||||
{ 575, 576, 10815 },
|
|
||||||
{ 577, 578, OddEven },
|
|
||||||
{ 579, 579, -195 },
|
|
||||||
{ 580, 580, 69 },
|
|
||||||
{ 581, 581, 71 },
|
|
||||||
{ 582, 591, EvenOdd },
|
|
||||||
{ 592, 592, 10783 },
|
|
||||||
{ 593, 593, 10780 },
|
|
||||||
{ 594, 594, 10782 },
|
|
||||||
{ 595, 595, -210 },
|
|
||||||
{ 596, 596, -206 },
|
|
||||||
{ 598, 599, -205 },
|
|
||||||
{ 601, 601, -202 },
|
|
||||||
{ 603, 603, -203 },
|
|
||||||
{ 604, 604, 42319 },
|
|
||||||
{ 608, 608, -205 },
|
|
||||||
{ 609, 609, 42315 },
|
|
||||||
{ 611, 611, -207 },
|
|
||||||
{ 613, 613, 42280 },
|
|
||||||
{ 614, 614, 42308 },
|
|
||||||
{ 616, 616, -209 },
|
|
||||||
{ 617, 617, -211 },
|
|
||||||
{ 618, 618, 42308 },
|
|
||||||
{ 619, 619, 10743 },
|
|
||||||
{ 620, 620, 42305 },
|
|
||||||
{ 623, 623, -211 },
|
|
||||||
{ 625, 625, 10749 },
|
|
||||||
{ 626, 626, -213 },
|
|
||||||
{ 629, 629, -214 },
|
|
||||||
{ 637, 637, 10727 },
|
|
||||||
{ 640, 640, -218 },
|
|
||||||
{ 643, 643, -218 },
|
|
||||||
{ 647, 647, 42282 },
|
|
||||||
{ 648, 648, -218 },
|
|
||||||
{ 649, 649, -69 },
|
|
||||||
{ 650, 651, -217 },
|
|
||||||
{ 652, 652, -71 },
|
|
||||||
{ 658, 658, -219 },
|
|
||||||
{ 669, 669, 42261 },
|
|
||||||
{ 670, 670, 42258 },
|
|
||||||
{ 837, 837, 84 },
|
|
||||||
{ 880, 883, EvenOdd },
|
|
||||||
{ 886, 887, EvenOdd },
|
|
||||||
{ 891, 893, 130 },
|
|
||||||
{ 895, 895, 116 },
|
|
||||||
{ 902, 902, 38 },
|
|
||||||
{ 904, 906, 37 },
|
|
||||||
{ 908, 908, 64 },
|
|
||||||
{ 910, 911, 63 },
|
|
||||||
{ 913, 929, 32 },
|
|
||||||
{ 931, 931, 31 },
|
|
||||||
{ 932, 939, 32 },
|
|
||||||
{ 940, 940, -38 },
|
|
||||||
{ 941, 943, -37 },
|
|
||||||
{ 945, 945, -32 },
|
|
||||||
{ 946, 946, 30 },
|
|
||||||
{ 947, 948, -32 },
|
|
||||||
{ 949, 949, 64 },
|
|
||||||
{ 950, 951, -32 },
|
|
||||||
{ 952, 952, 25 },
|
|
||||||
{ 953, 953, 7173 },
|
|
||||||
{ 954, 954, 54 },
|
|
||||||
{ 955, 955, -32 },
|
|
||||||
{ 956, 956, -775 },
|
|
||||||
{ 957, 959, -32 },
|
|
||||||
{ 960, 960, 22 },
|
|
||||||
{ 961, 961, 48 },
|
|
||||||
{ 962, 962, EvenOdd },
|
|
||||||
{ 963, 965, -32 },
|
|
||||||
{ 966, 966, 15 },
|
|
||||||
{ 967, 968, -32 },
|
|
||||||
{ 969, 969, 7517 },
|
|
||||||
{ 970, 971, -32 },
|
|
||||||
{ 972, 972, -64 },
|
|
||||||
{ 973, 974, -63 },
|
|
||||||
{ 975, 975, 8 },
|
|
||||||
{ 976, 976, -62 },
|
|
||||||
{ 977, 977, 35 },
|
|
||||||
{ 981, 981, -47 },
|
|
||||||
{ 982, 982, -54 },
|
|
||||||
{ 983, 983, -8 },
|
|
||||||
{ 984, 1007, EvenOdd },
|
|
||||||
{ 1008, 1008, -86 },
|
|
||||||
{ 1009, 1009, -80 },
|
|
||||||
{ 1010, 1010, 7 },
|
|
||||||
{ 1011, 1011, -116 },
|
|
||||||
{ 1012, 1012, -92 },
|
|
||||||
{ 1013, 1013, -96 },
|
|
||||||
{ 1015, 1016, OddEven },
|
|
||||||
{ 1017, 1017, -7 },
|
|
||||||
{ 1018, 1019, EvenOdd },
|
|
||||||
{ 1021, 1023, -130 },
|
|
||||||
{ 1024, 1039, 80 },
|
|
||||||
{ 1040, 1071, 32 },
|
|
||||||
{ 1072, 1073, -32 },
|
|
||||||
{ 1074, 1074, 6222 },
|
|
||||||
{ 1075, 1075, -32 },
|
|
||||||
{ 1076, 1076, 6221 },
|
|
||||||
{ 1077, 1085, -32 },
|
|
||||||
{ 1086, 1086, 6212 },
|
|
||||||
{ 1087, 1088, -32 },
|
|
||||||
{ 1089, 1090, 6210 },
|
|
||||||
{ 1091, 1097, -32 },
|
|
||||||
{ 1098, 1098, 6204 },
|
|
||||||
{ 1099, 1103, -32 },
|
|
||||||
{ 1104, 1119, -80 },
|
|
||||||
{ 1120, 1122, EvenOdd },
|
|
||||||
{ 1123, 1123, 6180 },
|
|
||||||
{ 1124, 1153, EvenOdd },
|
|
||||||
{ 1162, 1215, EvenOdd },
|
|
||||||
{ 1216, 1216, 15 },
|
|
||||||
{ 1217, 1230, OddEven },
|
|
||||||
{ 1231, 1231, -15 },
|
|
||||||
{ 1232, 1327, EvenOdd },
|
|
||||||
{ 1329, 1366, 48 },
|
|
||||||
{ 1377, 1414, -48 },
|
|
||||||
{ 4256, 4293, 7264 },
|
|
||||||
{ 4295, 4295, 7264 },
|
|
||||||
{ 4301, 4301, 7264 },
|
|
||||||
{ 5024, 5103, 38864 },
|
|
||||||
{ 5104, 5109, 8 },
|
|
||||||
{ 5112, 5117, -8 },
|
|
||||||
{ 7296, 7296, -6254 },
|
|
||||||
{ 7297, 7297, -6253 },
|
|
||||||
{ 7298, 7298, -6244 },
|
|
||||||
{ 7299, 7299, -6242 },
|
|
||||||
{ 7300, 7300, EvenOdd },
|
|
||||||
{ 7301, 7301, -6243 },
|
|
||||||
{ 7302, 7302, -6236 },
|
|
||||||
{ 7303, 7303, -6181 },
|
|
||||||
{ 7304, 7304, 35266 },
|
|
||||||
{ 7545, 7545, 35332 },
|
|
||||||
{ 7549, 7549, 3814 },
|
|
||||||
{ 7680, 7776, EvenOdd },
|
|
||||||
{ 7777, 7777, 58 },
|
|
||||||
{ 7778, 7829, EvenOdd },
|
|
||||||
{ 7835, 7835, -59 },
|
|
||||||
{ 7838, 7838, -7615 },
|
|
||||||
{ 7840, 7935, EvenOdd },
|
|
||||||
{ 7936, 7943, 8 },
|
|
||||||
{ 7944, 7951, -8 },
|
|
||||||
{ 7952, 7957, 8 },
|
|
||||||
{ 7960, 7965, -8 },
|
|
||||||
{ 7968, 7975, 8 },
|
|
||||||
{ 7976, 7983, -8 },
|
|
||||||
{ 7984, 7991, 8 },
|
|
||||||
{ 7992, 7999, -8 },
|
|
||||||
{ 8000, 8005, 8 },
|
|
||||||
{ 8008, 8013, -8 },
|
|
||||||
{ 8017, 8017, 8 },
|
|
||||||
{ 8019, 8019, 8 },
|
|
||||||
{ 8021, 8021, 8 },
|
|
||||||
{ 8023, 8023, 8 },
|
|
||||||
{ 8025, 8025, -8 },
|
|
||||||
{ 8027, 8027, -8 },
|
|
||||||
{ 8029, 8029, -8 },
|
|
||||||
{ 8031, 8031, -8 },
|
|
||||||
{ 8032, 8039, 8 },
|
|
||||||
{ 8040, 8047, -8 },
|
|
||||||
{ 8048, 8049, 74 },
|
|
||||||
{ 8050, 8053, 86 },
|
|
||||||
{ 8054, 8055, 100 },
|
|
||||||
{ 8056, 8057, 128 },
|
|
||||||
{ 8058, 8059, 112 },
|
|
||||||
{ 8060, 8061, 126 },
|
|
||||||
{ 8064, 8071, 8 },
|
|
||||||
{ 8072, 8079, -8 },
|
|
||||||
{ 8080, 8087, 8 },
|
|
||||||
{ 8088, 8095, -8 },
|
|
||||||
{ 8096, 8103, 8 },
|
|
||||||
{ 8104, 8111, -8 },
|
|
||||||
{ 8112, 8113, 8 },
|
|
||||||
{ 8115, 8115, 9 },
|
|
||||||
{ 8120, 8121, -8 },
|
|
||||||
{ 8122, 8123, -74 },
|
|
||||||
{ 8124, 8124, -9 },
|
|
||||||
{ 8126, 8126, -7289 },
|
|
||||||
{ 8131, 8131, 9 },
|
|
||||||
{ 8136, 8139, -86 },
|
|
||||||
{ 8140, 8140, -9 },
|
|
||||||
{ 8144, 8145, 8 },
|
|
||||||
{ 8152, 8153, -8 },
|
|
||||||
{ 8154, 8155, -100 },
|
|
||||||
{ 8160, 8161, 8 },
|
|
||||||
{ 8165, 8165, 7 },
|
|
||||||
{ 8168, 8169, -8 },
|
|
||||||
{ 8170, 8171, -112 },
|
|
||||||
{ 8172, 8172, -7 },
|
|
||||||
{ 8179, 8179, 9 },
|
|
||||||
{ 8184, 8185, -128 },
|
|
||||||
{ 8186, 8187, -126 },
|
|
||||||
{ 8188, 8188, -9 },
|
|
||||||
{ 8486, 8486, -7549 },
|
|
||||||
{ 8490, 8490, -8415 },
|
|
||||||
{ 8491, 8491, -8294 },
|
|
||||||
{ 8498, 8498, 28 },
|
|
||||||
{ 8526, 8526, -28 },
|
|
||||||
{ 8544, 8559, 16 },
|
|
||||||
{ 8560, 8575, -16 },
|
|
||||||
{ 8579, 8580, OddEven },
|
|
||||||
{ 9398, 9423, 26 },
|
|
||||||
{ 9424, 9449, -26 },
|
|
||||||
{ 11264, 11310, 48 },
|
|
||||||
{ 11312, 11358, -48 },
|
|
||||||
{ 11360, 11361, EvenOdd },
|
|
||||||
{ 11362, 11362, -10743 },
|
|
||||||
{ 11363, 11363, -3814 },
|
|
||||||
{ 11364, 11364, -10727 },
|
|
||||||
{ 11365, 11365, -10795 },
|
|
||||||
{ 11366, 11366, -10792 },
|
|
||||||
{ 11367, 11372, OddEven },
|
|
||||||
{ 11373, 11373, -10780 },
|
|
||||||
{ 11374, 11374, -10749 },
|
|
||||||
{ 11375, 11375, -10783 },
|
|
||||||
{ 11376, 11376, -10782 },
|
|
||||||
{ 11378, 11379, EvenOdd },
|
|
||||||
{ 11381, 11382, OddEven },
|
|
||||||
{ 11390, 11391, -10815 },
|
|
||||||
{ 11392, 11491, EvenOdd },
|
|
||||||
{ 11499, 11502, OddEven },
|
|
||||||
{ 11506, 11507, EvenOdd },
|
|
||||||
{ 11520, 11557, -7264 },
|
|
||||||
{ 11559, 11559, -7264 },
|
|
||||||
{ 11565, 11565, -7264 },
|
|
||||||
{ 42560, 42570, EvenOdd },
|
|
||||||
{ 42571, 42571, -35267 },
|
|
||||||
{ 42572, 42605, EvenOdd },
|
|
||||||
{ 42624, 42651, EvenOdd },
|
|
||||||
{ 42786, 42799, EvenOdd },
|
|
||||||
{ 42802, 42863, EvenOdd },
|
|
||||||
{ 42873, 42876, OddEven },
|
|
||||||
{ 42877, 42877, -35332 },
|
|
||||||
{ 42878, 42887, EvenOdd },
|
|
||||||
{ 42891, 42892, OddEven },
|
|
||||||
{ 42893, 42893, -42280 },
|
|
||||||
{ 42896, 42899, EvenOdd },
|
|
||||||
{ 42902, 42921, EvenOdd },
|
|
||||||
{ 42922, 42922, -42308 },
|
|
||||||
{ 42923, 42923, -42319 },
|
|
||||||
{ 42924, 42924, -42315 },
|
|
||||||
{ 42925, 42925, -42305 },
|
|
||||||
{ 42926, 42926, -42308 },
|
|
||||||
{ 42928, 42928, -42258 },
|
|
||||||
{ 42929, 42929, -42282 },
|
|
||||||
{ 42930, 42930, -42261 },
|
|
||||||
{ 42931, 42931, 928 },
|
|
||||||
{ 42932, 42935, EvenOdd },
|
|
||||||
{ 43859, 43859, -928 },
|
|
||||||
{ 43888, 43967, -38864 },
|
|
||||||
{ 65313, 65338, 32 },
|
|
||||||
{ 65345, 65370, -32 },
|
|
||||||
{ 66560, 66599, 40 },
|
|
||||||
{ 66600, 66639, -40 },
|
|
||||||
{ 66736, 66771, 40 },
|
|
||||||
{ 66776, 66811, -40 },
|
|
||||||
{ 68736, 68786, 64 },
|
|
||||||
{ 68800, 68850, -64 },
|
|
||||||
{ 71840, 71871, 32 },
|
|
||||||
{ 71872, 71903, -32 },
|
|
||||||
{ 125184, 125217, 34 },
|
|
||||||
{ 125218, 125251, -34 },
|
|
||||||
};
|
|
||||||
const int num_unicode_casefold = 343;
|
|
||||||
|
|
||||||
// 1295 groups, 1325 pairs, 191 ranges
|
|
||||||
const CaseFold unicode_tolower[] = {
|
|
||||||
{ 65, 90, 32 },
|
|
||||||
{ 181, 181, 775 },
|
|
||||||
{ 192, 214, 32 },
|
|
||||||
{ 216, 222, 32 },
|
|
||||||
{ 256, 302, EvenOddSkip },
|
|
||||||
{ 306, 310, EvenOddSkip },
|
|
||||||
{ 313, 327, OddEvenSkip },
|
|
||||||
{ 330, 374, EvenOddSkip },
|
|
||||||
{ 376, 376, -121 },
|
|
||||||
{ 377, 381, OddEvenSkip },
|
|
||||||
{ 383, 383, -268 },
|
|
||||||
{ 385, 385, 210 },
|
|
||||||
{ 386, 388, EvenOddSkip },
|
|
||||||
{ 390, 390, 206 },
|
|
||||||
{ 391, 391, OddEven },
|
|
||||||
{ 393, 394, 205 },
|
|
||||||
{ 395, 395, OddEven },
|
|
||||||
{ 398, 398, 79 },
|
|
||||||
{ 399, 399, 202 },
|
|
||||||
{ 400, 400, 203 },
|
|
||||||
{ 401, 401, OddEven },
|
|
||||||
{ 403, 403, 205 },
|
|
||||||
{ 404, 404, 207 },
|
|
||||||
{ 406, 406, 211 },
|
|
||||||
{ 407, 407, 209 },
|
|
||||||
{ 408, 408, EvenOdd },
|
|
||||||
{ 412, 412, 211 },
|
|
||||||
{ 413, 413, 213 },
|
|
||||||
{ 415, 415, 214 },
|
|
||||||
{ 416, 420, EvenOddSkip },
|
|
||||||
{ 422, 422, 218 },
|
|
||||||
{ 423, 423, OddEven },
|
|
||||||
{ 425, 425, 218 },
|
|
||||||
{ 428, 428, EvenOdd },
|
|
||||||
{ 430, 430, 218 },
|
|
||||||
{ 431, 431, OddEven },
|
|
||||||
{ 433, 434, 217 },
|
|
||||||
{ 435, 437, OddEvenSkip },
|
|
||||||
{ 439, 439, 219 },
|
|
||||||
{ 440, 440, EvenOdd },
|
|
||||||
{ 444, 444, EvenOdd },
|
|
||||||
{ 452, 452, 2 },
|
|
||||||
{ 453, 453, OddEven },
|
|
||||||
{ 455, 455, 2 },
|
|
||||||
{ 456, 456, EvenOdd },
|
|
||||||
{ 458, 458, 2 },
|
|
||||||
{ 459, 475, OddEvenSkip },
|
|
||||||
{ 478, 494, EvenOddSkip },
|
|
||||||
{ 497, 497, 2 },
|
|
||||||
{ 498, 500, EvenOddSkip },
|
|
||||||
{ 502, 502, -97 },
|
|
||||||
{ 503, 503, -56 },
|
|
||||||
{ 504, 542, EvenOddSkip },
|
|
||||||
{ 544, 544, -130 },
|
|
||||||
{ 546, 562, EvenOddSkip },
|
|
||||||
{ 570, 570, 10795 },
|
|
||||||
{ 571, 571, OddEven },
|
|
||||||
{ 573, 573, -163 },
|
|
||||||
{ 574, 574, 10792 },
|
|
||||||
{ 577, 577, OddEven },
|
|
||||||
{ 579, 579, -195 },
|
|
||||||
{ 580, 580, 69 },
|
|
||||||
{ 581, 581, 71 },
|
|
||||||
{ 582, 590, EvenOddSkip },
|
|
||||||
{ 837, 837, 116 },
|
|
||||||
{ 880, 882, EvenOddSkip },
|
|
||||||
{ 886, 886, EvenOdd },
|
|
||||||
{ 895, 895, 116 },
|
|
||||||
{ 902, 902, 38 },
|
|
||||||
{ 904, 906, 37 },
|
|
||||||
{ 908, 908, 64 },
|
|
||||||
{ 910, 911, 63 },
|
|
||||||
{ 913, 929, 32 },
|
|
||||||
{ 931, 939, 32 },
|
|
||||||
{ 962, 962, EvenOdd },
|
|
||||||
{ 975, 975, 8 },
|
|
||||||
{ 976, 976, -30 },
|
|
||||||
{ 977, 977, -25 },
|
|
||||||
{ 981, 981, -15 },
|
|
||||||
{ 982, 982, -22 },
|
|
||||||
{ 984, 1006, EvenOddSkip },
|
|
||||||
{ 1008, 1008, -54 },
|
|
||||||
{ 1009, 1009, -48 },
|
|
||||||
{ 1012, 1012, -60 },
|
|
||||||
{ 1013, 1013, -64 },
|
|
||||||
{ 1015, 1015, OddEven },
|
|
||||||
{ 1017, 1017, -7 },
|
|
||||||
{ 1018, 1018, EvenOdd },
|
|
||||||
{ 1021, 1023, -130 },
|
|
||||||
{ 1024, 1039, 80 },
|
|
||||||
{ 1040, 1071, 32 },
|
|
||||||
{ 1120, 1152, EvenOddSkip },
|
|
||||||
{ 1162, 1214, EvenOddSkip },
|
|
||||||
{ 1216, 1216, 15 },
|
|
||||||
{ 1217, 1229, OddEvenSkip },
|
|
||||||
{ 1232, 1326, EvenOddSkip },
|
|
||||||
{ 1329, 1366, 48 },
|
|
||||||
{ 4256, 4293, 7264 },
|
|
||||||
{ 4295, 4295, 7264 },
|
|
||||||
{ 4301, 4301, 7264 },
|
|
||||||
{ 5112, 5117, -8 },
|
|
||||||
{ 7296, 7296, -6222 },
|
|
||||||
{ 7297, 7297, -6221 },
|
|
||||||
{ 7298, 7298, -6212 },
|
|
||||||
{ 7299, 7300, -6210 },
|
|
||||||
{ 7301, 7301, -6211 },
|
|
||||||
{ 7302, 7302, -6204 },
|
|
||||||
{ 7303, 7303, -6180 },
|
|
||||||
{ 7304, 7304, 35267 },
|
|
||||||
{ 7680, 7828, EvenOddSkip },
|
|
||||||
{ 7835, 7835, -58 },
|
|
||||||
{ 7838, 7838, -7615 },
|
|
||||||
{ 7840, 7934, EvenOddSkip },
|
|
||||||
{ 7944, 7951, -8 },
|
|
||||||
{ 7960, 7965, -8 },
|
|
||||||
{ 7976, 7983, -8 },
|
|
||||||
{ 7992, 7999, -8 },
|
|
||||||
{ 8008, 8013, -8 },
|
|
||||||
{ 8025, 8025, -8 },
|
|
||||||
{ 8027, 8027, -8 },
|
|
||||||
{ 8029, 8029, -8 },
|
|
||||||
{ 8031, 8031, -8 },
|
|
||||||
{ 8040, 8047, -8 },
|
|
||||||
{ 8072, 8079, -8 },
|
|
||||||
{ 8088, 8095, -8 },
|
|
||||||
{ 8104, 8111, -8 },
|
|
||||||
{ 8120, 8121, -8 },
|
|
||||||
{ 8122, 8123, -74 },
|
|
||||||
{ 8124, 8124, -9 },
|
|
||||||
{ 8126, 8126, -7173 },
|
|
||||||
{ 8136, 8139, -86 },
|
|
||||||
{ 8140, 8140, -9 },
|
|
||||||
{ 8152, 8153, -8 },
|
|
||||||
{ 8154, 8155, -100 },
|
|
||||||
{ 8168, 8169, -8 },
|
|
||||||
{ 8170, 8171, -112 },
|
|
||||||
{ 8172, 8172, -7 },
|
|
||||||
{ 8184, 8185, -128 },
|
|
||||||
{ 8186, 8187, -126 },
|
|
||||||
{ 8188, 8188, -9 },
|
|
||||||
{ 8486, 8486, -7517 },
|
|
||||||
{ 8490, 8490, -8383 },
|
|
||||||
{ 8491, 8491, -8262 },
|
|
||||||
{ 8498, 8498, 28 },
|
|
||||||
{ 8544, 8559, 16 },
|
|
||||||
{ 8579, 8579, OddEven },
|
|
||||||
{ 9398, 9423, 26 },
|
|
||||||
{ 11264, 11310, 48 },
|
|
||||||
{ 11360, 11360, EvenOdd },
|
|
||||||
{ 11362, 11362, -10743 },
|
|
||||||
{ 11363, 11363, -3814 },
|
|
||||||
{ 11364, 11364, -10727 },
|
|
||||||
{ 11367, 11371, OddEvenSkip },
|
|
||||||
{ 11373, 11373, -10780 },
|
|
||||||
{ 11374, 11374, -10749 },
|
|
||||||
{ 11375, 11375, -10783 },
|
|
||||||
{ 11376, 11376, -10782 },
|
|
||||||
{ 11378, 11378, EvenOdd },
|
|
||||||
{ 11381, 11381, OddEven },
|
|
||||||
{ 11390, 11391, -10815 },
|
|
||||||
{ 11392, 11490, EvenOddSkip },
|
|
||||||
{ 11499, 11501, OddEvenSkip },
|
|
||||||
{ 11506, 11506, EvenOdd },
|
|
||||||
{ 42560, 42604, EvenOddSkip },
|
|
||||||
{ 42624, 42650, EvenOddSkip },
|
|
||||||
{ 42786, 42798, EvenOddSkip },
|
|
||||||
{ 42802, 42862, EvenOddSkip },
|
|
||||||
{ 42873, 42875, OddEvenSkip },
|
|
||||||
{ 42877, 42877, -35332 },
|
|
||||||
{ 42878, 42886, EvenOddSkip },
|
|
||||||
{ 42891, 42891, OddEven },
|
|
||||||
{ 42893, 42893, -42280 },
|
|
||||||
{ 42896, 42898, EvenOddSkip },
|
|
||||||
{ 42902, 42920, EvenOddSkip },
|
|
||||||
{ 42922, 42922, -42308 },
|
|
||||||
{ 42923, 42923, -42319 },
|
|
||||||
{ 42924, 42924, -42315 },
|
|
||||||
{ 42925, 42925, -42305 },
|
|
||||||
{ 42926, 42926, -42308 },
|
|
||||||
{ 42928, 42928, -42258 },
|
|
||||||
{ 42929, 42929, -42282 },
|
|
||||||
{ 42930, 42930, -42261 },
|
|
||||||
{ 42931, 42931, 928 },
|
|
||||||
{ 42932, 42934, EvenOddSkip },
|
|
||||||
{ 43888, 43967, -38864 },
|
|
||||||
{ 65313, 65338, 32 },
|
|
||||||
{ 66560, 66599, 40 },
|
|
||||||
{ 66736, 66771, 40 },
|
|
||||||
{ 68736, 68786, 64 },
|
|
||||||
{ 71840, 71871, 32 },
|
|
||||||
{ 125184, 125217, 34 },
|
|
||||||
};
|
|
||||||
const int num_unicode_tolower = 191;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
|
|
@ -1,78 +0,0 @@
|
|||||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_UNICODE_CASEFOLD_H_
|
|
||||||
#define RE2_UNICODE_CASEFOLD_H_
|
|
||||||
|
|
||||||
// Unicode case folding tables.
|
|
||||||
|
|
||||||
// The Unicode case folding tables encode the mapping from one Unicode point
|
|
||||||
// to the next largest Unicode point with equivalent folding. The largest
|
|
||||||
// point wraps back to the first. For example, the tables map:
|
|
||||||
//
|
|
||||||
// 'A' -> 'a'
|
|
||||||
// 'a' -> 'A'
|
|
||||||
//
|
|
||||||
// 'K' -> 'k'
|
|
||||||
// 'k' -> 'K' (Kelvin symbol)
|
|
||||||
// 'K' -> 'K'
|
|
||||||
//
|
|
||||||
// Like everything Unicode, these tables are big. If we represent the table
|
|
||||||
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
|
|
||||||
// Most table entries look like the ones around them:
|
|
||||||
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
|
||||||
// Instead of listing all the pairs explicitly, we make a list of ranges
|
|
||||||
// and deltas, so that the table entries for 'A' through 'Z' can be represented
|
|
||||||
// as a single entry { 'A', 'Z', +32 }.
|
|
||||||
//
|
|
||||||
// In addition to blocks that map to each other (A-Z mapping to a-z)
|
|
||||||
// there are blocks of pairs that individually map to each other
|
|
||||||
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
|
|
||||||
// For those, the special delta value EvenOdd marks even/odd pairs
|
|
||||||
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
|
|
||||||
//
|
|
||||||
// In this form, the table has 274 entries, about 3kB. If we were to split
|
|
||||||
// the table into one for 16-bit codes and an overflow table for larger ones,
|
|
||||||
// we could get it down to about 1.5kB, but that's not worth the complexity.
|
|
||||||
//
|
|
||||||
// The grouped form also allows for efficient fold range calculations
|
|
||||||
// rather than looping one character at a time.
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
enum {
|
|
||||||
EvenOdd = 1,
|
|
||||||
OddEven = -1,
|
|
||||||
EvenOddSkip = 1<<30,
|
|
||||||
OddEvenSkip,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct CaseFold {
|
|
||||||
Rune lo;
|
|
||||||
Rune hi;
|
|
||||||
int32_t delta;
|
|
||||||
};
|
|
||||||
|
|
||||||
extern const CaseFold unicode_casefold[];
|
|
||||||
extern const int num_unicode_casefold;
|
|
||||||
|
|
||||||
extern const CaseFold unicode_tolower[];
|
|
||||||
extern const int num_unicode_tolower;
|
|
||||||
|
|
||||||
// Returns the CaseFold* in the tables that contains rune.
|
|
||||||
// If rune is not in the tables, returns the first CaseFold* after rune.
|
|
||||||
// If rune is larger than any value in the tables, returns NULL.
|
|
||||||
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
|
|
||||||
|
|
||||||
// Returns the result of applying the fold f to the rune r.
|
|
||||||
extern Rune ApplyFold(const CaseFold *f, Rune r);
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_UNICODE_CASEFOLD_H_
|
|
File diff suppressed because it is too large
Load Diff
@ -1,67 +0,0 @@
|
|||||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_UNICODE_GROUPS_H_
|
|
||||||
#define RE2_UNICODE_GROUPS_H_
|
|
||||||
|
|
||||||
// Unicode character groups.
|
|
||||||
|
|
||||||
// The codes get split into ranges of 16-bit codes
|
|
||||||
// and ranges of 32-bit codes. It would be simpler
|
|
||||||
// to use only 32-bit ranges, but these tables are large
|
|
||||||
// enough to warrant extra care.
|
|
||||||
//
|
|
||||||
// Using just 32-bit ranges gives 27 kB of data.
|
|
||||||
// Adding 16-bit ranges gives 18 kB of data.
|
|
||||||
// Adding an extra table of 16-bit singletons would reduce
|
|
||||||
// to 16.5 kB of data but make the data harder to use;
|
|
||||||
// we don't bother.
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
struct URange16
|
|
||||||
{
|
|
||||||
uint16_t lo;
|
|
||||||
uint16_t hi;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct URange32
|
|
||||||
{
|
|
||||||
Rune lo;
|
|
||||||
Rune hi;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct UGroup
|
|
||||||
{
|
|
||||||
const char *name;
|
|
||||||
int sign; // +1 for [abc], -1 for [^abc]
|
|
||||||
const URange16 *r16;
|
|
||||||
int nr16;
|
|
||||||
const URange32 *r32;
|
|
||||||
int nr32;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Named by property or script name (e.g., "Nd", "N", "Han").
|
|
||||||
// Negated groups are not included.
|
|
||||||
extern const UGroup unicode_groups[];
|
|
||||||
extern const int num_unicode_groups;
|
|
||||||
|
|
||||||
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
|
|
||||||
// Negated groups are included.
|
|
||||||
extern const UGroup posix_groups[];
|
|
||||||
extern const int num_posix_groups;
|
|
||||||
|
|
||||||
// Named by Perl name (e.g., "\\d", "\\D").
|
|
||||||
// Negated groups are included.
|
|
||||||
extern const UGroup perl_groups[];
|
|
||||||
extern const int num_perl_groups;
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_UNICODE_GROUPS_H_
|
|
@ -1,344 +0,0 @@
|
|||||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_VARIADIC_FUNCTION_H_
|
|
||||||
#define RE2_VARIADIC_FUNCTION_H_
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
template <typename Result, typename Param0, typename Param1, typename Arg,
|
|
||||||
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
|
|
||||||
class VariadicFunction2 {
|
|
||||||
public:
|
|
||||||
Result operator()(Param0 p0, Param1 p1) const {
|
|
||||||
return Func(p0, p1, 0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
|
|
||||||
const Arg* const args[] = { &a0 };
|
|
||||||
return Func(p0, p1, args, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1 };
|
|
||||||
return Func(p0, p1, args, 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2 };
|
|
||||||
return Func(p0, p1, args, 3);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
|
|
||||||
return Func(p0, p1, args, 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
|
|
||||||
return Func(p0, p1, args, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
|
|
||||||
return Func(p0, p1, args, 6);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
|
|
||||||
return Func(p0, p1, args, 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
|
|
||||||
return Func(p0, p1, args, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
|
|
||||||
return Func(p0, p1, args, 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9 };
|
|
||||||
return Func(p0, p1, args, 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10 };
|
|
||||||
return Func(p0, p1, args, 11);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11 };
|
|
||||||
return Func(p0, p1, args, 12);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12 };
|
|
||||||
return Func(p0, p1, args, 13);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13 };
|
|
||||||
return Func(p0, p1, args, 14);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14 };
|
|
||||||
return Func(p0, p1, args, 15);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
|
|
||||||
return Func(p0, p1, args, 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
|
|
||||||
return Func(p0, p1, args, 17);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
|
|
||||||
return Func(p0, p1, args, 18);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
|
|
||||||
return Func(p0, p1, args, 19);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
|
|
||||||
return Func(p0, p1, args, 20);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
|
|
||||||
&a20 };
|
|
||||||
return Func(p0, p1, args, 21);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21 };
|
|
||||||
return Func(p0, p1, args, 22);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22 };
|
|
||||||
return Func(p0, p1, args, 23);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23 };
|
|
||||||
return Func(p0, p1, args, 24);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24 };
|
|
||||||
return Func(p0, p1, args, 25);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25 };
|
|
||||||
return Func(p0, p1, args, 26);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26 };
|
|
||||||
return Func(p0, p1, args, 27);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26, const Arg& a27) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
|
|
||||||
return Func(p0, p1, args, 28);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26, const Arg& a27, const Arg& a28) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
|
|
||||||
return Func(p0, p1, args, 29);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
|
|
||||||
return Func(p0, p1, args, 30);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
|
|
||||||
const Arg& a30) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
|
|
||||||
return Func(p0, p1, args, 31);
|
|
||||||
}
|
|
||||||
|
|
||||||
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
|
|
||||||
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
|
|
||||||
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
|
|
||||||
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
|
|
||||||
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
|
|
||||||
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
|
|
||||||
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
|
|
||||||
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
|
|
||||||
const Arg& a30, const Arg& a31) const {
|
|
||||||
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
|
|
||||||
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
|
|
||||||
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
|
|
||||||
return Func(p0, p1, args, 32);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_VARIADIC_FUNCTION_H_
|
|
@ -1,248 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_WALKER_INL_H_
|
|
||||||
#define RE2_WALKER_INL_H_
|
|
||||||
|
|
||||||
// Helper class for traversing Regexps without recursion.
|
|
||||||
// Clients should declare their own subclasses that override
|
|
||||||
// the PreVisit and PostVisit methods, which are called before
|
|
||||||
// and after visiting the subexpressions.
|
|
||||||
|
|
||||||
// Not quite the Visitor pattern, because (among other things)
|
|
||||||
// the Visitor pattern is recursive.
|
|
||||||
|
|
||||||
#include <stack>
|
|
||||||
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "re2/regexp.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
template<typename T> struct WalkState;
|
|
||||||
|
|
||||||
template<typename T> class Regexp::Walker {
|
|
||||||
public:
|
|
||||||
Walker();
|
|
||||||
virtual ~Walker();
|
|
||||||
|
|
||||||
// Virtual method called before visiting re's children.
|
|
||||||
// PreVisit passes ownership of its return value to its caller.
|
|
||||||
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
|
|
||||||
// and passed to the child PreVisits and PostVisits as parent_arg.
|
|
||||||
// At the top-most Regexp, parent_arg is arg passed to walk.
|
|
||||||
// If PreVisit sets *stop to true, the walk does not recurse
|
|
||||||
// into the children. Instead it behaves as though the return
|
|
||||||
// value from PreVisit is the return value from PostVisit.
|
|
||||||
// The default PreVisit returns parent_arg.
|
|
||||||
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
|
|
||||||
|
|
||||||
// Virtual method called after visiting re's children.
|
|
||||||
// The pre_arg is the T that PreVisit returned.
|
|
||||||
// The child_args is a vector of the T that the child PostVisits returned.
|
|
||||||
// PostVisit takes ownership of pre_arg.
|
|
||||||
// PostVisit takes ownership of the Ts
|
|
||||||
// in *child_args, but not the vector itself.
|
|
||||||
// PostVisit passes ownership of its return value
|
|
||||||
// to its caller.
|
|
||||||
// The default PostVisit simply returns pre_arg.
|
|
||||||
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
|
|
||||||
T* child_args, int nchild_args);
|
|
||||||
|
|
||||||
// Virtual method called to copy a T,
|
|
||||||
// when Walk notices that more than one child is the same re.
|
|
||||||
virtual T Copy(T arg);
|
|
||||||
|
|
||||||
// Virtual method called to do a "quick visit" of the re,
|
|
||||||
// but not its children. Only called once the visit budget
|
|
||||||
// has been used up and we're trying to abort the walk
|
|
||||||
// as quickly as possible. Should return a value that
|
|
||||||
// makes sense for the parent PostVisits still to be run.
|
|
||||||
// This function is (hopefully) only called by
|
|
||||||
// WalkExponential, but must be implemented by all clients,
|
|
||||||
// just in case.
|
|
||||||
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
|
|
||||||
|
|
||||||
// Walks over a regular expression.
|
|
||||||
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
|
|
||||||
// Returns the T returned by PostVisit on re.
|
|
||||||
T Walk(Regexp* re, T top_arg);
|
|
||||||
|
|
||||||
// Like Walk, but doesn't use Copy. This can lead to
|
|
||||||
// exponential runtimes on cross-linked Regexps like the
|
|
||||||
// ones generated by Simplify. To help limit this,
|
|
||||||
// at most max_visits nodes will be visited and then
|
|
||||||
// the walk will be cut off early.
|
|
||||||
// If the walk *is* cut off early, ShortVisit(re)
|
|
||||||
// will be called on regexps that cannot be fully
|
|
||||||
// visited rather than calling PreVisit/PostVisit.
|
|
||||||
T WalkExponential(Regexp* re, T top_arg, int max_visits);
|
|
||||||
|
|
||||||
// Clears the stack. Should never be necessary, since
|
|
||||||
// Walk always enters and exits with an empty stack.
|
|
||||||
// Logs DFATAL if stack is not already clear.
|
|
||||||
void Reset();
|
|
||||||
|
|
||||||
// Returns whether walk was cut off.
|
|
||||||
bool stopped_early() { return stopped_early_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Walk state for the entire traversal.
|
|
||||||
std::stack<WalkState<T> >* stack_;
|
|
||||||
bool stopped_early_;
|
|
||||||
int max_visits_;
|
|
||||||
|
|
||||||
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
|
|
||||||
|
|
||||||
Walker(const Walker&) = delete;
|
|
||||||
Walker& operator=(const Walker&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
|
|
||||||
T parent_arg,
|
|
||||||
bool* stop) {
|
|
||||||
return parent_arg;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
|
|
||||||
T parent_arg,
|
|
||||||
T pre_arg,
|
|
||||||
T* child_args,
|
|
||||||
int nchild_args) {
|
|
||||||
return pre_arg;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
|
|
||||||
return arg;
|
|
||||||
}
|
|
||||||
|
|
||||||
// State about a single level in the traversal.
|
|
||||||
template<typename T> struct WalkState {
|
|
||||||
WalkState<T>(Regexp* re, T parent)
|
|
||||||
: re(re),
|
|
||||||
n(-1),
|
|
||||||
parent_arg(parent),
|
|
||||||
child_args(NULL) { }
|
|
||||||
|
|
||||||
Regexp* re; // The regexp
|
|
||||||
int n; // The index of the next child to process; -1 means need to PreVisit
|
|
||||||
T parent_arg; // Accumulated arguments.
|
|
||||||
T pre_arg;
|
|
||||||
T child_arg; // One-element buffer for child_args.
|
|
||||||
T* child_args;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename T> Regexp::Walker<T>::Walker() {
|
|
||||||
stack_ = new std::stack<WalkState<T> >;
|
|
||||||
stopped_early_ = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> Regexp::Walker<T>::~Walker() {
|
|
||||||
Reset();
|
|
||||||
delete stack_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clears the stack. Should never be necessary, since
|
|
||||||
// Walk always enters and exits with an empty stack.
|
|
||||||
// Logs DFATAL if stack is not already clear.
|
|
||||||
template<typename T> void Regexp::Walker<T>::Reset() {
|
|
||||||
if (stack_ && stack_->size() > 0) {
|
|
||||||
LOG(DFATAL) << "Stack not empty.";
|
|
||||||
while (stack_->size() > 0) {
|
|
||||||
delete stack_->top().child_args;
|
|
||||||
stack_->pop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
|
|
||||||
bool use_copy) {
|
|
||||||
Reset();
|
|
||||||
|
|
||||||
if (re == NULL) {
|
|
||||||
LOG(DFATAL) << "Walk NULL";
|
|
||||||
return top_arg;
|
|
||||||
}
|
|
||||||
|
|
||||||
stack_->push(WalkState<T>(re, top_arg));
|
|
||||||
|
|
||||||
WalkState<T>* s;
|
|
||||||
for (;;) {
|
|
||||||
T t;
|
|
||||||
s = &stack_->top();
|
|
||||||
Regexp* re = s->re;
|
|
||||||
switch (s->n) {
|
|
||||||
case -1: {
|
|
||||||
if (--max_visits_ < 0) {
|
|
||||||
stopped_early_ = true;
|
|
||||||
t = ShortVisit(re, s->parent_arg);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
bool stop = false;
|
|
||||||
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
|
|
||||||
if (stop) {
|
|
||||||
t = s->pre_arg;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
s->n = 0;
|
|
||||||
s->child_args = NULL;
|
|
||||||
if (re->nsub_ == 1)
|
|
||||||
s->child_args = &s->child_arg;
|
|
||||||
else if (re->nsub_ > 1)
|
|
||||||
s->child_args = new T[re->nsub_];
|
|
||||||
FALLTHROUGH_INTENDED;
|
|
||||||
}
|
|
||||||
default: {
|
|
||||||
if (re->nsub_ > 0) {
|
|
||||||
Regexp** sub = re->sub();
|
|
||||||
if (s->n < re->nsub_) {
|
|
||||||
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
|
|
||||||
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
|
|
||||||
s->n++;
|
|
||||||
} else {
|
|
||||||
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
|
|
||||||
if (re->nsub_ > 1)
|
|
||||||
delete[] s->child_args;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We've finished stack_->top().
|
|
||||||
// Update next guy down.
|
|
||||||
stack_->pop();
|
|
||||||
if (stack_->size() == 0)
|
|
||||||
return t;
|
|
||||||
s = &stack_->top();
|
|
||||||
if (s->child_args != NULL)
|
|
||||||
s->child_args[s->n] = t;
|
|
||||||
else
|
|
||||||
s->child_arg = t;
|
|
||||||
s->n++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
|
|
||||||
// Without the exponential walking behavior,
|
|
||||||
// this budget should be more than enough for any
|
|
||||||
// regexp, and yet not enough to get us in trouble
|
|
||||||
// as far as CPU time.
|
|
||||||
max_visits_ = 1000000;
|
|
||||||
return WalkInternal(re, top_arg, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
|
|
||||||
int max_visits) {
|
|
||||||
max_visits_ = max_visits;
|
|
||||||
return WalkInternal(re, top_arg, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_WALKER_INL_H_
|
|
@ -1,160 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <chrono>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/flags.h"
|
|
||||||
#include "util/benchmark.h"
|
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#define snprintf _snprintf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using testing::Benchmark;
|
|
||||||
|
|
||||||
static Benchmark* benchmarks[10000];
|
|
||||||
static int nbenchmarks;
|
|
||||||
|
|
||||||
void Benchmark::Register() {
|
|
||||||
benchmarks[nbenchmarks] = this;
|
|
||||||
if(lo < 1)
|
|
||||||
lo = 1;
|
|
||||||
if(hi < lo)
|
|
||||||
hi = lo;
|
|
||||||
nbenchmarks++;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t nsec() {
|
|
||||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
||||||
std::chrono::steady_clock::now().time_since_epoch()).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
static int64_t bytes;
|
|
||||||
static int64_t ns;
|
|
||||||
static int64_t t0;
|
|
||||||
static int64_t items;
|
|
||||||
|
|
||||||
void SetBenchmarkBytesProcessed(int64_t x) {
|
|
||||||
bytes = x;
|
|
||||||
}
|
|
||||||
|
|
||||||
void StopBenchmarkTiming() {
|
|
||||||
if(t0 != 0)
|
|
||||||
ns += nsec() - t0;
|
|
||||||
t0 = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void StartBenchmarkTiming() {
|
|
||||||
if(t0 == 0)
|
|
||||||
t0 = nsec();
|
|
||||||
}
|
|
||||||
|
|
||||||
void SetBenchmarkItemsProcessed(int n) {
|
|
||||||
items = n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void BenchmarkMemoryUsage() {
|
|
||||||
// TODO(rsc): Implement.
|
|
||||||
}
|
|
||||||
|
|
||||||
int NumCPUs() {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void runN(Benchmark *b, int n, int siz) {
|
|
||||||
bytes = 0;
|
|
||||||
items = 0;
|
|
||||||
ns = 0;
|
|
||||||
t0 = nsec();
|
|
||||||
if(b->fn)
|
|
||||||
b->fn(n);
|
|
||||||
else if(b->fnr)
|
|
||||||
b->fnr(n, siz);
|
|
||||||
else {
|
|
||||||
fprintf(stderr, "%s: missing function\n", b->name);
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
if(t0 != 0)
|
|
||||||
ns += nsec() - t0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int round(int n) {
|
|
||||||
int base = 1;
|
|
||||||
|
|
||||||
while(base*10 < n)
|
|
||||||
base *= 10;
|
|
||||||
if(n < 2*base)
|
|
||||||
return 2*base;
|
|
||||||
if(n < 5*base)
|
|
||||||
return 5*base;
|
|
||||||
return 10*base;
|
|
||||||
}
|
|
||||||
|
|
||||||
void RunBench(Benchmark* b, int nthread, int siz) {
|
|
||||||
int n, last;
|
|
||||||
|
|
||||||
// TODO(rsc): Threaded benchmarks.
|
|
||||||
if(nthread != 1)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// run once in case it's expensive
|
|
||||||
n = 1;
|
|
||||||
runN(b, n, siz);
|
|
||||||
while(ns < (int)1e9 && n < (int)1e9) {
|
|
||||||
last = n;
|
|
||||||
if(ns/n == 0)
|
|
||||||
n = (int)1e9;
|
|
||||||
else
|
|
||||||
n = (int)1e9 / static_cast<int>(ns/n);
|
|
||||||
|
|
||||||
n = std::max(last+1, std::min(n+n/2, 100*last));
|
|
||||||
n = round(n);
|
|
||||||
runN(b, n, siz);
|
|
||||||
}
|
|
||||||
|
|
||||||
char mb[100];
|
|
||||||
char suf[100];
|
|
||||||
mb[0] = '\0';
|
|
||||||
suf[0] = '\0';
|
|
||||||
if(ns > 0 && bytes > 0)
|
|
||||||
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
|
|
||||||
if(b->fnr || b->lo != b->hi) {
|
|
||||||
if(siz >= (1<<20))
|
|
||||||
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
|
|
||||||
else if(siz >= (1<<10))
|
|
||||||
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
|
|
||||||
else
|
|
||||||
snprintf(suf, sizeof suf, "/%d", siz);
|
|
||||||
}
|
|
||||||
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
|
|
||||||
fflush(stdout);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int match(const char* name, int argc, const char** argv) {
|
|
||||||
if(argc == 1)
|
|
||||||
return 1;
|
|
||||||
for(int i = 1; i < argc; i++)
|
|
||||||
if(RE2::PartialMatch(name, argv[i]))
|
|
||||||
return 1;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, const char** argv) {
|
|
||||||
for(int i = 0; i < nbenchmarks; i++) {
|
|
||||||
Benchmark* b = benchmarks[i];
|
|
||||||
if(match(b->name, argc, argv))
|
|
||||||
for(int j = b->threadlo; j <= b->threadhi; j++)
|
|
||||||
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
|
|
||||||
RunBench(b, j, k);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_BENCHMARK_H_
|
|
||||||
#define UTIL_BENCHMARK_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
namespace testing {
|
|
||||||
struct Benchmark {
|
|
||||||
const char* name;
|
|
||||||
void (*fn)(int);
|
|
||||||
void (*fnr)(int, int);
|
|
||||||
int lo;
|
|
||||||
int hi;
|
|
||||||
int threadlo;
|
|
||||||
int threadhi;
|
|
||||||
|
|
||||||
void Register();
|
|
||||||
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
|
|
||||||
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
|
|
||||||
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
|
|
||||||
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
|
|
||||||
};
|
|
||||||
} // namespace testing
|
|
||||||
|
|
||||||
void SetBenchmarkBytesProcessed(int64_t);
|
|
||||||
void StopBenchmarkTiming();
|
|
||||||
void StartBenchmarkTiming();
|
|
||||||
void BenchmarkMemoryUsage();
|
|
||||||
void SetBenchmarkItemsProcessed(int);
|
|
||||||
|
|
||||||
int NumCPUs();
|
|
||||||
|
|
||||||
#define BENCHMARK(f) \
|
|
||||||
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
|
|
||||||
|
|
||||||
#define BENCHMARK_RANGE(f, lo, hi) \
|
|
||||||
::testing::Benchmark* _benchmark_##f = \
|
|
||||||
(new ::testing::Benchmark(#f, f, lo, hi))
|
|
||||||
|
|
||||||
#endif // UTIL_BENCHMARK_H_
|
|
@ -1,29 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_FLAGS_H_
|
|
||||||
#define UTIL_FLAGS_H_
|
|
||||||
|
|
||||||
// Simplified version of Google's command line flags.
|
|
||||||
// Does not support parsing the command line.
|
|
||||||
// If you want to do that, see
|
|
||||||
// https://gflags.github.io/gflags/
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#define DEFINE_flag(type, name, deflt, desc) \
|
|
||||||
namespace re2 { type FLAGS_##name = deflt; }
|
|
||||||
|
|
||||||
#define DECLARE_flag(type, name) \
|
|
||||||
namespace re2 { extern type FLAGS_##name; }
|
|
||||||
|
|
||||||
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
|
|
||||||
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
|
|
||||||
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
|
|
||||||
|
|
||||||
#define DECLARE_bool(name) DECLARE_flag(bool, name)
|
|
||||||
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
|
|
||||||
#define DECLARE_string(name) DECLARE_flag(string, name)
|
|
||||||
|
|
||||||
#endif // UTIL_FLAGS_H_
|
|
@ -1,21 +0,0 @@
|
|||||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
// Entry point for libFuzzer.
|
|
||||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
uint8_t data[32];
|
|
||||||
for (int i = 0; i < 32; i++) {
|
|
||||||
for (int j = 0; j < 32; j++) {
|
|
||||||
data[j] = random() & 0xFF;
|
|
||||||
}
|
|
||||||
LLVMFuzzerTestOneInput(data, 32);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,109 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_LOGGING_H_
|
|
||||||
#define UTIL_LOGGING_H_
|
|
||||||
|
|
||||||
// Simplified version of Google's logging.
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <ostream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
// Debug-only checking.
|
|
||||||
#define DCHECK(condition) assert(condition)
|
|
||||||
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
|
|
||||||
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
|
|
||||||
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
|
|
||||||
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
|
|
||||||
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
|
|
||||||
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
|
|
||||||
|
|
||||||
// Always-on checking
|
|
||||||
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
|
|
||||||
#define CHECK_LT(x, y) CHECK((x) < (y))
|
|
||||||
#define CHECK_GT(x, y) CHECK((x) > (y))
|
|
||||||
#define CHECK_LE(x, y) CHECK((x) <= (y))
|
|
||||||
#define CHECK_GE(x, y) CHECK((x) >= (y))
|
|
||||||
#define CHECK_EQ(x, y) CHECK((x) == (y))
|
|
||||||
#define CHECK_NE(x, y) CHECK((x) != (y))
|
|
||||||
|
|
||||||
#define LOG_INFO LogMessage(__FILE__, __LINE__)
|
|
||||||
#define LOG_WARNING LogMessage(__FILE__, __LINE__)
|
|
||||||
#define LOG_ERROR LogMessage(__FILE__, __LINE__)
|
|
||||||
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
|
|
||||||
#define LOG_QFATAL LOG_FATAL
|
|
||||||
|
|
||||||
// It seems that one of the Windows header files defines ERROR as 0.
|
|
||||||
#ifdef _WIN32
|
|
||||||
#define LOG_0 LOG_INFO
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef NDEBUG
|
|
||||||
#define LOG_DFATAL LOG_ERROR
|
|
||||||
#else
|
|
||||||
#define LOG_DFATAL LOG_FATAL
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define LOG(severity) LOG_ ## severity.stream()
|
|
||||||
|
|
||||||
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
|
||||||
|
|
||||||
class LogMessage {
|
|
||||||
public:
|
|
||||||
LogMessage(const char* file, int line)
|
|
||||||
: flushed_(false) {
|
|
||||||
stream() << file << ":" << line << ": ";
|
|
||||||
}
|
|
||||||
void Flush() {
|
|
||||||
stream() << "\n";
|
|
||||||
string s = str_.str();
|
|
||||||
size_t n = s.size();
|
|
||||||
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
|
|
||||||
flushed_ = true;
|
|
||||||
}
|
|
||||||
~LogMessage() {
|
|
||||||
if (!flushed_) {
|
|
||||||
Flush();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
std::ostream& stream() { return str_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool flushed_;
|
|
||||||
std::ostringstream str_;
|
|
||||||
|
|
||||||
LogMessage(const LogMessage&) = delete;
|
|
||||||
LogMessage& operator=(const LogMessage&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Silence "destructor never returns" warning for ~LogMessageFatal().
|
|
||||||
// Since this is a header file, push and then pop to limit the scope.
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning(push)
|
|
||||||
#pragma warning(disable: 4722)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class LogMessageFatal : public LogMessage {
|
|
||||||
public:
|
|
||||||
LogMessageFatal(const char* file, int line)
|
|
||||||
: LogMessage(file, line) {}
|
|
||||||
~LogMessageFatal() {
|
|
||||||
Flush();
|
|
||||||
abort();
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
LogMessageFatal(const LogMessageFatal&) = delete;
|
|
||||||
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning(pop)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // UTIL_LOGGING_H_
|
|
@ -1,41 +0,0 @@
|
|||||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_MIX_H_
|
|
||||||
#define UTIL_MIX_H_
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <limits>
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
|
|
||||||
// Since this is a header file, push and then pop to limit the scope.
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning(push)
|
|
||||||
#pragma warning(disable: 4309)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class HashMix {
|
|
||||||
public:
|
|
||||||
HashMix() : hash_(1) {}
|
|
||||||
explicit HashMix(size_t val) : hash_(val + 83) {}
|
|
||||||
void Mix(size_t val) {
|
|
||||||
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
|
|
||||||
hash_ *= kMul;
|
|
||||||
hash_ = ((hash_ << 19) |
|
|
||||||
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
|
|
||||||
}
|
|
||||||
size_t get() const { return hash_; }
|
|
||||||
private:
|
|
||||||
size_t hash_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#pragma warning(pop)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_MIX_H_
|
|
@ -1,131 +0,0 @@
|
|||||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_MUTEX_H_
|
|
||||||
#define UTIL_MUTEX_H_
|
|
||||||
|
|
||||||
/*
|
|
||||||
* A simple mutex wrapper, supporting locks and read-write locks.
|
|
||||||
* You should assume the locks are *not* re-entrant.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#if !defined(_WIN32)
|
|
||||||
#ifndef _POSIX_C_SOURCE
|
|
||||||
#define _POSIX_C_SOURCE 200809L
|
|
||||||
#endif
|
|
||||||
#include <unistd.h>
|
|
||||||
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
|
|
||||||
#define MUTEX_IS_PTHREAD_RWLOCK
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
|
||||||
#include <pthread.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
typedef pthread_rwlock_t MutexType;
|
|
||||||
#else
|
|
||||||
#include <mutex>
|
|
||||||
typedef std::mutex MutexType;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class Mutex {
|
|
||||||
public:
|
|
||||||
inline Mutex();
|
|
||||||
inline ~Mutex();
|
|
||||||
inline void Lock(); // Block if needed until free then acquire exclusively
|
|
||||||
inline void Unlock(); // Release a lock acquired via Lock()
|
|
||||||
// Note that on systems that don't support read-write locks, these may
|
|
||||||
// be implemented as synonyms to Lock() and Unlock(). So you can use
|
|
||||||
// these for efficiency, but don't use them anyplace where being able
|
|
||||||
// to do shared reads is necessary to avoid deadlock.
|
|
||||||
inline void ReaderLock(); // Block until free or shared then acquire a share
|
|
||||||
inline void ReaderUnlock(); // Release a read share of this Mutex
|
|
||||||
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
|
|
||||||
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
|
|
||||||
|
|
||||||
private:
|
|
||||||
MutexType mutex_;
|
|
||||||
|
|
||||||
// Catch the error of writing Mutex when intending MutexLock.
|
|
||||||
Mutex(Mutex *ignored);
|
|
||||||
|
|
||||||
Mutex(const Mutex&) = delete;
|
|
||||||
Mutex& operator=(const Mutex&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
|
||||||
|
|
||||||
#define SAFE_PTHREAD(fncall) \
|
|
||||||
do { \
|
|
||||||
if ((fncall) != 0) abort(); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
|
|
||||||
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
|
|
||||||
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
|
|
||||||
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
|
||||||
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
|
|
||||||
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
|
||||||
|
|
||||||
#undef SAFE_PTHREAD
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
Mutex::Mutex() { }
|
|
||||||
Mutex::~Mutex() { }
|
|
||||||
void Mutex::Lock() { mutex_.lock(); }
|
|
||||||
void Mutex::Unlock() { mutex_.unlock(); }
|
|
||||||
void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
|
|
||||||
void Mutex::ReaderUnlock() { Unlock(); }
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------
|
|
||||||
// Some helper classes
|
|
||||||
|
|
||||||
// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
|
|
||||||
class MutexLock {
|
|
||||||
public:
|
|
||||||
explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
|
|
||||||
~MutexLock() { mu_->Unlock(); }
|
|
||||||
private:
|
|
||||||
Mutex * const mu_;
|
|
||||||
|
|
||||||
MutexLock(const MutexLock&) = delete;
|
|
||||||
MutexLock& operator=(const MutexLock&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
|
|
||||||
class ReaderMutexLock {
|
|
||||||
public:
|
|
||||||
explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
|
|
||||||
~ReaderMutexLock() { mu_->ReaderUnlock(); }
|
|
||||||
private:
|
|
||||||
Mutex * const mu_;
|
|
||||||
|
|
||||||
ReaderMutexLock(const ReaderMutexLock&) = delete;
|
|
||||||
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
class WriterMutexLock {
|
|
||||||
public:
|
|
||||||
explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
|
|
||||||
~WriterMutexLock() { mu_->WriterUnlock(); }
|
|
||||||
private:
|
|
||||||
Mutex * const mu_;
|
|
||||||
|
|
||||||
WriterMutexLock(const WriterMutexLock&) = delete;
|
|
||||||
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
|
|
||||||
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
|
|
||||||
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
|
|
||||||
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_MUTEX_H_
|
|
File diff suppressed because it is too large
Load Diff
@ -1,680 +0,0 @@
|
|||||||
// Copyright 2003-2010 Google Inc. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_PCRE_H_
|
|
||||||
#define UTIL_PCRE_H_
|
|
||||||
|
|
||||||
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
|
|
||||||
// The main changes are the addition of the HitLimit method and
|
|
||||||
// compilation as PCRE in namespace re2.
|
|
||||||
|
|
||||||
// C++ interface to the pcre regular-expression library. PCRE supports
|
|
||||||
// Perl-style regular expressions (with extensions like \d, \w, \s,
|
|
||||||
// ...).
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// REGEXP SYNTAX:
|
|
||||||
//
|
|
||||||
// This module uses the pcre library and hence supports its syntax
|
|
||||||
// for regular expressions:
|
|
||||||
//
|
|
||||||
// http://www.google.com/search?q=pcre
|
|
||||||
//
|
|
||||||
// The syntax is pretty similar to Perl's. For those not familiar
|
|
||||||
// with Perl's regular expressions, here are some examples of the most
|
|
||||||
// commonly used extensions:
|
|
||||||
//
|
|
||||||
// "hello (\\w+) world" -- \w matches a "word" character
|
|
||||||
// "version (\\d+)" -- \d matches a digit
|
|
||||||
// "hello\\s+world" -- \s matches any whitespace character
|
|
||||||
// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary
|
|
||||||
// "(?i)hello" -- (?i) turns on case-insensitive matching
|
|
||||||
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// MATCHING INTERFACE:
|
|
||||||
//
|
|
||||||
// The "FullMatch" operation checks that supplied text matches a
|
|
||||||
// supplied pattern exactly.
|
|
||||||
//
|
|
||||||
// Example: successful match
|
|
||||||
// CHECK(PCRE::FullMatch("hello", "h.*o"));
|
|
||||||
//
|
|
||||||
// Example: unsuccessful match (requires full match):
|
|
||||||
// CHECK(!PCRE::FullMatch("hello", "e"));
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// UTF-8 AND THE MATCHING INTERFACE:
|
|
||||||
//
|
|
||||||
// By default, pattern and text are plain text, one byte per character.
|
|
||||||
// The UTF8 flag, passed to the constructor, causes both pattern
|
|
||||||
// and string to be treated as UTF-8 text, still a byte stream but
|
|
||||||
// potentially multiple bytes per character. In practice, the text
|
|
||||||
// is likelier to be UTF-8 than the pattern, but the match returned
|
|
||||||
// may depend on the UTF8 flag, so always use it when matching
|
|
||||||
// UTF8 text. E.g., "." will match one byte normally but with UTF8
|
|
||||||
// set may match up to three bytes of a multi-byte character.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// PCRE re(utf8_pattern, PCRE::UTF8);
|
|
||||||
// CHECK(PCRE::FullMatch(utf8_string, re));
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// MATCHING WITH SUB-STRING EXTRACTION:
|
|
||||||
//
|
|
||||||
// You can supply extra pointer arguments to extract matched subpieces.
|
|
||||||
//
|
|
||||||
// Example: extracts "ruby" into "s" and 1234 into "i"
|
|
||||||
// int i;
|
|
||||||
// string s;
|
|
||||||
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
|
|
||||||
//
|
|
||||||
// Example: fails because string cannot be stored in integer
|
|
||||||
// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i));
|
|
||||||
//
|
|
||||||
// Example: fails because there aren't enough sub-patterns:
|
|
||||||
// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s));
|
|
||||||
//
|
|
||||||
// Example: does not try to extract any extra sub-patterns
|
|
||||||
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
|
|
||||||
//
|
|
||||||
// Example: does not try to extract into NULL
|
|
||||||
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
|
|
||||||
//
|
|
||||||
// Example: integer overflow causes failure
|
|
||||||
// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PARTIAL MATCHES
|
|
||||||
//
|
|
||||||
// You can use the "PartialMatch" operation when you want the pattern
|
|
||||||
// to match any substring of the text.
|
|
||||||
//
|
|
||||||
// Example: simple search for a string:
|
|
||||||
// CHECK(PCRE::PartialMatch("hello", "ell"));
|
|
||||||
//
|
|
||||||
// Example: find first number in a string
|
|
||||||
// int number;
|
|
||||||
// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number));
|
|
||||||
// CHECK_EQ(number, 100);
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS
|
|
||||||
//
|
|
||||||
// PCRE makes it easy to use any string as a regular expression, without
|
|
||||||
// requiring a separate compilation step.
|
|
||||||
//
|
|
||||||
// If speed is of the essence, you can create a pre-compiled "PCRE"
|
|
||||||
// object from the pattern and use it multiple times. If you do so,
|
|
||||||
// you can typically parse text faster than with sscanf.
|
|
||||||
//
|
|
||||||
// Example: precompile pattern for faster matching:
|
|
||||||
// PCRE pattern("h.*o");
|
|
||||||
// while (ReadLine(&str)) {
|
|
||||||
// if (PCRE::FullMatch(str, pattern)) ...;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// SCANNING TEXT INCPCREMENTALLY
|
|
||||||
//
|
|
||||||
// The "Consume" operation may be useful if you want to repeatedly
|
|
||||||
// match regular expressions at the front of a string and skip over
|
|
||||||
// them as they match. This requires use of the "StringPiece" type,
|
|
||||||
// which represents a sub-range of a real string.
|
|
||||||
//
|
|
||||||
// Example: read lines of the form "var = value" from a string.
|
|
||||||
// string contents = ...; // Fill string somehow
|
|
||||||
// StringPiece input(contents); // Wrap a StringPiece around it
|
|
||||||
//
|
|
||||||
// string var;
|
|
||||||
// int value;
|
|
||||||
// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
|
|
||||||
// ...;
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// Each successful call to "Consume" will set "var/value", and also
|
|
||||||
// advance "input" so it points past the matched text. Note that if the
|
|
||||||
// regular expression matches an empty string, input will advance
|
|
||||||
// by 0 bytes. If the regular expression being used might match
|
|
||||||
// an empty string, the loop body must check for this case and either
|
|
||||||
// advance the string or break out of the loop.
|
|
||||||
//
|
|
||||||
// The "FindAndConsume" operation is similar to "Consume" but does not
|
|
||||||
// anchor your match at the beginning of the string. For example, you
|
|
||||||
// could extract all words from a string by repeatedly calling
|
|
||||||
// PCRE::FindAndConsume(&input, "(\\w+)", &word)
|
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------
|
|
||||||
// PARSING HEX/OCTAL/C-RADIX NUMBERS
|
|
||||||
//
|
|
||||||
// By default, if you pass a pointer to a numeric value, the
|
|
||||||
// corresponding text is interpreted as a base-10 number. You can
|
|
||||||
// instead wrap the pointer with a call to one of the operators Hex(),
|
|
||||||
// Octal(), or CRadix() to interpret the text in another base. The
|
|
||||||
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
|
|
||||||
// prefixes, but defaults to base-10.
|
|
||||||
//
|
|
||||||
// Example:
|
|
||||||
// int a, b, c, d;
|
|
||||||
// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
|
|
||||||
// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d));
|
|
||||||
// will leave 64 in a, b, c, and d.
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
|
|
||||||
#ifdef USEPCRE
|
|
||||||
#include <pcre.h>
|
|
||||||
namespace re2 {
|
|
||||||
const bool UsingPCRE = true;
|
|
||||||
} // namespace re2
|
|
||||||
#else
|
|
||||||
struct pcre; // opaque
|
|
||||||
namespace re2 {
|
|
||||||
const bool UsingPCRE = false;
|
|
||||||
} // namespace re2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
class PCRE_Options;
|
|
||||||
|
|
||||||
// Interface for regular expression matching. Also corresponds to a
|
|
||||||
// pre-compiled regular expression. An "PCRE" object is safe for
|
|
||||||
// concurrent use by multiple threads.
|
|
||||||
class PCRE {
|
|
||||||
public:
|
|
||||||
// We convert user-passed pointers into special Arg objects
|
|
||||||
class Arg;
|
|
||||||
|
|
||||||
// Marks end of arg list.
|
|
||||||
// ONLY USE IN OPTIONAL ARG DEFAULTS.
|
|
||||||
// DO NOT PASS EXPLICITLY.
|
|
||||||
static Arg no_more_args;
|
|
||||||
|
|
||||||
// Options are same value as those in pcre. We provide them here
|
|
||||||
// to avoid users needing to include pcre.h and also to isolate
|
|
||||||
// users from pcre should we change the underlying library.
|
|
||||||
// Only those needed by Google programs are exposed here to
|
|
||||||
// avoid collision with options employed internally by regexp.cc
|
|
||||||
// Note that some options have equivalents that can be specified in
|
|
||||||
// the regexp itself. For example, prefixing your regexp with
|
|
||||||
// "(?s)" has the same effect as the PCRE_DOTALL option.
|
|
||||||
enum Option {
|
|
||||||
None = 0x0000,
|
|
||||||
UTF8 = 0x0800, // == PCRE_UTF8
|
|
||||||
EnabledCompileOptions = UTF8,
|
|
||||||
EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag
|
|
||||||
};
|
|
||||||
|
|
||||||
// We provide implicit conversions from strings so that users can
|
|
||||||
// pass in a string or a "const char*" wherever an "PCRE" is expected.
|
|
||||||
PCRE(const char* pattern);
|
|
||||||
PCRE(const char* pattern, Option option);
|
|
||||||
PCRE(const string& pattern);
|
|
||||||
PCRE(const string& pattern, Option option);
|
|
||||||
PCRE(const char *pattern, const PCRE_Options& re_option);
|
|
||||||
PCRE(const string& pattern, const PCRE_Options& re_option);
|
|
||||||
|
|
||||||
~PCRE();
|
|
||||||
|
|
||||||
// The string specification for this PCRE. E.g.
|
|
||||||
// PCRE re("ab*c?d+");
|
|
||||||
// re.pattern(); // "ab*c?d+"
|
|
||||||
const string& pattern() const { return pattern_; }
|
|
||||||
|
|
||||||
// If PCRE could not be created properly, returns an error string.
|
|
||||||
// Else returns the empty string.
|
|
||||||
const string& error() const { return *error_; }
|
|
||||||
|
|
||||||
// Whether the PCRE has hit a match limit during execution.
|
|
||||||
// Not thread safe. Intended only for testing.
|
|
||||||
// If hitting match limits is a problem,
|
|
||||||
// you should be using PCRE2 (re2/re2.h)
|
|
||||||
// instead of checking this flag.
|
|
||||||
bool HitLimit();
|
|
||||||
void ClearHitLimit();
|
|
||||||
|
|
||||||
/***** The useful part: the matching interface *****/
|
|
||||||
|
|
||||||
// Matches "text" against "pattern". If pointer arguments are
|
|
||||||
// supplied, copies matched sub-patterns into them.
|
|
||||||
//
|
|
||||||
// You can pass in a "const char*" or a "string" for "text".
|
|
||||||
// You can pass in a "const char*" or a "string" or a "PCRE" for "pattern".
|
|
||||||
//
|
|
||||||
// The provided pointer arguments can be pointers to any scalar numeric
|
|
||||||
// type, or one of:
|
|
||||||
// string (matched piece is copied to string)
|
|
||||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
|
||||||
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
|
||||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
|
||||||
//
|
|
||||||
// Returns true iff all of the following conditions are satisfied:
|
|
||||||
// a. "text" matches "pattern" exactly
|
|
||||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
|
||||||
// c. The "i"th argument has a suitable type for holding the
|
|
||||||
// string captured as the "i"th sub-pattern. If you pass in
|
|
||||||
// NULL for the "i"th argument, or pass fewer arguments than
|
|
||||||
// number of sub-patterns, "i"th captured sub-pattern is
|
|
||||||
// ignored.
|
|
||||||
//
|
|
||||||
// CAVEAT: An optional sub-pattern that does not exist in the
|
|
||||||
// matched string is assigned the empty string. Therefore, the
|
|
||||||
// following will return false (because the empty string is not a
|
|
||||||
// valid number):
|
|
||||||
// int number;
|
|
||||||
// PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
|
||||||
struct FullMatchFunctor {
|
|
||||||
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
|
|
||||||
const Arg& ptr1 = no_more_args,
|
|
||||||
const Arg& ptr2 = no_more_args,
|
|
||||||
const Arg& ptr3 = no_more_args,
|
|
||||||
const Arg& ptr4 = no_more_args,
|
|
||||||
const Arg& ptr5 = no_more_args,
|
|
||||||
const Arg& ptr6 = no_more_args,
|
|
||||||
const Arg& ptr7 = no_more_args,
|
|
||||||
const Arg& ptr8 = no_more_args,
|
|
||||||
const Arg& ptr9 = no_more_args,
|
|
||||||
const Arg& ptr10 = no_more_args,
|
|
||||||
const Arg& ptr11 = no_more_args,
|
|
||||||
const Arg& ptr12 = no_more_args,
|
|
||||||
const Arg& ptr13 = no_more_args,
|
|
||||||
const Arg& ptr14 = no_more_args,
|
|
||||||
const Arg& ptr15 = no_more_args,
|
|
||||||
const Arg& ptr16 = no_more_args) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const FullMatchFunctor FullMatch;
|
|
||||||
|
|
||||||
// Exactly like FullMatch(), except that "pattern" is allowed to match
|
|
||||||
// a substring of "text".
|
|
||||||
struct PartialMatchFunctor {
|
|
||||||
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
|
|
||||||
const Arg& ptr1 = no_more_args,
|
|
||||||
const Arg& ptr2 = no_more_args,
|
|
||||||
const Arg& ptr3 = no_more_args,
|
|
||||||
const Arg& ptr4 = no_more_args,
|
|
||||||
const Arg& ptr5 = no_more_args,
|
|
||||||
const Arg& ptr6 = no_more_args,
|
|
||||||
const Arg& ptr7 = no_more_args,
|
|
||||||
const Arg& ptr8 = no_more_args,
|
|
||||||
const Arg& ptr9 = no_more_args,
|
|
||||||
const Arg& ptr10 = no_more_args,
|
|
||||||
const Arg& ptr11 = no_more_args,
|
|
||||||
const Arg& ptr12 = no_more_args,
|
|
||||||
const Arg& ptr13 = no_more_args,
|
|
||||||
const Arg& ptr14 = no_more_args,
|
|
||||||
const Arg& ptr15 = no_more_args,
|
|
||||||
const Arg& ptr16 = no_more_args) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const PartialMatchFunctor PartialMatch;
|
|
||||||
|
|
||||||
// Like FullMatch() and PartialMatch(), except that pattern has to
|
|
||||||
// match a prefix of "text", and "input" is advanced past the matched
|
|
||||||
// text. Note: "input" is modified iff this routine returns true.
|
|
||||||
struct ConsumeFunctor {
|
|
||||||
bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args
|
|
||||||
const Arg& ptr1 = no_more_args,
|
|
||||||
const Arg& ptr2 = no_more_args,
|
|
||||||
const Arg& ptr3 = no_more_args,
|
|
||||||
const Arg& ptr4 = no_more_args,
|
|
||||||
const Arg& ptr5 = no_more_args,
|
|
||||||
const Arg& ptr6 = no_more_args,
|
|
||||||
const Arg& ptr7 = no_more_args,
|
|
||||||
const Arg& ptr8 = no_more_args,
|
|
||||||
const Arg& ptr9 = no_more_args,
|
|
||||||
const Arg& ptr10 = no_more_args,
|
|
||||||
const Arg& ptr11 = no_more_args,
|
|
||||||
const Arg& ptr12 = no_more_args,
|
|
||||||
const Arg& ptr13 = no_more_args,
|
|
||||||
const Arg& ptr14 = no_more_args,
|
|
||||||
const Arg& ptr15 = no_more_args,
|
|
||||||
const Arg& ptr16 = no_more_args) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const ConsumeFunctor Consume;
|
|
||||||
|
|
||||||
// Like Consume(..), but does not anchor the match at the beginning of the
|
|
||||||
// string. That is, "pattern" need not start its match at the beginning of
|
|
||||||
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
|
|
||||||
// word in "s" and stores it in "word".
|
|
||||||
struct FindAndConsumeFunctor {
|
|
||||||
bool operator ()(StringPiece* input, const PCRE& pattern,
|
|
||||||
const Arg& ptr1 = no_more_args,
|
|
||||||
const Arg& ptr2 = no_more_args,
|
|
||||||
const Arg& ptr3 = no_more_args,
|
|
||||||
const Arg& ptr4 = no_more_args,
|
|
||||||
const Arg& ptr5 = no_more_args,
|
|
||||||
const Arg& ptr6 = no_more_args,
|
|
||||||
const Arg& ptr7 = no_more_args,
|
|
||||||
const Arg& ptr8 = no_more_args,
|
|
||||||
const Arg& ptr9 = no_more_args,
|
|
||||||
const Arg& ptr10 = no_more_args,
|
|
||||||
const Arg& ptr11 = no_more_args,
|
|
||||||
const Arg& ptr12 = no_more_args,
|
|
||||||
const Arg& ptr13 = no_more_args,
|
|
||||||
const Arg& ptr14 = no_more_args,
|
|
||||||
const Arg& ptr15 = no_more_args,
|
|
||||||
const Arg& ptr16 = no_more_args) const;
|
|
||||||
};
|
|
||||||
|
|
||||||
static const FindAndConsumeFunctor FindAndConsume;
|
|
||||||
|
|
||||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
|
||||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
|
||||||
// used to insert text matching corresponding parenthesized group
|
|
||||||
// from the pattern. \0 in "rewrite" refers to the entire matching
|
|
||||||
// text. E.g.,
|
|
||||||
//
|
|
||||||
// string s = "yabba dabba doo";
|
|
||||||
// CHECK(PCRE::Replace(&s, "b+", "d"));
|
|
||||||
//
|
|
||||||
// will leave "s" containing "yada dabba doo"
|
|
||||||
//
|
|
||||||
// Returns true if the pattern matches and a replacement occurs,
|
|
||||||
// false otherwise.
|
|
||||||
static bool Replace(string *str,
|
|
||||||
const PCRE& pattern,
|
|
||||||
const StringPiece& rewrite);
|
|
||||||
|
|
||||||
// Like Replace(), except replaces all occurrences of the pattern in
|
|
||||||
// the string with the rewrite. Replacements are not subject to
|
|
||||||
// re-matching. E.g.,
|
|
||||||
//
|
|
||||||
// string s = "yabba dabba doo";
|
|
||||||
// CHECK(PCRE::GlobalReplace(&s, "b+", "d"));
|
|
||||||
//
|
|
||||||
// will leave "s" containing "yada dada doo"
|
|
||||||
//
|
|
||||||
// Returns the number of replacements made.
|
|
||||||
static int GlobalReplace(string *str,
|
|
||||||
const PCRE& pattern,
|
|
||||||
const StringPiece& rewrite);
|
|
||||||
|
|
||||||
// Like Replace, except that if the pattern matches, "rewrite"
|
|
||||||
// is copied into "out" with substitutions. The non-matching
|
|
||||||
// portions of "text" are ignored.
|
|
||||||
//
|
|
||||||
// Returns true iff a match occurred and the extraction happened
|
|
||||||
// successfully; if no match occurs, the string is left unaffected.
|
|
||||||
static bool Extract(const StringPiece &text,
|
|
||||||
const PCRE& pattern,
|
|
||||||
const StringPiece &rewrite,
|
|
||||||
string *out);
|
|
||||||
|
|
||||||
// Check that the given @p rewrite string is suitable for use with
|
|
||||||
// this PCRE. It checks that:
|
|
||||||
// * The PCRE has enough parenthesized subexpressions to satisfy all
|
|
||||||
// of the \N tokens in @p rewrite, and
|
|
||||||
// * The @p rewrite string doesn't have any syntax errors
|
|
||||||
// ('\' followed by anything besides [0-9] and '\').
|
|
||||||
// Making this test will guarantee that "replace" and "extract"
|
|
||||||
// operations won't LOG(ERROR) or fail because of a bad rewrite
|
|
||||||
// string.
|
|
||||||
// @param rewrite The proposed rewrite string.
|
|
||||||
// @param error An error message is recorded here, iff we return false.
|
|
||||||
// Otherwise, it is unchanged.
|
|
||||||
// @return true, iff @p rewrite is suitable for use with the PCRE.
|
|
||||||
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
|
|
||||||
|
|
||||||
// Returns a copy of 'unquoted' with all potentially meaningful
|
|
||||||
// regexp characters backslash-escaped. The returned string, used
|
|
||||||
// as a regular expression, will exactly match the original string.
|
|
||||||
// For example,
|
|
||||||
// 1.5-2.0?
|
|
||||||
// becomes:
|
|
||||||
// 1\.5\-2\.0\?
|
|
||||||
static string QuoteMeta(const StringPiece& unquoted);
|
|
||||||
|
|
||||||
/***** Generic matching interface (not so nice to use) *****/
|
|
||||||
|
|
||||||
// Type of match (TODO: Should be restructured as an Option)
|
|
||||||
enum Anchor {
|
|
||||||
UNANCHORED, // No anchoring
|
|
||||||
ANCHOR_START, // Anchor at start only
|
|
||||||
ANCHOR_BOTH, // Anchor at start and end
|
|
||||||
};
|
|
||||||
|
|
||||||
// General matching routine. Stores the length of the match in
|
|
||||||
// "*consumed" if successful.
|
|
||||||
bool DoMatch(const StringPiece& text,
|
|
||||||
Anchor anchor,
|
|
||||||
size_t* consumed,
|
|
||||||
const Arg* const* args, int n) const;
|
|
||||||
|
|
||||||
// Return the number of capturing subpatterns, or -1 if the
|
|
||||||
// regexp wasn't valid on construction.
|
|
||||||
int NumberOfCapturingGroups() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void Init(const char* pattern, Option option, int match_limit,
|
|
||||||
int stack_limit, bool report_errors);
|
|
||||||
|
|
||||||
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
|
|
||||||
// pairs of integers for the beginning and end positions of matched
|
|
||||||
// text. The first pair corresponds to the entire matched text;
|
|
||||||
// subsequent pairs correspond, in order, to parentheses-captured
|
|
||||||
// matches. Returns the number of pairs (one more than the number of
|
|
||||||
// the last subpattern with a match) if matching was successful
|
|
||||||
// and zero if the match failed.
|
|
||||||
// I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
|
|
||||||
// against "foo", "bar", and "baz" respectively.
|
|
||||||
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
|
|
||||||
// But the values for all subpattern are filled in into "vec".
|
|
||||||
int TryMatch(const StringPiece& text,
|
|
||||||
size_t startpos,
|
|
||||||
Anchor anchor,
|
|
||||||
bool empty_ok,
|
|
||||||
int *vec,
|
|
||||||
int vecsize) const;
|
|
||||||
|
|
||||||
// Append the "rewrite" string, with backslash subsitutions from "text"
|
|
||||||
// and "vec", to string "out".
|
|
||||||
bool Rewrite(string *out,
|
|
||||||
const StringPiece &rewrite,
|
|
||||||
const StringPiece &text,
|
|
||||||
int *vec,
|
|
||||||
int veclen) const;
|
|
||||||
|
|
||||||
// internal implementation for DoMatch
|
|
||||||
bool DoMatchImpl(const StringPiece& text,
|
|
||||||
Anchor anchor,
|
|
||||||
size_t* consumed,
|
|
||||||
const Arg* const args[],
|
|
||||||
int n,
|
|
||||||
int* vec,
|
|
||||||
int vecsize) const;
|
|
||||||
|
|
||||||
// Compile the regexp for the specified anchoring mode
|
|
||||||
pcre* Compile(Anchor anchor);
|
|
||||||
|
|
||||||
string pattern_;
|
|
||||||
Option options_;
|
|
||||||
pcre* re_full_; // For full matches
|
|
||||||
pcre* re_partial_; // For partial matches
|
|
||||||
const string* error_; // Error indicator (or empty string)
|
|
||||||
bool report_errors_; // Silences error logging if false
|
|
||||||
int match_limit_; // Limit on execution resources
|
|
||||||
int stack_limit_; // Limit on stack resources (bytes)
|
|
||||||
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
|
|
||||||
|
|
||||||
PCRE(const PCRE&) = delete;
|
|
||||||
PCRE& operator=(const PCRE&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
|
|
||||||
// "extra" options. The only extras are match_limit, which limits
|
|
||||||
// the CPU time of a match, and stack_limit, which limits the
|
|
||||||
// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default
|
|
||||||
// that should not cause too many problems in production code.
|
|
||||||
// If PCRE hits a limit during a match, it may return a false negative,
|
|
||||||
// but (hopefully) it won't crash.
|
|
||||||
//
|
|
||||||
// NOTE: If you are handling regular expressions specified by
|
|
||||||
// (external or internal) users, rather than hard-coded ones,
|
|
||||||
// you should be using PCRE2, which uses an alternate implementation
|
|
||||||
// that avoids these issues. See http://go/re2quick.
|
|
||||||
class PCRE_Options {
|
|
||||||
public:
|
|
||||||
// constructor
|
|
||||||
PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {}
|
|
||||||
// accessors
|
|
||||||
PCRE::Option option() const { return option_; }
|
|
||||||
void set_option(PCRE::Option option) {
|
|
||||||
option_ = option;
|
|
||||||
}
|
|
||||||
int match_limit() const { return match_limit_; }
|
|
||||||
void set_match_limit(int match_limit) {
|
|
||||||
match_limit_ = match_limit;
|
|
||||||
}
|
|
||||||
int stack_limit() const { return stack_limit_; }
|
|
||||||
void set_stack_limit(int stack_limit) {
|
|
||||||
stack_limit_ = stack_limit;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If the regular expression is malformed, an error message will be printed
|
|
||||||
// iff report_errors() is true. Default: true.
|
|
||||||
bool report_errors() const { return report_errors_; }
|
|
||||||
void set_report_errors(bool report_errors) {
|
|
||||||
report_errors_ = report_errors;
|
|
||||||
}
|
|
||||||
private:
|
|
||||||
PCRE::Option option_;
|
|
||||||
int match_limit_;
|
|
||||||
int stack_limit_;
|
|
||||||
bool report_errors_;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/***** Implementation details *****/
|
|
||||||
|
|
||||||
// Hex/Octal/Binary?
|
|
||||||
|
|
||||||
// Special class for parsing into objects that define a ParseFrom() method
|
|
||||||
template <class T>
|
|
||||||
class _PCRE_MatchObject {
|
|
||||||
public:
|
|
||||||
static inline bool Parse(const char* str, size_t n, void* dest) {
|
|
||||||
if (dest == NULL) return true;
|
|
||||||
T* object = reinterpret_cast<T*>(dest);
|
|
||||||
return object->ParseFrom(str, n);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
class PCRE::Arg {
|
|
||||||
public:
|
|
||||||
// Empty constructor so we can declare arrays of PCRE::Arg
|
|
||||||
Arg();
|
|
||||||
|
|
||||||
// Constructor specially designed for NULL arguments
|
|
||||||
Arg(void*);
|
|
||||||
|
|
||||||
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
|
||||||
|
|
||||||
// Type-specific parsers
|
|
||||||
#define MAKE_PARSER(type, name) \
|
|
||||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
|
||||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
|
||||||
|
|
||||||
MAKE_PARSER(char, parse_char);
|
|
||||||
MAKE_PARSER(signed char, parse_schar);
|
|
||||||
MAKE_PARSER(unsigned char, parse_uchar);
|
|
||||||
MAKE_PARSER(float, parse_float);
|
|
||||||
MAKE_PARSER(double, parse_double);
|
|
||||||
MAKE_PARSER(string, parse_string);
|
|
||||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
|
||||||
|
|
||||||
MAKE_PARSER(short, parse_short);
|
|
||||||
MAKE_PARSER(unsigned short, parse_ushort);
|
|
||||||
MAKE_PARSER(int, parse_int);
|
|
||||||
MAKE_PARSER(unsigned int, parse_uint);
|
|
||||||
MAKE_PARSER(long, parse_long);
|
|
||||||
MAKE_PARSER(unsigned long, parse_ulong);
|
|
||||||
MAKE_PARSER(long long, parse_longlong);
|
|
||||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
|
||||||
|
|
||||||
#undef MAKE_PARSER
|
|
||||||
|
|
||||||
// Generic constructor
|
|
||||||
template <class T> Arg(T*, Parser parser);
|
|
||||||
// Generic constructor template
|
|
||||||
template <class T> Arg(T* p)
|
|
||||||
: arg_(p), parser_(_PCRE_MatchObject<T>::Parse) {
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the data
|
|
||||||
bool Parse(const char* str, size_t n) const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
void* arg_;
|
|
||||||
Parser parser_;
|
|
||||||
|
|
||||||
static bool parse_null (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_char (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_schar (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_uchar (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_float (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_double (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_string (const char* str, size_t n, void* dest);
|
|
||||||
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
|
||||||
|
|
||||||
#define DECLARE_INTEGER_PARSER(name) \
|
|
||||||
private: \
|
|
||||||
static bool parse_##name(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
|
||||||
int radix); \
|
|
||||||
\
|
|
||||||
public: \
|
|
||||||
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
|
||||||
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
|
||||||
|
|
||||||
DECLARE_INTEGER_PARSER(short);
|
|
||||||
DECLARE_INTEGER_PARSER(ushort);
|
|
||||||
DECLARE_INTEGER_PARSER(int);
|
|
||||||
DECLARE_INTEGER_PARSER(uint);
|
|
||||||
DECLARE_INTEGER_PARSER(long);
|
|
||||||
DECLARE_INTEGER_PARSER(ulong);
|
|
||||||
DECLARE_INTEGER_PARSER(longlong);
|
|
||||||
DECLARE_INTEGER_PARSER(ulonglong);
|
|
||||||
|
|
||||||
#undef DECLARE_INTEGER_PARSER
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
|
||||||
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
|
||||||
|
|
||||||
inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
|
|
||||||
return (*parser_)(str, n, arg_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This part of the parser, appropriate only for ints, deals with bases
|
|
||||||
#define MAKE_INTEGER_PARSER(type, name) \
|
|
||||||
inline PCRE::Arg Hex(type* ptr) { \
|
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
|
|
||||||
} \
|
|
||||||
inline PCRE::Arg Octal(type* ptr) { \
|
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
|
|
||||||
} \
|
|
||||||
inline PCRE::Arg CRadix(type* ptr) { \
|
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
|
|
||||||
}
|
|
||||||
|
|
||||||
MAKE_INTEGER_PARSER(short, short);
|
|
||||||
MAKE_INTEGER_PARSER(unsigned short, ushort);
|
|
||||||
MAKE_INTEGER_PARSER(int, int);
|
|
||||||
MAKE_INTEGER_PARSER(unsigned int, uint);
|
|
||||||
MAKE_INTEGER_PARSER(long, long);
|
|
||||||
MAKE_INTEGER_PARSER(unsigned long, ulong);
|
|
||||||
MAKE_INTEGER_PARSER(long long, longlong);
|
|
||||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
|
|
||||||
|
|
||||||
#undef MAKE_INTEGER_PARSER
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_PCRE_H_
|
|
@ -1,260 +0,0 @@
|
|||||||
/*
|
|
||||||
* The authors of this software are Rob Pike and Ken Thompson.
|
|
||||||
* Copyright (c) 2002 by Lucent Technologies.
|
|
||||||
* Permission to use, copy, modify, and distribute this software for any
|
|
||||||
* purpose without fee is hereby granted, provided that this entire notice
|
|
||||||
* is included in all copies of any software which is or includes a copy
|
|
||||||
* or modification of this software and in all copies of the supporting
|
|
||||||
* documentation for such software.
|
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
|
||||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include "util/utf.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
enum
|
|
||||||
{
|
|
||||||
Bit1 = 7,
|
|
||||||
Bitx = 6,
|
|
||||||
Bit2 = 5,
|
|
||||||
Bit3 = 4,
|
|
||||||
Bit4 = 3,
|
|
||||||
Bit5 = 2,
|
|
||||||
|
|
||||||
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
|
|
||||||
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
|
|
||||||
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
|
|
||||||
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
|
|
||||||
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
|
|
||||||
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
|
|
||||||
|
|
||||||
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
|
|
||||||
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
|
|
||||||
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
|
|
||||||
Rune4 = (1<<(Bit4+3*Bitx))-1,
|
|
||||||
/* 0001 1111 1111 1111 1111 1111 */
|
|
||||||
|
|
||||||
Maskx = (1<<Bitx)-1, /* 0011 1111 */
|
|
||||||
Testx = Maskx ^ 0xFF, /* 1100 0000 */
|
|
||||||
|
|
||||||
Bad = Runeerror,
|
|
||||||
};
|
|
||||||
|
|
||||||
int
|
|
||||||
chartorune(Rune *rune, const char *str)
|
|
||||||
{
|
|
||||||
int c, c1, c2, c3;
|
|
||||||
long l;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* one character sequence
|
|
||||||
* 00000-0007F => T1
|
|
||||||
*/
|
|
||||||
c = *(unsigned char*)str;
|
|
||||||
if(c < Tx) {
|
|
||||||
*rune = c;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* two character sequence
|
|
||||||
* 0080-07FF => T2 Tx
|
|
||||||
*/
|
|
||||||
c1 = *(unsigned char*)(str+1) ^ Tx;
|
|
||||||
if(c1 & Testx)
|
|
||||||
goto bad;
|
|
||||||
if(c < T3) {
|
|
||||||
if(c < T2)
|
|
||||||
goto bad;
|
|
||||||
l = ((c << Bitx) | c1) & Rune2;
|
|
||||||
if(l <= Rune1)
|
|
||||||
goto bad;
|
|
||||||
*rune = l;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* three character sequence
|
|
||||||
* 0800-FFFF => T3 Tx Tx
|
|
||||||
*/
|
|
||||||
c2 = *(unsigned char*)(str+2) ^ Tx;
|
|
||||||
if(c2 & Testx)
|
|
||||||
goto bad;
|
|
||||||
if(c < T4) {
|
|
||||||
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
|
|
||||||
if(l <= Rune2)
|
|
||||||
goto bad;
|
|
||||||
*rune = l;
|
|
||||||
return 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* four character sequence (21-bit value)
|
|
||||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
|
||||||
*/
|
|
||||||
c3 = *(unsigned char*)(str+3) ^ Tx;
|
|
||||||
if (c3 & Testx)
|
|
||||||
goto bad;
|
|
||||||
if (c < T5) {
|
|
||||||
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
|
|
||||||
if (l <= Rune3)
|
|
||||||
goto bad;
|
|
||||||
*rune = l;
|
|
||||||
return 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Support for 5-byte or longer UTF-8 would go here, but
|
|
||||||
* since we don't have that, we'll just fall through to bad.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
* bad decoding
|
|
||||||
*/
|
|
||||||
bad:
|
|
||||||
*rune = Bad;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
runetochar(char *str, const Rune *rune)
|
|
||||||
{
|
|
||||||
/* Runes are signed, so convert to unsigned for range check. */
|
|
||||||
unsigned long c;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* one character sequence
|
|
||||||
* 00000-0007F => 00-7F
|
|
||||||
*/
|
|
||||||
c = *rune;
|
|
||||||
if(c <= Rune1) {
|
|
||||||
str[0] = static_cast<char>(c);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* two character sequence
|
|
||||||
* 0080-07FF => T2 Tx
|
|
||||||
*/
|
|
||||||
if(c <= Rune2) {
|
|
||||||
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
|
|
||||||
str[1] = Tx | (c & Maskx);
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the Rune is out of range, convert it to the error rune.
|
|
||||||
* Do this test here because the error rune encodes to three bytes.
|
|
||||||
* Doing it earlier would duplicate work, since an out of range
|
|
||||||
* Rune wouldn't have fit in one or two bytes.
|
|
||||||
*/
|
|
||||||
if (c > Runemax)
|
|
||||||
c = Runeerror;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* three character sequence
|
|
||||||
* 0800-FFFF => T3 Tx Tx
|
|
||||||
*/
|
|
||||||
if (c <= Rune3) {
|
|
||||||
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
|
|
||||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
|
||||||
str[2] = Tx | (c & Maskx);
|
|
||||||
return 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* four character sequence (21-bit value)
|
|
||||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
|
||||||
*/
|
|
||||||
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
|
|
||||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
|
||||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
|
||||||
str[3] = Tx | (c & Maskx);
|
|
||||||
return 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
runelen(Rune rune)
|
|
||||||
{
|
|
||||||
char str[10];
|
|
||||||
|
|
||||||
return runetochar(str, &rune);
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
fullrune(const char *str, int n)
|
|
||||||
{
|
|
||||||
if (n > 0) {
|
|
||||||
int c = *(unsigned char*)str;
|
|
||||||
if (c < Tx)
|
|
||||||
return 1;
|
|
||||||
if (n > 1) {
|
|
||||||
if (c < T3)
|
|
||||||
return 1;
|
|
||||||
if (n > 2) {
|
|
||||||
if (c < T4 || n > 3)
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int
|
|
||||||
utflen(const char *s)
|
|
||||||
{
|
|
||||||
int c;
|
|
||||||
long n;
|
|
||||||
Rune rune;
|
|
||||||
|
|
||||||
n = 0;
|
|
||||||
for(;;) {
|
|
||||||
c = *(unsigned char*)s;
|
|
||||||
if(c < Runeself) {
|
|
||||||
if(c == 0)
|
|
||||||
return n;
|
|
||||||
s++;
|
|
||||||
} else
|
|
||||||
s += chartorune(&rune, s);
|
|
||||||
n++;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
char*
|
|
||||||
utfrune(const char *s, Rune c)
|
|
||||||
{
|
|
||||||
long c1;
|
|
||||||
Rune r;
|
|
||||||
int n;
|
|
||||||
|
|
||||||
if(c < Runesync) /* not part of utf sequence */
|
|
||||||
return strchr((char*)s, c);
|
|
||||||
|
|
||||||
for(;;) {
|
|
||||||
c1 = *(unsigned char*)s;
|
|
||||||
if(c1 < Runeself) { /* one byte rune */
|
|
||||||
if(c1 == 0)
|
|
||||||
return 0;
|
|
||||||
if(c1 == c)
|
|
||||||
return (char*)s;
|
|
||||||
s++;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
n = chartorune(&r, s);
|
|
||||||
if(r == c)
|
|
||||||
return (char*)s;
|
|
||||||
s += n;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,526 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_SPARSE_ARRAY_H_
|
|
||||||
#define UTIL_SPARSE_ARRAY_H_
|
|
||||||
|
|
||||||
// DESCRIPTION
|
|
||||||
//
|
|
||||||
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
|
||||||
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
|
|
||||||
// fast iteration through the elements in the array and fast clearing
|
|
||||||
// of the array. The array has a concept of certain elements being
|
|
||||||
// uninitialized (having no value).
|
|
||||||
//
|
|
||||||
// Insertion and deletion are constant time operations.
|
|
||||||
//
|
|
||||||
// Allocating the array is a constant time operation
|
|
||||||
// when memory allocation is a constant time operation.
|
|
||||||
//
|
|
||||||
// Clearing the array is a constant time operation (unusual!).
|
|
||||||
//
|
|
||||||
// Iterating through the array is an O(n) operation, where n
|
|
||||||
// is the number of items in the array (not O(m)).
|
|
||||||
//
|
|
||||||
// The array iterator visits entries in the order they were first
|
|
||||||
// inserted into the array. It is safe to add items to the array while
|
|
||||||
// using an iterator: the iterator will visit indices added to the array
|
|
||||||
// during the iteration, but will not re-visit indices whose values
|
|
||||||
// change after visiting. Thus SparseArray can be a convenient
|
|
||||||
// implementation of a work queue.
|
|
||||||
//
|
|
||||||
// The SparseArray implementation is NOT thread-safe. It is up to the
|
|
||||||
// caller to make sure only one thread is accessing the array. (Typically
|
|
||||||
// these arrays are temporary values and used in situations where speed is
|
|
||||||
// important.)
|
|
||||||
//
|
|
||||||
// The SparseArray interface does not present all the usual STL bells and
|
|
||||||
// whistles.
|
|
||||||
//
|
|
||||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
|
||||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
|
||||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
|
||||||
//
|
|
||||||
// Briggs & Torczon popularized this technique, but it had been known
|
|
||||||
// long before their paper. They point out that Aho, Hopcroft, and
|
|
||||||
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
|
|
||||||
// 1986 Programming Pearls both hint at the technique in exercises to the
|
|
||||||
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
|
|
||||||
// exercise 8).
|
|
||||||
//
|
|
||||||
// Briggs & Torczon describe a sparse set implementation. I have
|
|
||||||
// trivially generalized it to create a sparse array (actually the original
|
|
||||||
// target of the AHU and Bentley exercises).
|
|
||||||
|
|
||||||
// IMPLEMENTATION
|
|
||||||
//
|
|
||||||
// SparseArray is an array dense_ and an array sparse_, both of size max_size_.
|
|
||||||
// At any point, the number of elements in the sparse array is size_.
|
|
||||||
//
|
|
||||||
// The array dense_ contains the size_ elements in the sparse array (with
|
|
||||||
// their indices),
|
|
||||||
// in the order that the elements were first inserted. This array is dense:
|
|
||||||
// the size_ pairs are dense_[0] through dense_[size_-1].
|
|
||||||
//
|
|
||||||
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
|
|
||||||
// For indices present in the array, dense_[sparse_[i]].index_ == i.
|
|
||||||
// For indices not present in the array, sparse_ can contain any value at all,
|
|
||||||
// perhaps outside the range [0, size_) but perhaps not.
|
|
||||||
//
|
|
||||||
// The lax requirement on sparse_ values makes clearing the array very easy:
|
|
||||||
// set size_ to 0. Lookups are slightly more complicated.
|
|
||||||
// An index i has a value in the array if and only if:
|
|
||||||
// sparse_[i] is in [0, size_) AND
|
|
||||||
// dense_[sparse_[i]].index_ == i.
|
|
||||||
// If both these properties hold, only then it is safe to refer to
|
|
||||||
// dense_[sparse_[i]].value_
|
|
||||||
// as the value associated with index i.
|
|
||||||
//
|
|
||||||
// To insert a new entry, set sparse_[i] to size_,
|
|
||||||
// initialize dense_[size_], and then increment size_.
|
|
||||||
//
|
|
||||||
// Deletion of specific values from the array is implemented by
|
|
||||||
// swapping dense_[size_-1] and the dense_ being deleted and then
|
|
||||||
// updating the appropriate sparse_ entries.
|
|
||||||
//
|
|
||||||
// To make the sparse array as efficient as possible for non-primitive types,
|
|
||||||
// elements may or may not be destroyed when they are deleted from the sparse
|
|
||||||
// array through a call to erase(), erase_existing() or resize(). They
|
|
||||||
// immediately become inaccessible, but they are only guaranteed to be
|
|
||||||
// destroyed when the SparseArray destructor is called.
|
|
||||||
//
|
|
||||||
// A moved-from SparseArray will be empty.
|
|
||||||
|
|
||||||
// Doing this simplifies the logic below.
|
|
||||||
#ifndef __has_feature
|
|
||||||
#define __has_feature(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#if __has_feature(memory_sanitizer)
|
|
||||||
#include <sanitizer/msan_interface.h>
|
|
||||||
#endif
|
|
||||||
#include <algorithm>
|
|
||||||
#include <memory>
|
|
||||||
#include <type_traits>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
class SparseArray {
|
|
||||||
public:
|
|
||||||
SparseArray();
|
|
||||||
explicit SparseArray(int max_size);
|
|
||||||
~SparseArray();
|
|
||||||
|
|
||||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
|
||||||
class IndexValue;
|
|
||||||
static_assert(std::is_trivially_destructible<IndexValue>::value,
|
|
||||||
"IndexValue must be trivially destructible");
|
|
||||||
|
|
||||||
typedef IndexValue value_type;
|
|
||||||
typedef IndexValue* iterator;
|
|
||||||
typedef const IndexValue* const_iterator;
|
|
||||||
|
|
||||||
SparseArray(const SparseArray& src);
|
|
||||||
SparseArray(SparseArray&& src) /*noexcept*/;
|
|
||||||
|
|
||||||
SparseArray& operator=(const SparseArray& src);
|
|
||||||
SparseArray& operator=(SparseArray&& src) /*noexcept*/;
|
|
||||||
|
|
||||||
const IndexValue& iv(int i) const;
|
|
||||||
|
|
||||||
// Return the number of entries in the array.
|
|
||||||
int size() const {
|
|
||||||
return size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Indicate whether the array is empty.
|
|
||||||
int empty() const {
|
|
||||||
return size_ == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Iterate over the array.
|
|
||||||
iterator begin() {
|
|
||||||
return dense_.get();
|
|
||||||
}
|
|
||||||
iterator end() {
|
|
||||||
return dense_.get() + size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
const_iterator begin() const {
|
|
||||||
return dense_.get();
|
|
||||||
}
|
|
||||||
const_iterator end() const {
|
|
||||||
return dense_.get() + size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Change the maximum size of the array.
|
|
||||||
// Invalidates all iterators.
|
|
||||||
void resize(int max_size);
|
|
||||||
|
|
||||||
// Return the maximum size of the array.
|
|
||||||
// Indices can be in the range [0, max_size).
|
|
||||||
int max_size() const {
|
|
||||||
return max_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the array.
|
|
||||||
void clear() {
|
|
||||||
size_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check whether index i is in the array.
|
|
||||||
bool has_index(int i) const;
|
|
||||||
|
|
||||||
// Comparison function for sorting.
|
|
||||||
// Can sort the sparse array so that future iterations
|
|
||||||
// will visit indices in increasing order using
|
|
||||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
|
||||||
static bool less(const IndexValue& a, const IndexValue& b);
|
|
||||||
|
|
||||||
public:
|
|
||||||
// Set the value at index i to v.
|
|
||||||
iterator set(int i, const Value& v) {
|
|
||||||
return SetInternal(true, i, v);
|
|
||||||
}
|
|
||||||
iterator set(int i, Value&& v) { // NOLINT
|
|
||||||
return SetInternal(true, i, std::move(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
std::pair<iterator, bool> insert(const value_type& v) {
|
|
||||||
return InsertInternal(v);
|
|
||||||
}
|
|
||||||
std::pair<iterator, bool> insert(value_type&& v) { // NOLINT
|
|
||||||
return InsertInternal(std::move(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename... Args>
|
|
||||||
std::pair<iterator, bool> emplace(Args&&... args) { // NOLINT
|
|
||||||
return InsertInternal(value_type(std::forward<Args>(args)...));
|
|
||||||
}
|
|
||||||
|
|
||||||
iterator find(int i) {
|
|
||||||
if (has_index(i))
|
|
||||||
return dense_.get() + sparse_[i];
|
|
||||||
return end();
|
|
||||||
}
|
|
||||||
|
|
||||||
const_iterator find(int i) const {
|
|
||||||
if (has_index(i))
|
|
||||||
return dense_.get() + sparse_[i];
|
|
||||||
return end();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Change the value at index i to v.
|
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
|
||||||
iterator set_existing(int i, const Value& v) {
|
|
||||||
return SetExistingInternal(i, v);
|
|
||||||
}
|
|
||||||
iterator set_existing(int i, Value&& v) { // NOLINT
|
|
||||||
return SetExistingInternal(i, std::move(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the value at the new index i to v.
|
|
||||||
// Fast but unsafe: only use if has_index(i) is false.
|
|
||||||
iterator set_new(int i, const Value& v) {
|
|
||||||
return SetInternal(false, i, v);
|
|
||||||
}
|
|
||||||
iterator set_new(int i, Value&& v) { // NOLINT
|
|
||||||
return SetInternal(false, i, std::move(v));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get the value at index i from the array..
|
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
|
||||||
const Value& get_existing(int i) const;
|
|
||||||
|
|
||||||
// Erasing items from the array during iteration is in general
|
|
||||||
// NOT safe. There is one special case, which is that the current
|
|
||||||
// index-value pair can be erased as long as the iterator is then
|
|
||||||
// checked for being at the end before being incremented.
|
|
||||||
// For example:
|
|
||||||
//
|
|
||||||
// for (i = m.begin(); i != m.end(); ++i) {
|
|
||||||
// if (ShouldErase(i->index(), i->value())) {
|
|
||||||
// m.erase(i->index());
|
|
||||||
// --i;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// Except in the specific case just described, elements must
|
|
||||||
// not be erased from the array (including clearing the array)
|
|
||||||
// while iterators are walking over the array. Otherwise,
|
|
||||||
// the iterators could walk past the end of the array.
|
|
||||||
|
|
||||||
// Erases the element at index i from the array.
|
|
||||||
void erase(int i);
|
|
||||||
|
|
||||||
// Erases the element at index i from the array.
|
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
|
||||||
void erase_existing(int i);
|
|
||||||
|
|
||||||
private:
|
|
||||||
template <typename U>
|
|
||||||
std::pair<iterator, bool> InsertInternal(U&& v) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
std::pair<iterator, bool> p;
|
|
||||||
if (has_index(v.index_)) {
|
|
||||||
p = {dense_.get() + sparse_[v.index_], false};
|
|
||||||
} else {
|
|
||||||
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
|
|
||||||
}
|
|
||||||
DebugCheckInvariants();
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename U>
|
|
||||||
iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
|
||||||
assert(false && "illegal index");
|
|
||||||
// Semantically, end() would be better here, but we already know
|
|
||||||
// the user did something stupid, so begin() insulates them from
|
|
||||||
// dereferencing an invalid pointer.
|
|
||||||
return begin();
|
|
||||||
}
|
|
||||||
if (!allow_overwrite) {
|
|
||||||
assert(!has_index(i));
|
|
||||||
create_index(i);
|
|
||||||
} else {
|
|
||||||
if (!has_index(i))
|
|
||||||
create_index(i);
|
|
||||||
}
|
|
||||||
return set_existing(i, std::forward<U>(v)); // NOLINT
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename U>
|
|
||||||
iterator SetExistingInternal(int i, U&& v) { // NOLINT
|
|
||||||
DebugCheckInvariants();
|
|
||||||
assert(has_index(i));
|
|
||||||
dense_[sparse_[i]].value() = std::forward<U>(v);
|
|
||||||
DebugCheckInvariants();
|
|
||||||
return dense_.get() + sparse_[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the index i to the array.
|
|
||||||
// Only use if has_index(i) is known to be false.
|
|
||||||
// Since it doesn't set the value associated with i,
|
|
||||||
// this function is private, only intended as a helper
|
|
||||||
// for other methods.
|
|
||||||
void create_index(int i);
|
|
||||||
|
|
||||||
// In debug mode, verify that some invariant properties of the class
|
|
||||||
// are being maintained. This is called at the end of the constructor
|
|
||||||
// and at the beginning and end of all public non-const member functions.
|
|
||||||
void DebugCheckInvariants() const;
|
|
||||||
|
|
||||||
// Initializes memory for elements [min, max).
|
|
||||||
void MaybeInitializeMemory(int min, int max) {
|
|
||||||
#if __has_feature(memory_sanitizer)
|
|
||||||
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
|
||||||
#elif defined(RE2_ON_VALGRIND)
|
|
||||||
for (int i = min; i < max; i++) {
|
|
||||||
sparse_[i] = 0xababababU;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int size_ = 0;
|
|
||||||
int max_size_ = 0;
|
|
||||||
std::unique_ptr<int[]> sparse_;
|
|
||||||
std::unique_ptr<IndexValue[]> dense_;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseArray<Value>::SparseArray() = default;
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseArray<Value>::SparseArray(const SparseArray& src)
|
|
||||||
: size_(src.size_),
|
|
||||||
max_size_(src.max_size_),
|
|
||||||
sparse_(new int[max_size_]),
|
|
||||||
dense_(new IndexValue[max_size_]) {
|
|
||||||
std::copy_n(src.sparse_.get(), max_size_, sparse_.get());
|
|
||||||
std::copy_n(src.dense_.get(), max_size_, dense_.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
|
|
||||||
: size_(src.size_),
|
|
||||||
max_size_(src.max_size_),
|
|
||||||
sparse_(std::move(src.sparse_)),
|
|
||||||
dense_(std::move(src.dense_)) {
|
|
||||||
src.size_ = 0;
|
|
||||||
src.max_size_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
|
|
||||||
size_ = src.size_;
|
|
||||||
max_size_ = src.max_size_;
|
|
||||||
std::unique_ptr<int[]> a(new int[max_size_]);
|
|
||||||
std::copy_n(src.sparse_.get(), src.max_size_, a.get());
|
|
||||||
sparse_ = std::move(a);
|
|
||||||
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
|
|
||||||
std::copy_n(src.dense_.get(), src.max_size_, b.get());
|
|
||||||
dense_ = std::move(b);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseArray<Value>& SparseArray<Value>::operator=(
|
|
||||||
SparseArray&& src) /*noexcept*/ { // NOLINT
|
|
||||||
size_ = src.size_;
|
|
||||||
max_size_ = src.max_size_;
|
|
||||||
sparse_ = std::move(src.sparse_);
|
|
||||||
dense_ = std::move(src.dense_);
|
|
||||||
// clear out the source
|
|
||||||
src.size_ = 0;
|
|
||||||
src.max_size_ = 0;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
|
||||||
template<typename Value>
|
|
||||||
class SparseArray<Value>::IndexValue {
|
|
||||||
friend class SparseArray;
|
|
||||||
public:
|
|
||||||
typedef int first_type;
|
|
||||||
typedef Value second_type;
|
|
||||||
|
|
||||||
IndexValue() {}
|
|
||||||
IndexValue(int i, const Value& v) : index_(i), second(v) {}
|
|
||||||
IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {}
|
|
||||||
|
|
||||||
int index() const { return index_; }
|
|
||||||
|
|
||||||
Value& value() /*&*/ { return second; }
|
|
||||||
const Value& value() const /*&*/ { return second; }
|
|
||||||
//Value&& value() /*&&*/ { return std::move(second); } // NOLINT
|
|
||||||
|
|
||||||
private:
|
|
||||||
int index_;
|
|
||||||
|
|
||||||
public:
|
|
||||||
// Provide the data in the 'second' member so that the utilities
|
|
||||||
// in map-util work.
|
|
||||||
// TODO(billydonahue): 'second' is public for short-term compatibility.
|
|
||||||
// Users will be transitioned to using value() accessor.
|
|
||||||
Value second;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
const typename SparseArray<Value>::IndexValue&
|
|
||||||
SparseArray<Value>::iv(int i) const {
|
|
||||||
assert(i >= 0);
|
|
||||||
assert(i < size_);
|
|
||||||
return dense_[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Change the maximum size of the array.
|
|
||||||
// Invalidates all iterators.
|
|
||||||
template<typename Value>
|
|
||||||
void SparseArray<Value>::resize(int max_size) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (max_size > max_size_) {
|
|
||||||
std::unique_ptr<int[]> a(new int[max_size]);
|
|
||||||
if (sparse_) {
|
|
||||||
std::copy_n(sparse_.get(), max_size_, a.get());
|
|
||||||
}
|
|
||||||
sparse_ = std::move(a);
|
|
||||||
|
|
||||||
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
|
|
||||||
if (dense_) {
|
|
||||||
std::copy_n(dense_.get(), max_size_, b.get());
|
|
||||||
}
|
|
||||||
dense_ = std::move(b);
|
|
||||||
|
|
||||||
MaybeInitializeMemory(max_size_, max_size);
|
|
||||||
}
|
|
||||||
max_size_ = max_size;
|
|
||||||
if (size_ > max_size_)
|
|
||||||
size_ = max_size_;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check whether index i is in the array.
|
|
||||||
template<typename Value>
|
|
||||||
bool SparseArray<Value>::has_index(int i) const {
|
|
||||||
assert(i >= 0);
|
|
||||||
assert(i < max_size_);
|
|
||||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
|
||||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
|
||||||
dense_[sparse_[i]].index_ == i;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
const Value& SparseArray<Value>::get_existing(int i) const {
|
|
||||||
assert(has_index(i));
|
|
||||||
return dense_[sparse_[i]].second;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
void SparseArray<Value>::erase(int i) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (has_index(i))
|
|
||||||
erase_existing(i);
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
void SparseArray<Value>::erase_existing(int i) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
assert(has_index(i));
|
|
||||||
int di = sparse_[i];
|
|
||||||
if (di < size_ - 1) {
|
|
||||||
dense_[di] = std::move(dense_[size_ - 1]);
|
|
||||||
sparse_[dense_[di].index_] = di;
|
|
||||||
}
|
|
||||||
size_--;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
void SparseArray<Value>::create_index(int i) {
|
|
||||||
assert(!has_index(i));
|
|
||||||
assert(size_ < max_size_);
|
|
||||||
sparse_[i] = size_;
|
|
||||||
dense_[size_].index_ = i;
|
|
||||||
size_++;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
|
||||||
sparse_.reset(new int[max_size]);
|
|
||||||
dense_.reset(new IndexValue[max_size]);
|
|
||||||
size_ = 0;
|
|
||||||
MaybeInitializeMemory(size_, max_size);
|
|
||||||
max_size_ = max_size;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> SparseArray<Value>::~SparseArray() {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
|
||||||
assert(0 <= size_);
|
|
||||||
assert(size_ <= max_size_);
|
|
||||||
assert(size_ == 0 || sparse_ != NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Comparison function for sorting.
|
|
||||||
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
|
||||||
const IndexValue& b) {
|
|
||||||
return a.index_ < b.index_;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_SPARSE_ARRAY_H_
|
|
@ -1,266 +0,0 @@
|
|||||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_SPARSE_SET_H_
|
|
||||||
#define UTIL_SPARSE_SET_H_
|
|
||||||
|
|
||||||
// DESCRIPTION
|
|
||||||
//
|
|
||||||
// SparseSet(m) is a set of integers in [0, m).
|
|
||||||
// It requires sizeof(int)*m memory, but it provides
|
|
||||||
// fast iteration through the elements in the set and fast clearing
|
|
||||||
// of the set.
|
|
||||||
//
|
|
||||||
// Insertion and deletion are constant time operations.
|
|
||||||
//
|
|
||||||
// Allocating the set is a constant time operation
|
|
||||||
// when memory allocation is a constant time operation.
|
|
||||||
//
|
|
||||||
// Clearing the set is a constant time operation (unusual!).
|
|
||||||
//
|
|
||||||
// Iterating through the set is an O(n) operation, where n
|
|
||||||
// is the number of items in the set (not O(m)).
|
|
||||||
//
|
|
||||||
// The set iterator visits entries in the order they were first
|
|
||||||
// inserted into the set. It is safe to add items to the set while
|
|
||||||
// using an iterator: the iterator will visit indices added to the set
|
|
||||||
// during the iteration, but will not re-visit indices whose values
|
|
||||||
// change after visiting. Thus SparseSet can be a convenient
|
|
||||||
// implementation of a work queue.
|
|
||||||
//
|
|
||||||
// The SparseSet implementation is NOT thread-safe. It is up to the
|
|
||||||
// caller to make sure only one thread is accessing the set. (Typically
|
|
||||||
// these sets are temporary values and used in situations where speed is
|
|
||||||
// important.)
|
|
||||||
//
|
|
||||||
// The SparseSet interface does not present all the usual STL bells and
|
|
||||||
// whistles.
|
|
||||||
//
|
|
||||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
|
||||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
|
||||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
|
||||||
//
|
|
||||||
// This is a specialization of sparse array; see sparse_array.h.
|
|
||||||
|
|
||||||
// IMPLEMENTATION
|
|
||||||
//
|
|
||||||
// See sparse_array.h for implementation details.
|
|
||||||
|
|
||||||
// Doing this simplifies the logic below.
|
|
||||||
#ifndef __has_feature
|
|
||||||
#define __has_feature(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#if __has_feature(memory_sanitizer)
|
|
||||||
#include <sanitizer/msan_interface.h>
|
|
||||||
#endif
|
|
||||||
#include <algorithm>
|
|
||||||
#include <memory>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
class SparseSetT {
|
|
||||||
public:
|
|
||||||
SparseSetT();
|
|
||||||
explicit SparseSetT(int max_size);
|
|
||||||
~SparseSetT();
|
|
||||||
|
|
||||||
typedef int* iterator;
|
|
||||||
typedef const int* const_iterator;
|
|
||||||
|
|
||||||
// Return the number of entries in the set.
|
|
||||||
int size() const {
|
|
||||||
return size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Indicate whether the set is empty.
|
|
||||||
int empty() const {
|
|
||||||
return size_ == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Iterate over the set.
|
|
||||||
iterator begin() {
|
|
||||||
return dense_.get();
|
|
||||||
}
|
|
||||||
iterator end() {
|
|
||||||
return dense_.get() + size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
const_iterator begin() const {
|
|
||||||
return dense_.get();
|
|
||||||
}
|
|
||||||
const_iterator end() const {
|
|
||||||
return dense_.get() + size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Change the maximum size of the set.
|
|
||||||
// Invalidates all iterators.
|
|
||||||
void resize(int max_size);
|
|
||||||
|
|
||||||
// Return the maximum size of the set.
|
|
||||||
// Indices can be in the range [0, max_size).
|
|
||||||
int max_size() const {
|
|
||||||
return max_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear the set.
|
|
||||||
void clear() {
|
|
||||||
size_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check whether index i is in the set.
|
|
||||||
bool contains(int i) const;
|
|
||||||
|
|
||||||
// Comparison function for sorting.
|
|
||||||
// Can sort the sparse set so that future iterations
|
|
||||||
// will visit indices in increasing order using
|
|
||||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
|
||||||
static bool less(int a, int b);
|
|
||||||
|
|
||||||
public:
|
|
||||||
// Insert index i into the set.
|
|
||||||
iterator insert(int i) {
|
|
||||||
return InsertInternal(true, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert index i into the set.
|
|
||||||
// Fast but unsafe: only use if contains(i) is false.
|
|
||||||
iterator insert_new(int i) {
|
|
||||||
return InsertInternal(false, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
iterator InsertInternal(bool allow_existing, int i) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
|
||||||
assert(false && "illegal index");
|
|
||||||
// Semantically, end() would be better here, but we already know
|
|
||||||
// the user did something stupid, so begin() insulates them from
|
|
||||||
// dereferencing an invalid pointer.
|
|
||||||
return begin();
|
|
||||||
}
|
|
||||||
if (!allow_existing) {
|
|
||||||
assert(!contains(i));
|
|
||||||
create_index(i);
|
|
||||||
} else {
|
|
||||||
if (!contains(i))
|
|
||||||
create_index(i);
|
|
||||||
}
|
|
||||||
DebugCheckInvariants();
|
|
||||||
return dense_.get() + sparse_[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the index i to the set.
|
|
||||||
// Only use if contains(i) is known to be false.
|
|
||||||
// This function is private, only intended as a helper
|
|
||||||
// for other methods.
|
|
||||||
void create_index(int i);
|
|
||||||
|
|
||||||
// In debug mode, verify that some invariant properties of the class
|
|
||||||
// are being maintained. This is called at the end of the constructor
|
|
||||||
// and at the beginning and end of all public non-const member functions.
|
|
||||||
void DebugCheckInvariants() const;
|
|
||||||
|
|
||||||
// Initializes memory for elements [min, max).
|
|
||||||
void MaybeInitializeMemory(int min, int max) {
|
|
||||||
#if __has_feature(memory_sanitizer)
|
|
||||||
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
|
||||||
#elif defined(RE2_ON_VALGRIND)
|
|
||||||
for (int i = min; i < max; i++) {
|
|
||||||
sparse_[i] = 0xababababU;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
int size_ = 0;
|
|
||||||
int max_size_ = 0;
|
|
||||||
std::unique_ptr<int[]> sparse_;
|
|
||||||
std::unique_ptr<int[]> dense_;
|
|
||||||
};
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
SparseSetT<Value>::SparseSetT() = default;
|
|
||||||
|
|
||||||
// Change the maximum size of the set.
|
|
||||||
// Invalidates all iterators.
|
|
||||||
template<typename Value>
|
|
||||||
void SparseSetT<Value>::resize(int max_size) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (max_size > max_size_) {
|
|
||||||
std::unique_ptr<int[]> a(new int[max_size]);
|
|
||||||
if (sparse_) {
|
|
||||||
std::copy_n(sparse_.get(), max_size_, a.get());
|
|
||||||
}
|
|
||||||
sparse_ = std::move(a);
|
|
||||||
|
|
||||||
std::unique_ptr<int[]> b(new int[max_size]);
|
|
||||||
if (dense_) {
|
|
||||||
std::copy_n(dense_.get(), max_size_, b.get());
|
|
||||||
}
|
|
||||||
dense_ = std::move(b);
|
|
||||||
|
|
||||||
MaybeInitializeMemory(max_size_, max_size);
|
|
||||||
}
|
|
||||||
max_size_ = max_size;
|
|
||||||
if (size_ > max_size_)
|
|
||||||
size_ = max_size_;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check whether index i is in the set.
|
|
||||||
template<typename Value>
|
|
||||||
bool SparseSetT<Value>::contains(int i) const {
|
|
||||||
assert(i >= 0);
|
|
||||||
assert(i < max_size_);
|
|
||||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
|
||||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
|
||||||
dense_[sparse_[i]] == i;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
void SparseSetT<Value>::create_index(int i) {
|
|
||||||
assert(!contains(i));
|
|
||||||
assert(size_ < max_size_);
|
|
||||||
sparse_[i] = size_;
|
|
||||||
dense_[size_] = i;
|
|
||||||
size_++;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
|
|
||||||
sparse_.reset(new int[max_size]);
|
|
||||||
dense_.reset(new int[max_size]);
|
|
||||||
size_ = 0;
|
|
||||||
MaybeInitializeMemory(size_, max_size);
|
|
||||||
max_size_ = max_size;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> SparseSetT<Value>::~SparseSetT() {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
|
|
||||||
assert(0 <= size_);
|
|
||||||
assert(size_ <= max_size_);
|
|
||||||
assert(size_ == 0 || sparse_ != NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Comparison function for sorting.
|
|
||||||
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
|
|
||||||
return a < b;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef SparseSetT<void> SparseSet;
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_SPARSE_SET_H_
|
|
@ -1,164 +0,0 @@
|
|||||||
// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#include "util/strutil.h"
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#define snprintf _snprintf
|
|
||||||
#define vsnprintf _vsnprintf
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// CEscapeString()
|
|
||||||
// Copies 'src' to 'dest', escaping dangerous characters using
|
|
||||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
|
||||||
// Returns the number of bytes written to 'dest' (not including the \0)
|
|
||||||
// or (size_t)-1 if there was insufficient space.
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
static size_t CEscapeString(const char* src, size_t src_len,
|
|
||||||
char* dest, size_t dest_len) {
|
|
||||||
const char* src_end = src + src_len;
|
|
||||||
size_t used = 0;
|
|
||||||
|
|
||||||
for (; src < src_end; src++) {
|
|
||||||
if (dest_len - used < 2) // space for two-character escape
|
|
||||||
return (size_t)-1;
|
|
||||||
|
|
||||||
unsigned char c = *src;
|
|
||||||
switch (c) {
|
|
||||||
case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
|
|
||||||
case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
|
|
||||||
case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
|
|
||||||
case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
|
|
||||||
case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
|
|
||||||
case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
|
|
||||||
default:
|
|
||||||
// Note that if we emit \xNN and the src character after that is a hex
|
|
||||||
// digit then that digit must be escaped too to prevent it being
|
|
||||||
// interpreted as part of the character code by C.
|
|
||||||
if (c < ' ' || c > '~') {
|
|
||||||
if (dest_len - used < 5) // space for four-character escape + \0
|
|
||||||
return (size_t)-1;
|
|
||||||
snprintf(dest + used, 5, "\\%03o", c);
|
|
||||||
used += 4;
|
|
||||||
} else {
|
|
||||||
dest[used++] = c; break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dest_len - used < 1) // make sure that there is room for \0
|
|
||||||
return (size_t)-1;
|
|
||||||
|
|
||||||
dest[used] = '\0'; // doesn't count towards return value though
|
|
||||||
return used;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// CEscape()
|
|
||||||
// Copies 'src' to result, escaping dangerous characters using
|
|
||||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
string CEscape(const StringPiece& src) {
|
|
||||||
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
|
|
||||||
char* dest = new char[dest_len];
|
|
||||||
const size_t used = CEscapeString(src.data(), src.size(),
|
|
||||||
dest, dest_len);
|
|
||||||
string s = string(dest, used);
|
|
||||||
delete[] dest;
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefixSuccessor(string* prefix) {
|
|
||||||
// We can increment the last character in the string and be done
|
|
||||||
// unless that character is 255, in which case we have to erase the
|
|
||||||
// last character and increment the previous character, unless that
|
|
||||||
// is 255, etc. If the string is empty or consists entirely of
|
|
||||||
// 255's, we just return the empty string.
|
|
||||||
while (!prefix->empty()) {
|
|
||||||
char& c = prefix->back();
|
|
||||||
if (c == '\xff') { // char literal avoids signed/unsigned.
|
|
||||||
prefix->pop_back();
|
|
||||||
} else {
|
|
||||||
++c;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void StringAppendV(string* dst, const char* format, va_list ap) {
|
|
||||||
// First try with a small fixed size buffer
|
|
||||||
char space[1024];
|
|
||||||
|
|
||||||
// It's possible for methods that use a va_list to invalidate
|
|
||||||
// the data in it upon use. The fix is to make a copy
|
|
||||||
// of the structure before using it and use that copy instead.
|
|
||||||
va_list backup_ap;
|
|
||||||
va_copy(backup_ap, ap);
|
|
||||||
int result = vsnprintf(space, sizeof(space), format, backup_ap);
|
|
||||||
va_end(backup_ap);
|
|
||||||
|
|
||||||
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
|
|
||||||
// It fit
|
|
||||||
dst->append(space, result);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeatedly increase buffer size until it fits
|
|
||||||
int length = sizeof(space);
|
|
||||||
while (true) {
|
|
||||||
if (result < 0) {
|
|
||||||
// Older behavior: just try doubling the buffer size
|
|
||||||
length *= 2;
|
|
||||||
} else {
|
|
||||||
// We need exactly "result+1" characters
|
|
||||||
length = result+1;
|
|
||||||
}
|
|
||||||
char* buf = new char[length];
|
|
||||||
|
|
||||||
// Restore the va_list before we use it again
|
|
||||||
va_copy(backup_ap, ap);
|
|
||||||
result = vsnprintf(buf, length, format, backup_ap);
|
|
||||||
va_end(backup_ap);
|
|
||||||
|
|
||||||
if ((result >= 0) && (result < length)) {
|
|
||||||
// It fit
|
|
||||||
dst->append(buf, result);
|
|
||||||
delete[] buf;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
delete[] buf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
string StringPrintf(const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
string result;
|
|
||||||
StringAppendV(&result, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SStringPrintf(string* dst, const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
dst->clear();
|
|
||||||
StringAppendV(dst, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
}
|
|
||||||
|
|
||||||
void StringAppendF(string* dst, const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
StringAppendV(dst, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,23 +0,0 @@
|
|||||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_STRUTIL_H_
|
|
||||||
#define UTIL_STRUTIL_H_
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
string CEscape(const StringPiece& src);
|
|
||||||
void PrefixSuccessor(string* prefix);
|
|
||||||
string StringPrintf(const char* format, ...);
|
|
||||||
void SStringPrintf(string* dst, const char* format, ...);
|
|
||||||
void StringAppendF(string* dst, const char* format, ...);
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_STRUTIL_H_
|
|
@ -1,34 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#ifndef _WIN32
|
|
||||||
#include <sys/resource.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "util/test.h"
|
|
||||||
|
|
||||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
|
||||||
|
|
||||||
struct Test {
|
|
||||||
void (*fn)(void);
|
|
||||||
const char *name;
|
|
||||||
};
|
|
||||||
|
|
||||||
static Test tests[10000];
|
|
||||||
static int ntests;
|
|
||||||
|
|
||||||
void RegisterTest(void (*fn)(void), const char *name) {
|
|
||||||
tests[ntests].fn = fn;
|
|
||||||
tests[ntests++].name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
|
||||||
for (int i = 0; i < ntests; i++) {
|
|
||||||
printf("%s\n", tests[i].name);
|
|
||||||
tests[i].fn();
|
|
||||||
}
|
|
||||||
printf("PASS\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
@ -1,46 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_TEST_H_
|
|
||||||
#define UTIL_TEST_H_
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/flags.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
|
|
||||||
#define TEST(x, y) \
|
|
||||||
void x##y(void); \
|
|
||||||
TestRegisterer r##x##y(x##y, # x "." # y); \
|
|
||||||
void x##y(void)
|
|
||||||
|
|
||||||
void RegisterTest(void (*)(void), const char*);
|
|
||||||
|
|
||||||
class TestRegisterer {
|
|
||||||
public:
|
|
||||||
TestRegisterer(void (*fn)(void), const char *s) {
|
|
||||||
RegisterTest(fn, s);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO(rsc): Do a better job.
|
|
||||||
#define EXPECT_EQ CHECK_EQ
|
|
||||||
#define EXPECT_TRUE CHECK
|
|
||||||
#define EXPECT_LT CHECK_LT
|
|
||||||
#define EXPECT_GT CHECK_GT
|
|
||||||
#define EXPECT_LE CHECK_LE
|
|
||||||
#define EXPECT_GE CHECK_GE
|
|
||||||
#define EXPECT_FALSE(x) CHECK(!(x))
|
|
||||||
|
|
||||||
namespace testing {
|
|
||||||
class MallocCounter {
|
|
||||||
public:
|
|
||||||
MallocCounter(int x) {}
|
|
||||||
static const int THIS_THREAD_ONLY = 0;
|
|
||||||
long long HeapGrowth() { return 0; }
|
|
||||||
long long PeakHeapGrowth() { return 0; }
|
|
||||||
void Reset() {}
|
|
||||||
};
|
|
||||||
} // namespace testing
|
|
||||||
|
|
||||||
#endif // UTIL_TEST_H_
|
|
@ -1,44 +0,0 @@
|
|||||||
/*
|
|
||||||
* The authors of this software are Rob Pike and Ken Thompson.
|
|
||||||
* Copyright (c) 2002 by Lucent Technologies.
|
|
||||||
* Permission to use, copy, modify, and distribute this software for any
|
|
||||||
* purpose without fee is hereby granted, provided that this entire notice
|
|
||||||
* is included in all copies of any software which is or includes a copy
|
|
||||||
* or modification of this software and in all copies of the supporting
|
|
||||||
* documentation for such software.
|
|
||||||
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
|
|
||||||
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
|
|
||||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
|
||||||
*
|
|
||||||
* This file and rune.cc have been converted to compile as C++ code
|
|
||||||
* in name space re2.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef UTIL_UTF_H_
|
|
||||||
#define UTIL_UTF_H_
|
|
||||||
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
|
|
||||||
|
|
||||||
enum
|
|
||||||
{
|
|
||||||
UTFmax = 4, /* maximum bytes per rune */
|
|
||||||
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
|
|
||||||
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
|
|
||||||
Runeerror = 0xFFFD, /* decoding error in UTF */
|
|
||||||
Runemax = 0x10FFFF, /* maximum rune value */
|
|
||||||
};
|
|
||||||
|
|
||||||
int runetochar(char* s, const Rune* r);
|
|
||||||
int chartorune(Rune* r, const char* s);
|
|
||||||
int fullrune(const char* s, int n);
|
|
||||||
int utflen(const char* s);
|
|
||||||
char* utfrune(const char*, Rune);
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // UTIL_UTF_H_
|
|
@ -1,22 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef UTIL_UTIL_H_
|
|
||||||
#define UTIL_UTIL_H_
|
|
||||||
|
|
||||||
// TODO(junyer): Get rid of this.
|
|
||||||
#include <string>
|
|
||||||
using std::string;
|
|
||||||
|
|
||||||
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
|
|
||||||
|
|
||||||
#ifndef FALLTHROUGH_INTENDED
|
|
||||||
#define FALLTHROUGH_INTENDED do { } while (0)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef NO_THREAD_SAFETY_ANALYSIS
|
|
||||||
#define NO_THREAD_SAFETY_ANALYSIS
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // UTIL_UTIL_H_
|
|
1
contrib/re2
vendored
Submodule
1
contrib/re2
vendored
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0
|
@ -1,50 +1,23 @@
|
|||||||
set (re2_sources
|
|
||||||
./re2/bitstate.cc
|
|
||||||
./re2/compile.cc
|
|
||||||
./re2/dfa.cc
|
|
||||||
./re2/filtered_re2.cc
|
|
||||||
./re2/mimics_pcre.cc
|
|
||||||
./re2/nfa.cc
|
|
||||||
./re2/onepass.cc
|
|
||||||
./re2/parse.cc
|
|
||||||
./re2/perl_groups.cc
|
|
||||||
./re2/prefilter.cc
|
|
||||||
./re2/prefilter_tree.cc
|
|
||||||
./re2/prog.cc
|
|
||||||
./re2/re2.cc
|
|
||||||
./re2/regexp.cc
|
|
||||||
./re2/set.cc
|
|
||||||
./re2/simplify.cc
|
|
||||||
./re2/stringpiece.cc
|
|
||||||
./re2/tostring.cc
|
|
||||||
./re2/unicode_casefold.cc
|
|
||||||
./re2/unicode_groups.cc
|
|
||||||
./util/benchmark.cc
|
|
||||||
./util/fuzz.cc
|
|
||||||
./util/pcre.cc
|
|
||||||
./util/rune.cc
|
|
||||||
./util/strutil.cc
|
|
||||||
./util/test.cc
|
|
||||||
)
|
|
||||||
|
|
||||||
# Building re2 which is thread-safe and re2_st which is not.
|
# Building re2 which is thread-safe and re2_st which is not.
|
||||||
# re2 changes its state during matching of regular expression, e.g. creates temporary DFA.
|
# re2 changes its state during matching of regular expression, e.g. creates temporary DFA.
|
||||||
# It uses RWLock to process the same regular expression object from different threads.
|
# It uses RWLock to process the same regular expression object from different threads.
|
||||||
# In order to avoid redundant locks in some cases, we use not thread-safe version of the library (re2_st).
|
# In order to avoid redundant locks in some cases, we use not thread-safe version of the library (re2_st).
|
||||||
|
|
||||||
add_library (re2 ${re2_sources})
|
set (RE2_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/re2/)
|
||||||
add_library (re2_st ${re2_sources})
|
|
||||||
|
|
||||||
target_compile_definitions (re2 PRIVATE NDEBUG)
|
get_target_property (RE2_SOURCES_ re2 SOURCES)
|
||||||
|
foreach (src ${RE2_SOURCES_})
|
||||||
|
list(APPEND RE2_ST_SOURCES ${RE2_SOURCE_DIR}/${src})
|
||||||
|
endforeach ()
|
||||||
|
|
||||||
|
add_library (re2_st ${RE2_ST_SOURCES})
|
||||||
target_compile_definitions (re2_st PRIVATE NDEBUG NO_THREADS re2=re2_st)
|
target_compile_definitions (re2_st PRIVATE NDEBUG NO_THREADS re2=re2_st)
|
||||||
|
target_include_directories (re2_st PRIVATE . PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${RE2_SOURCE_DIR})
|
||||||
target_include_directories (re2 PUBLIC .)
|
|
||||||
target_include_directories (re2_st PRIVATE . PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
|
|
||||||
|
|
||||||
file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/re2_st)
|
file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/re2_st)
|
||||||
foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h variadic_function.h)
|
foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h)
|
||||||
add_custom_command (OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
|
add_custom_command (OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
|
||||||
COMMAND ${CMAKE_COMMAND} -DSOURCE_FILENAME="${CMAKE_CURRENT_SOURCE_DIR}/re2/${FILENAME}"
|
COMMAND ${CMAKE_COMMAND} -DSOURCE_FILENAME="${RE2_SOURCE_DIR}/re2/${FILENAME}"
|
||||||
-DTARGET_FILENAME="${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
|
-DTARGET_FILENAME="${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
|
||||||
-P "${CMAKE_CURRENT_SOURCE_DIR}/re2_transform.cmake"
|
-P "${CMAKE_CURRENT_SOURCE_DIR}/re2_transform.cmake"
|
||||||
COMMENT "Creating ${FILENAME} for re2_st library.")
|
COMMENT "Creating ${FILENAME} for re2_st library.")
|
Loading…
Reference in New Issue
Block a user